国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

669 lines
19KB

  1. <?php if(!defined('DEDEINC')) exit('dedecms');
  2. /**
  3. * 采集小助手
  4. *
  5. * @version $Id: charset.helper.php 1 2010-07-05 11:43:09Z tianya $
  6. * @package DedeCMS.Helpers
  7. * @copyright Copyright (c) 2020, DedeBIZ.COM
  8. * @license https://www.dedebiz.com/license
  9. * @link https://www.dedebiz.com
  10. */
  11. require_once(DEDEINC."/dedehttpdown.class.php");
  12. require_once(DEDEINC."/dedetag.class.php");
  13. require_once(DEDEINC."/charset.func.php");
  14. /**
  15. * 下载图片
  16. *
  17. * @access public
  18. * @param string $gurl 地址
  19. * @param string $rfurl 来源地址
  20. * @param string $filename 文件名
  21. * @param string $gcookie 调整cookie
  22. * @param string $JumpCount 跳转计数
  23. * @param string $maxtime 最大次数
  24. * @return string
  25. */
  26. function DownImageKeep($gurl, $rfurl, $filename, $gcookie="", $JumpCount=0, $maxtime=30)
  27. {
  28. $urlinfos = GetHostInfo($gurl);
  29. $ghost = trim($urlinfos['host']);
  30. if($ghost=='')
  31. {
  32. return FALSE;
  33. }
  34. $gquery = $urlinfos['query'];
  35. if($gcookie=="" && !empty($rfurl))
  36. {
  37. $gcookie = RefurlCookie($rfurl);
  38. }
  39. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  40. $sessionQuery .= "Host: $ghost\r\n";
  41. $sessionQuery .= "Referer: $rfurl\r\n";
  42. $sessionQuery .= "Accept: */*\r\n";
  43. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  44. if($gcookie!="" && !preg_match("/[\r\n]/", $gcookie))
  45. {
  46. $sessionQuery .= $gcookie."\r\n";
  47. }
  48. $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
  49. $errno = "";
  50. $errstr = "";
  51. $m_fp = fsockopen($ghost, 80, $errno, $errstr,10);
  52. fwrite($m_fp,$sessionQuery);
  53. $lnum = 0;
  54. //获取详细应答头
  55. $m_httphead = Array();
  56. $httpstas = explode(" ",fgets($m_fp,256));
  57. $m_httphead["http-edition"] = trim($httpstas[0]);
  58. $m_httphead["http-state"] = trim($httpstas[1]);
  59. while(!feof($m_fp))
  60. {
  61. $line = trim(fgets($m_fp,256));
  62. if($line == "" || $lnum>100)
  63. {
  64. break;
  65. }
  66. $hkey = "";
  67. $hvalue = "";
  68. $v = 0;
  69. for($i=0; $i<strlen($line); $i++)
  70. {
  71. if($v==1)
  72. {
  73. $hvalue .= $line[$i];
  74. }
  75. if($line[$i]==":")
  76. {
  77. $v = 1;
  78. }
  79. if($v==0)
  80. {
  81. $hkey .= $line[$i];
  82. }
  83. }
  84. $hkey = trim($hkey);
  85. if($hkey!="")
  86. {
  87. $m_httphead[strtolower($hkey)] = trim($hvalue);
  88. }
  89. }
  90. //分析返回记录
  91. if(preg_match("/^3/", $m_httphead["http-state"]))
  92. {
  93. if(isset($m_httphead["location"]) && $JumpCount<3)
  94. {
  95. $JumpCount++;
  96. DownImageKeep($gurl,$rfurl,$filename,$gcookie,$JumpCount);
  97. }
  98. else
  99. {
  100. return FALSE;
  101. }
  102. }
  103. if(!preg_match("/^2/", $m_httphead["http-state"]))
  104. {
  105. return FALSE;
  106. }
  107. if(!isset($m_httphead))
  108. {
  109. return FALSE;
  110. }
  111. $contentLength = $m_httphead['content-length'];
  112. //保存文件
  113. $fp = fopen($filename,"w") or die("写入文件:{$filename} 失败!");
  114. $i=0;
  115. $okdata = "";
  116. $starttime = time();
  117. while(!feof($m_fp))
  118. {
  119. $okdata .= fgetc($m_fp);
  120. $i++;
  121. //超时结束
  122. if(time()-$starttime>$maxtime)
  123. {
  124. break;
  125. }
  126. //到达指定大小结束
  127. if($i >= $contentLength)
  128. {
  129. break;
  130. }
  131. }
  132. if($okdata!="")
  133. {
  134. fwrite($fp,$okdata);
  135. }
  136. fclose($fp);
  137. if($okdata=="")
  138. {
  139. @unlink($filename);
  140. fclose($m_fp);
  141. return FALSE;
  142. }
  143. fclose($m_fp);
  144. return TRUE;
  145. }
  146. /**
  147. * 获得某页面返回的Cookie信息
  148. *
  149. * @access public
  150. * @param string $gurl 调整地址
  151. * @return string
  152. */
  153. function RefurlCookie($gurl)
  154. {
  155. global $gcookie,$lastRfurl;
  156. $gurl = trim($gurl);
  157. if(!empty($gcookie) && $lastRfurl==$gurl)
  158. {
  159. return $gcookie;
  160. }
  161. else
  162. {
  163. $lastRfurl=$gurl;
  164. }
  165. if(trim($gurl)=='')
  166. {
  167. return '';
  168. }
  169. $urlinfos = GetHostInfo($gurl);
  170. $ghost = $urlinfos['host'];
  171. $gquery = $urlinfos['query'];
  172. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  173. $sessionQuery .= "Host: $ghost\r\n";
  174. $sessionQuery .= "Accept: */*\r\n";
  175. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  176. $sessionQuery .= "Connection: Close\r\n\r\n";
  177. $errno = "";
  178. $errstr = "";
  179. $m_fp = fsockopen($ghost, 80, $errno, $errstr,10) or die($ghost.'<br />');
  180. fwrite($m_fp,$sessionQuery);
  181. $lnum = 0;
  182. //获取详细应答头
  183. $gcookie = "";
  184. while(!feof($m_fp))
  185. {
  186. $line = trim(fgets($m_fp,256));
  187. if($line == "" || $lnum>100)
  188. {
  189. break;
  190. }
  191. else
  192. {
  193. if(preg_match("/^cookie/i", $line))
  194. {
  195. $gcookie = $line;
  196. break;
  197. }
  198. }
  199. }
  200. fclose($m_fp);
  201. return $gcookie;
  202. }
  203. /**
  204. * 获得网址的host和query部份
  205. *
  206. * @access public
  207. * @param string $gurl 调整地址
  208. * @return string
  209. */
  210. function GetHostInfo($gurl)
  211. {
  212. $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
  213. $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
  214. $garr['query'] = "/".preg_replace("/^([^\/]*)\//i", "", $gurl);
  215. return $garr;
  216. }
  217. /**
  218. * HTML里的图片转DEDE格式
  219. *
  220. * @access public
  221. * @param string $body 文章内容
  222. * @return string
  223. */
  224. function TurnImageTag(&$body)
  225. {
  226. global $cfg_album_width,$cfg_ddimg_width;
  227. if(empty($cfg_album_width))
  228. {
  229. $cfg_album_width = 800;
  230. }
  231. if(empty($cfg_ddimg_width))
  232. {
  233. $cfg_ddimg_width = 150;
  234. }
  235. $patten = "/<\\s*img\\s.*?src\\s*=\\s*([\"\\'])?(?(1)(.*?)\\1|([^\\s\\>\"\\']+))/isx";
  236. preg_match_all($patten,$body,$images);
  237. $returnArray1 = $images[2];
  238. $returnArray2 = $images[3];
  239. foreach ( $returnArray1 as $key => $value )
  240. {
  241. if ($value)
  242. {
  243. $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key+1)."'}".$value."{/dede:img}"."\r\n";
  244. }
  245. else
  246. {
  247. $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key+1)."'}".$returnArray2[$key]."{/dede:img}"."\r\n";
  248. }
  249. }
  250. $ttx = "\r\n{dede:pagestyle maxwidth='{$cfg_album_width}' ddmaxwidth='{$cfg_ddimg_width}' row='3' col='3' value='2'/}\r\n{dede:comments}图集类型会采集时生成此配置是正常的,不过如果后面没有跟着img标记则表示规则无效{/dede:comments}\r\n".$ttx;
  251. return $ttx;
  252. }
  253. /**
  254. * HTML里的网址格式转换
  255. *
  256. * @access public
  257. * @param string $body 文章内容
  258. * @return string
  259. */
  260. function TurnLinkTag(&$body)
  261. {
  262. $ttx = '';
  263. $handid = '服务器';
  264. preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is",$body,$match);
  265. if(is_array($match[1]) && count($match[1])>0)
  266. {
  267. for($i=0;isset($match[1][$i]);$i++)
  268. {
  269. $servername = (isset($match[3][$i]) ? str_replace("'","`",$match[3][$i]) : $handid.($i+1));
  270. if(preg_match("/[<>]/", $servername) || strlen($servername)>40)
  271. {
  272. $servername = $handid.($i+1);
  273. }
  274. $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
  275. }
  276. }
  277. return $ttx;
  278. }
  279. /**
  280. * 替换XML的CDATA
  281. *
  282. * @access public
  283. * @param string $str 字符串
  284. * @return string
  285. */
  286. function RpCdata($str)
  287. {
  288. $str = str_replace('<![CDATA[', '', $str);
  289. $str = str_replace(']]>', '', $str);
  290. return $str;
  291. }
  292. /**
  293. * 分析RSS里的链接
  294. *
  295. * @access public
  296. * @param string $rssurl rss地址
  297. * @return string
  298. */
  299. function GetRssLinks($rssurl)
  300. {
  301. global $cfg_soft_lang;
  302. $dhd = new DedeHttpDown();
  303. $dhd->OpenUrl($rssurl);
  304. $rsshtml = $dhd->GetHtml();
  305. //分析编码
  306. preg_match("/encoding=[\"']([^\"']*)[\"']/is",$rsshtml,$infos);
  307. if(isset($infos[1]))
  308. {
  309. $pcode = strtolower(trim($infos[1]));
  310. }
  311. else
  312. {
  313. $pcode = strtolower($cfg_soft_lang);
  314. }
  315. if($cfg_soft_lang=='gb2312')
  316. {
  317. if($pcode=='utf-8')
  318. {
  319. $rsshtml = utf82gb($rsshtml);
  320. }
  321. else if($pcode=='big5')
  322. {
  323. $rsshtml = big52gb($rsshtml);
  324. }
  325. }
  326. else if($cfg_soft_lang=='utf-8')
  327. {
  328. if($pcode=='gbk'||$pcode=='gb2312')
  329. {
  330. $rsshtml = gb2utf8($rsshtml);
  331. }
  332. else if($pcode=='big5')
  333. {
  334. $rsshtml = gb2utf8(big52gb($rsshtml));
  335. }
  336. }
  337. $rsarr = array();
  338. preg_match_all("/<item(.*)<title>(.*)<\/title>/isU",$rsshtml,$titles);
  339. preg_match_all("/<item(.*)<link>(.*)<\/link>/isU",$rsshtml,$links);
  340. preg_match_all("/<item(.*)<description>(.*)<\/description>/isU",$rsshtml,$descriptions);
  341. if(!isset($links[2]))
  342. {
  343. return '';
  344. }
  345. foreach($links[2] as $k=>$v)
  346. {
  347. $rsarr[$k]['link'] = RpCdata($v);
  348. if(isset($titles[2][$k]))
  349. {
  350. $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
  351. }
  352. else
  353. {
  354. $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
  355. }
  356. if(isset($descriptions[2][$k]))
  357. {
  358. $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k],$rssurl);
  359. }
  360. else
  361. {
  362. $rsarr[$k]['image'] = '';
  363. }
  364. }
  365. return $rsarr;
  366. }
  367. /**
  368. * 从RSS摘要获取图片信息
  369. *
  370. * @access public
  371. * @param string $descriptions 描述
  372. * @param string $refurl 来源地址
  373. * @return string
  374. */
  375. function GetddImgFromRss($descriptions,$refurl)
  376. {
  377. if($descriptions=='')
  378. {
  379. return '';
  380. }
  381. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$descriptions,$imgs);
  382. if(isset($imgs[2][0]))
  383. {
  384. $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
  385. $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
  386. return FillUrl($refurl,$imgs[2][0]);
  387. }
  388. else
  389. {
  390. return '';
  391. }
  392. }
  393. /**
  394. * 补全网址
  395. *
  396. * @access public
  397. * @param string $refurl 来源地址
  398. * @param string $surl 站点地址
  399. * @return string
  400. */
  401. function FillUrl($refurl,$surl)
  402. {
  403. $i = $pathStep = 0;
  404. $dstr = $pstr = $okurl = '';
  405. $refurl = trim($refurl);
  406. $surl = trim($surl);
  407. $urls = @parse_url($refurl);
  408. $basehost = ( (!isset($urls['port']) || $urls['port']=='80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
  409. //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
  410. //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
  411. $basepath = $basehost;
  412. $paths = explode('/',preg_replace("/^http:\/\//i", "", $refurl));
  413. $n = count($paths);
  414. for($i=1;$i < ($n-1);$i++)
  415. {
  416. if(!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
  417. }
  418. if(!preg_match("/[\?\.]/", $paths[$n-1]))
  419. {
  420. $basepath .= '/'.$paths[$n-1];
  421. }
  422. if($surl=='')
  423. {
  424. return $basepath;
  425. }
  426. $pos = strpos($surl, "#");
  427. if($pos>0)
  428. {
  429. $surl = substr($surl, 0, $pos);
  430. }
  431. //用 '/' 表示网站根的网址
  432. if($surl[0]=='/')
  433. {
  434. $okurl = $basehost.$surl;
  435. }
  436. else if($surl[0]=='.')
  437. {
  438. if(strlen($surl)<=2)
  439. {
  440. return '';
  441. }
  442. else if($surl[1]=='/')
  443. {
  444. $okurl = $basepath.preg_replace('/^./', '', $surl);
  445. }
  446. else
  447. {
  448. $okurl = $basepath.'/'.$surl;
  449. }
  450. }
  451. else
  452. {
  453. if( strlen($surl) < 7 )
  454. {
  455. $okurl = $basepath.'/'.$surl;
  456. }
  457. else if( preg_match("/^http:\/\//i",$surl) )
  458. {
  459. $okurl = $surl;
  460. }
  461. else
  462. {
  463. $okurl = $basepath.'/'.$surl;
  464. }
  465. }
  466. $okurl = preg_replace("/^http:\/\//i", '', $okurl);
  467. $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
  468. return $okurl;
  469. }
  470. /**
  471. * 从匹配规则中获取列表网址
  472. *
  473. * @access public
  474. * @param string $regxurl 正则地址
  475. * @param string $handurl 操作地址
  476. * @param string $startid 开始ID
  477. * @param string $endid 结束ID
  478. * @param string $addv 增值
  479. * @param string $usemore 使用更多
  480. * @param string $batchrule 列表规则
  481. * @return string
  482. */
  483. function GetUrlFromListRule($regxurl='',$handurl='',$startid=0,$endid=0,$addv=1,$usemore=0,$batchrule='')
  484. {
  485. global $dsql,$islisten;
  486. $lists = array();
  487. $n = 0;
  488. $islisten = (empty($islisten) ? 0 : $islisten);
  489. if($handurl!='')
  490. {
  491. $handurls = explode("\n",$handurl);
  492. foreach($handurls as $handurl)
  493. {
  494. $handurl = trim($handurl);
  495. if(preg_match("/^http:\/\//i", $handurl))
  496. {
  497. $lists[$n][0] = $handurl;
  498. $lists[$n][1] = 0;
  499. $n++;
  500. if($islisten==1)
  501. {
  502. break;
  503. }
  504. }
  505. }
  506. }
  507. if($regxurl!='')
  508. {
  509. //没指定(#)和(*)
  510. if(!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl))
  511. {
  512. $lists[$n][0] = $regxurl;
  513. $lists[$n][1] = 0;
  514. $n++;
  515. }
  516. else
  517. {
  518. if($addv <= 0)
  519. {
  520. $addv = 1;
  521. }
  522. //没指定多栏目匹配规则
  523. if($usemore==0)
  524. {
  525. while($startid <= $endid)
  526. {
  527. $lists[$n][0] = str_replace("(*)",sprintf('%0'.strlen($startid).'d',$startid),$regxurl);
  528. $lists[$n][1] = 0;
  529. $startid = sprintf('%0'.strlen($startid).'d',$startid + $addv);
  530. $n++;
  531. if($n>2000 || $islisten==1)
  532. {
  533. break;
  534. }
  535. }
  536. }
  537. //匹配多个栏目
  538. //规则表达式 [(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20; typeid=>栏目id; addurl=>附加的网址(用|分开多个)]
  539. else
  540. {
  541. $nrules = explode(']',trim($batchrule));
  542. foreach($nrules as $nrule)
  543. {
  544. $nrule = trim($nrule);
  545. $nrule = preg_replace("/^\[|\]$/", '', $nrule);
  546. $nrules = explode(';',$nrule);
  547. if(count($nrules)<3)
  548. {
  549. continue;
  550. }
  551. $brtag = '';
  552. $startid = 0;
  553. $endid = 0;
  554. $typeid = 0;
  555. $addurls = array();
  556. foreach($nrules as $nrule)
  557. {
  558. $nrule = trim($nrule);
  559. list($k,$v) = explode('=>',$nrule);
  560. if(trim($k)=='(#)')
  561. {
  562. $brtag = trim($v);
  563. }
  564. else if(trim($k)=='typeid')
  565. {
  566. $typeid = trim($v);
  567. }
  568. else if(trim($k)=='addurl')
  569. {
  570. $addurl = trim($v);
  571. $addurls = explode('|',$addurl);
  572. }
  573. else if(trim($k)=='(*)')
  574. {
  575. $v = preg_replace("/[ \r\n\t]/", '', trim($v));
  576. list($startid,$endid) = explode('-',$v);
  577. }
  578. }
  579. //如果栏目用栏目名称
  580. if(preg_match('/[^0-9]/', $typeid))
  581. {
  582. $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
  583. if(is_array($arr))
  584. {
  585. $typeid = $arr['id'];
  586. }
  587. else
  588. {
  589. $typeid = 0;
  590. }
  591. }
  592. //附加网址优先
  593. $mjj = 0;
  594. if(isset($addurls[0]))
  595. {
  596. foreach($addurls as $addurl)
  597. {
  598. $addurl = trim($addurl);
  599. if($addurl=='')
  600. {
  601. continue;
  602. }
  603. $lists[$n][0] = $addurl;
  604. $lists[$n][1] = $typeid;
  605. $n++;
  606. $mjj++;
  607. if($islisten==1)
  608. {
  609. break;
  610. }
  611. }
  612. }
  613. //如果为非监听模式或监听模式没手工指定的附加网址
  614. if($islisten!=1 || $mjj==0 )
  615. {
  616. //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
  617. while($startid <= $endid)
  618. {
  619. $lists[$n][0] = str_replace("(#)",$brtag,$regxurl);
  620. $lists[$n][0] = str_replace("(*)",sprintf('%0'.strlen($startid).'d',$startid),$lists[$n][0]);
  621. $lists[$n][1] = $typeid;
  622. $startid = sprintf('%0'.strlen($startid).'d',$startid + $addv);
  623. $n++;
  624. if($islisten==1)
  625. {
  626. break;
  627. }
  628. if($n>20000)
  629. {
  630. break;
  631. }
  632. }
  633. }
  634. }
  635. } //End 匹配多栏目
  636. } //End使用规则匹配的情况
  637. }
  638. return $lists;
  639. }//End