国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

670 lignes
19KB

  1. <?php if(!defined('DEDEINC')) exit('dedecms');
  2. /**
  3. * 采集小助手
  4. *
  5. * @version $Id: charset.helper.php 1 2010-07-05 11:43:09Z tianya $
  6. * @package DedeCMS.Helpers
  7. * @copyright Copyright (c) 2007 - 2018, DesDev, Inc.
  8. * @copyright Copyright (c) 2020, DedeBIZ.COM
  9. * @license https://www.dedebiz.com/license/v6
  10. * @link https://www.dedebiz.com
  11. */
  12. require_once(DEDEINC."/dedehttpdown.class.php");
  13. require_once(DEDEINC."/dedetag.class.php");
  14. require_once(DEDEINC."/charset.func.php");
  15. /**
  16. * 下载图片
  17. *
  18. * @access public
  19. * @param string $gurl 地址
  20. * @param string $rfurl 来源地址
  21. * @param string $filename 文件名
  22. * @param string $gcookie 调整cookie
  23. * @param string $JumpCount 跳转计数
  24. * @param string $maxtime 最大次数
  25. * @return string
  26. */
  27. function DownImageKeep($gurl, $rfurl, $filename, $gcookie="", $JumpCount=0, $maxtime=30)
  28. {
  29. $urlinfos = GetHostInfo($gurl);
  30. $ghost = trim($urlinfos['host']);
  31. if($ghost=='')
  32. {
  33. return FALSE;
  34. }
  35. $gquery = $urlinfos['query'];
  36. if($gcookie=="" && !empty($rfurl))
  37. {
  38. $gcookie = RefurlCookie($rfurl);
  39. }
  40. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  41. $sessionQuery .= "Host: $ghost\r\n";
  42. $sessionQuery .= "Referer: $rfurl\r\n";
  43. $sessionQuery .= "Accept: */*\r\n";
  44. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  45. if($gcookie!="" && !preg_match("/[\r\n]/", $gcookie))
  46. {
  47. $sessionQuery .= $gcookie."\r\n";
  48. }
  49. $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
  50. $errno = "";
  51. $errstr = "";
  52. $m_fp = fsockopen($ghost, 80, $errno, $errstr,10);
  53. fwrite($m_fp,$sessionQuery);
  54. $lnum = 0;
  55. //获取详细应答头
  56. $m_httphead = Array();
  57. $httpstas = explode(" ",fgets($m_fp,256));
  58. $m_httphead["http-edition"] = trim($httpstas[0]);
  59. $m_httphead["http-state"] = trim($httpstas[1]);
  60. while(!feof($m_fp))
  61. {
  62. $line = trim(fgets($m_fp,256));
  63. if($line == "" || $lnum>100)
  64. {
  65. break;
  66. }
  67. $hkey = "";
  68. $hvalue = "";
  69. $v = 0;
  70. for($i=0; $i<strlen($line); $i++)
  71. {
  72. if($v==1)
  73. {
  74. $hvalue .= $line[$i];
  75. }
  76. if($line[$i]==":")
  77. {
  78. $v = 1;
  79. }
  80. if($v==0)
  81. {
  82. $hkey .= $line[$i];
  83. }
  84. }
  85. $hkey = trim($hkey);
  86. if($hkey!="")
  87. {
  88. $m_httphead[strtolower($hkey)] = trim($hvalue);
  89. }
  90. }
  91. //分析返回记录
  92. if(preg_match("/^3/", $m_httphead["http-state"]))
  93. {
  94. if(isset($m_httphead["location"]) && $JumpCount<3)
  95. {
  96. $JumpCount++;
  97. DownImageKeep($gurl,$rfurl,$filename,$gcookie,$JumpCount);
  98. }
  99. else
  100. {
  101. return FALSE;
  102. }
  103. }
  104. if(!preg_match("/^2/", $m_httphead["http-state"]))
  105. {
  106. return FALSE;
  107. }
  108. if(!isset($m_httphead))
  109. {
  110. return FALSE;
  111. }
  112. $contentLength = $m_httphead['content-length'];
  113. //保存文件
  114. $fp = fopen($filename,"w") or die("写入文件:{$filename} 失败!");
  115. $i=0;
  116. $okdata = "";
  117. $starttime = time();
  118. while(!feof($m_fp))
  119. {
  120. $okdata .= fgetc($m_fp);
  121. $i++;
  122. //超时结束
  123. if(time()-$starttime>$maxtime)
  124. {
  125. break;
  126. }
  127. //到达指定大小结束
  128. if($i >= $contentLength)
  129. {
  130. break;
  131. }
  132. }
  133. if($okdata!="")
  134. {
  135. fwrite($fp,$okdata);
  136. }
  137. fclose($fp);
  138. if($okdata=="")
  139. {
  140. @unlink($filename);
  141. fclose($m_fp);
  142. return FALSE;
  143. }
  144. fclose($m_fp);
  145. return TRUE;
  146. }
  147. /**
  148. * 获得某页面返回的Cookie信息
  149. *
  150. * @access public
  151. * @param string $gurl 调整地址
  152. * @return string
  153. */
  154. function RefurlCookie($gurl)
  155. {
  156. global $gcookie,$lastRfurl;
  157. $gurl = trim($gurl);
  158. if(!empty($gcookie) && $lastRfurl==$gurl)
  159. {
  160. return $gcookie;
  161. }
  162. else
  163. {
  164. $lastRfurl=$gurl;
  165. }
  166. if(trim($gurl)=='')
  167. {
  168. return '';
  169. }
  170. $urlinfos = GetHostInfo($gurl);
  171. $ghost = $urlinfos['host'];
  172. $gquery = $urlinfos['query'];
  173. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  174. $sessionQuery .= "Host: $ghost\r\n";
  175. $sessionQuery .= "Accept: */*\r\n";
  176. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  177. $sessionQuery .= "Connection: Close\r\n\r\n";
  178. $errno = "";
  179. $errstr = "";
  180. $m_fp = fsockopen($ghost, 80, $errno, $errstr,10) or die($ghost.'<br />');
  181. fwrite($m_fp,$sessionQuery);
  182. $lnum = 0;
  183. //获取详细应答头
  184. $gcookie = "";
  185. while(!feof($m_fp))
  186. {
  187. $line = trim(fgets($m_fp,256));
  188. if($line == "" || $lnum>100)
  189. {
  190. break;
  191. }
  192. else
  193. {
  194. if(preg_match("/^cookie/i", $line))
  195. {
  196. $gcookie = $line;
  197. break;
  198. }
  199. }
  200. }
  201. fclose($m_fp);
  202. return $gcookie;
  203. }
  204. /**
  205. * 获得网址的host和query部份
  206. *
  207. * @access public
  208. * @param string $gurl 调整地址
  209. * @return string
  210. */
  211. function GetHostInfo($gurl)
  212. {
  213. $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
  214. $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
  215. $garr['query'] = "/".preg_replace("/^([^\/]*)\//i", "", $gurl);
  216. return $garr;
  217. }
  218. /**
  219. * HTML里的图片转DEDE格式
  220. *
  221. * @access public
  222. * @param string $body 文章内容
  223. * @return string
  224. */
  225. function TurnImageTag(&$body)
  226. {
  227. global $cfg_album_width,$cfg_ddimg_width;
  228. if(empty($cfg_album_width))
  229. {
  230. $cfg_album_width = 800;
  231. }
  232. if(empty($cfg_ddimg_width))
  233. {
  234. $cfg_ddimg_width = 150;
  235. }
  236. $patten = "/<\\s*img\\s.*?src\\s*=\\s*([\"\\'])?(?(1)(.*?)\\1|([^\\s\\>\"\\']+))/isx";
  237. preg_match_all($patten,$body,$images);
  238. $returnArray1 = $images[2];
  239. $returnArray2 = $images[3];
  240. foreach ( $returnArray1 as $key => $value )
  241. {
  242. if ($value)
  243. {
  244. $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key+1)."'}".$value."{/dede:img}"."\r\n";
  245. }
  246. else
  247. {
  248. $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key+1)."'}".$returnArray2[$key]."{/dede:img}"."\r\n";
  249. }
  250. }
  251. $ttx = "\r\n{dede:pagestyle maxwidth='{$cfg_album_width}' ddmaxwidth='{$cfg_ddimg_width}' row='3' col='3' value='2'/}\r\n{dede:comments}图集类型会采集时生成此配置是正常的,不过如果后面没有跟着img标记则表示规则无效{/dede:comments}\r\n".$ttx;
  252. return $ttx;
  253. }
  254. /**
  255. * HTML里的网址格式转换
  256. *
  257. * @access public
  258. * @param string $body 文章内容
  259. * @return string
  260. */
  261. function TurnLinkTag(&$body)
  262. {
  263. $ttx = '';
  264. $handid = '服务器';
  265. preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is",$body,$match);
  266. if(is_array($match[1]) && count($match[1])>0)
  267. {
  268. for($i=0;isset($match[1][$i]);$i++)
  269. {
  270. $servername = (isset($match[3][$i]) ? str_replace("'","`",$match[3][$i]) : $handid.($i+1));
  271. if(preg_match("/[<>]/", $servername) || strlen($servername)>40)
  272. {
  273. $servername = $handid.($i+1);
  274. }
  275. $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
  276. }
  277. }
  278. return $ttx;
  279. }
  280. /**
  281. * 替换XML的CDATA
  282. *
  283. * @access public
  284. * @param string $str 字符串
  285. * @return string
  286. */
  287. function RpCdata($str)
  288. {
  289. $str = str_replace('<![CDATA[', '', $str);
  290. $str = str_replace(']]>', '', $str);
  291. return $str;
  292. }
  293. /**
  294. * 分析RSS里的链接
  295. *
  296. * @access public
  297. * @param string $rssurl rss地址
  298. * @return string
  299. */
  300. function GetRssLinks($rssurl)
  301. {
  302. global $cfg_soft_lang;
  303. $dhd = new DedeHttpDown();
  304. $dhd->OpenUrl($rssurl);
  305. $rsshtml = $dhd->GetHtml();
  306. //分析编码
  307. preg_match("/encoding=[\"']([^\"']*)[\"']/is",$rsshtml,$infos);
  308. if(isset($infos[1]))
  309. {
  310. $pcode = strtolower(trim($infos[1]));
  311. }
  312. else
  313. {
  314. $pcode = strtolower($cfg_soft_lang);
  315. }
  316. if($cfg_soft_lang=='gb2312')
  317. {
  318. if($pcode=='utf-8')
  319. {
  320. $rsshtml = utf82gb($rsshtml);
  321. }
  322. else if($pcode=='big5')
  323. {
  324. $rsshtml = big52gb($rsshtml);
  325. }
  326. }
  327. else if($cfg_soft_lang=='utf-8')
  328. {
  329. if($pcode=='gbk'||$pcode=='gb2312')
  330. {
  331. $rsshtml = gb2utf8($rsshtml);
  332. }
  333. else if($pcode=='big5')
  334. {
  335. $rsshtml = gb2utf8(big52gb($rsshtml));
  336. }
  337. }
  338. $rsarr = array();
  339. preg_match_all("/<item(.*)<title>(.*)<\/title>/isU",$rsshtml,$titles);
  340. preg_match_all("/<item(.*)<link>(.*)<\/link>/isU",$rsshtml,$links);
  341. preg_match_all("/<item(.*)<description>(.*)<\/description>/isU",$rsshtml,$descriptions);
  342. if(!isset($links[2]))
  343. {
  344. return '';
  345. }
  346. foreach($links[2] as $k=>$v)
  347. {
  348. $rsarr[$k]['link'] = RpCdata($v);
  349. if(isset($titles[2][$k]))
  350. {
  351. $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
  352. }
  353. else
  354. {
  355. $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
  356. }
  357. if(isset($descriptions[2][$k]))
  358. {
  359. $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k],$rssurl);
  360. }
  361. else
  362. {
  363. $rsarr[$k]['image'] = '';
  364. }
  365. }
  366. return $rsarr;
  367. }
  368. /**
  369. * 从RSS摘要获取图片信息
  370. *
  371. * @access public
  372. * @param string $descriptions 描述
  373. * @param string $refurl 来源地址
  374. * @return string
  375. */
  376. function GetddImgFromRss($descriptions,$refurl)
  377. {
  378. if($descriptions=='')
  379. {
  380. return '';
  381. }
  382. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$descriptions,$imgs);
  383. if(isset($imgs[2][0]))
  384. {
  385. $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
  386. $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
  387. return FillUrl($refurl,$imgs[2][0]);
  388. }
  389. else
  390. {
  391. return '';
  392. }
  393. }
  394. /**
  395. * 补全网址
  396. *
  397. * @access public
  398. * @param string $refurl 来源地址
  399. * @param string $surl 站点地址
  400. * @return string
  401. */
  402. function FillUrl($refurl,$surl)
  403. {
  404. $i = $pathStep = 0;
  405. $dstr = $pstr = $okurl = '';
  406. $refurl = trim($refurl);
  407. $surl = trim($surl);
  408. $urls = @parse_url($refurl);
  409. $basehost = ( (!isset($urls['port']) || $urls['port']=='80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
  410. //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
  411. //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
  412. $basepath = $basehost;
  413. $paths = explode('/',preg_replace("/^http:\/\//i", "", $refurl));
  414. $n = count($paths);
  415. for($i=1;$i < ($n-1);$i++)
  416. {
  417. if(!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
  418. }
  419. if(!preg_match("/[\?\.]/", $paths[$n-1]))
  420. {
  421. $basepath .= '/'.$paths[$n-1];
  422. }
  423. if($surl=='')
  424. {
  425. return $basepath;
  426. }
  427. $pos = strpos($surl, "#");
  428. if($pos>0)
  429. {
  430. $surl = substr($surl, 0, $pos);
  431. }
  432. //用 '/' 表示网站根的网址
  433. if($surl[0]=='/')
  434. {
  435. $okurl = $basehost.$surl;
  436. }
  437. else if($surl[0]=='.')
  438. {
  439. if(strlen($surl)<=2)
  440. {
  441. return '';
  442. }
  443. else if($surl[1]=='/')
  444. {
  445. $okurl = $basepath.preg_replace('/^./', '', $surl);
  446. }
  447. else
  448. {
  449. $okurl = $basepath.'/'.$surl;
  450. }
  451. }
  452. else
  453. {
  454. if( strlen($surl) < 7 )
  455. {
  456. $okurl = $basepath.'/'.$surl;
  457. }
  458. else if( preg_match("/^http:\/\//i",$surl) )
  459. {
  460. $okurl = $surl;
  461. }
  462. else
  463. {
  464. $okurl = $basepath.'/'.$surl;
  465. }
  466. }
  467. $okurl = preg_replace("/^http:\/\//i", '', $okurl);
  468. $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
  469. return $okurl;
  470. }
  471. /**
  472. * 从匹配规则中获取列表网址
  473. *
  474. * @access public
  475. * @param string $regxurl 正则地址
  476. * @param string $handurl 操作地址
  477. * @param string $startid 开始ID
  478. * @param string $endid 结束ID
  479. * @param string $addv 增值
  480. * @param string $usemore 使用更多
  481. * @param string $batchrule 列表规则
  482. * @return string
  483. */
  484. function GetUrlFromListRule($regxurl='',$handurl='',$startid=0,$endid=0,$addv=1,$usemore=0,$batchrule='')
  485. {
  486. global $dsql,$islisten;
  487. $lists = array();
  488. $n = 0;
  489. $islisten = (empty($islisten) ? 0 : $islisten);
  490. if($handurl!='')
  491. {
  492. $handurls = explode("\n",$handurl);
  493. foreach($handurls as $handurl)
  494. {
  495. $handurl = trim($handurl);
  496. if(preg_match("/^http:\/\//i", $handurl))
  497. {
  498. $lists[$n][0] = $handurl;
  499. $lists[$n][1] = 0;
  500. $n++;
  501. if($islisten==1)
  502. {
  503. break;
  504. }
  505. }
  506. }
  507. }
  508. if($regxurl!='')
  509. {
  510. //没指定(#)和(*)
  511. if(!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl))
  512. {
  513. $lists[$n][0] = $regxurl;
  514. $lists[$n][1] = 0;
  515. $n++;
  516. }
  517. else
  518. {
  519. if($addv <= 0)
  520. {
  521. $addv = 1;
  522. }
  523. //没指定多栏目匹配规则
  524. if($usemore==0)
  525. {
  526. while($startid <= $endid)
  527. {
  528. $lists[$n][0] = str_replace("(*)",sprintf('%0'.strlen($startid).'d',$startid),$regxurl);
  529. $lists[$n][1] = 0;
  530. $startid = sprintf('%0'.strlen($startid).'d',$startid + $addv);
  531. $n++;
  532. if($n>2000 || $islisten==1)
  533. {
  534. break;
  535. }
  536. }
  537. }
  538. //匹配多个栏目
  539. //规则表达式 [(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20; typeid=>栏目id; addurl=>附加的网址(用|分开多个)]
  540. else
  541. {
  542. $nrules = explode(']',trim($batchrule));
  543. foreach($nrules as $nrule)
  544. {
  545. $nrule = trim($nrule);
  546. $nrule = preg_replace("/^\[|\]$/", '', $nrule);
  547. $nrules = explode(';',$nrule);
  548. if(count($nrules)<3)
  549. {
  550. continue;
  551. }
  552. $brtag = '';
  553. $startid = 0;
  554. $endid = 0;
  555. $typeid = 0;
  556. $addurls = array();
  557. foreach($nrules as $nrule)
  558. {
  559. $nrule = trim($nrule);
  560. list($k,$v) = explode('=>',$nrule);
  561. if(trim($k)=='(#)')
  562. {
  563. $brtag = trim($v);
  564. }
  565. else if(trim($k)=='typeid')
  566. {
  567. $typeid = trim($v);
  568. }
  569. else if(trim($k)=='addurl')
  570. {
  571. $addurl = trim($v);
  572. $addurls = explode('|',$addurl);
  573. }
  574. else if(trim($k)=='(*)')
  575. {
  576. $v = preg_replace("/[ \r\n\t]/", '', trim($v));
  577. list($startid,$endid) = explode('-',$v);
  578. }
  579. }
  580. //如果栏目用栏目名称
  581. if(preg_match('/[^0-9]/', $typeid))
  582. {
  583. $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
  584. if(is_array($arr))
  585. {
  586. $typeid = $arr['id'];
  587. }
  588. else
  589. {
  590. $typeid = 0;
  591. }
  592. }
  593. //附加网址优先
  594. $mjj = 0;
  595. if(isset($addurls[0]))
  596. {
  597. foreach($addurls as $addurl)
  598. {
  599. $addurl = trim($addurl);
  600. if($addurl=='')
  601. {
  602. continue;
  603. }
  604. $lists[$n][0] = $addurl;
  605. $lists[$n][1] = $typeid;
  606. $n++;
  607. $mjj++;
  608. if($islisten==1)
  609. {
  610. break;
  611. }
  612. }
  613. }
  614. //如果为非监听模式或监听模式没手工指定的附加网址
  615. if($islisten!=1 || $mjj==0 )
  616. {
  617. //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
  618. while($startid <= $endid)
  619. {
  620. $lists[$n][0] = str_replace("(#)",$brtag,$regxurl);
  621. $lists[$n][0] = str_replace("(*)",sprintf('%0'.strlen($startid).'d',$startid),$lists[$n][0]);
  622. $lists[$n][1] = $typeid;
  623. $startid = sprintf('%0'.strlen($startid).'d',$startid + $addv);
  624. $n++;
  625. if($islisten==1)
  626. {
  627. break;
  628. }
  629. if($n>20000)
  630. {
  631. break;
  632. }
  633. }
  634. }
  635. }
  636. } //End 匹配多栏目
  637. } //End使用规则匹配的情况
  638. }
  639. return $lists;
  640. }//End