国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1212 lines
41KB

  1. <?php if(!defined('DEDEINC')) exit('dedecms');
  2. /**
  3. * Dede采集类
  4. *
  5. * @version $Id: dedecollection.class.php 1 20:20 2010年7月7日Z tianya $
  6. * @package DedeCMS.Libraries
  7. * @copyright Copyright (c) 2020, DedeBIZ.COM
  8. * @license https://www.dedebiz.com/license/v6
  9. * @link https://www.dedebiz.com
  10. */
  11. require_once(DEDEINC."/dedecollection.func.php"); //采集扩展函数
  12. require_once(DEDEINC."/image.func.php");
  13. require_once(DEDEINC."/dedehtml2.class.php");
  14. @set_time_limit(0);
  15. /**
  16. * Dede采集类
  17. *
  18. * @package DedeCollection
  19. * @subpackage DedeCMS.Libraries
  20. * @link https://www.dedebiz.com
  21. */
  22. class DedeCollection
  23. {
  24. var $artNotes = array(); //文章采集的字段信息
  25. var $spNotes = array(); //文章采集的字段信息
  26. var $lists = array(); //采集节点的来源列表处理信息
  27. var $noteInfos = array(); //采集节点的基本配置信息
  28. var $dsql = '';
  29. var $noteId = '';
  30. var $cDedeHtml = '';
  31. var $cHttpDown = '';
  32. var $mediaCount = 0;
  33. var $tmpUnitValue = '';
  34. var $tmpLinks = array();
  35. var $tmpHtml = '';
  36. var $breImage = '';
  37. var $errString = '';
  38. //兼容php5构造函数
  39. function __construct()
  40. {
  41. $this->dsql = $GLOBALS['dsql'];
  42. $this->cHttpDown = new DedeHttpDown();
  43. $this->cDedeHtml = new DedeHtml2();
  44. }
  45. function DedeCollection()
  46. {
  47. $this->__construct();
  48. }
  49. //析放资源
  50. function Close()
  51. {
  52. }
  53. /**
  54. * 从数据库里载入某个节点
  55. *
  56. * @access public
  57. * @param int $nid 采集节点ID
  58. * @return void
  59. */
  60. function LoadNote($nid)
  61. {
  62. $this->noteId = $nid;
  63. $row = $this->dsql->GetOne("SELECT * FROM `#@__co_note` WHERE nid='$nid'");
  64. $this->LoadListConfig($row['listconfig']);
  65. $this->LoadItemConfig($row['itemconfig']);
  66. }
  67. /**
  68. * 分析基本节点的及索引配置信息
  69. *
  70. * @access public
  71. * @param string $configString 配置字符串
  72. * @return void
  73. */
  74. function LoadListConfig($configString)
  75. {
  76. $dtp = new DedeTagParse();
  77. $dtp2 = new DedeTagParse();
  78. $dtp->LoadString($configString);
  79. for($i=0; $i<=$dtp->Count; $i++)
  80. {
  81. $ctag = $dtp->CTags[$i];
  82. //item 配置
  83. //节点基本信息
  84. if($ctag->GetName()=="noteinfo")
  85. {
  86. $this->noteInfos['notename'] = $ctag->GetAtt('notename');
  87. $this->noteInfos['matchtype'] = $ctag->GetAtt('matchtype');
  88. $this->noteInfos['channelid'] = $ctag->GetAtt('channelid');
  89. $this->noteInfos['refurl'] = $ctag->GetAtt('refurl');
  90. $this->noteInfos['sourcelang'] = $ctag->GetAtt('sourcelang');
  91. $this->noteInfos['cosort'] = $ctag->GetAtt('cosort');
  92. $this->noteInfos['isref'] = $ctag->GetAtt('isref');
  93. $this->noteInfos['exptime'] = $ctag->GetAtt('exptime');
  94. }
  95. //list 配置
  96. //要采集的列表页的信息
  97. else if($ctag->GetName()=="listrule")
  98. {
  99. $this->lists['sourcetype'] = $ctag->GetAtt('sourcetype');
  100. $this->lists['rssurl'] = $ctag->GetAtt('rssurl');
  101. $this->lists['regxurl'] = $ctag->GetAtt('regxurl');
  102. $this->lists['startid'] = $ctag->GetAtt('startid');
  103. $this->lists['endid'] = $ctag->GetAtt('endid');
  104. $this->lists['addv'] = $ctag->GetAtt('addv');
  105. $this->lists['urlrule'] = $ctag->GetAtt('urlrule');
  106. $this->lists['musthas'] = $ctag->GetAtt('musthas');
  107. $this->lists['nothas'] = $ctag->GetAtt('nothas');
  108. $this->lists['listpic'] = $ctag->GetAtt('listpic');
  109. $this->lists['usemore'] = $ctag->GetAtt('usemore');
  110. $dtp2->LoadString($ctag->GetInnerText());
  111. for($j=0; $j<=$dtp2->Count; $j++)
  112. {
  113. $ctag2 = $dtp2->CTags[$j];
  114. $tname = $ctag2->GetName();
  115. if($tname=='addurls')
  116. {
  117. $this->lists['addurls'] = trim($ctag2->GetInnerText());
  118. }
  119. else if($tname=='regxrule')
  120. {
  121. $this->lists['regxrule'] = trim($ctag2->GetInnerText());
  122. }
  123. else if($tname=='areastart')
  124. {
  125. $this->lists['areastart'] = trim($ctag2->GetInnerText());
  126. }
  127. else if($tname=='areaend')
  128. {
  129. $this->lists['areaend'] = trim($ctag2->GetInnerText());
  130. }
  131. else if($tname=='batchrule')
  132. {
  133. $this->lists['batchrule'] = trim($ctag2->GetInnerText());
  134. }
  135. }
  136. //分析列表网址
  137. if($this->lists['sourcetype'] != 'rss')
  138. {
  139. $this->lists['url'] = GetUrlFromListRule($this->lists['regxurl'],$this->lists['addurls'],
  140. $this->lists['startid'],$this->lists['endid'],$this->lists['addv'],$this->lists['usemore'],$this->lists['batchrule']);
  141. }
  142. else
  143. {
  144. $this->lists['url'] = $this->lists['rssurl'];
  145. }
  146. }
  147. }//End Loop
  148. $dtp->Clear();
  149. $dtp2->Clear();
  150. }
  151. /**
  152. * 分析采集文章页的字段的设置
  153. *
  154. * @access public
  155. * @param string $configString 配置字符串
  156. * @return void
  157. */
  158. function LoadItemConfig($configString)
  159. {
  160. $dtp = new DedeTagParse();
  161. $dtp2 = new DedeTagParse();
  162. $dtp->LoadString($configString);
  163. for($i=0; $i<=$dtp->Count; $i++)
  164. {
  165. $ctag = $dtp->CTags[$i];
  166. if($ctag->GetName()=='sppage')
  167. {
  168. $this->artNotes['sppage'] = $ctag->GetInnerText();
  169. $this->artNotes['sptype'] = $ctag->GetAtt('sptype');
  170. $this->spNotes['srul'] = $ctag->GetAtt('srul');
  171. $this->spNotes['erul'] = $ctag->GetAtt('erul');
  172. }
  173. else if($ctag->GetName()=='previewurl')
  174. {
  175. $this->artNotes['previewurl'] = $ctag->GetInnerText();
  176. }
  177. else if($ctag->GetName()=='keywordtrim')
  178. {
  179. $this->artNotes['keywordtrim'] = $ctag->GetInnerText();
  180. }
  181. else if($ctag->GetName()=='descriptiontrim')
  182. {
  183. $this->artNotes['descriptiontrim'] = $ctag->GetInnerText();
  184. }
  185. else if($ctag->GetName()=='item')
  186. {
  187. $field = $ctag->GetAtt('field');
  188. if($field == '')
  189. {
  190. continue;
  191. }
  192. $this->artNotes[$field]['value'] = $ctag->GetAtt('value');
  193. $this->artNotes[$field]['isunit'] = $ctag->GetAtt('isunit');
  194. $this->artNotes[$field]['isdown'] = $ctag->GetAtt('isdown');
  195. $this->artNotes[$field]['trim'] = array();
  196. $this->artNotes[$field]['match'] = '';
  197. $this->artNotes[$field]['function'] = '';
  198. $t = 0;
  199. $dtp2->LoadString($ctag->GetInnerText());
  200. for($k=0; $k<=$dtp2->Count; $k++)
  201. {
  202. $ctag2 = $dtp2->CTags[$k];
  203. if($ctag2->GetName()=='trim')
  204. {
  205. $this->artNotes[$field]['trim'][$t][0] = str_replace('#n#','&nbsp;',$ctag2->GetInnerText());
  206. $this->artNotes[$field]['trim'][$t][1] = $ctag2->GetAtt('replace');
  207. $t++;
  208. }
  209. else if($ctag2->GetName()=='match')
  210. {
  211. $this->artNotes[$field]['match'] = str_replace('#n#','&nbsp;',$ctag2->GetInnerText());
  212. }
  213. else if($ctag2->GetName()=='function')
  214. {
  215. $this->artNotes[$field]['function'] = $ctag2->GetInnerText();
  216. }
  217. }
  218. }
  219. }//End Loop
  220. $dtp->Clear();
  221. $dtp2->Clear();
  222. }
  223. /**
  224. * 下载其中一个网址,并保存
  225. *
  226. * @access public
  227. * @param int $aid 文档ID
  228. * @param string $dourl 操作地址
  229. * @param string $litpic 缩略图
  230. * @param bool $issave 是否保存
  231. * @return string
  232. */
  233. function DownUrl($aid, $dourl, $litpic='', $issave=TRUE)
  234. {
  235. $this->tmpLinks = array();
  236. $this->tmpUnitValue = '';
  237. $this->breImage = '';
  238. $this->tmpHtml = $this->DownOnePage($dourl);
  239. //检测是否有分页字段,并预先处理
  240. if(!empty($this->artNotes['sppage']))
  241. {
  242. $noteid = '';
  243. foreach($this->artNotes as $k=>$sarr)
  244. {
  245. if(isset($sarr['isunit']) && $sarr['isunit']==1)
  246. {
  247. $noteid = $k;
  248. break;
  249. }
  250. }
  251. $this->GetSpPage($dourl, $noteid, $this->tmpHtml);
  252. if(preg_match("/#p#/i", $this->tmpUnitValue))
  253. {
  254. if ($this->artNotes["sptype"] != 'diyrule')
  255. {
  256. $this->tmpUnitValue = '副标题#e#'.$this->tmpUnitValue;
  257. }
  258. }
  259. }
  260. //处理字段
  261. $body = $this->GetPageFields($dourl, $issave, $litpic);
  262. //保存资料到数据库
  263. if($issave)
  264. {
  265. $query = " UPDATE `#@__co_htmls` SET dtime='".time()."',result='".addslashes($body)."',isdown='1' WHERE aid='$aid' ";
  266. if(!$this->dsql->ExecuteNoneQuery($query))
  267. {
  268. echo $this->dsql->GetError();
  269. }
  270. return $body;
  271. }
  272. return $body;
  273. }
  274. // 解析地址
  275. function GetUrl($uri)
  276. {
  277. $arr = $tmp = array();
  278. // query
  279. $x = array_pad( explode( '?', $uri ), 2, false );
  280. $arr['query'] = ( $x[1] )? $x[1] : '' ;
  281. // resource
  282. $x = array_pad( explode( '/', $x[0] ), 2, false );
  283. $x_last = array_pop( $x );
  284. if( strpos( $x_last, '.' ) === false )
  285. {
  286. $arr['resource'] = '';
  287. $x[] = $x_last;
  288. }
  289. else
  290. {
  291. $arr['resource'] = $x_last;
  292. $tmp = @explode('.', $arr['resource']);
  293. $arr['file'] = @$tmp[0];
  294. $arr['ext'] = '.'.@$tmp[1];
  295. }
  296. // path
  297. $arr['path'] = implode( '/', $x );
  298. if( substr( $arr['path'], -1 ) !== '/' ) $arr['path'] .= '/';
  299. // url
  300. $arr['url'] = $uri;
  301. return $arr;
  302. }
  303. /**
  304. * 获取分页区域的内容
  305. *
  306. * @access public
  307. * @param string $dourl 操作地址
  308. * @param string $noteid 节点ID
  309. * @param string $html html内容
  310. * @param int $step 步骤
  311. * @return string
  312. */
  313. function GetSpPage($dourl, $noteid, $html, $step=0)
  314. {
  315. $sarr = $this->artNotes[$noteid];
  316. $linkareaHtml = $this->GetHtmlArea('[内容]', $this->artNotes['sppage'], $html);
  317. if($linkareaHtml=='')
  318. {
  319. if($this->tmpUnitValue=='')
  320. {
  321. $this->tmpUnitValue .= $this->GetHtmlArea('[内容]', $sarr['match'], $html);
  322. }
  323. else
  324. {
  325. $this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea('[内容]', $sarr['match'], $html);
  326. }
  327. if ($this->artNotes["sptype"] != 'diyrule') return;
  328. }
  329. //完整的分页列表
  330. if($this->artNotes["sptype"]=='full' || $this->artNotes["sptype"]=='')
  331. {
  332. $this->tmpUnitValue .= $this->GetHtmlArea('[内容]', $sarr['match'], $html);
  333. $this->cDedeHtml->GetLinkType = "link";
  334. $this->cDedeHtml->SetSource($linkareaHtml, $dourl, 'link');
  335. foreach($this->cDedeHtml->Links as $k=>$t)
  336. {
  337. $k = $this->cDedeHtml->FillUrl($k);
  338. if($k==$dourl)
  339. {
  340. continue;
  341. }
  342. $nhtml = $this->DownOnePage($k);
  343. if($nhtml!='')
  344. {
  345. $ct = trim($this->GetHtmlArea('[内容]', $sarr['match'], $nhtml));
  346. if($ct!='')
  347. {
  348. $this->tmpUnitValue .= "#p#副标题#e#".$ct;
  349. }
  350. }
  351. }
  352. }
  353. else if ($this->artNotes["sptype"] == 'diyrule')
  354. {
  355. $maxpage = 10;
  356. $urlinfo = $this->GetUrl($dourl);
  357. $testurl = str_replace(array_keys($urlinfo), array_values($urlinfo), $this->artNotes['sppage']);
  358. $testurl = str_ireplace('{p}', '~p~', $testurl);
  359. $testurl = str_replace(array('{', '}'), '', $testurl);
  360. $lastchash = md5($html);
  361. for($i=$this->spNotes['srul']; $i <= $this->spNotes['erul']; $i++)
  362. {
  363. $tempurl = str_replace('~p~', $i, $testurl);
  364. $tempurl = $this->cDedeHtml->FillUrl($tempurl);
  365. $nhtml = $this->DownOnePage($tempurl);
  366. $newchash = md5($nhtml);
  367. if ($newchash == $lastchash) continue;
  368. $lastchash = $newchash;
  369. if($nhtml!='')
  370. {
  371. $ct = trim($this->GetHtmlArea('[内容]', $sarr['match'], $nhtml));
  372. if($ct!='')
  373. {
  374. $this->tmpUnitValue .= "#p#副标题#e#".$ct;
  375. // echo $this->tmpUnitValue;exit;
  376. }
  377. }
  378. }
  379. }
  380. //上下页形式或不完整的分页列表
  381. else
  382. {
  383. if($step>50)
  384. {
  385. return;
  386. }
  387. if($step==0)
  388. {
  389. $this->tmpUnitValue .= $this->GetHtmlArea('[内容]', $sarr['match'], $html);
  390. }
  391. $this->cDedeHtml->GetLinkType = "link";
  392. $this->cDedeHtml->SetSource($linkareaHtml, $dourl, 'link');
  393. $hasLink = FALSE;
  394. foreach($this->cDedeHtml->Links as $k=>$t)
  395. {
  396. $k = $this->cDedeHtml->FillUrl($k);
  397. if(in_array($k, $this->tmpLinks))
  398. {
  399. CONTINUE;
  400. }
  401. else{
  402. $nhtml = $this->DownOnePage($k);
  403. if($nhtml!='')
  404. {
  405. $ct = trim($this->GetHtmlArea('[内容]',$sarr['match'],$nhtml));
  406. if($ct!='')
  407. {
  408. $this->tmpUnitValue .= "#p#副标题#e#".$ct;
  409. }
  410. }
  411. $hasLink = TRUE;
  412. $this->tmpLinks[] = $k;
  413. $dourl = $k;
  414. $step++;
  415. }
  416. }
  417. if($hasLink)
  418. {
  419. $this->GetSpPage($dourl, $noteid, $nhtml, $step);
  420. }
  421. }
  422. }
  423. /**
  424. * 获取特定区域的HTML
  425. *
  426. * @access public
  427. * @param string $sptag 区域标记
  428. * @param string $areaRule 地址规则
  429. * @param string $html html代码
  430. * @return string
  431. */
  432. function GetHtmlArea($sptag, &$areaRule, &$html)
  433. {
  434. //用正则表达式的模式匹配
  435. if($this->noteInfos['matchtype']=='regex')
  436. {
  437. $areaRule = str_replace("/", "\\/", $areaRule);
  438. $areaRules = explode($sptag, $areaRule);
  439. $arr = array();
  440. if($html==''||$areaRules[0]=='')
  441. {
  442. return '';
  443. }
  444. preg_match('#'.$areaRules[0]."(.*)".$areaRules[1]."#isU", $html, $arr);
  445. return empty($arr[1]) ? '' : trim($arr[1]);
  446. }
  447. //用字符串模式匹配
  448. else
  449. {
  450. $areaRules = explode($sptag,$areaRule);
  451. if($html=='' || $areaRules[0]=='')
  452. {
  453. return '';
  454. }
  455. $posstart = @strpos($html,$areaRules[0]);
  456. if($posstart===FALSE)
  457. {
  458. return '';
  459. }
  460. $posstart = $posstart + strlen($areaRules[0]);
  461. $posend = @strpos($html,$areaRules[1],$posstart);
  462. if($posend > $posstart && $posend!==FALSE)
  463. {
  464. //return substr($html,$posstart+strlen($areaRules[0]),$posend-$posstart-strlen($areaRules[0]));
  465. return substr($html,$posstart,$posend-$posstart);
  466. }
  467. else
  468. {
  469. return '';
  470. }
  471. }
  472. }
  473. /**
  474. * 下载指定网址
  475. *
  476. * @access public
  477. * @param string $dourl 下载地址
  478. */
  479. function DownOnePage($dourl)
  480. {
  481. $this->cHttpDown->OpenUrl($dourl);
  482. $html = $this->cHttpDown->GetHtml();
  483. $this->cHttpDown->Close();
  484. $this->ChangeCode($html);
  485. return $html;
  486. }
  487. /**
  488. * 下载特定资源,并保存为指定文件
  489. *
  490. * @access public
  491. * @param string $dourl 操作地址
  492. * @param string $mtype 附件类型
  493. * @param string $islitpic 是否缩略图
  494. * @return string
  495. */
  496. function DownMedia($dourl, $mtype='img', $islitpic=FALSE)
  497. {
  498. global $notckpic;
  499. if(empty($notckpic))
  500. {
  501. $notckpic = 0;
  502. }
  503. //检测是否已经下载此文件
  504. $wi = FALSE;
  505. $tofile = $filename = '';
  506. if($notckpic==0)
  507. {
  508. $row = $this->dsql->GetOne("SELECT hash,tofile FROM `#@__co_mediaurls` WHERE nid='{$this->noteId}' AND hash='".md5($dourl)."' ");
  509. if(isset($row['tofile']))
  510. {
  511. $tofile = $filename = $row['tofile'];
  512. }
  513. }
  514. //如果不存在,下载文件
  515. if($tofile=='' || !file_exists($GLOBALS['cfg_basedir'].$filename))
  516. {
  517. $filename = $this->GetRndName($dourl,$mtype);
  518. if(!preg_match("#^\/#", $filename))
  519. {
  520. $filename = "/".$filename;
  521. }
  522. //防盗链模式
  523. if($this->noteInfos['isref']=='yes' && $this->noteInfos['refurl']!='')
  524. {
  525. if($this->noteInfos['exptime']=='')
  526. {
  527. $this->noteInfos['exptime'] = 10;
  528. }
  529. DownImageKeep($dourl,$this->noteInfos['refurl'],$GLOBALS['cfg_basedir'].$filename,'',0,$this->Item['exptime']);
  530. }
  531. //普通模式
  532. else
  533. {
  534. $this->cHttpDown->OpenUrl($dourl);
  535. $this->cHttpDown->SaveToBin($GLOBALS['cfg_basedir'].$filename);
  536. $this->cHttpDown->Close();
  537. }
  538. //下载文件成功,保存记录
  539. if(file_exists($GLOBALS['cfg_basedir'].$filename))
  540. {
  541. if($tofile=='')
  542. {
  543. $query = "INSERT INTO `#@__co_mediaurls`(nid,hash,tofile) VALUES ('".$this->noteId."', '".md5($dourl)."', '".addslashes($filename)."');";
  544. }
  545. else
  546. {
  547. $query = "UPDATE `#@__co_mediaurls` SET tofile='".addslashes($filename)."' WHERE hash='".md5($dourl)."' ";
  548. }
  549. $this->dsql->ExecuteNoneQuery($query);
  550. }
  551. }
  552. //如果下载图片失败或图片不存在,返回网址
  553. if(!file_exists($GLOBALS['cfg_basedir'].$filename))
  554. {
  555. return $dourl;
  556. }
  557. //生成缩略图
  558. if($mtype=='img' && !$islitpic && $this->breImage=='')
  559. {
  560. $this->breImage = $filename;
  561. if(!preg_match("#^http:\/\/#", $this->breImage) && file_exists($GLOBALS['cfg_basedir'].$filename))
  562. {
  563. $filenames = explode('/',$filename);
  564. $filenamed = $filenames[count($filenames)-1];
  565. $nfilename = str_replace('.','_lit.',$filenamed);
  566. $nfilename = str_replace($filenamed,$nfilename,$filename);
  567. if(@copy($GLOBALS['cfg_basedir'].$filename, $GLOBALS['cfg_basedir'].$nfilename))
  568. {
  569. ImageResize($GLOBALS['cfg_basedir'].$nfilename,$GLOBALS['cfg_ddimg_width'],$GLOBALS['cfg_ddimg_height']);
  570. $this->breImage = $nfilename;
  571. }
  572. }
  573. }
  574. if($mtype=='img' && !$islitpic)
  575. {
  576. @WaterImg($GLOBALS['cfg_basedir'].$filename,'collect');
  577. }
  578. return $filename;
  579. }
  580. /**
  581. * 获得下载媒体的随机名称
  582. *
  583. * @access public
  584. * @param string $url 地址
  585. * @param string $v 值
  586. * @return string
  587. */
  588. function GetRndName($url, $v)
  589. {
  590. global $cfg_image_dir,$cfg_dir_purview;
  591. $this->mediaCount++;
  592. $mnum = $this->mediaCount;
  593. $timedir = "c".MyDate("ymd",time());
  594. //存放路径
  595. $fullurl = preg_replace("#\/{1,}#", "/", $cfg_image_dir."/");
  596. if(!is_dir($GLOBALS['cfg_basedir']."/$fullurl"))
  597. {
  598. MkdirAll($GLOBALS['cfg_basedir']."/$fullurl", $cfg_dir_purview);
  599. }
  600. $fullurl = $fullurl.$timedir."/";
  601. if(!is_dir($GLOBALS['cfg_basedir']."/$fullurl"))
  602. {
  603. MkdirAll($GLOBALS['cfg_basedir']."/$fullurl", $cfg_dir_purview);
  604. }
  605. //文件名称
  606. $timename = str_replace('.','', ExecTime());
  607. $threadnum = 0;
  608. if(isset($_GET['threadnum']))
  609. {
  610. $threadnum = intval($_GET['threadnum']);
  611. }
  612. $filename = dd2char($timename.$threadnum.'-'.$mnum.mt_rand(1000,9999));
  613. //分配扩展名
  614. $urls = explode('.',$url);
  615. if($v=='img')
  616. {
  617. $shortname = '.jpg';
  618. if(preg_match("#\.gif$#i", $url))
  619. {
  620. $shortname = '.gif';
  621. }
  622. else if(preg_match("#\.png$#i", $url))
  623. {
  624. $shortname = '.png';
  625. }
  626. }
  627. else if($v=='embed')
  628. {
  629. $shortname = '.swf';
  630. }
  631. else
  632. {
  633. $shortname = '';
  634. }
  635. $fullname = $fullurl.$filename.$shortname;
  636. return preg_replace("#\/{1,}#", "/", $fullname);
  637. }
  638. /**
  639. * 按载入的网页内容获取规则,从一个HTML文件中获取内容
  640. *
  641. * @access public
  642. * @param string $dourl 操作地址
  643. * @param string $needDown 需要下载
  644. * @param string $litpic 缩略图
  645. * @return string
  646. */
  647. function GetPageFields($dourl, $needDown, $litpic='')
  648. {
  649. global $cfg_auot_description;
  650. if($this->tmpHtml == '')
  651. {
  652. return '';
  653. }
  654. $artitem = '';
  655. $isPutUnit = FALSE;
  656. $tmpLtKeys = array();
  657. $inarr = array();
  658. //自动分析关键字和摘要
  659. preg_match("#<meta[\s]+name=['\"]keywords['\"] content=['\"](.*)['\"]#isU", $this->tmpHtml, $inarr);
  660. preg_match("#<meta[\s]+content=['\"](.*)['\"] name=['\"]keywords['\"]#isU", $this->tmpHtml, $inarr2);
  661. if(!isset($inarr[1]) && isset($inarr2[1]))
  662. {
  663. $inarr[1] = $inarr2[1];
  664. }
  665. if(isset($inarr[1]))
  666. {
  667. $keywords = trim(cn_substr(html2text($inarr[1]),30));
  668. $keywords = preg_replace("#".$this->artNotes['keywordtrim']."#isU",'',$keywords);
  669. if(!preg_match("#,#", $keywords))
  670. {
  671. $keywords = str_replace(' ', ',', $keywords);
  672. }
  673. $artitem .= "{dede:field name='keywords'}".$keywords."{/dede:field}\r\n";
  674. }
  675. else
  676. {
  677. $artitem .= "{dede:field name='keywords'}{/dede:field}\r\n";
  678. }
  679. // preg_match("#<meta[\s]+name=['\"]description['\"] content=['\"](.*)['\"]#isU", $this->tmpHtml, $inarr);
  680. // preg_match("#<meta[\s]+content=['\"](.*)['\"] name=['\"]description['\"]#isU", $this->tmpHtml, $inarr2);
  681. preg_match("#<meta[\s]+name=['\"]description['\"][\s]+content=['\"]([^>]*?)['\"]#iU", $this->tmpHtml, $inarr);
  682. preg_match("#<meta[\s]+content=['\"]([^>]*?)['\"][\s]+name=['\"]description['\"]#iU", $this->tmpHtml, $inarr2);
  683. if(!isset($inarr[1]) && isset($inarr2[1]))
  684. {
  685. $inarr[1] = $inarr2[1];
  686. }
  687. if(isset($inarr[1]))
  688. {
  689. $description = trim(cn_substr(html2text($inarr[1]),$cfg_auot_description));
  690. $description = preg_replace("/".$this->artNotes['descriptiontrim']."/isU",'',$description);
  691. $artitem .= "{dede:field name='description'}".$description."{/dede:field}\r\n";
  692. }
  693. else
  694. {
  695. $artitem .= "{dede:field name='description'}{/dede:field}\r\n";
  696. }
  697. foreach($this->artNotes as $k=>$sarr)
  698. {
  699. //可能出现意外的情况
  700. if($k=='sppage' || $k=='sptype')
  701. {
  702. continue;
  703. }
  704. if(!is_array($sarr))
  705. {
  706. continue;
  707. }
  708. //特殊的规则或没匹配选项
  709. if($sarr['match']=='' || trim($sarr['match'])=='[内容]')
  710. {
  711. if($sarr['value']!='[内容]')
  712. {
  713. $v = trim($sarr['value']);
  714. }
  715. else
  716. {
  717. $v = '';
  718. }
  719. }
  720. else
  721. {
  722. //分多页的内容
  723. if($this->tmpUnitValue!='' && !$isPutUnit && $sarr['isunit']==1)
  724. {
  725. $v = $this->tmpUnitValue;
  726. $isPutUnit = TRUE;
  727. }
  728. else
  729. {
  730. $v = $this->GetHtmlArea('[内容]',$sarr['match'],$this->tmpHtml);
  731. }
  732. //过滤内容规则
  733. if(isset($sarr['trim']) && $v!='')
  734. {
  735. foreach($sarr['trim'] as $nv)
  736. {
  737. if($nv[0]=='')
  738. {
  739. continue;
  740. }
  741. $nvs = str_replace("/", "\\/", $nv[0]);
  742. $v = preg_replace("#".$nvs."#isU", $nv[1], $v);
  743. }
  744. }
  745. //是否下载远程资源
  746. if($needDown)
  747. {
  748. if($sarr['isdown'] == '1')
  749. {
  750. $v = $this->DownMedias($v, $dourl);
  751. }
  752. }
  753. else
  754. {
  755. if($sarr['isdown'] == '1')
  756. {
  757. $v = $this->MediasReplace($v, $dourl);
  758. }
  759. }
  760. }
  761. $v = trim($v);
  762. //用户自行对内容进行处理的接口
  763. if($sarr['function'] != '')
  764. {
  765. $tmpLtKeys[$k]['v'] = $v;
  766. $tmpLtKeys[$k]['f'] = $sarr['function'];
  767. }
  768. else
  769. {
  770. $v = preg_replace("#( )$#", '', $v);
  771. $v = preg_replace("#[\r\n\t ]{1,}$#", '', $v);
  772. $artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
  773. }
  774. }//End Foreach
  775. //处理带函数的项目
  776. foreach($tmpLtKeys as $k=>$sarr)
  777. {
  778. $v = $this->RunPHP($sarr['v'],$sarr['f']);
  779. $v = preg_replace("#( )$#", '', $v);
  780. $v = preg_replace("#[\r\n\t ]{1,}$#", '', $v);
  781. $artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
  782. }
  783. if($litpic!='' && $this->lists['listpic']==1)
  784. {
  785. $artitem .= "{dede:field name='litpic'}".$this->DownMedia($litpic,'img',TRUE)."{/dede:field}\r\n";
  786. }
  787. else
  788. {
  789. $artitem .= "{dede:field name='litpic'}".$this->breImage."{/dede:field}\r\n";
  790. }
  791. return $artitem;
  792. }
  793. /**
  794. * 下载内容里的资源
  795. *
  796. * @access public
  797. * @param string $html html内容
  798. * @param string $url 地址
  799. * @return string
  800. */
  801. function DownMedias(&$html, $url)
  802. {
  803. $this->cDedeHtml->SetSource($html,$url,'media');
  804. //下载标记里的图片和flash
  805. foreach($this->cDedeHtml->Medias as $k=>$v)
  806. {
  807. $furl = $this->cDedeHtml->FillUrl($k);
  808. if($v=='embed' && !preg_match("#\.(swf)\?(.*)$#i", $k)&& !preg_match("#\.(swf)$#i", $k))
  809. {
  810. continue;
  811. }
  812. $okurl = $this->DownMedia($furl, $v);
  813. $html = str_replace($k, $okurl, $html);
  814. }
  815. //下载超链接里的图片
  816. foreach($this->cDedeHtml->Links as $v=>$k)
  817. {
  818. if(preg_match("#\.(jpg|gif|png)\?(.*)$#i",$v) || preg_match("#\.(jpg|gif|png)$#i", $v))
  819. {
  820. $m = "img";
  821. }
  822. else if(preg_match("#\.(swf)\?(.*)$#i", $v) || preg_match("#\.(swf)$#i", $v))
  823. {
  824. $m = "embed";
  825. }
  826. else
  827. {
  828. continue;
  829. }
  830. $furl = $this->cDedeHtml->FillUrl($v);
  831. $okurl = $this->DownMedia($furl, $m);
  832. $html = str_replace($v, $okurl, $html);
  833. }
  834. return $html;
  835. }
  836. /**
  837. * 仅替换内容里的资源为绝对网址
  838. *
  839. * @access public
  840. * @param string $html html内容
  841. * @param string $dourl 操作地址
  842. * @return string
  843. */
  844. function MediasReplace(&$html, $dourl)
  845. {
  846. $this->cDedeHtml->SetSource($html, $dourl, 'media');
  847. foreach($this->cDedeHtml->Medias as $k=>$v)
  848. {
  849. $k = trim($k);
  850. $okurl = $this->cDedeHtml->FillUrl($k);
  851. $html = str_replace($k, $okurl, $html);
  852. }
  853. return $html;
  854. }
  855. //测试列表
  856. function Testlists(&$dourl)
  857. {
  858. $links = array();
  859. //从RSS中获取网址
  860. if($this->lists['sourcetype']=='rss')
  861. {
  862. $dourl = $this->lists['rssurl'];
  863. $links = GetRssLinks($dourl);
  864. return $links;
  865. }
  866. //正常情况
  867. if(isset($this->lists['url'][0][0]))
  868. {
  869. $dourl = $this->lists['url'][0][0];
  870. }
  871. else
  872. {
  873. $dourl = '';
  874. $this->errString = "配置中指定列表的网址错误!\r\n";
  875. return $links;
  876. }
  877. $dhtml = new DedeHtml2();
  878. $html = $this->DownOnePage($dourl);
  879. if($html=='')
  880. {
  881. $this->errString = "读取网址: $dourl 时失败!\r\n";
  882. return $links;
  883. }
  884. if( trim($this->lists['areastart']) !='' && trim($this->lists['areaend']) != '' )
  885. {
  886. $areabody = $this->lists['areastart'].'[var:区域]'.$this->lists['areaend'];
  887. $html = $this->GetHtmlArea('[var:区域]',$areabody,$html);
  888. }
  889. $t1 = ExecTime();
  890. $dhtml->SetSource($html,$dourl,'link');
  891. $this->lists['musthas'] = str_replace('/', '\/', $this->lists['musthas']);
  892. foreach($dhtml->Links as $s)
  893. {
  894. if($this->lists['nothas']!='')
  895. {
  896. if( preg_match("#".$this->lists['nothas']."#i", $s['link']) )
  897. {
  898. continue;
  899. }
  900. }
  901. if($this->lists['musthas']!='')
  902. {
  903. if( !preg_match("#".$this->lists['musthas']."#i", $s['link']) )
  904. {
  905. continue;
  906. }
  907. }
  908. $links[] = $s;
  909. }
  910. return $links;
  911. }
  912. /**
  913. * 测试文章规则
  914. *
  915. * @access public
  916. * @param $dourl 操作地址
  917. * @return string
  918. */
  919. function TestArt($dourl)
  920. {
  921. return $this->DownUrl(0, $dourl, '', FALSE);
  922. }
  923. /**
  924. * 采集种子网址
  925. *
  926. * @access public
  927. * @param int $islisten 是否监听
  928. * @param int $glstart 采集开始
  929. * @param int $pagesize 分页尺寸
  930. * @return string
  931. */
  932. function GetSourceUrl($islisten=0, $glstart=0, $pagesize=10)
  933. {
  934. //在第一页中进行预处理
  935. //“下载种子网址的未下载内容”的模式不需要经过采集种子网址的步骤
  936. if($glstart==0)
  937. {
  938. //重新采集所有内容模式
  939. if($islisten == -1)
  940. {
  941. $this->dsql->ExecuteNoneQuery("DELETE FROM `#@__co_urls` WHERE nid='".$this->noteId."'");
  942. $this->dsql->ExecuteNoneQuery("DELETE FROM `#@__co_htmls` WHERE nid='".$this->noteId."' ");
  943. }
  944. //监听模式(保留未导出的内容、保留节点的历史网址记录)
  945. else
  946. {
  947. $this->dsql->ExecuteNoneQuery("DELETE FROM `#@__co_htmls` WHERE nid='".$this->noteId."' AND isexport=1 ");
  948. }
  949. }
  950. //从RSS中获取种子
  951. if($this->lists['sourcetype']=='rss')
  952. {
  953. $links = GetRssLinks($this->lists['rssurl']);
  954. //if($this->noteInfos['cosort']!='asc')
  955. $tmplink = krsort($links);
  956. $lk = 0;
  957. foreach($links as $v)
  958. {
  959. if($islisten==1)
  960. {
  961. $lrow = $this->dsql->GetOne("SELECT * FROM `#@__co_urls` WHERE nid='{$this->noteId}' AND hash='".md5($v['link'])."' ");
  962. if(is_array($lrow))
  963. {
  964. continue;
  965. }
  966. }
  967. $lk++;
  968. if($mytotal > 0 && $lk >= $mytotal) break;
  969. $inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
  970. VALUES ('{$this->noteId}' , '0', '".addslashes($v['title'])."' , '".addslashes($v['image'])."' , '".addslashes($v['link'])."' , 'dtime' , '0' , '0' , ''); ";
  971. $this->dsql->ExecuteNoneQuery($inquery);
  972. $inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('".md5($v['link'])."','{$this->noteId}');";
  973. $this->dsql->ExecuteNoneQuery($inquery);
  974. }
  975. return 0;
  976. }
  977. else
  978. {
  979. $tmplink = array();
  980. $arrStart = 0;
  981. $moviePostion = 0;
  982. $endpos = $glstart + $pagesize;
  983. $totallen = count($this->lists['url']);
  984. //dump($this->lists['url']);exit;
  985. foreach($this->lists['url'] as $k=>$cururls)
  986. {
  987. //$status = FALSE;
  988. $urlnum = 0;
  989. $cururl = $cururls[0];
  990. $typeid = (empty($cururls[1]) ? 0 : $cururls[1]);
  991. $moviePostion++;
  992. if($moviePostion > $endpos)
  993. {
  994. break;
  995. }
  996. if($moviePostion > $glstart)
  997. {
  998. $html = $this->DownOnePage($cururl);
  999. if( trim($this->lists['areastart']) !='' && trim($this->lists['areaend']) != '' )
  1000. {
  1001. $areabody = $this->lists['areastart'].'[var:区域]'.$this->lists['areaend'];
  1002. $html = $this->GetHtmlArea('[var:区域]',$areabody,$html);
  1003. }
  1004. $this->cDedeHtml->SetSource($html, $cururl, 'link');
  1005. $lk = 0;
  1006. foreach($this->cDedeHtml->Links as $k=>$v)
  1007. {
  1008. if($this->lists['nothas']!='')
  1009. {
  1010. if( preg_match("#".$this->lists['nothas']."#", $v['link']) )
  1011. {
  1012. continue;
  1013. }
  1014. }
  1015. if($this->lists['musthas']!='')
  1016. {
  1017. if( !preg_match("#".$this->lists['musthas']."#i", $v['link']) )
  1018. {
  1019. continue;
  1020. }
  1021. }
  1022. $tmplink[$arrStart][0] = $v;
  1023. $tmplink[$arrStart][1] = $typeid;
  1024. $arrStart++;
  1025. $lk++;
  1026. }
  1027. $this->cDedeHtml->Clear();
  1028. }
  1029. }//foreach
  1030. //if($this->noteInfos['cosort']!='asc')
  1031. krsort($tmplink);
  1032. $unum = count($tmplink);
  1033. if($unum>0)
  1034. {
  1035. //echo "完成本次种子网址抓取,共找到:{$unum} 个记录!<br/>\r\n";
  1036. foreach($tmplink as $vs)
  1037. {
  1038. $v = $vs[0];
  1039. $typeid = $vs[1];
  1040. if($islisten==1)
  1041. {
  1042. $lrow = $this->dsql->GetOne("SELECT * FROM `#@__co_urls` WHERE nid='{$this->noteId}' AND hash='".md5($v['link'])."' ");
  1043. if(is_array($lrow))
  1044. {
  1045. continue;
  1046. }
  1047. }
  1048. $inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
  1049. VALUES ('{$this->noteId}' ,'$typeid', '".addslashes($v['title'])."' , '".addslashes($v['image'])."' , '".addslashes($v['link'])."' , '".time()."' , '0' , '0' , ''); ";
  1050. $this->dsql->ExecuteNoneQuery($inquery);
  1051. $inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('".md5($v['link'])."','{$this->noteId}');";
  1052. $this->dsql->ExecuteNoneQuery($inquery);
  1053. }
  1054. if($endpos >= $totallen)
  1055. {
  1056. return 0;
  1057. }
  1058. else
  1059. {
  1060. return ($totallen-$endpos);
  1061. }
  1062. }
  1063. else
  1064. {
  1065. //仅在第一批采集时出错才返回
  1066. if($glstart==0)
  1067. {
  1068. return -1;
  1069. }
  1070. //在其它页出错照常采集后面内容
  1071. if($endpos >= $totallen)
  1072. {
  1073. return 0;
  1074. }
  1075. else
  1076. {
  1077. return ($totallen-$endpos);
  1078. }
  1079. }
  1080. }
  1081. }
  1082. /**
  1083. * 用扩展函数处理采集到的原始数据
  1084. *
  1085. * @access public
  1086. * @param string $fvalue 值
  1087. * @param string $phpcode PHP代码
  1088. * @return string
  1089. */
  1090. function RunPHP($fvalue, $phpcode)
  1091. {
  1092. $DedeMeValue = $fvalue;
  1093. $phpcode = preg_replace("#'@me'|\"@me\"|@me#isU", '$DedeMeValue', $phpcode);
  1094. if(preg_match("#@body#i", $phpcode))
  1095. {
  1096. $DedeBodyValue = $this->tmpHtml;
  1097. $phpcode = preg_replace("#'@body'|\"@body\"|@body#isU", '$DedeBodyValue', $phpcode);
  1098. }
  1099. if(preg_match("#@litpic#i", $phpcode))
  1100. {
  1101. $DedeLitPicValue = $this->breImage;
  1102. $phpcode = preg_replace("#'@litpic'|\"@litpic\"|@litpic#isU", '$DedeLitPicValue', $phpcode);
  1103. }
  1104. eval($phpcode.";");
  1105. return $DedeMeValue;
  1106. }
  1107. /**
  1108. * 编码转换
  1109. *
  1110. * @access public
  1111. * @param string $str 字符串
  1112. * @return string
  1113. */
  1114. function ChangeCode(&$str)
  1115. {
  1116. global $cfg_soft_lang;
  1117. if($cfg_soft_lang=='utf-8')
  1118. {
  1119. if($this->noteInfos["sourcelang"]=="gb2312")
  1120. {
  1121. $str = gb2utf8($str);
  1122. }
  1123. if($this->noteInfos["sourcelang"]=="big5")
  1124. {
  1125. $str = gb2utf8(big52gb($str));
  1126. }
  1127. }
  1128. else
  1129. {
  1130. if($this->noteInfos["sourcelang"]=="utf-8")
  1131. {
  1132. $str = utf82gb($str);
  1133. }
  1134. if($this->noteInfos["sourcelang"]=="big5")
  1135. {
  1136. $str = big52gb($str);
  1137. }
  1138. }
  1139. }
  1140. }//End Class