国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1019 lines
39KB

  1. <?php if (!defined('DEDEINC')) exit('dedebiz');
  2. /**
  3. * Dede采集类
  4. *
  5. * @version $Id: dedecollection.class.php 1 20:20 2010年7月7日Z tianya $
  6. * @package DedeBIZ.Libraries
  7. * @copyright Copyright (c) 2021, DedeBIZ.COM
  8. * @license https://www.dedebiz.com/license
  9. * @link https://www.dedebiz.com
  10. */
  11. require_once(DEDEINC . "/dedecollection.func.php"); //采集扩展函数
  12. require_once(DEDEINC . "/image.func.php");
  13. require_once(DEDEINC . "/dedehtml2.class.php");
  14. @set_time_limit(0);
  15. /**
  16. * Dede采集类
  17. *
  18. * @package DedeCollection
  19. * @subpackage DedeBIZ.Libraries
  20. * @link https://www.dedebiz.com
  21. */
  22. class DedeCollection
  23. {
  24. var $artNotes = array(); //文章采集的字段信息
  25. var $spNotes = array(); //文章采集的字段信息
  26. var $lists = array(); //采集节点的来源列表处理信息
  27. var $noteInfos = array(); //采集节点的基本配置信息
  28. var $dsql = '';
  29. var $noteId = '';
  30. var $cDedeHtml = '';
  31. var $cHttpDown = '';
  32. var $mediaCount = 0;
  33. var $tmpUnitValue = '';
  34. var $tmpLinks = array();
  35. var $tmpHtml = '';
  36. var $breImage = '';
  37. var $errString = '';
  38. //兼容php5构造函数
  39. function __construct()
  40. {
  41. $this->dsql = $GLOBALS['dsql'];
  42. $this->cHttpDown = new DedeHttpDown();
  43. $this->cDedeHtml = new DedeHtml2();
  44. }
  45. function DedeCollection()
  46. {
  47. $this->__construct();
  48. }
  49. //析放资源
  50. function Close()
  51. {
  52. }
  53. /**
  54. * 从数据库里载入某个节点
  55. *
  56. * @access public
  57. * @param int $nid 采集节点ID
  58. * @return void
  59. */
  60. function LoadNote($nid)
  61. {
  62. $this->noteId = $nid;
  63. $row = $this->dsql->GetOne("SELECT * FROM `#@__co_note` WHERE nid='$nid'");
  64. $this->LoadListConfig($row['listconfig']);
  65. $this->LoadItemConfig($row['itemconfig']);
  66. }
  67. /**
  68. * 分析基本节点的及索引配置信息
  69. *
  70. * @access public
  71. * @param string $configString 配置字符串
  72. * @return void
  73. */
  74. function LoadListConfig($configString)
  75. {
  76. $dtp = new DedeTagParse();
  77. $dtp2 = new DedeTagParse();
  78. $dtp->LoadString($configString);
  79. for ($i = 0; $i <= $dtp->Count; $i++) {
  80. $ctag = $dtp->CTags[$i];
  81. //item 配置
  82. //节点基本信息
  83. if ($ctag->GetName() == "noteinfo") {
  84. $this->noteInfos['notename'] = $ctag->GetAtt('notename');
  85. $this->noteInfos['matchtype'] = $ctag->GetAtt('matchtype');
  86. $this->noteInfos['channelid'] = $ctag->GetAtt('channelid');
  87. $this->noteInfos['refurl'] = $ctag->GetAtt('refurl');
  88. $this->noteInfos['sourcelang'] = $ctag->GetAtt('sourcelang');
  89. $this->noteInfos['cosort'] = $ctag->GetAtt('cosort');
  90. $this->noteInfos['isref'] = $ctag->GetAtt('isref');
  91. $this->noteInfos['exptime'] = $ctag->GetAtt('exptime');
  92. }
  93. //list 配置
  94. //要采集的列表页的信息
  95. else if ($ctag->GetName() == "listrule") {
  96. $this->lists['sourcetype'] = $ctag->GetAtt('sourcetype');
  97. $this->lists['rssurl'] = $ctag->GetAtt('rssurl');
  98. $this->lists['regxurl'] = $ctag->GetAtt('regxurl');
  99. $this->lists['startid'] = $ctag->GetAtt('startid');
  100. $this->lists['endid'] = $ctag->GetAtt('endid');
  101. $this->lists['addv'] = $ctag->GetAtt('addv');
  102. $this->lists['urlrule'] = $ctag->GetAtt('urlrule');
  103. $this->lists['musthas'] = $ctag->GetAtt('musthas');
  104. $this->lists['nothas'] = $ctag->GetAtt('nothas');
  105. $this->lists['listpic'] = $ctag->GetAtt('listpic');
  106. $this->lists['usemore'] = $ctag->GetAtt('usemore');
  107. $dtp2->LoadString($ctag->GetInnerText());
  108. for ($j = 0; $j <= $dtp2->Count; $j++) {
  109. $ctag2 = $dtp2->CTags[$j];
  110. $tname = $ctag2->GetName();
  111. if ($tname == 'addurls') {
  112. $this->lists['addurls'] = trim($ctag2->GetInnerText());
  113. } else if ($tname == 'regxrule') {
  114. $this->lists['regxrule'] = trim($ctag2->GetInnerText());
  115. } else if ($tname == 'areastart') {
  116. $this->lists['areastart'] = trim($ctag2->GetInnerText());
  117. } else if ($tname == 'areaend') {
  118. $this->lists['areaend'] = trim($ctag2->GetInnerText());
  119. } else if ($tname == 'batchrule') {
  120. $this->lists['batchrule'] = trim($ctag2->GetInnerText());
  121. }
  122. }
  123. //分析列表网址
  124. if ($this->lists['sourcetype'] != 'rss') {
  125. $this->lists['url'] = GetUrlFromListRule(
  126. $this->lists['regxurl'],
  127. $this->lists['addurls'],
  128. $this->lists['startid'],
  129. $this->lists['endid'],
  130. $this->lists['addv'],
  131. $this->lists['usemore'],
  132. $this->lists['batchrule']
  133. );
  134. } else {
  135. $this->lists['url'] = $this->lists['rssurl'];
  136. }
  137. }
  138. } //End Loop
  139. $dtp->Clear();
  140. $dtp2->Clear();
  141. }
  142. /**
  143. * 分析采集文章页的字段的设置
  144. *
  145. * @access public
  146. * @param string $configString 配置字符串
  147. * @return void
  148. */
  149. function LoadItemConfig($configString)
  150. {
  151. $dtp = new DedeTagParse();
  152. $dtp2 = new DedeTagParse();
  153. $dtp->LoadString($configString);
  154. for ($i = 0; $i <= $dtp->Count; $i++) {
  155. $ctag = $dtp->CTags[$i];
  156. if ($ctag->GetName() == 'sppage') {
  157. $this->artNotes['sppage'] = $ctag->GetInnerText();
  158. $this->artNotes['sptype'] = $ctag->GetAtt('sptype');
  159. $this->spNotes['srul'] = $ctag->GetAtt('srul');
  160. $this->spNotes['erul'] = $ctag->GetAtt('erul');
  161. } else if ($ctag->GetName() == 'previewurl') {
  162. $this->artNotes['previewurl'] = $ctag->GetInnerText();
  163. } else if ($ctag->GetName() == 'keywordtrim') {
  164. $this->artNotes['keywordtrim'] = $ctag->GetInnerText();
  165. } else if ($ctag->GetName() == 'descriptiontrim') {
  166. $this->artNotes['descriptiontrim'] = $ctag->GetInnerText();
  167. } else if ($ctag->GetName() == 'item') {
  168. $field = $ctag->GetAtt('field');
  169. if ($field == '') {
  170. continue;
  171. }
  172. $this->artNotes[$field]['value'] = $ctag->GetAtt('value');
  173. $this->artNotes[$field]['isunit'] = $ctag->GetAtt('isunit');
  174. $this->artNotes[$field]['isdown'] = $ctag->GetAtt('isdown');
  175. $this->artNotes[$field]['trim'] = array();
  176. $this->artNotes[$field]['match'] = '';
  177. $this->artNotes[$field]['function'] = '';
  178. $t = 0;
  179. $dtp2->LoadString($ctag->GetInnerText());
  180. for ($k = 0; $k <= $dtp2->Count; $k++) {
  181. $ctag2 = $dtp2->CTags[$k];
  182. if ($ctag2->GetName() == 'trim') {
  183. $this->artNotes[$field]['trim'][$t][0] = str_replace('#n#', '&nbsp;', $ctag2->GetInnerText());
  184. $this->artNotes[$field]['trim'][$t][1] = $ctag2->GetAtt('replace');
  185. $t++;
  186. } else if ($ctag2->GetName() == 'match') {
  187. $this->artNotes[$field]['match'] = str_replace('#n#', '&nbsp;', $ctag2->GetInnerText());
  188. } else if ($ctag2->GetName() == 'function') {
  189. $this->artNotes[$field]['function'] = $ctag2->GetInnerText();
  190. }
  191. }
  192. }
  193. } //End Loop
  194. $dtp->Clear();
  195. $dtp2->Clear();
  196. }
  197. /**
  198. * 下载其中一个网址,并保存
  199. *
  200. * @access public
  201. * @param int $aid 文档ID
  202. * @param string $dourl 操作地址
  203. * @param string $litpic 缩略图
  204. * @param bool $issave 是否保存
  205. * @return string
  206. */
  207. function DownUrl($aid, $dourl, $litpic = '', $issave = TRUE)
  208. {
  209. $this->tmpLinks = array();
  210. $this->tmpUnitValue = '';
  211. $this->breImage = '';
  212. $this->tmpHtml = $this->DownOnePage($dourl);
  213. //检测是否有分页字段,并预先处理
  214. if (!empty($this->artNotes['sppage'])) {
  215. $noteid = '';
  216. foreach ($this->artNotes as $k => $sarr) {
  217. if (isset($sarr['isunit']) && $sarr['isunit'] == 1) {
  218. $noteid = $k;
  219. break;
  220. }
  221. }
  222. $this->GetSpPage($dourl, $noteid, $this->tmpHtml);
  223. if (preg_match("/#p#/i", $this->tmpUnitValue)) {
  224. if ($this->artNotes["sptype"] != 'diyrule') {
  225. $this->tmpUnitValue = '副标题#e#' . $this->tmpUnitValue;
  226. }
  227. }
  228. }
  229. //处理字段
  230. $body = $this->GetPageFields($dourl, $issave, $litpic);
  231. //保存资料到数据库
  232. if ($issave) {
  233. $query = " UPDATE `#@__co_htmls` SET dtime='" . time() . "',result='" . addslashes($body) . "',isdown='1' WHERE aid='$aid' ";
  234. if (!$this->dsql->ExecuteNoneQuery($query)) {
  235. echo $this->dsql->GetError();
  236. }
  237. return $body;
  238. }
  239. return $body;
  240. }
  241. // 解析地址
  242. function GetUrl($uri)
  243. {
  244. $arr = $tmp = array();
  245. // query
  246. $x = array_pad(explode('?', $uri), 2, false);
  247. $arr['query'] = ($x[1]) ? $x[1] : '';
  248. // resource
  249. $x = array_pad(explode('/', $x[0]), 2, false);
  250. $x_last = array_pop($x);
  251. if (strpos($x_last, '.') === false) {
  252. $arr['resource'] = '';
  253. $x[] = $x_last;
  254. } else {
  255. $arr['resource'] = $x_last;
  256. $tmp = @explode('.', $arr['resource']);
  257. $arr['file'] = @$tmp[0];
  258. $arr['ext'] = '.' . @$tmp[1];
  259. }
  260. // path
  261. $arr['path'] = implode('/', $x);
  262. if (substr($arr['path'], -1) !== '/') $arr['path'] .= '/';
  263. // url
  264. $arr['url'] = $uri;
  265. return $arr;
  266. }
  267. /**
  268. * 获取分页区域的内容
  269. *
  270. * @access public
  271. * @param string $dourl 操作地址
  272. * @param string $noteid 节点ID
  273. * @param string $html html内容
  274. * @param int $step 步骤
  275. * @return string
  276. */
  277. function GetSpPage($dourl, $noteid, $html, $step = 0)
  278. {
  279. $sarr = $this->artNotes[$noteid];
  280. $linkareaHtml = $this->GetHtmlArea('[内容]', $this->artNotes['sppage'], $html);
  281. if ($linkareaHtml == '') {
  282. if ($this->tmpUnitValue == '') {
  283. $this->tmpUnitValue .= $this->GetHtmlArea('[内容]', $sarr['match'], $html);
  284. } else {
  285. $this->tmpUnitValue .= "#p#副标题#e#" . $this->GetHtmlArea('[内容]', $sarr['match'], $html);
  286. }
  287. if ($this->artNotes["sptype"] != 'diyrule') return;
  288. }
  289. //完整的分页列表
  290. if ($this->artNotes["sptype"] == 'full' || $this->artNotes["sptype"] == '') {
  291. $this->tmpUnitValue .= $this->GetHtmlArea('[内容]', $sarr['match'], $html);
  292. $this->cDedeHtml->GetLinkType = "link";
  293. $this->cDedeHtml->SetSource($linkareaHtml, $dourl, 'link');
  294. foreach ($this->cDedeHtml->Links as $k => $t) {
  295. $k = $this->cDedeHtml->FillUrl($k);
  296. if ($k == $dourl) {
  297. continue;
  298. }
  299. $nhtml = $this->DownOnePage($k);
  300. if ($nhtml != '') {
  301. $ct = trim($this->GetHtmlArea('[内容]', $sarr['match'], $nhtml));
  302. if ($ct != '') {
  303. $this->tmpUnitValue .= "#p#副标题#e#" . $ct;
  304. }
  305. }
  306. }
  307. } else if ($this->artNotes["sptype"] == 'diyrule') {
  308. $maxpage = 10;
  309. $urlinfo = $this->GetUrl($dourl);
  310. $testurl = str_replace(array_keys($urlinfo), array_values($urlinfo), $this->artNotes['sppage']);
  311. $testurl = str_ireplace('{p}', '~p~', $testurl);
  312. $testurl = str_replace(array('{', '}'), '', $testurl);
  313. $lastchash = md5($html);
  314. for ($i = $this->spNotes['srul']; $i <= $this->spNotes['erul']; $i++) {
  315. $tempurl = str_replace('~p~', $i, $testurl);
  316. $tempurl = $this->cDedeHtml->FillUrl($tempurl);
  317. $nhtml = $this->DownOnePage($tempurl);
  318. $newchash = md5($nhtml);
  319. if ($newchash == $lastchash) continue;
  320. $lastchash = $newchash;
  321. if ($nhtml != '') {
  322. $ct = trim($this->GetHtmlArea('[内容]', $sarr['match'], $nhtml));
  323. if ($ct != '') {
  324. $this->tmpUnitValue .= "#p#副标题#e#" . $ct;
  325. // echo $this->tmpUnitValue;exit;
  326. }
  327. }
  328. }
  329. }
  330. //上下页形式或不完整的分页列表
  331. else {
  332. if ($step > 50) {
  333. return;
  334. }
  335. if ($step == 0) {
  336. $this->tmpUnitValue .= $this->GetHtmlArea('[内容]', $sarr['match'], $html);
  337. }
  338. $this->cDedeHtml->GetLinkType = "link";
  339. $this->cDedeHtml->SetSource($linkareaHtml, $dourl, 'link');
  340. $hasLink = FALSE;
  341. foreach ($this->cDedeHtml->Links as $k => $t) {
  342. $k = $this->cDedeHtml->FillUrl($k);
  343. if (in_array($k, $this->tmpLinks)) {
  344. continue;
  345. } else {
  346. $nhtml = $this->DownOnePage($k);
  347. if ($nhtml != '') {
  348. $ct = trim($this->GetHtmlArea('[内容]', $sarr['match'], $nhtml));
  349. if ($ct != '') {
  350. $this->tmpUnitValue .= "#p#副标题#e#" . $ct;
  351. }
  352. }
  353. $hasLink = TRUE;
  354. $this->tmpLinks[] = $k;
  355. $dourl = $k;
  356. $step++;
  357. }
  358. }
  359. if ($hasLink) {
  360. $this->GetSpPage($dourl, $noteid, $nhtml, $step);
  361. }
  362. }
  363. }
  364. /**
  365. * 获取特定区域的HTML
  366. *
  367. * @access public
  368. * @param string $sptag 区域标记
  369. * @param string $areaRule 地址规则
  370. * @param string $html html代码
  371. * @return string
  372. */
  373. function GetHtmlArea($sptag, &$areaRule, &$html)
  374. {
  375. //用正则表达式的模式匹配
  376. if ($this->noteInfos['matchtype'] == 'regex') {
  377. $areaRule = str_replace("/", "\\/", $areaRule);
  378. $areaRules = explode($sptag, $areaRule);
  379. $arr = array();
  380. if ($html == '' || $areaRules[0] == '') {
  381. return '';
  382. }
  383. preg_match('#' . $areaRules[0] . "(.*)" . $areaRules[1] . "#isU", $html, $arr);
  384. return empty($arr[1]) ? '' : trim($arr[1]);
  385. }
  386. //用字符串模式匹配
  387. else {
  388. $areaRules = explode($sptag, $areaRule);
  389. if ($html == '' || $areaRules[0] == '') {
  390. return '';
  391. }
  392. $posstart = @strpos($html, $areaRules[0]);
  393. if ($posstart === FALSE) {
  394. return '';
  395. }
  396. $posstart = $posstart + strlen($areaRules[0]);
  397. $posend = @strpos($html, $areaRules[1], $posstart);
  398. if ($posend > $posstart && $posend !== FALSE) {
  399. //return substr($html,$posstart+strlen($areaRules[0]),$posend-$posstart-strlen($areaRules[0]));
  400. return substr($html, $posstart, $posend - $posstart);
  401. } else {
  402. return '';
  403. }
  404. }
  405. }
  406. /**
  407. * 下载指定网址
  408. *
  409. * @access public
  410. * @param string $dourl 下载地址
  411. */
  412. function DownOnePage($dourl)
  413. {
  414. $this->cHttpDown->OpenUrl($dourl);
  415. $html = $this->cHttpDown->GetHtml();
  416. $this->cHttpDown->Close();
  417. $this->ChangeCode($html);
  418. return $html;
  419. }
  420. /**
  421. * 下载特定资源,并保存为指定文件
  422. *
  423. * @access public
  424. * @param string $dourl 操作地址
  425. * @param string $mtype 附件类型
  426. * @param string $islitpic 是否缩略图
  427. * @return string
  428. */
  429. function DownMedia($dourl, $mtype = 'img', $islitpic = FALSE)
  430. {
  431. global $notckpic;
  432. if (empty($notckpic)) {
  433. $notckpic = 0;
  434. }
  435. //检测是否已经下载此文件
  436. $wi = FALSE;
  437. $tofile = $filename = '';
  438. if ($notckpic == 0) {
  439. $row = $this->dsql->GetOne("SELECT hash,tofile FROM `#@__co_mediaurls` WHERE nid='{$this->noteId}' AND hash='" . md5($dourl) . "' ");
  440. if (isset($row['tofile'])) {
  441. $tofile = $filename = $row['tofile'];
  442. }
  443. }
  444. //如果不存在,下载文件
  445. if ($tofile == '' || !file_exists($GLOBALS['cfg_basedir'] . $filename)) {
  446. $filename = $this->GetRndName($dourl, $mtype);
  447. if (!preg_match("#^\/#", $filename)) {
  448. $filename = "/" . $filename;
  449. }
  450. //防盗链模式
  451. if ($this->noteInfos['isref'] == 'yes' && $this->noteInfos['refurl'] != '') {
  452. if ($this->noteInfos['exptime'] == '') {
  453. $this->noteInfos['exptime'] = 10;
  454. }
  455. DownImageKeep($dourl, $this->noteInfos['refurl'], $GLOBALS['cfg_basedir'] . $filename, '', 0, $this->Item['exptime']);
  456. }
  457. //普通模式
  458. else {
  459. $this->cHttpDown->OpenUrl($dourl);
  460. $this->cHttpDown->SaveToBin($GLOBALS['cfg_basedir'] . $filename);
  461. $this->cHttpDown->Close();
  462. }
  463. //下载文件成功,保存记录
  464. if (file_exists($GLOBALS['cfg_basedir'] . $filename)) {
  465. if ($tofile == '') {
  466. $query = "INSERT INTO `#@__co_mediaurls`(nid,hash,tofile) VALUES ('" . $this->noteId . "', '" . md5($dourl) . "', '" . addslashes($filename) . "');";
  467. } else {
  468. $query = "UPDATE `#@__co_mediaurls` SET tofile='" . addslashes($filename) . "' WHERE hash='" . md5($dourl) . "' ";
  469. }
  470. $this->dsql->ExecuteNoneQuery($query);
  471. }
  472. }
  473. //如果下载图片失败或图片不存在,返回网址
  474. if (!file_exists($GLOBALS['cfg_basedir'] . $filename)) {
  475. return $dourl;
  476. }
  477. //生成缩略图
  478. if ($mtype == 'img' && !$islitpic && $this->breImage == '') {
  479. $this->breImage = $filename;
  480. if (!preg_match("#^http:\/\/#", $this->breImage) && file_exists($GLOBALS['cfg_basedir'] . $filename)) {
  481. $filenames = explode('/', $filename);
  482. $filenamed = $filenames[count($filenames) - 1];
  483. $nfilename = str_replace('.', '_lit.', $filenamed);
  484. $nfilename = str_replace($filenamed, $nfilename, $filename);
  485. if (@copy($GLOBALS['cfg_basedir'] . $filename, $GLOBALS['cfg_basedir'] . $nfilename)) {
  486. ImageResize($GLOBALS['cfg_basedir'] . $nfilename, $GLOBALS['cfg_ddimg_width'], $GLOBALS['cfg_ddimg_height']);
  487. $this->breImage = $nfilename;
  488. }
  489. }
  490. }
  491. if ($mtype == 'img' && !$islitpic) {
  492. @WaterImg($GLOBALS['cfg_basedir'] . $filename, 'collect');
  493. }
  494. return $filename;
  495. }
  496. /**
  497. * 获得下载媒体的随机名称
  498. *
  499. * @access public
  500. * @param string $url 地址
  501. * @param string $v 值
  502. * @return string
  503. */
  504. function GetRndName($url, $v)
  505. {
  506. global $cfg_image_dir, $cfg_dir_purview;
  507. $this->mediaCount++;
  508. $mnum = $this->mediaCount;
  509. $timedir = "c" . MyDate("ymd", time());
  510. //存放路径
  511. $fullurl = preg_replace("#\/{1,}#", "/", $cfg_image_dir . "/");
  512. if (!is_dir($GLOBALS['cfg_basedir'] . "/$fullurl")) {
  513. MkdirAll($GLOBALS['cfg_basedir'] . "/$fullurl", $cfg_dir_purview);
  514. }
  515. $fullurl = $fullurl . $timedir . "/";
  516. if (!is_dir($GLOBALS['cfg_basedir'] . "/$fullurl")) {
  517. MkdirAll($GLOBALS['cfg_basedir'] . "/$fullurl", $cfg_dir_purview);
  518. }
  519. //文件名称
  520. $timename = str_replace('.', '', ExecTime());
  521. $threadnum = 0;
  522. if (isset($_GET['threadnum'])) {
  523. $threadnum = intval($_GET['threadnum']);
  524. }
  525. $filename = dd2char($timename . $threadnum . '-' . $mnum . mt_rand(1000, 9999));
  526. //分配扩展名
  527. $urls = explode('.', $url);
  528. if ($v == 'img') {
  529. $shortname = '.jpg';
  530. if (preg_match("#\.gif$#i", $url)) {
  531. $shortname = '.gif';
  532. } else if (preg_match("#\.png$#i", $url)) {
  533. $shortname = '.png';
  534. }
  535. } else if ($v == 'embed') {
  536. $shortname = '.swf';
  537. } else {
  538. $shortname = '';
  539. }
  540. $fullname = $fullurl . $filename . $shortname;
  541. return preg_replace("#\/{1,}#", "/", $fullname);
  542. }
  543. /**
  544. * 按载入的网页内容获取规则,从一个HTML文件中获取内容
  545. *
  546. * @access public
  547. * @param string $dourl 操作地址
  548. * @param string $needDown 需要下载
  549. * @param string $litpic 缩略图
  550. * @return string
  551. */
  552. function GetPageFields($dourl, $needDown, $litpic = '')
  553. {
  554. global $cfg_auot_description;
  555. if ($this->tmpHtml == '') {
  556. return '';
  557. }
  558. $artitem = '';
  559. $isPutUnit = FALSE;
  560. $tmpLtKeys = array();
  561. $inarr = array();
  562. //自动分析关键字和摘要
  563. preg_match("#<meta[\s]+name=['\"]keywords['\"] content=['\"](.*)['\"]#isU", $this->tmpHtml, $inarr);
  564. preg_match("#<meta[\s]+content=['\"](.*)['\"] name=['\"]keywords['\"]#isU", $this->tmpHtml, $inarr2);
  565. if (!isset($inarr[1]) && isset($inarr2[1])) {
  566. $inarr[1] = $inarr2[1];
  567. }
  568. if (isset($inarr[1])) {
  569. $keywords = trim(cn_substr(html2text($inarr[1]), 30));
  570. $keywords = preg_replace("#" . $this->artNotes['keywordtrim'] . "#isU", '', $keywords);
  571. if (!preg_match("#,#", $keywords)) {
  572. $keywords = str_replace(' ', ',', $keywords);
  573. }
  574. $artitem .= "{dede:field name='keywords'}" . $keywords . "{/dede:field}\r\n";
  575. } else {
  576. $artitem .= "{dede:field name='keywords'}{/dede:field}\r\n";
  577. }
  578. // preg_match("#<meta[\s]+name=['\"]description['\"] content=['\"](.*)['\"]#isU", $this->tmpHtml, $inarr);
  579. // preg_match("#<meta[\s]+content=['\"](.*)['\"] name=['\"]description['\"]#isU", $this->tmpHtml, $inarr2);
  580. preg_match("#<meta[\s]+name=['\"]description['\"][\s]+content=['\"]([^>]*?)['\"]#iU", $this->tmpHtml, $inarr);
  581. preg_match("#<meta[\s]+content=['\"]([^>]*?)['\"][\s]+name=['\"]description['\"]#iU", $this->tmpHtml, $inarr2);
  582. if (!isset($inarr[1]) && isset($inarr2[1])) {
  583. $inarr[1] = $inarr2[1];
  584. }
  585. if (isset($inarr[1])) {
  586. $description = trim(cn_substr(html2text($inarr[1]), $cfg_auot_description));
  587. $description = preg_replace("/" . $this->artNotes['descriptiontrim'] . "/isU", '', $description);
  588. $artitem .= "{dede:field name='description'}" . $description . "{/dede:field}\r\n";
  589. } else {
  590. $artitem .= "{dede:field name='description'}{/dede:field}\r\n";
  591. }
  592. foreach ($this->artNotes as $k => $sarr) {
  593. //可能出现意外的情况
  594. if ($k == 'sppage' || $k == 'sptype') {
  595. continue;
  596. }
  597. if (!is_array($sarr)) {
  598. continue;
  599. }
  600. //特殊的规则或没匹配选项
  601. if ($sarr['match'] == '' || trim($sarr['match']) == '[内容]') {
  602. if ($sarr['value'] != '[内容]') {
  603. $v = trim($sarr['value']);
  604. } else {
  605. $v = '';
  606. }
  607. } else {
  608. //分多页的内容
  609. if ($this->tmpUnitValue != '' && !$isPutUnit && $sarr['isunit'] == 1) {
  610. $v = $this->tmpUnitValue;
  611. $isPutUnit = TRUE;
  612. } else {
  613. $v = $this->GetHtmlArea('[内容]', $sarr['match'], $this->tmpHtml);
  614. }
  615. //过滤内容规则
  616. if (isset($sarr['trim']) && $v != '') {
  617. foreach ($sarr['trim'] as $nv) {
  618. if ($nv[0] == '') {
  619. continue;
  620. }
  621. $nvs = str_replace("/", "\\/", $nv[0]);
  622. $v = preg_replace("#" . $nvs . "#isU", $nv[1], $v);
  623. }
  624. }
  625. //是否下载远程资源
  626. if ($needDown) {
  627. if ($sarr['isdown'] == '1') {
  628. $v = $this->DownMedias($v, $dourl);
  629. }
  630. } else {
  631. if ($sarr['isdown'] == '1') {
  632. $v = $this->MediasReplace($v, $dourl);
  633. }
  634. }
  635. }
  636. $v = trim($v);
  637. //用户自行对内容进行处理的接口
  638. if ($sarr['function'] != '') {
  639. $tmpLtKeys[$k]['v'] = $v;
  640. $tmpLtKeys[$k]['f'] = $sarr['function'];
  641. } else {
  642. $v = preg_replace("#( )$#", '', $v);
  643. $v = preg_replace("#[\r\n\t ]{1,}$#", '', $v);
  644. $artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
  645. }
  646. } //End Foreach
  647. //处理带函数的项目
  648. foreach ($tmpLtKeys as $k => $sarr) {
  649. $v = $this->RunPHP($sarr['v'], $sarr['f']);
  650. $v = preg_replace("#( )$#", '', $v);
  651. $v = preg_replace("#[\r\n\t ]{1,}$#", '', $v);
  652. $artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
  653. }
  654. if ($litpic != '' && $this->lists['listpic'] == 1) {
  655. $artitem .= "{dede:field name='litpic'}" . $this->DownMedia($litpic, 'img', TRUE) . "{/dede:field}\r\n";
  656. } else {
  657. $artitem .= "{dede:field name='litpic'}" . $this->breImage . "{/dede:field}\r\n";
  658. }
  659. return $artitem;
  660. }
  661. /**
  662. * 下载内容里的资源
  663. *
  664. * @access public
  665. * @param string $html html内容
  666. * @param string $url 地址
  667. * @return string
  668. */
  669. function DownMedias(&$html, $url)
  670. {
  671. $this->cDedeHtml->SetSource($html, $url, 'media');
  672. //下载标记里的图片和flash
  673. foreach ($this->cDedeHtml->Medias as $k => $v) {
  674. $furl = $this->cDedeHtml->FillUrl($k);
  675. if ($v == 'embed' && !preg_match("#\.(swf)\?(.*)$#i", $k) && !preg_match("#\.(swf)$#i", $k)) {
  676. continue;
  677. }
  678. $okurl = $this->DownMedia($furl, $v);
  679. $html = str_replace($k, $okurl, $html);
  680. }
  681. //下载超链接里的图片
  682. foreach ($this->cDedeHtml->Links as $v => $k) {
  683. if (preg_match("#\.(jpg|gif|png)\?(.*)$#i", $v) || preg_match("#\.(jpg|gif|png)$#i", $v)) {
  684. $m = "img";
  685. } else if (preg_match("#\.(swf)\?(.*)$#i", $v) || preg_match("#\.(swf)$#i", $v)) {
  686. $m = "embed";
  687. } else {
  688. continue;
  689. }
  690. $furl = $this->cDedeHtml->FillUrl($v);
  691. $okurl = $this->DownMedia($furl, $m);
  692. $html = str_replace($v, $okurl, $html);
  693. }
  694. return $html;
  695. }
  696. /**
  697. * 仅替换内容里的资源为绝对网址
  698. *
  699. * @access public
  700. * @param string $html html内容
  701. * @param string $dourl 操作地址
  702. * @return string
  703. */
  704. function MediasReplace(&$html, $dourl)
  705. {
  706. $this->cDedeHtml->SetSource($html, $dourl, 'media');
  707. foreach ($this->cDedeHtml->Medias as $k => $v) {
  708. $k = trim($k);
  709. $okurl = $this->cDedeHtml->FillUrl($k);
  710. $html = str_replace($k, $okurl, $html);
  711. }
  712. return $html;
  713. }
  714. //测试列表
  715. function Testlists(&$dourl)
  716. {
  717. $links = array();
  718. //从RSS中获取网址
  719. if ($this->lists['sourcetype'] == 'rss') {
  720. $dourl = $this->lists['rssurl'];
  721. $links = GetRssLinks($dourl);
  722. return $links;
  723. }
  724. //正常情况
  725. if (isset($this->lists['url'][0][0])) {
  726. $dourl = $this->lists['url'][0][0];
  727. } else {
  728. $dourl = '';
  729. $this->errString = "配置中指定列表的网址错误!\r\n";
  730. return $links;
  731. }
  732. $dhtml = new DedeHtml2();
  733. $html = $this->DownOnePage($dourl);
  734. if ($html == '') {
  735. $this->errString = "读取网址: $dourl 时失败!\r\n";
  736. return $links;
  737. }
  738. if (trim($this->lists['areastart']) != '' && trim($this->lists['areaend']) != '') {
  739. $areabody = $this->lists['areastart'] . '[var:区域]' . $this->lists['areaend'];
  740. $html = $this->GetHtmlArea('[var:区域]', $areabody, $html);
  741. }
  742. $t1 = ExecTime();
  743. $dhtml->SetSource($html, $dourl, 'link');
  744. $this->lists['musthas'] = str_replace('/', '\/', $this->lists['musthas']);
  745. foreach ($dhtml->Links as $s) {
  746. if ($this->lists['nothas'] != '') {
  747. if (preg_match("#" . $this->lists['nothas'] . "#i", $s['link'])) {
  748. continue;
  749. }
  750. }
  751. if ($this->lists['musthas'] != '') {
  752. if (!preg_match("#" . $this->lists['musthas'] . "#i", $s['link'])) {
  753. continue;
  754. }
  755. }
  756. $links[] = $s;
  757. }
  758. return $links;
  759. }
  760. /**
  761. * 测试文章规则
  762. *
  763. * @access public
  764. * @param $dourl 操作地址
  765. * @return string
  766. */
  767. function TestArt($dourl)
  768. {
  769. return $this->DownUrl(0, $dourl, '', FALSE);
  770. }
  771. /**
  772. * 采集种子网址
  773. *
  774. * @access public
  775. * @param int $islisten 是否监听
  776. * @param int $glstart 采集开始
  777. * @param int $pagesize 分页尺寸
  778. * @return string
  779. */
  780. function GetSourceUrl($islisten = 0, $glstart = 0, $pagesize = 10)
  781. {
  782. //在第一页中进行预处理
  783. //“下载种子网址的未下载内容”的模式不需要经过采集种子网址的步骤
  784. if ($glstart == 0) {
  785. //重新采集所有内容模式
  786. if ($islisten == -1) {
  787. $this->dsql->ExecuteNoneQuery("DELETE FROM `#@__co_urls` WHERE nid='" . $this->noteId . "'");
  788. $this->dsql->ExecuteNoneQuery("DELETE FROM `#@__co_htmls` WHERE nid='" . $this->noteId . "' ");
  789. }
  790. //监听模式(保留未导出的内容、保留节点的历史网址记录)
  791. else {
  792. $this->dsql->ExecuteNoneQuery("DELETE FROM `#@__co_htmls` WHERE nid='" . $this->noteId . "' AND isexport=1 ");
  793. }
  794. }
  795. //从RSS中获取种子
  796. if ($this->lists['sourcetype'] == 'rss') {
  797. $links = GetRssLinks($this->lists['rssurl']);
  798. //if($this->noteInfos['cosort']!='asc')
  799. $tmplink = krsort($links);
  800. $lk = 0;
  801. foreach ($links as $v) {
  802. if ($islisten == 1) {
  803. $lrow = $this->dsql->GetOne("SELECT * FROM `#@__co_urls` WHERE nid='{$this->noteId}' AND hash='" . md5($v['link']) . "' ");
  804. if (is_array($lrow)) {
  805. continue;
  806. }
  807. }
  808. $lk++;
  809. if ($mytotal > 0 && $lk >= $mytotal) break;
  810. $inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
  811. VALUES ('{$this->noteId}' , '0', '" . addslashes($v['title']) . "' , '" . addslashes($v['image']) . "' , '" . addslashes($v['link']) . "' , 'dtime' , '0' , '0' , ''); ";
  812. $this->dsql->ExecuteNoneQuery($inquery);
  813. $inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('" . md5($v['link']) . "','{$this->noteId}');";
  814. $this->dsql->ExecuteNoneQuery($inquery);
  815. }
  816. return 0;
  817. } else {
  818. $tmplink = array();
  819. $arrStart = 0;
  820. $moviePostion = 0;
  821. $endpos = $glstart + $pagesize;
  822. $totallen = count($this->lists['url']);
  823. //dump($this->lists['url']);exit;
  824. foreach ($this->lists['url'] as $k => $cururls) {
  825. //$status = FALSE;
  826. $urlnum = 0;
  827. $cururl = $cururls[0];
  828. $typeid = (empty($cururls[1]) ? 0 : $cururls[1]);
  829. $moviePostion++;
  830. if ($moviePostion > $endpos) {
  831. break;
  832. }
  833. if ($moviePostion > $glstart) {
  834. $html = $this->DownOnePage($cururl);
  835. if (trim($this->lists['areastart']) != '' && trim($this->lists['areaend']) != '') {
  836. $areabody = $this->lists['areastart'] . '[var:区域]' . $this->lists['areaend'];
  837. $html = $this->GetHtmlArea('[var:区域]', $areabody, $html);
  838. }
  839. $this->cDedeHtml->SetSource($html, $cururl, 'link');
  840. $lk = 0;
  841. foreach ($this->cDedeHtml->Links as $k => $v) {
  842. if ($this->lists['nothas'] != '') {
  843. if (preg_match("#" . $this->lists['nothas'] . "#", $v['link'])) {
  844. continue;
  845. }
  846. }
  847. if ($this->lists['musthas'] != '') {
  848. if (!preg_match("#" . $this->lists['musthas'] . "#i", $v['link'])) {
  849. continue;
  850. }
  851. }
  852. $tmplink[$arrStart][0] = $v;
  853. $tmplink[$arrStart][1] = $typeid;
  854. $arrStart++;
  855. $lk++;
  856. }
  857. $this->cDedeHtml->Clear();
  858. }
  859. } //foreach
  860. //if($this->noteInfos['cosort']!='asc')
  861. krsort($tmplink);
  862. $unum = count($tmplink);
  863. if ($unum > 0) {
  864. //echo "完成本次种子网址抓取,共找到:{$unum} 个记录!<br/>\r\n";
  865. foreach ($tmplink as $vs) {
  866. $v = $vs[0];
  867. $typeid = $vs[1];
  868. if ($islisten == 1) {
  869. $lrow = $this->dsql->GetOne("SELECT * FROM `#@__co_urls` WHERE nid='{$this->noteId}' AND hash='" . md5($v['link']) . "' ");
  870. if (is_array($lrow)) {
  871. continue;
  872. }
  873. }
  874. $inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
  875. VALUES ('{$this->noteId}' ,'$typeid', '" . addslashes($v['title']) . "' , '" . addslashes($v['image']) . "' , '" . addslashes($v['link']) . "' , '" . time() . "' , '0' , '0' , ''); ";
  876. $this->dsql->ExecuteNoneQuery($inquery);
  877. $inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('" . md5($v['link']) . "','{$this->noteId}');";
  878. $this->dsql->ExecuteNoneQuery($inquery);
  879. }
  880. if ($endpos >= $totallen) {
  881. return 0;
  882. } else {
  883. return ($totallen - $endpos);
  884. }
  885. } else {
  886. //仅在第一批采集时出错才返回
  887. if ($glstart == 0) {
  888. return -1;
  889. }
  890. //在其它页出错照常采集后面内容
  891. if ($endpos >= $totallen) {
  892. return 0;
  893. } else {
  894. return ($totallen - $endpos);
  895. }
  896. }
  897. }
  898. }
  899. /**
  900. * 用扩展函数处理采集到的原始数据
  901. *
  902. * @access public
  903. * @param string $fvalue 值
  904. * @param string $phpcode PHP代码
  905. * @return string
  906. */
  907. function RunPHP($fvalue, $phpcode)
  908. {
  909. $DedeMeValue = $fvalue;
  910. $phpcode = preg_replace("#'@me'|\"@me\"|@me#isU", '$DedeMeValue', $phpcode);
  911. if (preg_match("#@body#i", $phpcode)) {
  912. $DedeBodyValue = $this->tmpHtml;
  913. $phpcode = preg_replace("#'@body'|\"@body\"|@body#isU", '$DedeBodyValue', $phpcode);
  914. }
  915. if (preg_match("#@litpic#i", $phpcode)) {
  916. $DedeLitPicValue = $this->breImage;
  917. $phpcode = preg_replace("#'@litpic'|\"@litpic\"|@litpic#isU", '$DedeLitPicValue', $phpcode);
  918. }
  919. eval($phpcode . ";");
  920. return $DedeMeValue;
  921. }
  922. /**
  923. * 编码转换
  924. *
  925. * @access public
  926. * @param string $str 字符串
  927. * @return string
  928. */
  929. function ChangeCode(&$str)
  930. {
  931. global $cfg_soft_lang;
  932. if ($cfg_soft_lang == 'utf-8') {
  933. if ($this->noteInfos["sourcelang"] == "gb2312") {
  934. $str = gb2utf8($str);
  935. }
  936. if ($this->noteInfos["sourcelang"] == "big5") {
  937. $str = gb2utf8(big52gb($str));
  938. }
  939. } else {
  940. if ($this->noteInfos["sourcelang"] == "utf-8") {
  941. $str = utf82gb($str);
  942. }
  943. if ($this->noteInfos["sourcelang"] == "big5") {
  944. $str = big52gb($str);
  945. }
  946. }
  947. }
  948. }//End Class