国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

682 lines
18KB

  1. <?php if(!defined('DEDEINC')) exit("Request Error!");
  2. /**
  3. * 织梦HTML解析类V1.6 PHP版
  4. * function c____DedeHtml2();
  5. * 这个类针对于采集程序,主要是获取某区域内的图片、超链接等信息
  6. *
  7. *
  8. * @version $Id: dedehtml2.class.php 1 14:44 2010年7月6日Z tianya $
  9. * @package DedeCMS.Libraries
  10. * @copyright Copyright (c) 2007 - 2019, DesDev, Inc.
  11. * @license http://help.dedecms.com/usersguide/license.html
  12. * @link http://www.dedecms.com
  13. */
  14. // ------------------------------------------------------------------------
  15. /**
  16. * 织梦HTML解析类V1.6 PHP版
  17. *
  18. * @package DedeHtml2
  19. * @subpackage DedeCMS.Libraries
  20. * @link http://www.dedecms.com
  21. */
  22. class DedeHtml2
  23. {
  24. var $CAtt;
  25. var $SourceHtml;
  26. var $Title;
  27. var $Medias;
  28. var $MediaInfos;
  29. var $Links;
  30. var $CharSet;
  31. var $BaseUrl;
  32. var $BaseUrlPath;
  33. var $Scheme;
  34. var $HomeUrl;
  35. var $IsHead;
  36. var $ImgHeight;
  37. var $ImgWidth;
  38. var $GetLinkType;
  39. //构造函数
  40. function __construct()
  41. {
  42. $this->CAtt = '';
  43. $this->SourceHtml = '';
  44. $this->Title = '';
  45. $this->Medias = Array();
  46. $this->MediaInfos = Array();
  47. $this->Links = Array();
  48. $this->BaseUrl = '';
  49. $this->BaseUrlPath = '';
  50. $this->Scheme = 'http://';
  51. $this->HomeUrl = '';
  52. $this->IsHead = false;
  53. $this->ImgHeight = 30;
  54. $this->ImgWidth = 50;
  55. $this->GetLinkType = 'link';
  56. }
  57. function DedeHtml2()
  58. {
  59. $this->__construct();
  60. }
  61. /**
  62. * 设置HTML的内容和来源网址
  63. *
  64. * @access public
  65. * @param string $html html资源
  66. * @param string $url 地址
  67. * @param string $linktype 连接类型
  68. * @return void
  69. */
  70. function SetSource(&$html, $url = '', $linktype='')
  71. {
  72. $this->__construct();
  73. $this->CAtt = new DedeAttribute2();
  74. $url = trim($url);
  75. $this->SourceHtml = $html;
  76. $this->BaseUrl = $url;
  77. //判断文档相对于当前的路径
  78. $urls = @parse_url($url);
  79. $this->Scheme = $urls['scheme'] . '://';
  80. $this->HomeUrl = $urls['host'];
  81. $this->BaseUrlPath = $this->HomeUrl.$urls['path'];
  82. $this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
  83. $this->BaseUrlPath = preg_replace("/\/$/",'',$this->BaseUrlPath);
  84. if($linktype!='')
  85. {
  86. $this->GetLinkType = $linktype;
  87. }
  88. if($html != '')
  89. {
  90. $this->Analyser();
  91. }
  92. }
  93. /**
  94. * 解析HTML
  95. *
  96. * @access private
  97. * @return void
  98. */
  99. function Analyser()
  100. {
  101. $cAtt = new DedeAttribute2();
  102. $cAtt->IsTagName = false;
  103. $c = '';
  104. $i = 0;
  105. $startPos = 0;
  106. $endPos = 0;
  107. $wt = 0;
  108. $ht = 0;
  109. $scriptdd = 0;
  110. $attStr = '';
  111. $tmpValue = '';
  112. $tmpValue2 = '';
  113. $tagName = '';
  114. $hashead = 0;
  115. $slen = strlen($this->SourceHtml);
  116. if($this->GetLinkType=='link' || $this->GetLinkType=='')
  117. {
  118. $needTags = array('a');
  119. }
  120. if($this->GetLinkType=='media')
  121. {
  122. $needTags = array('img','embed','a');
  123. $this->IsHead = true;
  124. }
  125. $tagbreaks = array(' ','<','>',"\r","\n","\t");
  126. for(;isset($this->SourceHtml[$i]);$i++)
  127. {
  128. if($this->SourceHtml[$i]=='<')
  129. {
  130. $tagName = '';
  131. $j = 0;
  132. for($i=$i+1; isset($this->SourceHtml[$i]); $i++)
  133. {
  134. if($j>10)
  135. {
  136. break;
  137. }
  138. $j++;
  139. if( in_array($this->SourceHtml[$i],$tagbreaks) )
  140. {
  141. break;
  142. }
  143. else
  144. {
  145. $tagName .= $this->SourceHtml[$i];
  146. }
  147. }
  148. $tagName = strtolower($tagName);
  149. //标记为注解
  150. if($tagName=='!--')
  151. {
  152. $endPos = strpos($this->SourceHtml,'-->',$i);
  153. if($endPos !== false)
  154. {
  155. $i=$endPos+3;
  156. }
  157. continue;
  158. }
  159. //标记在指定集合内
  160. else if( in_array($tagName,$needTags) )
  161. {
  162. $startPos = $i;
  163. $endPos = strpos($this->SourceHtml,'>',$i+1);
  164. if($endPos===false)
  165. {
  166. break;
  167. }
  168. $attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
  169. $cAtt->SetSource($attStr);
  170. if($tagName=='img')
  171. {
  172. $this->InsertMedia($cAtt->GetAtt('src'),'img');
  173. }
  174. else if($tagName=='embed')
  175. {
  176. $rurl = $this->InsertMedia($cAtt->GetAtt('src'),'embed');
  177. if($rurl != '')
  178. {
  179. $this->MediaInfos[$rurl][0] = $cAtt->GetAtt('width');
  180. $this->MediaInfos[$rurl][1] = $cAtt->GetAtt('height');
  181. }
  182. }
  183. else if($tagName=='a')
  184. {
  185. $this->InsertLink($this->FillUrl($cAtt->GetAtt('href')),$this->GetInnerText($i,'a'));
  186. }
  187. }
  188. else
  189. {
  190. continue;
  191. }
  192. $i--;
  193. }//End if char
  194. }//End for
  195. if($this->Title == '')
  196. {
  197. $this->Title = $this->BaseUrl;
  198. }
  199. }
  200. /**
  201. * 重置资源
  202. *
  203. * @access private
  204. * @return void
  205. */
  206. function Clear()
  207. {
  208. $this->CAtt = '';
  209. $this->SourceHtml = '';
  210. $this->Title = '';
  211. $this->Links = '';
  212. $this->Medias = '';
  213. $this->BaseUrl = '';
  214. $this->BaseUrlPath = '';
  215. }
  216. /**
  217. * 分析链接
  218. *
  219. * @access public
  220. * @param string $url 地址
  221. * @param string $mtype 媒体类型
  222. * @return string
  223. */
  224. function InsertMedia($url, $mtype)
  225. {
  226. if( preg_match("/^(javascript:|#|'|\")/", $url) )
  227. {
  228. return '';
  229. }
  230. if($url == '')
  231. {
  232. return '';
  233. }
  234. $this->Medias[$url]=$mtype;
  235. return $url;
  236. }
  237. /**
  238. * 分析链接
  239. *
  240. * @access public
  241. * @param string $url 地址
  242. * @param string $atitle 文档
  243. * @return string
  244. */
  245. function InsertLink($url, $atitle)
  246. {
  247. if( preg_match("/^(javascript:|#|'|\")/", $url) )
  248. {
  249. return '';
  250. }
  251. if($url == '')
  252. {
  253. return '';
  254. }
  255. if( preg_match('/^img:/', $atitle) )
  256. {
  257. list($aimg, $atitle) = explode(':txt:', $atitle);
  258. if(!isset($this->Links[$url]))
  259. {
  260. if($atitle != '')
  261. {
  262. $this->Links[$url]['title'] = cn_substr($atitle,50);
  263. }
  264. else
  265. {
  266. $this->Links[$url]['title'] = preg_replace('/img:/', '', $aimg);
  267. }
  268. $this->Links[$url]['link'] = $url;
  269. }
  270. $this->Links[$url]['image'] = preg_replace('/img:/', '', $aimg);
  271. $this->InsertMedia($this->Links[$url]['image'], 'img');
  272. }
  273. else
  274. {
  275. if(!isset($this->Links[$url]))
  276. {
  277. $this->Links[$url]['image'] = '';
  278. $this->Links[$url]['title'] = $atitle;
  279. $this->Links[$url]['link'] = $url;
  280. }
  281. else
  282. {
  283. if(strlen($this->Links[$url]['title']) < strlen($atitle)) $this->Links[$url]['title'] = $atitle;
  284. }
  285. }
  286. return $url;
  287. }
  288. /**
  289. * 分析content-type中的字符类型
  290. *
  291. * @access public
  292. * @param string $att 属性字符串
  293. * @return string
  294. */
  295. function ParCharSet($att)
  296. {
  297. $startdd=0;
  298. $taglen=0;
  299. $startdd = strpos($att,'=');
  300. if($startdd===false)
  301. {
  302. return '';
  303. }
  304. else
  305. {
  306. $taglen = strlen($att)-$startdd-1;
  307. if($taglen<=0)
  308. {
  309. return '';
  310. }
  311. return trim(substr($att, $startdd+1, $taglen));
  312. }
  313. }
  314. /**
  315. * 补全相对网址
  316. *
  317. * @access public
  318. * @param string $surl 地址
  319. * @return string
  320. */
  321. function FillUrl($surl)
  322. {
  323. $i = $pathStep = 0;
  324. $dstr = $pstr = $okurl = '';
  325. $surl = trim($surl);
  326. if($surl == '')
  327. {
  328. return '';
  329. }
  330. $pos = strpos($surl,'#');
  331. if($pos>0)
  332. {
  333. $surl = substr($surl,0,$pos);
  334. }
  335. if($surl[0]=='/')
  336. {
  337. $okurl = $this->HomeUrl.'/'.$surl;
  338. }
  339. else if($surl[0]=='.')
  340. {
  341. if(!isset($surl[2]))
  342. {
  343. return '';
  344. }
  345. else if($surl[0]=='/')
  346. {
  347. $okurl = $this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
  348. }
  349. else
  350. {
  351. $urls = explode('/',$surl);
  352. foreach($urls as $u)
  353. {
  354. if($u=='..')
  355. {
  356. $pathStep++;
  357. }
  358. else if($i<count($urls)-1)
  359. {
  360. $dstr .= $urls[$i].'/';
  361. }
  362. else
  363. {
  364. $dstr .= $urls[$i];
  365. }
  366. $i++;
  367. }
  368. $urls = explode('/',$this->BaseUrlPath);
  369. if(count($urls) <= $pathStep)
  370. {
  371. return '';
  372. }
  373. else
  374. {
  375. $pstr = '';
  376. for($i=0;$i<count($urls)-$pathStep;$i++){ $pstr .= $urls[$i].'/'; }
  377. $okurl = $pstr.$dstr;
  378. }
  379. }
  380. }
  381. else
  382. {
  383. if( strlen($surl) < 7 )
  384. {
  385. $okurl = $this->BaseUrlPath.'/'.$surl;
  386. }
  387. else if( strtolower(substr($surl,0,7))=='http://' )
  388. {
  389. $okurl = preg_replace('/^http:\/\//i', '', $surl);
  390. }
  391. else if( strtolower(substr($surl,0,8))=='https://' )
  392. {
  393. $okurl = preg_replace('/^https:\/\//i', '', $surl);
  394. }
  395. else
  396. {
  397. $okurl = $this->BaseUrlPath.'/'.$surl;
  398. }
  399. }
  400. $okurl = preg_replace('/\/{1,}/i', '/', $okurl);
  401. return $this->Scheme . $okurl;
  402. }
  403. /**
  404. * 获得和下一个标记之间的文本内容
  405. *
  406. * @access public
  407. * @param string $pos 位置地址
  408. * @param string $tagname 标签名称
  409. * @return string
  410. */
  411. function GetInnerText(&$pos,$tagname)
  412. {
  413. $startPos=0;
  414. $endPos=0;
  415. $textLen=0;
  416. $str = '';
  417. $startPos = strpos($this->SourceHtml,'>',$pos);
  418. if($tagname=='title')
  419. {
  420. $endPos = strpos($this->SourceHtml,'<',$startPos);
  421. }
  422. else
  423. {
  424. $endPos1 = strpos($this->SourceHtml,'</a',$startPos);
  425. $endPos2 = strpos($this->SourceHtml,'</A',$startPos);
  426. if($endPos1===false)
  427. {
  428. $endPos = $endPos2;
  429. }
  430. else if($endPos2===false)
  431. {
  432. $endPos = $endPos1;
  433. }
  434. else
  435. {
  436. $endPos = ($endPos1 < $endPos2 ? $endPos1 : $endPos2 );
  437. }
  438. }
  439. if($endPos > $startPos)
  440. {
  441. $textLen = $endPos-$startPos;
  442. $str = substr($this->SourceHtml,$startPos+1,$textLen-1);
  443. }
  444. $pos = $startPos + $textLen + strlen("</".$tagname) + 1;
  445. if($tagname=='title')
  446. {
  447. return trim($str);
  448. }
  449. else
  450. {
  451. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$str,$imgs);
  452. if(isset($imgs[2][0]))
  453. {
  454. $txt = trim(Html2Text($str));
  455. $imgs[2][0] = preg_replace("/[\"']/",'',$imgs[2][0]);
  456. return "img:".$this->FillUrl($imgs[2][0]).':txt:'.$txt;
  457. }
  458. else
  459. {
  460. $str = strip_tags($str);
  461. //$str = preg_replace('/<\/(.*)$/i', '', $str);
  462. //$str = trim(preg_replace('/^(.*)>/i','',$str));
  463. return $str;
  464. }
  465. }
  466. }
  467. }//End class
  468. /*******************************
  469. //属性解析器
  470. function c____DedeAttribute2();
  471. ********************************/
  472. class DedeAttribute2
  473. {
  474. var $SourceString = '';
  475. var $SourceMaxSize = 1024;
  476. var $CharToLow = FALSE; //属性值是否不分大小写(属性名统一为小写)
  477. var $IsTagName = TRUE; //是否解析标记名称
  478. var $Count = -1;
  479. var $Items = array(); //属性元素的集合
  480. //设置属性解析器源字符串
  481. function SetSource($str = '')
  482. {
  483. $this->Count = -1;
  484. $this->Items =array();
  485. $strLen = 0;
  486. $this->SourceString = trim(preg_replace("/[ \t\r\n]{1,}/"," ",$str));
  487. $strLen = strlen($this->SourceString);
  488. $this->SourceString .= " "; //增加一个空格结尾,以方便处理没有属性的标记
  489. if($strLen>0&&$strLen<=$this->SourceMaxSize)
  490. {
  491. $this->PrivateAttParse();
  492. }
  493. }
  494. //获得某个属性
  495. function GetAtt($str)
  496. {
  497. if($str == '')
  498. {
  499. return '';
  500. }
  501. $str = strtolower($str);
  502. if(isset($this->Items[$str]))
  503. {
  504. return $this->Items[$str];
  505. }
  506. else
  507. {
  508. return '';
  509. }
  510. }
  511. //判断属性是否存在
  512. function IsAtt($str)
  513. {
  514. if($str == '')
  515. {
  516. return false;
  517. }
  518. $str = strtolower($str);
  519. if(isset($this->Items[$str]))
  520. {
  521. return true;
  522. }
  523. else
  524. {
  525. return false;
  526. }
  527. }
  528. //获得标记名称
  529. function GetTagName()
  530. {
  531. return $this->GetAtt("tagname");
  532. }
  533. // 获得属性个数
  534. function GetCount()
  535. {
  536. return $this->Count+1;
  537. }
  538. //解析属性(仅给SetSource调用)
  539. function PrivateAttParse()
  540. {
  541. $d = '';
  542. $tmpatt = '';
  543. $tmpvalue = '';
  544. $startdd = -1;
  545. $ddtag = '';
  546. $strLen = strlen($this->SourceString);
  547. $j = 0;
  548. //这里是获得标记的名称
  549. if($this->IsTagName)
  550. {
  551. //如果属性是注解,不再解析里面的内容,直接返回
  552. if(isset($this->SourceString[2]))
  553. {
  554. if($this->SourceString[0].$this->SourceString[1].$this->SourceString[2]=='!--')
  555. {
  556. $this->Items['tagname'] = '!--';
  557. return ;
  558. }
  559. }
  560. for($i=0;$i<$strLen;$i++)
  561. {
  562. $d = $this->SourceString[$i];
  563. $j++;
  564. if(preg_match("/[ '\"\r\n\t]/i", $d))
  565. {
  566. $this->Count++;
  567. $this->Items["tagname"]=strtolower(trim($tmpvalue));
  568. $tmpvalue = ''; break;
  569. }
  570. else
  571. {
  572. $tmpvalue .= $d;
  573. }
  574. }
  575. if($j>0)
  576. {
  577. $j = $j-1;
  578. }
  579. }
  580. //遍历源字符串,获得各属性
  581. for($i=$j;$i<$strLen;$i++)
  582. {
  583. $d = $this->SourceString[$i];
  584. //获得属性的键
  585. if($startdd==-1)
  586. {
  587. if($d!='=')
  588. {
  589. $tmpatt .= $d;
  590. }
  591. else
  592. {
  593. $tmpatt = strtolower(trim($tmpatt));
  594. $startdd=0;
  595. }
  596. }
  597. //检测属性值是用什么包围的,允许使用 '' '' 或空白
  598. else if($startdd==0)
  599. {
  600. switch($d)
  601. {
  602. case ' ':
  603. continue;
  604. break;
  605. case '\'':
  606. $ddtag='\'';
  607. $startdd=1;
  608. break;
  609. case '"':
  610. $ddtag='"';
  611. $startdd=1;
  612. break;
  613. default:
  614. $tmpvalue.=$d;
  615. $ddtag=' ';
  616. $startdd=1;
  617. break;
  618. }
  619. }
  620. //获得属性的值
  621. else if($startdd==1)
  622. {
  623. if($d==$ddtag)
  624. {
  625. $this->Count++;
  626. if($this->CharToLow)
  627. {
  628. $this->Items[$tmpatt] = strtolower(trim($tmpvalue));
  629. }
  630. else
  631. {
  632. $this->Items[$tmpatt] = trim($tmpvalue);
  633. }
  634. $tmpatt = '';
  635. $tmpvalue = '';
  636. $startdd=-1;
  637. }
  638. else
  639. {
  640. $tmpvalue.=$d;
  641. }
  642. }
  643. }//End for
  644. //处理没有值的属性(必须放在结尾才有效)如:"input type=radio name=t1 value=aaa checked"
  645. if($tmpatt != '')
  646. {
  647. $this->Items[$tmpatt] = '';
  648. }
  649. }//End Function PrivateAttParse
  650. }//End Class DedeAttribute2
  651. ?>