国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

675 lines
18KB

  1. <?php if(!defined('DEDEINC')) exit("Request Error!");
  2. /**
  3. * 织梦HTML解析类V1.6 PHP版
  4. * function c____DedeHtml2();
  5. * 这个类针对于采集程序,主要是获取某区域内的图片、超链接等信息
  6. *
  7. *
  8. * @version $Id: dedehtml2.class.php 1 14:44 2010年7月6日Z tianya $
  9. * @package DedeCMS.Libraries
  10. * @copyright Copyright (c) 2007 - 2018, DesDev, Inc.
  11. * @copyright Copyright (c) 2020, DedeBIZ.COM
  12. * @license https://www.dedebiz.com/license/v6
  13. * @link https://www.dedebiz.com
  14. */
  15. // ------------------------------------------------------------------------
  16. /**
  17. * 织梦HTML解析类V1.6 PHP版
  18. *
  19. * @package DedeHtml2
  20. * @subpackage DedeCMS.Libraries
  21. * @link http://www.dedecms.com
  22. */
  23. class DedeHtml2
  24. {
  25. var $CAtt;
  26. var $SourceHtml;
  27. var $Title;
  28. var $Medias;
  29. var $MediaInfos;
  30. var $Links;
  31. var $CharSet;
  32. var $BaseUrl;
  33. var $BaseUrlPath;
  34. var $HomeUrl;
  35. var $IsHead;
  36. var $ImgHeight;
  37. var $ImgWidth;
  38. var $GetLinkType;
  39. //构造函数
  40. function __construct()
  41. {
  42. $this->CAtt = '';
  43. $this->SourceHtml = '';
  44. $this->Title = '';
  45. $this->Medias = Array();
  46. $this->MediaInfos = Array();
  47. $this->Links = Array();
  48. $this->BaseUrl = '';
  49. $this->BaseUrlPath = '';
  50. $this->HomeUrl = '';
  51. $this->IsHead = false;
  52. $this->ImgHeight = 30;
  53. $this->ImgWidth = 50;
  54. $this->GetLinkType = 'link';
  55. }
  56. function DedeHtml2()
  57. {
  58. $this->__construct();
  59. }
  60. /**
  61. * 设置HTML的内容和来源网址
  62. *
  63. * @access public
  64. * @param string $html html资源
  65. * @param string $url 地址
  66. * @param string $linktype 连接类型
  67. * @return void
  68. */
  69. function SetSource(&$html, $url = '', $linktype='')
  70. {
  71. $this->__construct();
  72. $this->CAtt = new DedeAttribute2();
  73. $url = trim($url);
  74. $this->SourceHtml = $html;
  75. $this->BaseUrl = $url;
  76. //判断文档相对于当前的路径
  77. $urls = @parse_url($url);
  78. $this->HomeUrl = $urls['host'];
  79. $this->BaseUrlPath = $this->HomeUrl.$urls['path'];
  80. $this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
  81. $this->BaseUrlPath = preg_replace("/\/$/",'',$this->BaseUrlPath);
  82. if($linktype!='')
  83. {
  84. $this->GetLinkType = $linktype;
  85. }
  86. if($html != '')
  87. {
  88. $this->Analyser();
  89. }
  90. }
  91. /**
  92. * 解析HTML
  93. *
  94. * @access private
  95. * @return void
  96. */
  97. function Analyser()
  98. {
  99. $cAtt = new DedeAttribute2();
  100. $cAtt->IsTagName = false;
  101. $c = '';
  102. $i = 0;
  103. $startPos = 0;
  104. $endPos = 0;
  105. $wt = 0;
  106. $ht = 0;
  107. $scriptdd = 0;
  108. $attStr = '';
  109. $tmpValue = '';
  110. $tmpValue2 = '';
  111. $tagName = '';
  112. $hashead = 0;
  113. $slen = strlen($this->SourceHtml);
  114. if($this->GetLinkType=='link' || $this->GetLinkType=='')
  115. {
  116. $needTags = array('a');
  117. }
  118. if($this->GetLinkType=='media')
  119. {
  120. $needTags = array('img','embed','a');
  121. $this->IsHead = true;
  122. }
  123. $tagbreaks = array(' ','<','>',"\r","\n","\t");
  124. for(;isset($this->SourceHtml[$i]);$i++)
  125. {
  126. if($this->SourceHtml[$i]=='<')
  127. {
  128. $tagName = '';
  129. $j = 0;
  130. for($i=$i+1; isset($this->SourceHtml[$i]); $i++)
  131. {
  132. if($j>10)
  133. {
  134. break;
  135. }
  136. $j++;
  137. if( in_array($this->SourceHtml[$i],$tagbreaks) )
  138. {
  139. break;
  140. }
  141. else
  142. {
  143. $tagName .= $this->SourceHtml[$i];
  144. }
  145. }
  146. $tagName = strtolower($tagName);
  147. //标记为注解
  148. if($tagName=='!--')
  149. {
  150. $endPos = strpos($this->SourceHtml,'-->',$i);
  151. if($endPos !== false)
  152. {
  153. $i=$endPos+3;
  154. }
  155. continue;
  156. }
  157. //标记在指定集合内
  158. else if( in_array($tagName,$needTags) )
  159. {
  160. $startPos = $i;
  161. $endPos = strpos($this->SourceHtml,'>',$i+1);
  162. if($endPos===false)
  163. {
  164. break;
  165. }
  166. $attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
  167. $cAtt->SetSource($attStr);
  168. if($tagName=='img')
  169. {
  170. $this->InsertMedia($cAtt->GetAtt('src'),'img');
  171. }
  172. else if($tagName=='embed')
  173. {
  174. $rurl = $this->InsertMedia($cAtt->GetAtt('src'),'embed');
  175. if($rurl != '')
  176. {
  177. $this->MediaInfos[$rurl][0] = $cAtt->GetAtt('width');
  178. $this->MediaInfos[$rurl][1] = $cAtt->GetAtt('height');
  179. }
  180. }
  181. else if($tagName=='a')
  182. {
  183. $this->InsertLink($this->FillUrl($cAtt->GetAtt('href')),$this->GetInnerText($i,'a'));
  184. }
  185. }
  186. else
  187. {
  188. continue;
  189. }
  190. $i--;
  191. }//End if char
  192. }//End for
  193. if($this->Title == '')
  194. {
  195. $this->Title = $this->BaseUrl;
  196. }
  197. }
  198. /**
  199. * 重置资源
  200. *
  201. * @access private
  202. * @return void
  203. */
  204. function Clear()
  205. {
  206. $this->CAtt = '';
  207. $this->SourceHtml = '';
  208. $this->Title = '';
  209. $this->Links = '';
  210. $this->Medias = '';
  211. $this->BaseUrl = '';
  212. $this->BaseUrlPath = '';
  213. }
  214. /**
  215. * 分析链接
  216. *
  217. * @access public
  218. * @param string $url 地址
  219. * @param string $mtype 媒体类型
  220. * @return string
  221. */
  222. function InsertMedia($url, $mtype)
  223. {
  224. if( preg_match("/^(javascript:|#|'|\")/", $url) )
  225. {
  226. return '';
  227. }
  228. if($url == '')
  229. {
  230. return '';
  231. }
  232. $this->Medias[$url]=$mtype;
  233. return $url;
  234. }
  235. /**
  236. * 分析链接
  237. *
  238. * @access public
  239. * @param string $url 地址
  240. * @param string $atitle 文档
  241. * @return string
  242. */
  243. function InsertLink($url, $atitle)
  244. {
  245. if( preg_match("/^(javascript:|#|'|\")/", $url) )
  246. {
  247. return '';
  248. }
  249. if($url == '')
  250. {
  251. return '';
  252. }
  253. if( preg_match('/^img:/', $atitle) )
  254. {
  255. list($aimg, $atitle) = explode(':txt:', $atitle);
  256. if(!isset($this->Links[$url]))
  257. {
  258. if($atitle != '')
  259. {
  260. $this->Links[$url]['title'] = cn_substr($atitle,50);
  261. }
  262. else
  263. {
  264. $this->Links[$url]['title'] = preg_replace('/img:/', '', $aimg);
  265. }
  266. $this->Links[$url]['link'] = $url;
  267. }
  268. $this->Links[$url]['image'] = preg_replace('/img:/', '', $aimg);
  269. $this->InsertMedia($this->Links[$url]['image'], 'img');
  270. }
  271. else
  272. {
  273. if(!isset($this->Links[$url]))
  274. {
  275. $this->Links[$url]['image'] = '';
  276. $this->Links[$url]['title'] = $atitle;
  277. $this->Links[$url]['link'] = $url;
  278. }
  279. else
  280. {
  281. if(strlen($this->Links[$url]['title']) < strlen($atitle)) $this->Links[$url]['title'] = $atitle;
  282. }
  283. }
  284. return $url;
  285. }
  286. /**
  287. * 分析content-type中的字符类型
  288. *
  289. * @access public
  290. * @param string $att 属性字符串
  291. * @return string
  292. */
  293. function ParCharSet($att)
  294. {
  295. $startdd=0;
  296. $taglen=0;
  297. $startdd = strpos($att,'=');
  298. if($startdd===false)
  299. {
  300. return '';
  301. }
  302. else
  303. {
  304. $taglen = strlen($att)-$startdd-1;
  305. if($taglen<=0)
  306. {
  307. return '';
  308. }
  309. return trim(substr($att, $startdd+1, $taglen));
  310. }
  311. }
  312. /**
  313. * 补全相对网址
  314. *
  315. * @access public
  316. * @param string $surl 地址
  317. * @return string
  318. */
  319. function FillUrl($surl)
  320. {
  321. $i = $pathStep = 0;
  322. $dstr = $pstr = $okurl = '';
  323. $surl = trim($surl);
  324. if($surl == '')
  325. {
  326. return '';
  327. }
  328. $pos = strpos($surl,'#');
  329. if($pos>0)
  330. {
  331. $surl = substr($surl,0,$pos);
  332. }
  333. if($surl[0]=='/')
  334. {
  335. $okurl = $this->HomeUrl.'/'.$surl;
  336. }
  337. else if($surl[0]=='.')
  338. {
  339. if(!isset($surl[2]))
  340. {
  341. return '';
  342. }
  343. else if($surl[0]=='/')
  344. {
  345. $okurl = $this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
  346. }
  347. else
  348. {
  349. $urls = explode('/',$surl);
  350. foreach($urls as $u)
  351. {
  352. if($u=='..')
  353. {
  354. $pathStep++;
  355. }
  356. else if($i<count($urls)-1)
  357. {
  358. $dstr .= $urls[$i].'/';
  359. }
  360. else
  361. {
  362. $dstr .= $urls[$i];
  363. }
  364. $i++;
  365. }
  366. $urls = explode('/',$this->BaseUrlPath);
  367. if(count($urls) <= $pathStep)
  368. {
  369. return '';
  370. }
  371. else
  372. {
  373. $pstr = '';
  374. for($i=0;$i<count($urls)-$pathStep;$i++){ $pstr .= $urls[$i].'/'; }
  375. $okurl = $pstr.$dstr;
  376. }
  377. }
  378. }
  379. else
  380. {
  381. if( strlen($surl) < 7 )
  382. {
  383. $okurl = $this->BaseUrlPath.'/'.$surl;
  384. }
  385. else if( strtolower(substr($surl,0,7))=='http://' )
  386. {
  387. $okurl = preg_replace('/^http:\/\//i', '', $surl);
  388. }
  389. else
  390. {
  391. $okurl = $this->BaseUrlPath.'/'.$surl;
  392. }
  393. }
  394. $okurl = preg_replace('/\/{1,}/i', '/', $okurl);
  395. return 'http://'.$okurl;
  396. }
  397. /**
  398. * 获得和下一个标记之间的文本内容
  399. *
  400. * @access public
  401. * @param string $pos 位置地址
  402. * @param string $tagname 标签名称
  403. * @return string
  404. */
  405. function GetInnerText(&$pos,$tagname)
  406. {
  407. $startPos=0;
  408. $endPos=0;
  409. $textLen=0;
  410. $str = '';
  411. $startPos = strpos($this->SourceHtml,'>',$pos);
  412. if($tagname=='title')
  413. {
  414. $endPos = strpos($this->SourceHtml,'<',$startPos);
  415. }
  416. else
  417. {
  418. $endPos1 = strpos($this->SourceHtml,'</a',$startPos);
  419. $endPos2 = strpos($this->SourceHtml,'</A',$startPos);
  420. if($endPos1===false)
  421. {
  422. $endPos = $endPos2;
  423. }
  424. else if($endPos2===false)
  425. {
  426. $endPos = $endPos1;
  427. }
  428. else
  429. {
  430. $endPos = ($endPos1 < $endPos2 ? $endPos1 : $endPos2 );
  431. }
  432. }
  433. if($endPos > $startPos)
  434. {
  435. $textLen = $endPos-$startPos;
  436. $str = substr($this->SourceHtml,$startPos+1,$textLen-1);
  437. }
  438. $pos = $startPos + $textLen + strlen("</".$tagname) + 1;
  439. if($tagname=='title')
  440. {
  441. return trim($str);
  442. }
  443. else
  444. {
  445. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$str,$imgs);
  446. if(isset($imgs[2][0]))
  447. {
  448. $txt = trim(Html2Text($str));
  449. $imgs[2][0] = preg_replace("/[\"']/",'',$imgs[2][0]);
  450. return "img:".$this->FillUrl($imgs[2][0]).':txt:'.$txt;
  451. }
  452. else
  453. {
  454. $str = strip_tags($str);
  455. //$str = preg_replace('/<\/(.*)$/i', '', $str);
  456. //$str = trim(preg_replace('/^(.*)>/i','',$str));
  457. return $str;
  458. }
  459. }
  460. }
  461. }//End class
  462. /*******************************
  463. //属性解析器
  464. function c____DedeAttribute2();
  465. ********************************/
  466. class DedeAttribute2
  467. {
  468. var $SourceString = '';
  469. var $SourceMaxSize = 1024;
  470. var $CharToLow = FALSE; //属性值是否不分大小写(属性名统一为小写)
  471. var $IsTagName = TRUE; //是否解析标记名称
  472. var $Count = -1;
  473. var $Items = array(); //属性元素的集合
  474. //设置属性解析器源字符串
  475. function SetSource($str = '')
  476. {
  477. $this->Count = -1;
  478. $this->Items =array();
  479. $strLen = 0;
  480. $this->SourceString = trim(preg_replace("/[ \t\r\n]{1,}/"," ",$str));
  481. $strLen = strlen($this->SourceString);
  482. $this->SourceString .= " "; //增加一个空格结尾,以方便处理没有属性的标记
  483. if($strLen>0&&$strLen<=$this->SourceMaxSize)
  484. {
  485. $this->PrivateAttParse();
  486. }
  487. }
  488. //获得某个属性
  489. function GetAtt($str)
  490. {
  491. if($str == '')
  492. {
  493. return '';
  494. }
  495. $str = strtolower($str);
  496. if(isset($this->Items[$str]))
  497. {
  498. return $this->Items[$str];
  499. }
  500. else
  501. {
  502. return '';
  503. }
  504. }
  505. //判断属性是否存在
  506. function IsAtt($str)
  507. {
  508. if($str == '')
  509. {
  510. return false;
  511. }
  512. $str = strtolower($str);
  513. if(isset($this->Items[$str]))
  514. {
  515. return true;
  516. }
  517. else
  518. {
  519. return false;
  520. }
  521. }
  522. //获得标记名称
  523. function GetTagName()
  524. {
  525. return $this->GetAtt("tagname");
  526. }
  527. // 获得属性个数
  528. function GetCount()
  529. {
  530. return $this->Count+1;
  531. }
  532. //解析属性(仅给SetSource调用)
  533. function PrivateAttParse()
  534. {
  535. $d = '';
  536. $tmpatt = '';
  537. $tmpvalue = '';
  538. $startdd = -1;
  539. $ddtag = '';
  540. $strLen = strlen($this->SourceString);
  541. $j = 0;
  542. //这里是获得标记的名称
  543. if($this->IsTagName)
  544. {
  545. //如果属性是注解,不再解析里面的内容,直接返回
  546. if(isset($this->SourceString[2]))
  547. {
  548. if($this->SourceString[0].$this->SourceString[1].$this->SourceString[2]=='!--')
  549. {
  550. $this->Items['tagname'] = '!--';
  551. return ;
  552. }
  553. }
  554. for($i=0;$i<$strLen;$i++)
  555. {
  556. $d = $this->SourceString[$i];
  557. $j++;
  558. if(preg_match("/[ '\"\r\n\t]/i", $d))
  559. {
  560. $this->Count++;
  561. $this->Items["tagname"]=strtolower(trim($tmpvalue));
  562. $tmpvalue = ''; break;
  563. }
  564. else
  565. {
  566. $tmpvalue .= $d;
  567. }
  568. }
  569. if($j>0)
  570. {
  571. $j = $j-1;
  572. }
  573. }
  574. //遍历源字符串,获得各属性
  575. for($i=$j;$i<$strLen;$i++)
  576. {
  577. $d = $this->SourceString[$i];
  578. //获得属性的键
  579. if($startdd==-1)
  580. {
  581. if($d!='=')
  582. {
  583. $tmpatt .= $d;
  584. }
  585. else
  586. {
  587. $tmpatt = strtolower(trim($tmpatt));
  588. $startdd=0;
  589. }
  590. }
  591. //检测属性值是用什么包围的,允许使用 '' '' 或空白
  592. else if($startdd==0)
  593. {
  594. switch($d)
  595. {
  596. case ' ':
  597. // continue;
  598. break;
  599. case '\'':
  600. $ddtag='\'';
  601. $startdd=1;
  602. break;
  603. case '"':
  604. $ddtag='"';
  605. $startdd=1;
  606. break;
  607. default:
  608. $tmpvalue.=$d;
  609. $ddtag=' ';
  610. $startdd=1;
  611. break;
  612. }
  613. }
  614. //获得属性的值
  615. else if($startdd==1)
  616. {
  617. if($d==$ddtag)
  618. {
  619. $this->Count++;
  620. if($this->CharToLow)
  621. {
  622. $this->Items[$tmpatt] = strtolower(trim($tmpvalue));
  623. }
  624. else
  625. {
  626. $this->Items[$tmpatt] = trim($tmpvalue);
  627. }
  628. $tmpatt = '';
  629. $tmpvalue = '';
  630. $startdd=-1;
  631. }
  632. else
  633. {
  634. $tmpvalue.=$d;
  635. }
  636. }
  637. }//End for
  638. //处理没有值的属性(必须放在结尾才有效)如:"input type=radio name=t1 value=aaa checked"
  639. if($tmpatt != '')
  640. {
  641. $this->Items[$tmpatt] = '';
  642. }
  643. }//End Function PrivateAttParse
  644. }//End Class DedeAttribute2
  645. ?>