国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

674 lines
18KB

  1. <?php if(!defined('DEDEINC')) exit("Request Error!");
  2. /**
  3. * 织梦HTML解析类V1.6 PHP版
  4. * function c____DedeHtml2();
  5. * 这个类针对于采集程序,主要是获取某区域内的图片、超链接等信息
  6. *
  7. *
  8. * @version $Id: dedehtml2.class.php 1 14:44 2010年7月6日Z tianya $
  9. * @package DedeCMS.Libraries
  10. * @copyright Copyright (c) 2020, DedeBIZ.COM
  11. * @license https://www.dedebiz.com/license/v6
  12. * @link https://www.dedebiz.com
  13. */
  14. // ------------------------------------------------------------------------
  15. /**
  16. * 织梦HTML解析类V1.6 PHP版
  17. *
  18. * @package DedeHtml2
  19. * @subpackage DedeCMS.Libraries
  20. * @link https://www.dedebiz.com
  21. */
  22. class DedeHtml2
  23. {
  24. var $CAtt;
  25. var $SourceHtml;
  26. var $Title;
  27. var $Medias;
  28. var $MediaInfos;
  29. var $Links;
  30. var $CharSet;
  31. var $BaseUrl;
  32. var $BaseUrlPath;
  33. var $HomeUrl;
  34. var $IsHead;
  35. var $ImgHeight;
  36. var $ImgWidth;
  37. var $GetLinkType;
  38. //构造函数
  39. function __construct()
  40. {
  41. $this->CAtt = '';
  42. $this->SourceHtml = '';
  43. $this->Title = '';
  44. $this->Medias = Array();
  45. $this->MediaInfos = Array();
  46. $this->Links = Array();
  47. $this->BaseUrl = '';
  48. $this->BaseUrlPath = '';
  49. $this->HomeUrl = '';
  50. $this->IsHead = false;
  51. $this->ImgHeight = 30;
  52. $this->ImgWidth = 50;
  53. $this->GetLinkType = 'link';
  54. }
  55. function DedeHtml2()
  56. {
  57. $this->__construct();
  58. }
  59. /**
  60. * 设置HTML的内容和来源网址
  61. *
  62. * @access public
  63. * @param string $html html资源
  64. * @param string $url 地址
  65. * @param string $linktype 连接类型
  66. * @return void
  67. */
  68. function SetSource(&$html, $url = '', $linktype='')
  69. {
  70. $this->__construct();
  71. $this->CAtt = new DedeAttribute2();
  72. $url = trim($url);
  73. $this->SourceHtml = $html;
  74. $this->BaseUrl = $url;
  75. //判断文档相对于当前的路径
  76. $urls = @parse_url($url);
  77. $this->HomeUrl = $urls['host'];
  78. $this->BaseUrlPath = $this->HomeUrl.$urls['path'];
  79. $this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
  80. $this->BaseUrlPath = preg_replace("/\/$/",'',$this->BaseUrlPath);
  81. if($linktype!='')
  82. {
  83. $this->GetLinkType = $linktype;
  84. }
  85. if($html != '')
  86. {
  87. $this->Analyser();
  88. }
  89. }
  90. /**
  91. * 解析HTML
  92. *
  93. * @access private
  94. * @return void
  95. */
  96. function Analyser()
  97. {
  98. $cAtt = new DedeAttribute2();
  99. $cAtt->IsTagName = false;
  100. $c = '';
  101. $i = 0;
  102. $startPos = 0;
  103. $endPos = 0;
  104. $wt = 0;
  105. $ht = 0;
  106. $scriptdd = 0;
  107. $attStr = '';
  108. $tmpValue = '';
  109. $tmpValue2 = '';
  110. $tagName = '';
  111. $hashead = 0;
  112. $slen = strlen($this->SourceHtml);
  113. if($this->GetLinkType=='link' || $this->GetLinkType=='')
  114. {
  115. $needTags = array('a');
  116. }
  117. if($this->GetLinkType=='media')
  118. {
  119. $needTags = array('img','embed','a');
  120. $this->IsHead = true;
  121. }
  122. $tagbreaks = array(' ','<','>',"\r","\n","\t");
  123. for(;isset($this->SourceHtml[$i]);$i++)
  124. {
  125. if($this->SourceHtml[$i]=='<')
  126. {
  127. $tagName = '';
  128. $j = 0;
  129. for($i=$i+1; isset($this->SourceHtml[$i]); $i++)
  130. {
  131. if($j>10)
  132. {
  133. break;
  134. }
  135. $j++;
  136. if( in_array($this->SourceHtml[$i],$tagbreaks) )
  137. {
  138. break;
  139. }
  140. else
  141. {
  142. $tagName .= $this->SourceHtml[$i];
  143. }
  144. }
  145. $tagName = strtolower($tagName);
  146. //标记为注解
  147. if($tagName=='!--')
  148. {
  149. $endPos = strpos($this->SourceHtml,'-->',$i);
  150. if($endPos !== false)
  151. {
  152. $i=$endPos+3;
  153. }
  154. continue;
  155. }
  156. //标记在指定集合内
  157. else if( in_array($tagName,$needTags) )
  158. {
  159. $startPos = $i;
  160. $endPos = strpos($this->SourceHtml,'>',$i+1);
  161. if($endPos===false)
  162. {
  163. break;
  164. }
  165. $attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
  166. $cAtt->SetSource($attStr);
  167. if($tagName=='img')
  168. {
  169. $this->InsertMedia($cAtt->GetAtt('src'),'img');
  170. }
  171. else if($tagName=='embed')
  172. {
  173. $rurl = $this->InsertMedia($cAtt->GetAtt('src'),'embed');
  174. if($rurl != '')
  175. {
  176. $this->MediaInfos[$rurl][0] = $cAtt->GetAtt('width');
  177. $this->MediaInfos[$rurl][1] = $cAtt->GetAtt('height');
  178. }
  179. }
  180. else if($tagName=='a')
  181. {
  182. $this->InsertLink($this->FillUrl($cAtt->GetAtt('href')),$this->GetInnerText($i,'a'));
  183. }
  184. }
  185. else
  186. {
  187. continue;
  188. }
  189. $i--;
  190. }//End if char
  191. }//End for
  192. if($this->Title == '')
  193. {
  194. $this->Title = $this->BaseUrl;
  195. }
  196. }
  197. /**
  198. * 重置资源
  199. *
  200. * @access private
  201. * @return void
  202. */
  203. function Clear()
  204. {
  205. $this->CAtt = '';
  206. $this->SourceHtml = '';
  207. $this->Title = '';
  208. $this->Links = '';
  209. $this->Medias = '';
  210. $this->BaseUrl = '';
  211. $this->BaseUrlPath = '';
  212. }
  213. /**
  214. * 分析链接
  215. *
  216. * @access public
  217. * @param string $url 地址
  218. * @param string $mtype 媒体类型
  219. * @return string
  220. */
  221. function InsertMedia($url, $mtype)
  222. {
  223. if( preg_match("/^(javascript:|#|'|\")/", $url) )
  224. {
  225. return '';
  226. }
  227. if($url == '')
  228. {
  229. return '';
  230. }
  231. $this->Medias[$url]=$mtype;
  232. return $url;
  233. }
  234. /**
  235. * 分析链接
  236. *
  237. * @access public
  238. * @param string $url 地址
  239. * @param string $atitle 文档
  240. * @return string
  241. */
  242. function InsertLink($url, $atitle)
  243. {
  244. if( preg_match("/^(javascript:|#|'|\")/", $url) )
  245. {
  246. return '';
  247. }
  248. if($url == '')
  249. {
  250. return '';
  251. }
  252. if( preg_match('/^img:/', $atitle) )
  253. {
  254. list($aimg, $atitle) = explode(':txt:', $atitle);
  255. if(!isset($this->Links[$url]))
  256. {
  257. if($atitle != '')
  258. {
  259. $this->Links[$url]['title'] = cn_substr($atitle,50);
  260. }
  261. else
  262. {
  263. $this->Links[$url]['title'] = preg_replace('/img:/', '', $aimg);
  264. }
  265. $this->Links[$url]['link'] = $url;
  266. }
  267. $this->Links[$url]['image'] = preg_replace('/img:/', '', $aimg);
  268. $this->InsertMedia($this->Links[$url]['image'], 'img');
  269. }
  270. else
  271. {
  272. if(!isset($this->Links[$url]))
  273. {
  274. $this->Links[$url]['image'] = '';
  275. $this->Links[$url]['title'] = $atitle;
  276. $this->Links[$url]['link'] = $url;
  277. }
  278. else
  279. {
  280. if(strlen($this->Links[$url]['title']) < strlen($atitle)) $this->Links[$url]['title'] = $atitle;
  281. }
  282. }
  283. return $url;
  284. }
  285. /**
  286. * 分析content-type中的字符类型
  287. *
  288. * @access public
  289. * @param string $att 属性字符串
  290. * @return string
  291. */
  292. function ParCharSet($att)
  293. {
  294. $startdd=0;
  295. $taglen=0;
  296. $startdd = strpos($att,'=');
  297. if($startdd===false)
  298. {
  299. return '';
  300. }
  301. else
  302. {
  303. $taglen = strlen($att)-$startdd-1;
  304. if($taglen<=0)
  305. {
  306. return '';
  307. }
  308. return trim(substr($att, $startdd+1, $taglen));
  309. }
  310. }
  311. /**
  312. * 补全相对网址
  313. *
  314. * @access public
  315. * @param string $surl 地址
  316. * @return string
  317. */
  318. function FillUrl($surl)
  319. {
  320. $i = $pathStep = 0;
  321. $dstr = $pstr = $okurl = '';
  322. $surl = trim($surl);
  323. if($surl == '')
  324. {
  325. return '';
  326. }
  327. $pos = strpos($surl,'#');
  328. if($pos>0)
  329. {
  330. $surl = substr($surl,0,$pos);
  331. }
  332. if($surl[0]=='/')
  333. {
  334. $okurl = $this->HomeUrl.'/'.$surl;
  335. }
  336. else if($surl[0]=='.')
  337. {
  338. if(!isset($surl[2]))
  339. {
  340. return '';
  341. }
  342. else if($surl[0]=='/')
  343. {
  344. $okurl = $this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
  345. }
  346. else
  347. {
  348. $urls = explode('/',$surl);
  349. foreach($urls as $u)
  350. {
  351. if($u=='..')
  352. {
  353. $pathStep++;
  354. }
  355. else if($i<count($urls)-1)
  356. {
  357. $dstr .= $urls[$i].'/';
  358. }
  359. else
  360. {
  361. $dstr .= $urls[$i];
  362. }
  363. $i++;
  364. }
  365. $urls = explode('/',$this->BaseUrlPath);
  366. if(count($urls) <= $pathStep)
  367. {
  368. return '';
  369. }
  370. else
  371. {
  372. $pstr = '';
  373. for($i=0;$i<count($urls)-$pathStep;$i++){ $pstr .= $urls[$i].'/'; }
  374. $okurl = $pstr.$dstr;
  375. }
  376. }
  377. }
  378. else
  379. {
  380. if( strlen($surl) < 7 )
  381. {
  382. $okurl = $this->BaseUrlPath.'/'.$surl;
  383. }
  384. else if( strtolower(substr($surl,0,7))=='http://' )
  385. {
  386. $okurl = preg_replace('/^http:\/\//i', '', $surl);
  387. }
  388. else
  389. {
  390. $okurl = $this->BaseUrlPath.'/'.$surl;
  391. }
  392. }
  393. $okurl = preg_replace('/\/{1,}/i', '/', $okurl);
  394. return 'http://'.$okurl;
  395. }
  396. /**
  397. * 获得和下一个标记之间的文本内容
  398. *
  399. * @access public
  400. * @param string $pos 位置地址
  401. * @param string $tagname 标签名称
  402. * @return string
  403. */
  404. function GetInnerText(&$pos,$tagname)
  405. {
  406. $startPos=0;
  407. $endPos=0;
  408. $textLen=0;
  409. $str = '';
  410. $startPos = strpos($this->SourceHtml,'>',$pos);
  411. if($tagname=='title')
  412. {
  413. $endPos = strpos($this->SourceHtml,'<',$startPos);
  414. }
  415. else
  416. {
  417. $endPos1 = strpos($this->SourceHtml,'</a',$startPos);
  418. $endPos2 = strpos($this->SourceHtml,'</A',$startPos);
  419. if($endPos1===false)
  420. {
  421. $endPos = $endPos2;
  422. }
  423. else if($endPos2===false)
  424. {
  425. $endPos = $endPos1;
  426. }
  427. else
  428. {
  429. $endPos = ($endPos1 < $endPos2 ? $endPos1 : $endPos2 );
  430. }
  431. }
  432. if($endPos > $startPos)
  433. {
  434. $textLen = $endPos-$startPos;
  435. $str = substr($this->SourceHtml,$startPos+1,$textLen-1);
  436. }
  437. $pos = $startPos + $textLen + strlen("</".$tagname) + 1;
  438. if($tagname=='title')
  439. {
  440. return trim($str);
  441. }
  442. else
  443. {
  444. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$str,$imgs);
  445. if(isset($imgs[2][0]))
  446. {
  447. $txt = trim(Html2Text($str));
  448. $imgs[2][0] = preg_replace("/[\"']/",'',$imgs[2][0]);
  449. return "img:".$this->FillUrl($imgs[2][0]).':txt:'.$txt;
  450. }
  451. else
  452. {
  453. $str = strip_tags($str);
  454. //$str = preg_replace('/<\/(.*)$/i', '', $str);
  455. //$str = trim(preg_replace('/^(.*)>/i','',$str));
  456. return $str;
  457. }
  458. }
  459. }
  460. }//End class
  461. /*******************************
  462. //属性解析器
  463. function c____DedeAttribute2();
  464. ********************************/
  465. class DedeAttribute2
  466. {
  467. var $SourceString = '';
  468. var $SourceMaxSize = 1024;
  469. var $CharToLow = FALSE; //属性值是否不分大小写(属性名统一为小写)
  470. var $IsTagName = TRUE; //是否解析标记名称
  471. var $Count = -1;
  472. var $Items = array(); //属性元素的集合
  473. //设置属性解析器源字符串
  474. function SetSource($str = '')
  475. {
  476. $this->Count = -1;
  477. $this->Items =array();
  478. $strLen = 0;
  479. $this->SourceString = trim(preg_replace("/[ \t\r\n]{1,}/"," ",$str));
  480. $strLen = strlen($this->SourceString);
  481. $this->SourceString .= " "; //增加一个空格结尾,以方便处理没有属性的标记
  482. if($strLen>0&&$strLen<=$this->SourceMaxSize)
  483. {
  484. $this->PrivateAttParse();
  485. }
  486. }
  487. //获得某个属性
  488. function GetAtt($str)
  489. {
  490. if($str == '')
  491. {
  492. return '';
  493. }
  494. $str = strtolower($str);
  495. if(isset($this->Items[$str]))
  496. {
  497. return $this->Items[$str];
  498. }
  499. else
  500. {
  501. return '';
  502. }
  503. }
  504. //判断属性是否存在
  505. function IsAtt($str)
  506. {
  507. if($str == '')
  508. {
  509. return false;
  510. }
  511. $str = strtolower($str);
  512. if(isset($this->Items[$str]))
  513. {
  514. return true;
  515. }
  516. else
  517. {
  518. return false;
  519. }
  520. }
  521. //获得标记名称
  522. function GetTagName()
  523. {
  524. return $this->GetAtt("tagname");
  525. }
  526. // 获得属性个数
  527. function GetCount()
  528. {
  529. return $this->Count+1;
  530. }
  531. //解析属性(仅给SetSource调用)
  532. function PrivateAttParse()
  533. {
  534. $d = '';
  535. $tmpatt = '';
  536. $tmpvalue = '';
  537. $startdd = -1;
  538. $ddtag = '';
  539. $strLen = strlen($this->SourceString);
  540. $j = 0;
  541. //这里是获得标记的名称
  542. if($this->IsTagName)
  543. {
  544. //如果属性是注解,不再解析里面的内容,直接返回
  545. if(isset($this->SourceString[2]))
  546. {
  547. if($this->SourceString[0].$this->SourceString[1].$this->SourceString[2]=='!--')
  548. {
  549. $this->Items['tagname'] = '!--';
  550. return ;
  551. }
  552. }
  553. for($i=0;$i<$strLen;$i++)
  554. {
  555. $d = $this->SourceString[$i];
  556. $j++;
  557. if(preg_match("/[ '\"\r\n\t]/i", $d))
  558. {
  559. $this->Count++;
  560. $this->Items["tagname"]=strtolower(trim($tmpvalue));
  561. $tmpvalue = ''; break;
  562. }
  563. else
  564. {
  565. $tmpvalue .= $d;
  566. }
  567. }
  568. if($j>0)
  569. {
  570. $j = $j-1;
  571. }
  572. }
  573. //遍历源字符串,获得各属性
  574. for($i=$j;$i<$strLen;$i++)
  575. {
  576. $d = $this->SourceString[$i];
  577. //获得属性的键
  578. if($startdd==-1)
  579. {
  580. if($d!='=')
  581. {
  582. $tmpatt .= $d;
  583. }
  584. else
  585. {
  586. $tmpatt = strtolower(trim($tmpatt));
  587. $startdd=0;
  588. }
  589. }
  590. //检测属性值是用什么包围的,允许使用 '' '' 或空白
  591. else if($startdd==0)
  592. {
  593. switch($d)
  594. {
  595. case ' ':
  596. // continue;
  597. break;
  598. case '\'':
  599. $ddtag='\'';
  600. $startdd=1;
  601. break;
  602. case '"':
  603. $ddtag='"';
  604. $startdd=1;
  605. break;
  606. default:
  607. $tmpvalue.=$d;
  608. $ddtag=' ';
  609. $startdd=1;
  610. break;
  611. }
  612. }
  613. //获得属性的值
  614. else if($startdd==1)
  615. {
  616. if($d==$ddtag)
  617. {
  618. $this->Count++;
  619. if($this->CharToLow)
  620. {
  621. $this->Items[$tmpatt] = strtolower(trim($tmpvalue));
  622. }
  623. else
  624. {
  625. $this->Items[$tmpatt] = trim($tmpvalue);
  626. }
  627. $tmpatt = '';
  628. $tmpvalue = '';
  629. $startdd=-1;
  630. }
  631. else
  632. {
  633. $tmpvalue.=$d;
  634. }
  635. }
  636. }//End for
  637. //处理没有值的属性(必须放在结尾才有效)如:"input type=radio name=t1 value=aaa checked"
  638. if($tmpatt != '')
  639. {
  640. $this->Items[$tmpatt] = '';
  641. }
  642. }//End Function PrivateAttParse
  643. }//End Class DedeAttribute2
  644. ?>