|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681 |
- <?php if(!defined('DEDEINC')) exit("Request Error!");
- /**
- * 织梦HTML解析类V1.6 PHP版
- * function c____DedeHtml2();
- * 这个类针对于采集程序,主要是获取某区域内的图片、超链接等信息
- *
- *
- * @version $Id: dedehtml2.class.php 1 14:44 2010年7月6日Z tianya $
- * @package DedeCMS.Libraries
- * @copyright Copyright (c) 2007 - 2019, DesDev, Inc.
- * @license http://help.dedecms.com/usersguide/license.html
- * @link http://www.dedecms.com
- */
- // ------------------------------------------------------------------------
- /**
- * 织梦HTML解析类V1.6 PHP版
- *
- * @package DedeHtml2
- * @subpackage DedeCMS.Libraries
- * @link http://www.dedecms.com
- */
- class DedeHtml2
- {
- var $CAtt;
- var $SourceHtml;
- var $Title;
- var $Medias;
- var $MediaInfos;
- var $Links;
- var $CharSet;
- var $BaseUrl;
- var $BaseUrlPath;
- var $Scheme;
- var $HomeUrl;
- var $IsHead;
- var $ImgHeight;
- var $ImgWidth;
- var $GetLinkType;
-
- //构造函数
- function __construct()
- {
- $this->CAtt = '';
- $this->SourceHtml = '';
- $this->Title = '';
- $this->Medias = Array();
- $this->MediaInfos = Array();
- $this->Links = Array();
- $this->BaseUrl = '';
- $this->BaseUrlPath = '';
- $this->Scheme = 'http://';
- $this->HomeUrl = '';
- $this->IsHead = false;
- $this->ImgHeight = 30;
- $this->ImgWidth = 50;
- $this->GetLinkType = 'link';
- }
-
- function DedeHtml2()
- {
- $this->__construct();
- }
-
- /**
- * 设置HTML的内容和来源网址
- *
- * @access public
- * @param string $html html资源
- * @param string $url 地址
- * @param string $linktype 连接类型
- * @return void
- */
- function SetSource(&$html, $url = '', $linktype='')
- {
- $this->__construct();
- $this->CAtt = new DedeAttribute2();
- $url = trim($url);
- $this->SourceHtml = $html;
- $this->BaseUrl = $url;
- //判断文档相对于当前的路径
- $urls = @parse_url($url);
- $this->Scheme = $urls['scheme'] . '://';
- $this->HomeUrl = $urls['host'];
- $this->BaseUrlPath = $this->HomeUrl.$urls['path'];
- $this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
- $this->BaseUrlPath = preg_replace("/\/$/",'',$this->BaseUrlPath);
- if($linktype!='')
- {
- $this->GetLinkType = $linktype;
- }
- if($html != '')
- {
- $this->Analyser();
- }
- }
-
- /**
- * 解析HTML
- *
- * @access private
- * @return void
- */
- function Analyser()
- {
- $cAtt = new DedeAttribute2();
- $cAtt->IsTagName = false;
- $c = '';
- $i = 0;
- $startPos = 0;
- $endPos = 0;
- $wt = 0;
- $ht = 0;
- $scriptdd = 0;
- $attStr = '';
- $tmpValue = '';
- $tmpValue2 = '';
- $tagName = '';
- $hashead = 0;
- $slen = strlen($this->SourceHtml);
- if($this->GetLinkType=='link' || $this->GetLinkType=='')
- {
- $needTags = array('a');
- }
- if($this->GetLinkType=='media')
- {
- $needTags = array('img','embed','a');
- $this->IsHead = true;
- }
- $tagbreaks = array(' ','<','>',"\r","\n","\t");
- for(;isset($this->SourceHtml[$i]);$i++)
- {
- if($this->SourceHtml[$i]=='<')
- {
- $tagName = '';
- $j = 0;
- for($i=$i+1; isset($this->SourceHtml[$i]); $i++)
- {
- if($j>10)
- {
- break;
- }
- $j++;
- if( in_array($this->SourceHtml[$i],$tagbreaks) )
- {
- break;
- }
- else
- {
- $tagName .= $this->SourceHtml[$i];
- }
- }
- $tagName = strtolower($tagName);
-
- //标记为注解
- if($tagName=='!--')
- {
- $endPos = strpos($this->SourceHtml,'-->',$i);
- if($endPos !== false)
- {
- $i=$endPos+3;
- }
- continue;
- }
-
- //标记在指定集合内
- else if( in_array($tagName,$needTags) )
- {
- $startPos = $i;
- $endPos = strpos($this->SourceHtml,'>',$i+1);
- if($endPos===false)
- {
- break;
- }
- $attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
- $cAtt->SetSource($attStr);
- if($tagName=='img')
- {
- $this->InsertMedia($cAtt->GetAtt('src'),'img');
- }
- else if($tagName=='embed')
- {
- $rurl = $this->InsertMedia($cAtt->GetAtt('src'),'embed');
- if($rurl != '')
- {
- $this->MediaInfos[$rurl][0] = $cAtt->GetAtt('width');
- $this->MediaInfos[$rurl][1] = $cAtt->GetAtt('height');
- }
- }
- else if($tagName=='a')
- {
- $this->InsertLink($this->FillUrl($cAtt->GetAtt('href')),$this->GetInnerText($i,'a'));
- }
- }
- else
- {
- continue;
- }
- $i--;
- }//End if char
-
- }//End for
-
- if($this->Title == '')
- {
- $this->Title = $this->BaseUrl;
- }
- }
-
- /**
- * 重置资源
- *
- * @access private
- * @return void
- */
- function Clear()
- {
- $this->CAtt = '';
- $this->SourceHtml = '';
- $this->Title = '';
- $this->Links = '';
- $this->Medias = '';
- $this->BaseUrl = '';
- $this->BaseUrlPath = '';
- }
-
- /**
- * 分析链接
- *
- * @access public
- * @param string $url 地址
- * @param string $mtype 媒体类型
- * @return string
- */
- function InsertMedia($url, $mtype)
- {
- if( preg_match("/^(javascript:|#|'|\")/", $url) )
- {
- return '';
- }
- if($url == '')
- {
- return '';
- }
- $this->Medias[$url]=$mtype;
- return $url;
- }
-
- /**
- * 分析链接
- *
- * @access public
- * @param string $url 地址
- * @param string $atitle 文档
- * @return string
- */
- function InsertLink($url, $atitle)
- {
- if( preg_match("/^(javascript:|#|'|\")/", $url) )
- {
- return '';
- }
- if($url == '')
- {
- return '';
- }
- if( preg_match('/^img:/', $atitle) )
- {
- list($aimg, $atitle) = explode(':txt:', $atitle);
- if(!isset($this->Links[$url]))
- {
- if($atitle != '')
- {
- $this->Links[$url]['title'] = cn_substr($atitle,50);
- }
- else
- {
- $this->Links[$url]['title'] = preg_replace('/img:/', '', $aimg);
- }
- $this->Links[$url]['link'] = $url;
- }
- $this->Links[$url]['image'] = preg_replace('/img:/', '', $aimg);
- $this->InsertMedia($this->Links[$url]['image'], 'img');
- }
- else
- {
- if(!isset($this->Links[$url]))
- {
- $this->Links[$url]['image'] = '';
- $this->Links[$url]['title'] = $atitle;
- $this->Links[$url]['link'] = $url;
- }
- else
- {
- if(strlen($this->Links[$url]['title']) < strlen($atitle)) $this->Links[$url]['title'] = $atitle;
- }
- }
- return $url;
- }
-
- /**
- * 分析content-type中的字符类型
- *
- * @access public
- * @param string $att 属性字符串
- * @return string
- */
- function ParCharSet($att)
- {
- $startdd=0;
- $taglen=0;
- $startdd = strpos($att,'=');
- if($startdd===false)
- {
- return '';
- }
- else
- {
- $taglen = strlen($att)-$startdd-1;
- if($taglen<=0)
- {
- return '';
- }
- return trim(substr($att, $startdd+1, $taglen));
- }
- }
-
- /**
- * 补全相对网址
- *
- * @access public
- * @param string $surl 地址
- * @return string
- */
- function FillUrl($surl)
- {
- $i = $pathStep = 0;
- $dstr = $pstr = $okurl = '';
-
- $surl = trim($surl);
- if($surl == '')
- {
- return '';
- }
- $pos = strpos($surl,'#');
- if($pos>0)
- {
- $surl = substr($surl,0,$pos);
- }
- if($surl[0]=='/')
- {
- $okurl = $this->HomeUrl.'/'.$surl;
- }
- else if($surl[0]=='.')
- {
- if(!isset($surl[2]))
- {
- return '';
- }
- else if($surl[0]=='/')
- {
- $okurl = $this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
- }
- else
- {
- $urls = explode('/',$surl);
- foreach($urls as $u)
- {
- if($u=='..')
- {
- $pathStep++;
- }
- else if($i<count($urls)-1)
- {
- $dstr .= $urls[$i].'/';
- }
- else
- {
- $dstr .= $urls[$i];
- }
- $i++;
- }
- $urls = explode('/',$this->BaseUrlPath);
- if(count($urls) <= $pathStep)
- {
- return '';
- }
- else
- {
- $pstr = '';
- for($i=0;$i<count($urls)-$pathStep;$i++){ $pstr .= $urls[$i].'/'; }
- $okurl = $pstr.$dstr;
- }
- }
- }
- else
- {
- if( strlen($surl) < 7 )
- {
- $okurl = $this->BaseUrlPath.'/'.$surl;
- }
- else if( strtolower(substr($surl,0,7))=='http://' )
- {
- $okurl = preg_replace('/^http:\/\//i', '', $surl);
- }
- else if( strtolower(substr($surl,0,8))=='https://' )
- {
- $okurl = preg_replace('/^https:\/\//i', '', $surl);
- }
- else
- {
- $okurl = $this->BaseUrlPath.'/'.$surl;
- }
- }
- $okurl = preg_replace('/\/{1,}/i', '/', $okurl);
- return $this->Scheme . $okurl;
- }
-
- /**
- * 获得和下一个标记之间的文本内容
- *
- * @access public
- * @param string $pos 位置地址
- * @param string $tagname 标签名称
- * @return string
- */
- function GetInnerText(&$pos,$tagname)
- {
- $startPos=0;
- $endPos=0;
- $textLen=0;
- $str = '';
- $startPos = strpos($this->SourceHtml,'>',$pos);
-
- if($tagname=='title')
- {
- $endPos = strpos($this->SourceHtml,'<',$startPos);
- }
- else
- {
- $endPos1 = strpos($this->SourceHtml,'</a',$startPos);
- $endPos2 = strpos($this->SourceHtml,'</A',$startPos);
- if($endPos1===false)
- {
- $endPos = $endPos2;
- }
- else if($endPos2===false)
- {
- $endPos = $endPos1;
- }
- else
- {
- $endPos = ($endPos1 < $endPos2 ? $endPos1 : $endPos2 );
- }
- }
- if($endPos > $startPos)
- {
- $textLen = $endPos-$startPos;
- $str = substr($this->SourceHtml,$startPos+1,$textLen-1);
- }
- $pos = $startPos + $textLen + strlen("</".$tagname) + 1;
- if($tagname=='title')
- {
- return trim($str);
- }
- else
- {
- preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$str,$imgs);
- if(isset($imgs[2][0]))
- {
- $txt = trim(Html2Text($str));
- $imgs[2][0] = preg_replace("/[\"']/",'',$imgs[2][0]);
- return "img:".$this->FillUrl($imgs[2][0]).':txt:'.$txt;
- }
- else
- {
- $str = strip_tags($str);
- //$str = preg_replace('/<\/(.*)$/i', '', $str);
- //$str = trim(preg_replace('/^(.*)>/i','',$str));
- return $str;
- }
- }
- }
- }//End class
-
- /*******************************
- //属性解析器
- function c____DedeAttribute2();
- ********************************/
- class DedeAttribute2
- {
- var $SourceString = '';
- var $SourceMaxSize = 1024;
- var $CharToLow = FALSE; //属性值是否不分大小写(属性名统一为小写)
- var $IsTagName = TRUE; //是否解析标记名称
- var $Count = -1;
- var $Items = array(); //属性元素的集合
-
- //设置属性解析器源字符串
- function SetSource($str = '')
- {
- $this->Count = -1;
- $this->Items =array();
- $strLen = 0;
- $this->SourceString = trim(preg_replace("/[ \t\r\n]{1,}/"," ",$str));
- $strLen = strlen($this->SourceString);
- $this->SourceString .= " "; //增加一个空格结尾,以方便处理没有属性的标记
- if($strLen>0&&$strLen<=$this->SourceMaxSize)
- {
- $this->PrivateAttParse();
- }
- }
-
- //获得某个属性
- function GetAtt($str)
- {
- if($str == '')
- {
- return '';
- }
- $str = strtolower($str);
- if(isset($this->Items[$str]))
- {
- return $this->Items[$str];
- }
- else
- {
- return '';
- }
- }
-
- //判断属性是否存在
- function IsAtt($str)
- {
- if($str == '')
- {
- return false;
- }
- $str = strtolower($str);
- if(isset($this->Items[$str]))
- {
- return true;
- }
- else
- {
- return false;
- }
- }
-
- //获得标记名称
- function GetTagName()
- {
- return $this->GetAtt("tagname");
- }
-
- // 获得属性个数
- function GetCount()
- {
- return $this->Count+1;
- }
-
- //解析属性(仅给SetSource调用)
- function PrivateAttParse()
- {
- $d = '';
- $tmpatt = '';
- $tmpvalue = '';
- $startdd = -1;
- $ddtag = '';
- $strLen = strlen($this->SourceString);
- $j = 0;
-
- //这里是获得标记的名称
- if($this->IsTagName)
- {
- //如果属性是注解,不再解析里面的内容,直接返回
- if(isset($this->SourceString[2]))
- {
- if($this->SourceString[0].$this->SourceString[1].$this->SourceString[2]=='!--')
- {
- $this->Items['tagname'] = '!--';
- return ;
- }
- }
- for($i=0;$i<$strLen;$i++)
- {
- $d = $this->SourceString[$i];
- $j++;
- if(preg_match("/[ '\"\r\n\t]/i", $d))
- {
- $this->Count++;
- $this->Items["tagname"]=strtolower(trim($tmpvalue));
- $tmpvalue = ''; break;
- }
- else
- {
- $tmpvalue .= $d;
- }
- }
- if($j>0)
- {
- $j = $j-1;
- }
- }
-
- //遍历源字符串,获得各属性
- for($i=$j;$i<$strLen;$i++)
- {
- $d = $this->SourceString[$i];
- //获得属性的键
- if($startdd==-1)
- {
- if($d!='=')
- {
- $tmpatt .= $d;
- }
- else
- {
- $tmpatt = strtolower(trim($tmpatt));
- $startdd=0;
- }
- }
-
- //检测属性值是用什么包围的,允许使用 '' '' 或空白
- else if($startdd==0)
- {
- switch($d)
- {
- case ' ':
- continue;
- break;
- case '\'':
- $ddtag='\'';
- $startdd=1;
- break;
- case '"':
- $ddtag='"';
- $startdd=1;
- break;
- default:
- $tmpvalue.=$d;
- $ddtag=' ';
- $startdd=1;
- break;
- }
- }
-
- //获得属性的值
- else if($startdd==1)
- {
- if($d==$ddtag)
- {
- $this->Count++;
- if($this->CharToLow)
- {
- $this->Items[$tmpatt] = strtolower(trim($tmpvalue));
- }
- else
- {
- $this->Items[$tmpatt] = trim($tmpvalue);
- }
- $tmpatt = '';
- $tmpvalue = '';
- $startdd=-1;
- }
- else
- {
- $tmpvalue.=$d;
- }
- }
- }//End for
-
- //处理没有值的属性(必须放在结尾才有效)如:"input type=radio name=t1 value=aaa checked"
- if($tmpatt != '')
- {
- $this->Items[$tmpatt] = '';
- }
- }//End Function PrivateAttParse
-
- }//End Class DedeAttribute2
-
- ?>
|