|
- <?php if(!defined('DEDEINC')) exit('dedecms');
- /**
- * 采集小助手
- *
- * @version $Id: charset.helper.php 1 2010-07-05 11:43:09Z tianya $
- * @package DedeCMS.Helpers
- * @copyright Copyright (c) 2020, DedeBIZ.COM
- * @license https://www.dedebiz.com/license
- * @link https://www.dedebiz.com
- */
-
- require_once(DEDEINC."/dedehttpdown.class.php");
- require_once(DEDEINC."/dedetag.class.php");
- require_once(DEDEINC."/charset.func.php");
-
- /**
- * 下载图片
- *
- * @access public
- * @param string $gurl 地址
- * @param string $rfurl 来源地址
- * @param string $filename 文件名
- * @param string $gcookie 调整cookie
- * @param string $JumpCount 跳转计数
- * @param string $maxtime 最大次数
- * @return string
- */
- function DownImageKeep($gurl, $rfurl, $filename, $gcookie="", $JumpCount=0, $maxtime=30)
- {
- $urlinfos = GetHostInfo($gurl);
- $ghost = trim($urlinfos['host']);
- if($ghost=='')
- {
- return FALSE;
- }
- $gquery = $urlinfos['query'];
- if($gcookie=="" && !empty($rfurl))
- {
- $gcookie = RefurlCookie($rfurl);
- }
- $sessionQuery = "GET $gquery HTTP/1.1\r\n";
- $sessionQuery .= "Host: $ghost\r\n";
- $sessionQuery .= "Referer: $rfurl\r\n";
- $sessionQuery .= "Accept: */*\r\n";
- $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
- if($gcookie!="" && !preg_match("/[\r\n]/", $gcookie))
- {
- $sessionQuery .= $gcookie."\r\n";
- }
- $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
- $errno = "";
- $errstr = "";
- $m_fp = fsockopen($ghost, 80, $errno, $errstr,10);
- fwrite($m_fp,$sessionQuery);
- $lnum = 0;
-
- //获取详细应答头
- $m_httphead = Array();
- $httpstas = explode(" ",fgets($m_fp,256));
- $m_httphead["http-edition"] = trim($httpstas[0]);
- $m_httphead["http-state"] = trim($httpstas[1]);
- while(!feof($m_fp))
- {
- $line = trim(fgets($m_fp,256));
- if($line == "" || $lnum>100)
- {
- break;
- }
- $hkey = "";
- $hvalue = "";
- $v = 0;
- for($i=0; $i<strlen($line); $i++)
- {
- if($v==1)
- {
- $hvalue .= $line[$i];
- }
- if($line[$i]==":")
- {
- $v = 1;
- }
- if($v==0)
- {
- $hkey .= $line[$i];
- }
- }
- $hkey = trim($hkey);
- if($hkey!="")
- {
- $m_httphead[strtolower($hkey)] = trim($hvalue);
- }
- }
-
- //分析返回记录
- if(preg_match("/^3/", $m_httphead["http-state"]))
- {
- if(isset($m_httphead["location"]) && $JumpCount<3)
- {
- $JumpCount++;
- DownImageKeep($gurl,$rfurl,$filename,$gcookie,$JumpCount);
- }
- else
- {
- return FALSE;
- }
- }
- if(!preg_match("/^2/", $m_httphead["http-state"]))
- {
- return FALSE;
- }
- if(!isset($m_httphead))
- {
- return FALSE;
- }
- $contentLength = $m_httphead['content-length'];
-
- //保存文件
- $fp = fopen($filename,"w") or die("写入文件:{$filename} 失败!");
- $i=0;
- $okdata = "";
- $starttime = time();
- while(!feof($m_fp))
- {
- $okdata .= fgetc($m_fp);
- $i++;
-
- //超时结束
- if(time()-$starttime>$maxtime)
- {
- break;
- }
-
- //到达指定大小结束
- if($i >= $contentLength)
- {
- break;
- }
- }
- if($okdata!="")
- {
- fwrite($fp,$okdata);
- }
- fclose($fp);
- if($okdata=="")
- {
- @unlink($filename);
- fclose($m_fp);
- return FALSE;
- }
- fclose($m_fp);
- return TRUE;
- }
-
- /**
- * 获得某页面返回的Cookie信息
- *
- * @access public
- * @param string $gurl 调整地址
- * @return string
- */
- function RefurlCookie($gurl)
- {
- global $gcookie,$lastRfurl;
- $gurl = trim($gurl);
- if(!empty($gcookie) && $lastRfurl==$gurl)
- {
- return $gcookie;
- }
- else
- {
- $lastRfurl=$gurl;
- }
- if(trim($gurl)=='')
- {
- return '';
- }
- $urlinfos = GetHostInfo($gurl);
- $ghost = $urlinfos['host'];
- $gquery = $urlinfos['query'];
- $sessionQuery = "GET $gquery HTTP/1.1\r\n";
- $sessionQuery .= "Host: $ghost\r\n";
- $sessionQuery .= "Accept: */*\r\n";
- $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
- $sessionQuery .= "Connection: Close\r\n\r\n";
- $errno = "";
- $errstr = "";
- $m_fp = fsockopen($ghost, 80, $errno, $errstr,10) or die($ghost.'<br />');
- fwrite($m_fp,$sessionQuery);
- $lnum = 0;
-
- //获取详细应答头
- $gcookie = "";
- while(!feof($m_fp))
- {
- $line = trim(fgets($m_fp,256));
- if($line == "" || $lnum>100)
- {
- break;
- }
- else
- {
- if(preg_match("/^cookie/i", $line))
- {
- $gcookie = $line;
- break;
- }
- }
- }
- fclose($m_fp);
- return $gcookie;
- }
-
- /**
- * 获得网址的host和query部份
- *
- * @access public
- * @param string $gurl 调整地址
- * @return string
- */
- function GetHostInfo($gurl)
- {
- $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
- $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
- $garr['query'] = "/".preg_replace("/^([^\/]*)\//i", "", $gurl);
- return $garr;
- }
-
- /**
- * HTML里的图片转DEDE格式
- *
- * @access public
- * @param string $body 文章内容
- * @return string
- */
- function TurnImageTag(&$body)
- {
- global $cfg_album_width,$cfg_ddimg_width;
- if(empty($cfg_album_width))
- {
- $cfg_album_width = 800;
- }
- if(empty($cfg_ddimg_width))
- {
- $cfg_ddimg_width = 150;
- }
- $patten = "/<\\s*img\\s.*?src\\s*=\\s*([\"\\'])?(?(1)(.*?)\\1|([^\\s\\>\"\\']+))/isx";
- preg_match_all($patten,$body,$images);
- $returnArray1 = $images[2];
- $returnArray2 = $images[3];
- foreach ( $returnArray1 as $key => $value )
- {
- if ($value)
- {
- $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key+1)."'}".$value."{/dede:img}"."\r\n";
- }
- else
- {
- $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key+1)."'}".$returnArray2[$key]."{/dede:img}"."\r\n";
- }
- }
- $ttx = "\r\n{dede:pagestyle maxwidth='{$cfg_album_width}' ddmaxwidth='{$cfg_ddimg_width}' row='3' col='3' value='2'/}\r\n{dede:comments}图集类型会采集时生成此配置是正常的,不过如果后面没有跟着img标记则表示规则无效{/dede:comments}\r\n".$ttx;
- return $ttx;
- }
-
- /**
- * HTML里的网址格式转换
- *
- * @access public
- * @param string $body 文章内容
- * @return string
- */
- function TurnLinkTag(&$body)
- {
- $ttx = '';
- $handid = '服务器';
- preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is",$body,$match);
- if(is_array($match[1]) && count($match[1])>0)
- {
- for($i=0;isset($match[1][$i]);$i++)
- {
- $servername = (isset($match[3][$i]) ? str_replace("'","`",$match[3][$i]) : $handid.($i+1));
- if(preg_match("/[<>]/", $servername) || strlen($servername)>40)
- {
- $servername = $handid.($i+1);
- }
- $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
- }
- }
- return $ttx;
- }
-
- /**
- * 替换XML的CDATA
- *
- * @access public
- * @param string $str 字符串
- * @return string
- */
- function RpCdata($str)
- {
- $str = str_replace('<![CDATA[', '', $str);
- $str = str_replace(']]>', '', $str);
- return $str;
- }
-
- /**
- * 分析RSS里的链接
- *
- * @access public
- * @param string $rssurl rss地址
- * @return string
- */
- function GetRssLinks($rssurl)
- {
- global $cfg_soft_lang;
- $dhd = new DedeHttpDown();
- $dhd->OpenUrl($rssurl);
- $rsshtml = $dhd->GetHtml();
-
- //分析编码
- preg_match("/encoding=[\"']([^\"']*)[\"']/is",$rsshtml,$infos);
- if(isset($infos[1]))
- {
- $pcode = strtolower(trim($infos[1]));
- }
- else
- {
- $pcode = strtolower($cfg_soft_lang);
- }
- if($cfg_soft_lang=='gb2312')
- {
- if($pcode=='utf-8')
- {
- $rsshtml = utf82gb($rsshtml);
- }
- else if($pcode=='big5')
- {
- $rsshtml = big52gb($rsshtml);
- }
- }
- else if($cfg_soft_lang=='utf-8')
- {
- if($pcode=='gbk'||$pcode=='gb2312')
- {
- $rsshtml = gb2utf8($rsshtml);
- }
- else if($pcode=='big5')
- {
- $rsshtml = gb2utf8(big52gb($rsshtml));
- }
- }
- $rsarr = array();
- preg_match_all("/<item(.*)<title>(.*)<\/title>/isU",$rsshtml,$titles);
- preg_match_all("/<item(.*)<link>(.*)<\/link>/isU",$rsshtml,$links);
- preg_match_all("/<item(.*)<description>(.*)<\/description>/isU",$rsshtml,$descriptions);
- if(!isset($links[2]))
- {
- return '';
- }
- foreach($links[2] as $k=>$v)
- {
- $rsarr[$k]['link'] = RpCdata($v);
-
- if(isset($titles[2][$k]))
- {
- $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
- }
- else
- {
- $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
- }
- if(isset($descriptions[2][$k]))
- {
- $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k],$rssurl);
- }
- else
- {
- $rsarr[$k]['image'] = '';
- }
- }
- return $rsarr;
- }
-
- /**
- * 从RSS摘要获取图片信息
- *
- * @access public
- * @param string $descriptions 描述
- * @param string $refurl 来源地址
- * @return string
- */
- function GetddImgFromRss($descriptions,$refurl)
- {
- if($descriptions=='')
- {
- return '';
- }
- preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$descriptions,$imgs);
- if(isset($imgs[2][0]))
- {
- $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
- $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
- return FillUrl($refurl,$imgs[2][0]);
- }
- else
- {
- return '';
- }
- }
-
- /**
- * 补全网址
- *
- * @access public
- * @param string $refurl 来源地址
- * @param string $surl 站点地址
- * @return string
- */
- function FillUrl($refurl,$surl)
- {
- $i = $pathStep = 0;
- $dstr = $pstr = $okurl = '';
- $refurl = trim($refurl);
- $surl = trim($surl);
- $urls = @parse_url($refurl);
- $basehost = ( (!isset($urls['port']) || $urls['port']=='80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
-
- //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
- //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
- $basepath = $basehost;
- $paths = explode('/',preg_replace("/^http:\/\//i", "", $refurl));
- $n = count($paths);
- for($i=1;$i < ($n-1);$i++)
- {
- if(!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
- }
- if(!preg_match("/[\?\.]/", $paths[$n-1]))
- {
- $basepath .= '/'.$paths[$n-1];
- }
- if($surl=='')
- {
- return $basepath;
- }
- $pos = strpos($surl, "#");
- if($pos>0)
- {
- $surl = substr($surl, 0, $pos);
- }
-
- //用 '/' 表示网站根的网址
- if($surl[0]=='/')
- {
- $okurl = $basehost.$surl;
- }
- else if($surl[0]=='.')
- {
- if(strlen($surl)<=2)
- {
- return '';
- }
- else if($surl[1]=='/')
- {
- $okurl = $basepath.preg_replace('/^./', '', $surl);
- }
- else
- {
- $okurl = $basepath.'/'.$surl;
- }
- }
- else
- {
- if( strlen($surl) < 7 )
- {
- $okurl = $basepath.'/'.$surl;
- }
- else if( preg_match("/^http:\/\//i",$surl) )
- {
- $okurl = $surl;
- }
- else
- {
- $okurl = $basepath.'/'.$surl;
- }
- }
- $okurl = preg_replace("/^http:\/\//i", '', $okurl);
- $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
- return $okurl;
- }
-
- /**
- * 从匹配规则中获取列表网址
- *
- * @access public
- * @param string $regxurl 正则地址
- * @param string $handurl 操作地址
- * @param string $startid 开始ID
- * @param string $endid 结束ID
- * @param string $addv 增值
- * @param string $usemore 使用更多
- * @param string $batchrule 列表规则
- * @return string
- */
- function GetUrlFromListRule($regxurl='',$handurl='',$startid=0,$endid=0,$addv=1,$usemore=0,$batchrule='')
- {
- global $dsql,$islisten;
-
- $lists = array();
- $n = 0;
- $islisten = (empty($islisten) ? 0 : $islisten);
- if($handurl!='')
- {
- $handurls = explode("\n",$handurl);
- foreach($handurls as $handurl)
- {
- $handurl = trim($handurl);
- if(preg_match("/^http:\/\//i", $handurl))
- {
- $lists[$n][0] = $handurl;
- $lists[$n][1] = 0;
- $n++;
- if($islisten==1)
- {
- break;
- }
- }
- }
- }
- if($regxurl!='')
- {
- //没指定(#)和(*)
- if(!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl))
- {
- $lists[$n][0] = $regxurl;
- $lists[$n][1] = 0;
- $n++;
- }
- else
- {
- if($addv <= 0)
- {
- $addv = 1;
- }
-
- //没指定多栏目匹配规则
- if($usemore==0)
- {
- while($startid <= $endid)
- {
- $lists[$n][0] = str_replace("(*)",sprintf('%0'.strlen($startid).'d',$startid),$regxurl);
- $lists[$n][1] = 0;
- $startid = sprintf('%0'.strlen($startid).'d',$startid + $addv);
- $n++;
- if($n>2000 || $islisten==1)
- {
- break;
- }
- }
- }
-
- //匹配多个栏目
- //规则表达式 [(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20; typeid=>栏目id; addurl=>附加的网址(用|分开多个)]
- else
- {
- $nrules = explode(']',trim($batchrule));
- foreach($nrules as $nrule)
- {
- $nrule = trim($nrule);
- $nrule = preg_replace("/^\[|\]$/", '', $nrule);
- $nrules = explode(';',$nrule);
- if(count($nrules)<3)
- {
- continue;
- }
- $brtag = '';
- $startid = 0;
- $endid = 0;
- $typeid = 0;
- $addurls = array();
- foreach($nrules as $nrule)
- {
- $nrule = trim($nrule);
- list($k,$v) = explode('=>',$nrule);
- if(trim($k)=='(#)')
- {
- $brtag = trim($v);
- }
- else if(trim($k)=='typeid')
- {
- $typeid = trim($v);
- }
- else if(trim($k)=='addurl')
- {
- $addurl = trim($v);
- $addurls = explode('|',$addurl);
- }
- else if(trim($k)=='(*)')
- {
- $v = preg_replace("/[ \r\n\t]/", '', trim($v));
- list($startid,$endid) = explode('-',$v);
- }
- }
-
- //如果栏目用栏目名称
- if(preg_match('/[^0-9]/', $typeid))
- {
- $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
- if(is_array($arr))
- {
- $typeid = $arr['id'];
- }
- else
- {
- $typeid = 0;
- }
- }
-
- //附加网址优先
- $mjj = 0;
- if(isset($addurls[0]))
- {
- foreach($addurls as $addurl)
- {
- $addurl = trim($addurl);
- if($addurl=='')
- {
- continue;
- }
- $lists[$n][0] = $addurl;
- $lists[$n][1] = $typeid;
- $n++;
- $mjj++;
- if($islisten==1)
- {
- break;
- }
- }
- }
-
- //如果为非监听模式或监听模式没手工指定的附加网址
- if($islisten!=1 || $mjj==0 )
- {
- //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
- while($startid <= $endid)
- {
- $lists[$n][0] = str_replace("(#)",$brtag,$regxurl);
- $lists[$n][0] = str_replace("(*)",sprintf('%0'.strlen($startid).'d',$startid),$lists[$n][0]);
- $lists[$n][1] = $typeid;
- $startid = sprintf('%0'.strlen($startid).'d',$startid + $addv);
- $n++;
- if($islisten==1)
- {
- break;
- }
- if($n>20000)
- {
- break;
- }
- }
- }
- }
- } //End 匹配多栏目
-
- } //End使用规则匹配的情况
-
- }
-
- return $lists;
- }//End
|