| 
                        123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494 | 
                        - <?php
 - if (!defined('DEDEINC')) exit('dedebiz');
 - /**
 -  * 采集助手
 -  *
 -  * @version        $id:charset.helper.php 2010-07-05 11:43:09 tianya $
 -  * @package        DedeBIZ.Helpers
 -  * @copyright      Copyright (c) 2022 DedeBIZ.COM
 -  * @license        GNU GPL v2 (https://www.dedebiz.com/license)
 -  * @link           https://www.dedebiz.com
 -  */
 - require_once(DEDEINC."/libraries/dedehttpdown.class.php");
 - require_once(DEDEINC."/dedetag.class.php");
 - require_once(DEDEINC."/charset.func.php");
 - /**
 -  *  下载图片
 -  *
 -  * @access    public
 -  * @param     string  $gurl  地址
 -  * @param     string  $rfurl  来源地址
 -  * @param     string  $filename  文件名
 -  * @param     string  $gcookie  调整cookie
 -  * @param     string  $JumpCount  跳转计数
 -  * @param     string  $maxtime  最大次数
 -  * @return    string
 -  */
 - function DownImageKeep($gurl, $rfurl, $filename, $gcookie = "", $JumpCount = 0, $maxtime = 30)
 - {
 -     $urlinfos = GetHostInfo($gurl);
 -     $ghost = trim($urlinfos['host']);
 -     if ($ghost == '') {
 -         return FALSE;
 -     }
 -     $gquery = $urlinfos['query'];
 -     if ($gcookie == "" && !empty($rfurl)) {
 -         $gcookie = RefurlCookie($rfurl);
 -     }
 -     $sessionQuery = "GET $gquery HTTP/1.1\r\n";
 -     $sessionQuery .= "Host: $ghost\r\n";
 -     $sessionQuery .= "Referer: $rfurl\r\n";
 -     $sessionQuery .= "Accept: */*\r\n";
 -     $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
 -     if ($gcookie != "" && !preg_match("/[\r\n]/", $gcookie)) {
 -         $sessionQuery .= $gcookie."\r\n";
 -     }
 -     $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
 -     $errno = '';
 -     $errstr = '';
 -     $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10);
 -     fwrite($m_fp, $sessionQuery);
 -     $lnum = 0;
 -     //获取详细应答头
 -     $m_httphead = array();
 -     $httpstas = explode(" ", fgets($m_fp, 256));
 -     $m_httphead["http-edition"] = trim($httpstas[0]);
 -     $m_httphead["http-state"] = trim($httpstas[1]);
 -     while (!feof($m_fp)) {
 -         $line = trim(fgets($m_fp, 256));
 -         if ($line == "" || $lnum > 100) {
 -             break;
 -         }
 -         $hkey = '';
 -         $hvalue = '';
 -         $v = 0;
 -         for ($i = 0; $i < strlen($line); $i++) {
 -             if ($v == 1) {
 -                 $hvalue .= $line[$i];
 -             }
 -             if ($line[$i] == ":") {
 -                 $v = 1;
 -             }
 -             if ($v == 0) {
 -                 $hkey .= $line[$i];
 -             }
 -         }
 -         $hkey = trim($hkey);
 -         if ($hkey != "") {
 -             $m_httphead[strtolower($hkey)] = trim($hvalue);
 -         }
 -     }
 -     //分析返回记录
 -     if (preg_match("/^3/", $m_httphead["http-state"])) {
 -         if (isset($m_httphead["location"]) && $JumpCount < 3) {
 -             $JumpCount++;
 -             DownImageKeep($gurl, $rfurl, $filename, $gcookie, $JumpCount);
 -         } else {
 -             return FALSE;
 -         }
 -     }
 -     if (!preg_match("/^2/", $m_httphead["http-state"])) {
 -         return FALSE;
 -     }
 -     if (!isset($m_httphead)) {
 -         return FALSE;
 -     }
 -     $contentLength = $m_httphead['content-length'];
 -     //保存文件
 -     $fp = fopen($filename, "w") or die("写入文件:{$filename} 失败");
 -     $i = 0;
 -     $okdata = '';
 -     $starttime = time();
 -     while (!feof($m_fp)) {
 -         $okdata .= fgetc($m_fp);
 -         $i++;
 -         //超时结束
 -         if (time() - $starttime > $maxtime) {
 -             break;
 -         }
 -         //到达指定大小结束
 -         if ($i >= $contentLength) {
 -             break;
 -         }
 -     }
 -     if ($okdata != "") {
 -         fwrite($fp, $okdata);
 -     }
 -     fclose($fp);
 -     if ($okdata == "") {
 -         @unlink($filename);
 -         fclose($m_fp);
 -         return FALSE;
 -     }
 -     fclose($m_fp);
 -     return TRUE;
 - }
 - /**
 -  *  获得某页面返回的Cookie信息
 -  *
 -  * @access    public
 -  * @param     string  $gurl  调整地址
 -  * @return    string
 -  */
 - function RefurlCookie($gurl)
 - {
 -     global $gcookie, $lastRfurl;
 -     $gurl = trim($gurl);
 -     if (!empty($gcookie) && $lastRfurl == $gurl) {
 -         return $gcookie;
 -     } else {
 -         $lastRfurl = $gurl;
 -     }
 -     if (trim($gurl) == '') {
 -         return '';
 -     }
 -     $urlinfos = GetHostInfo($gurl);
 -     $ghost = $urlinfos['host'];
 -     $gquery = $urlinfos['query'];
 -     $sessionQuery = "GET $gquery HTTP/1.1\r\n";
 -     $sessionQuery .= "Host: $ghost\r\n";
 -     $sessionQuery .= "Accept: */*\r\n";
 -     $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
 -     $sessionQuery .= "Connection: Close\r\n\r\n";
 -     $errno = '';
 -     $errstr = '';
 -     $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10) or die($ghost.'<br>');
 -     fwrite($m_fp, $sessionQuery);
 -     $lnum = 0;
 -     //获取详细应答头
 -     $gcookie = '';
 -     while (!feof($m_fp)) {
 -         $line = trim(fgets($m_fp, 256));
 -         if ($line == "" || $lnum > 100) {
 -             break;
 -         } else {
 -             if (preg_match("/^cookie/i", $line)) {
 -                 $gcookie = $line;
 -                 break;
 -             }
 -         }
 -     }
 -     fclose($m_fp);
 -     return $gcookie;
 - }
 - /**
 -  *  获得网址的host和query部份
 -  *
 -  * @access    public
 -  * @param     string  $gurl  调整地址
 -  * @return    string
 -  */
 - function GetHostInfo($gurl)
 - {
 -     $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
 -     $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
 -     $garr['query'] = "/".preg_replace("/^([^\/]*)\//i", "", $gurl);
 -     return $garr;
 - }
 - 
 - /**
 -  *  HTML里的网址格式转换
 -  *
 -  * @access    public
 -  * @param     string  $body  文档
 -  * @return    string
 -  */
 - function TurnLinkTag(&$body)
 - {
 -     $ttx = '';
 -     $handid = '服务器';
 -     preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is", $body, $match);
 -     if (is_array($match[1]) && count($match[1]) > 0) {
 -         for ($i = 0; isset($match[1][$i]); $i++) {
 -             $servername = (isset($match[3][$i]) ? str_replace("'", "`", $match[3][$i]) : $handid.($i + 1));
 -             if (preg_match("/[<>]/", $servername) || strlen($servername) > 40) {
 -                 $servername = $handid.($i + 1);
 -             }
 -             $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
 -         }
 -     }
 -     return $ttx;
 - }
 - /**
 -  *  替换XML的CDATA
 -  *
 -  * @access    public
 -  * @param     string  $str  字符串
 -  * @return    string
 -  */
 - function RpCdata($str)
 - {
 -     $str = str_replace('<![CDATA[', '', $str);
 -     $str = str_replace(']]>', '', $str);
 -     return  $str;
 - }
 - /**
 -  *  分析RSS里的链接
 -  *
 -  * @access    public
 -  * @param     string  $rssurl  rss地址
 -  * @return    string
 -  */
 - function GetRssLinks($rssurl)
 - {
 -     global $cfg_soft_lang;
 -     $dhd = new DedeHttpDown();
 -     $dhd->OpenUrl($rssurl);
 -     $rsshtml = $dhd->GetHtml();
 -     //分析编码
 -     preg_match("/encoding=[\"']([^\"']*)[\"']/is", $rsshtml, $infos);
 -     if (isset($infos[1])) {
 -         $pcode = strtolower(trim($infos[1]));
 -     } else {
 -         $pcode = strtolower($cfg_soft_lang);
 -     }
 -     if ($cfg_soft_lang == 'gb2312') {
 -         if ($pcode == 'utf-8') {
 -             $rsshtml = utf82gb($rsshtml);
 -         } else if ($pcode == 'big5') {
 -             $rsshtml = big52gb($rsshtml);
 -         }
 -     } else if ($cfg_soft_lang == 'utf-8') {
 -         if ($pcode == 'gbk' || $pcode == 'gb2312') {
 -             $rsshtml = gb2utf8($rsshtml);
 -         } else if ($pcode == 'big5') {
 -             $rsshtml = gb2utf8(big52gb($rsshtml));
 -         }
 -     }
 -     $rsarr = array();
 -     preg_match_all("/<item(.*)<title>(.*)<\/title>/isU", $rsshtml, $titles);
 -     preg_match_all("/<item(.*)<link>(.*)<\/link>/isU", $rsshtml, $links);
 -     preg_match_all("/<item(.*)<description>(.*)<\/description>/isU", $rsshtml, $descriptions);
 -     if (!isset($links[2])) {
 -         return '';
 -     }
 -     foreach ($links[2] as $k => $v) {
 -         $rsarr[$k]['link'] = RpCdata($v);
 - 
 -         if (isset($titles[2][$k])) {
 -             $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
 -         } else {
 -             $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
 -         }
 -         if (isset($descriptions[2][$k])) {
 -             $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k], $rssurl);
 -         } else {
 -             $rsarr[$k]['image'] = '';
 -         }
 -     }
 -     return $rsarr;
 - }
 - /**
 -  *  从RSS摘要获取图片信息
 -  *
 -  * @access    public
 -  * @param     string  $descriptions  描述
 -  * @param     string  $refurl  来源地址
 -  * @return    string
 -  */
 - function GetddImgFromRss($descriptions, $refurl)
 - {
 -     if ($descriptions == '') {
 -         return '';
 -     }
 -     preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $descriptions, $imgs);
 -     if (isset($imgs[2][0])) {
 -         $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
 -         $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
 -         return FillUrl($refurl, $imgs[2][0]);
 -     } else {
 -         return '';
 -     }
 - }
 - /**
 -  *  补全网址
 -  *
 -  * @access    public
 -  * @param     string  $refurl  来源地址
 -  * @param     string  $surl  站点地址
 -  * @return    string
 -  */
 - function FillUrl($refurl, $surl)
 - {
 -     $i = $pathStep = 0;
 -     $dstr = $pstr = $okurl = '';
 -     $refurl = trim($refurl);
 -     $surl = trim($surl);
 -     $urls = @parse_url($refurl);
 -     $basehost = ((!isset($urls['port']) || $urls['port'] == '80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
 -     //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
 -     $basepath = $basehost;
 -     $paths = explode('/', preg_replace("/^http:\/\//i", "", $refurl));
 -     $n = count($paths);
 -     for ($i = 1; $i < ($n - 1); $i++) {
 -         if (!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
 -     }
 -     if (!preg_match("/[\?\.]/", $paths[$n - 1])) {
 -         $basepath .= '/'.$paths[$n - 1];
 -     }
 -     if ($surl == '') {
 -         return $basepath;
 -     }
 -     $pos = strpos($surl, "#");
 -     if ($pos > 0) {
 -         $surl = substr($surl, 0, $pos);
 -     }
 -     //用 '/' 表示网站根的网址
 -     if ($surl[0] == '/') {
 -         $okurl = $basehost.$surl;
 -     } else if ($surl[0] == '.') {
 -         if (strlen($surl) <= 2) {
 -             return '';
 -         } else if ($surl[1] == '/') {
 -             $okurl = $basepath.preg_replace('/^./', '', $surl);
 -         } else {
 -             $okurl = $basepath.'/'.$surl;
 -         }
 -     } else {
 -         if (strlen($surl) < 7) {
 -             $okurl = $basepath.'/'.$surl;
 -         } else if (preg_match("/^http:\/\//i", $surl)) {
 -             $okurl = $surl;
 -         } else {
 -             $okurl = $basepath.'/'.$surl;
 -         }
 -     }
 -     $okurl = preg_replace("/^http:\/\//i", '', $okurl);
 -     $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
 -     return $okurl;
 - }
 - /**
 -  *  从匹配规则中获取列表网址
 -  *
 -  * @access    public
 -  * @param     string  $regxurl  正则地址
 -  * @param     string  $handurl  操作地址
 -  * @param     string  $startid  开始id
 -  * @param     string  $endid  结束id
 -  * @param     string  $addv  增值
 -  * @param     string  $usemore  使用更多
 -  * @param     string  $batchrule  列表规则
 -  * @return    string
 -  */
 - function GetUrlFromListRule($regxurl = '', $handurl = '', $startid = 0, $endid = 0, $addv = 1, $usemore = 0, $batchrule = '')
 - {
 -     global $dsql, $islisten;
 -     $lists = array();
 -     $n = 0;
 -     $islisten = (empty($islisten) ? 0 : $islisten);
 -     if ($handurl != '') {
 -         $handurls = explode("\n", $handurl);
 -         foreach ($handurls as $handurl) {
 -             $handurl = trim($handurl);
 -             if (preg_match("/^http:\/\//i", $handurl)) {
 -                 $lists[$n][0] = $handurl;
 -                 $lists[$n][1] = 0;
 -                 $n++;
 -                 if ($islisten == 1) {
 -                     break;
 -                 }
 -             }
 -         }
 -     }
 -     if ($regxurl != '') {
 -         //没指定(#)和(*)
 -         if (!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl)) {
 -             $lists[$n][0] = $regxurl;
 -             $lists[$n][1] = 0;
 -             $n++;
 -         } else {
 -             if ($addv <= 0) {
 -                 $addv = 1;
 -             }
 -             //没指定多栏目匹配规则
 -             if ($usemore == 0) {
 -                 while ($startid <= $endid) {
 -                     $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $regxurl);
 -                     $lists[$n][1] = 0;
 -                     $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
 -                     $n++;
 -                     if ($n > 2000 || $islisten == 1) {
 -                         break;
 -                     }
 -                 }
 -             }
 -             //匹配多个栏目,规则表达式[(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20;typeid=>栏目id;addurl=>附加的网址(用|分开多个)]
 -             else {
 -                 $nrules = explode(']', trim($batchrule));
 -                 foreach ($nrules as $nrule) {
 -                     $nrule = trim($nrule);
 -                     $nrule = preg_replace("/^\[|\]$/", '', $nrule);
 -                     $nrules  = explode(';', $nrule);
 -                     if (count($nrules) < 3) {
 -                         continue;
 -                     }
 -                     $brtag = '';
 -                     $startid = 0;
 -                     $endid = 0;
 -                     $typeid = 0;
 -                     $addurls = array();
 -                     foreach ($nrules as $nrule) {
 -                         $nrule = trim($nrule);
 -                         list($k, $v) = explode('=>', $nrule);
 -                         if (trim($k) == '(#)') {
 -                             $brtag = trim($v);
 -                         } else if (trim($k) == 'typeid') {
 -                             $typeid = trim($v);
 -                         } else if (trim($k) == 'addurl') {
 -                             $addurl = trim($v);
 -                             $addurls = explode('|', $addurl);
 -                         } else if (trim($k) == '(*)') {
 -                             $v = preg_replace("/[ \r\n\t]/", '', trim($v));
 -                             list($startid, $endid) = explode('-', $v);
 -                         }
 -                     }
 -                     //如果栏目用栏目名称
 -                     if (preg_match('/[^0-9]/', $typeid)) {
 -                         $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
 -                         if (is_array($arr)) {
 -                             $typeid = $arr['id'];
 -                         } else {
 -                             $typeid = 0;
 -                         }
 -                     }
 -                     //附加网址优先
 -                     $mjj = 0;
 -                     if (isset($addurls[0])) {
 -                         foreach ($addurls as $addurl) {
 -                             $addurl = trim($addurl);
 -                             if ($addurl == '') {
 -                                 continue;
 -                             }
 -                             $lists[$n][0] = $addurl;
 -                             $lists[$n][1] = $typeid;
 -                             $n++;
 -                             $mjj++;
 -                             if ($islisten == 1) {
 -                                 break;
 -                             }
 -                         }
 -                     }
 -                     //如果为非监听模式或监听模式没手工指定的附加网址
 -                     if ($islisten != 1 || $mjj == 0) {
 -                         //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
 -                         while ($startid <= $endid) {
 -                             $lists[$n][0] = str_replace("(#)", $brtag, $regxurl);
 -                             $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $lists[$n][0]);
 -                             $lists[$n][1] = $typeid;
 -                             $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
 -                             $n++;
 -                             if ($islisten == 1) {
 -                                 break;
 -                             }
 -                             if ($n > 20000) {
 -                                 break;
 -                             }
 -                         }
 -                     }
 -                 }
 -             }
 -         }
 -     }
 -     return $lists;
 - }
 - ?>
 
 
  |