DedeBIZ
/
DedeV6


			
				
					
						
						
							
							<?php if (!defined('DEDEINC')) exit('dedebiz');
/**
 * 采集小助手
 *
 * @version        $Id: charset.helper.php 1 2010-07-05 11:43:09Z tianya $
 * @package        DedeBIZ.Helpers
 * @copyright      Copyright (c) 2020, DedeBIZ.COM
 * @license        https://www.dedebiz.com/license
 * @link           https://www.dedebiz.com
 */

require_once(DEDEINC . "/dedehttpdown.class.php");
require_once(DEDEINC . "/dedetag.class.php");
require_once(DEDEINC . "/charset.func.php");

/**
 *  下载图片
 *
 * @access    public
 * @param     string  $gurl  地址
 * @param     string  $rfurl  来源地址
 * @param     string  $filename  文件名
 * @param     string  $gcookie  调整cookie
 * @param     string  $JumpCount  跳转计数
 * @param     string  $maxtime  最大次数
 * @return    string
 */
function DownImageKeep($gurl, $rfurl, $filename, $gcookie = "", $JumpCount = 0, $maxtime = 30)
{
    $urlinfos = GetHostInfo($gurl);
    $ghost = trim($urlinfos['host']);
    if ($ghost == '') {
        return FALSE;
    }
    $gquery = $urlinfos['query'];
    if ($gcookie == "" && !empty($rfurl)) {
        $gcookie = RefurlCookie($rfurl);
    }
    $sessionQuery = "GET $gquery HTTP/1.1\r\n";
    $sessionQuery .= "Host: $ghost\r\n";
    $sessionQuery .= "Referer: $rfurl\r\n";
    $sessionQuery .= "Accept: */*\r\n";
    $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
    if ($gcookie != "" && !preg_match("/[\r\n]/", $gcookie)) {
        $sessionQuery .= $gcookie . "\r\n";
    }
    $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
    $errno = "";
    $errstr = "";
    $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10);
    fwrite($m_fp, $sessionQuery);
    $lnum = 0;

    //获取详细应答头
    $m_httphead = array();
    $httpstas = explode(" ", fgets($m_fp, 256));
    $m_httphead["http-edition"] = trim($httpstas[0]);
    $m_httphead["http-state"] = trim($httpstas[1]);
    while (!feof($m_fp)) {
        $line = trim(fgets($m_fp, 256));
        if ($line == "" || $lnum > 100) {
            break;
        }
        $hkey = "";
        $hvalue = "";
        $v = 0;
        for ($i = 0; $i < strlen($line); $i++) {
            if ($v == 1) {
                $hvalue .= $line[$i];
            }
            if ($line[$i] == ":") {
                $v = 1;
            }
            if ($v == 0) {
                $hkey .= $line[$i];
            }
        }
        $hkey = trim($hkey);
        if ($hkey != "") {
            $m_httphead[strtolower($hkey)] = trim($hvalue);
        }
    }

    //分析返回记录
    if (preg_match("/^3/", $m_httphead["http-state"])) {
        if (isset($m_httphead["location"]) && $JumpCount < 3) {
            $JumpCount++;
            DownImageKeep($gurl, $rfurl, $filename, $gcookie, $JumpCount);
        } else {
            return FALSE;
        }
    }
    if (!preg_match("/^2/", $m_httphead["http-state"])) {
        return FALSE;
    }
    if (!isset($m_httphead)) {
        return FALSE;
    }
    $contentLength = $m_httphead['content-length'];

    //保存文件
    $fp = fopen($filename, "w") or die("写入文件：{$filename} 失败！");
    $i = 0;
    $okdata = "";
    $starttime = time();
    while (!feof($m_fp)) {
        $okdata .= fgetc($m_fp);
        $i++;

        //超时结束
        if (time() - $starttime > $maxtime) {
            break;
        }

        //到达指定大小结束
        if ($i >= $contentLength) {
            break;
        }
    }
    if ($okdata != "") {
        fwrite($fp, $okdata);
    }
    fclose($fp);
    if ($okdata == "") {
        @unlink($filename);
        fclose($m_fp);
        return FALSE;
    }
    fclose($m_fp);
    return TRUE;
}

/**
 *  获得某页面返回的Cookie信息
 *
 * @access    public
 * @param     string  $gurl  调整地址
 * @return    string
 */
function RefurlCookie($gurl)
{
    global $gcookie, $lastRfurl;
    $gurl = trim($gurl);
    if (!empty($gcookie) && $lastRfurl == $gurl) {
        return $gcookie;
    } else {
        $lastRfurl = $gurl;
    }
    if (trim($gurl) == '') {
        return '';
    }
    $urlinfos = GetHostInfo($gurl);
    $ghost = $urlinfos['host'];
    $gquery = $urlinfos['query'];
    $sessionQuery = "GET $gquery HTTP/1.1\r\n";
    $sessionQuery .= "Host: $ghost\r\n";
    $sessionQuery .= "Accept: */*\r\n";
    $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
    $sessionQuery .= "Connection: Close\r\n\r\n";
    $errno = "";
    $errstr = "";
    $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10) or die($ghost . '<br />');
    fwrite($m_fp, $sessionQuery);
    $lnum = 0;

    //获取详细应答头
    $gcookie = "";
    while (!feof($m_fp)) {
        $line = trim(fgets($m_fp, 256));
        if ($line == "" || $lnum > 100) {
            break;
        } else {
            if (preg_match("/^cookie/i", $line)) {
                $gcookie = $line;
                break;
            }
        }
    }
    fclose($m_fp);
    return $gcookie;
}

/**
 *  获得网址的host和query部份
 *
 * @access    public
 * @param     string  $gurl  调整地址
 * @return    string
 */
function GetHostInfo($gurl)
{
    $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
    $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
    $garr['query'] = "/" . preg_replace("/^([^\/]*)\//i", "", $gurl);
    return $garr;
}

/**
 *  HTML里的图片转DEDE格式
 *
 * @access    public
 * @param     string  $body  文章内容
 * @return    string
 */
function TurnImageTag(&$body)
{
    global $cfg_album_width, $cfg_ddimg_width;
    if (empty($cfg_album_width)) {
        $cfg_album_width = 800;
    }
    if (empty($cfg_ddimg_width)) {
        $cfg_ddimg_width = 150;
    }
    $patten = "/<\\s*img\\s.*?src\\s*=\\s*([\"\\'])?(?(1)(.*?)\\1|([^\\s\\>\"\\']+))/isx";
    preg_match_all($patten, $body, $images);
    $returnArray1 = $images[2];
    $returnArray2 = $images[3];
    foreach ($returnArray1 as $key => $value) {
        if ($value) {
            $ttx .= "{dede:img ddimg='$litpicname' text='图 " . ($key + 1) . "'}" . $value . "{/dede:img}" . "\r\n";
        } else {
            $ttx .= "{dede:img ddimg='$litpicname' text='图 " . ($key + 1) . "'}" . $returnArray2[$key] . "{/dede:img}" . "\r\n";
        }
    }
    $ttx = "\r\n{dede:pagestyle maxwidth='{$cfg_album_width}' ddmaxwidth='{$cfg_ddimg_width}' row='3' col='3' value='2'/}\r\n{dede:comments}图集类型会采集时生成此配置是正常的，不过如果后面没有跟着img标记则表示规则无效{/dede:comments}\r\n" . $ttx;
    return $ttx;
}

/**
 *  HTML里的网址格式转换
 *
 * @access    public
 * @param     string  $body  文章内容
 * @return    string
 */
function TurnLinkTag(&$body)
{
    $ttx = '';
    $handid = '服务器';
    preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is", $body, $match);
    if (is_array($match[1]) && count($match[1]) > 0) {
        for ($i = 0; isset($match[1][$i]); $i++) {
            $servername = (isset($match[3][$i]) ? str_replace("'", "`", $match[3][$i]) : $handid . ($i + 1));
            if (preg_match("/[<>]/", $servername) || strlen($servername) > 40) {
                $servername = $handid . ($i + 1);
            }
            $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
        }
    }
    return $ttx;
}

/**
 *  替换XML的CDATA
 *
 * @access    public
 * @param     string  $str  字符串
 * @return    string
 */
function RpCdata($str)
{
    $str = str_replace('<![CDATA[', '', $str);
    $str = str_replace(']]>', '', $str);
    return  $str;
}

/**
 *  分析RSS里的链接
 *
 * @access    public
 * @param     string  $rssurl  rss地址
 * @return    string
 */
function GetRssLinks($rssurl)
{
    global $cfg_soft_lang;
    $dhd = new DedeHttpDown();
    $dhd->OpenUrl($rssurl);
    $rsshtml = $dhd->GetHtml();

    //分析编码
    preg_match("/encoding=[\"']([^\"']*)[\"']/is", $rsshtml, $infos);
    if (isset($infos[1])) {
        $pcode = strtolower(trim($infos[1]));
    } else {
        $pcode = strtolower($cfg_soft_lang);
    }
    if ($cfg_soft_lang == 'gb2312') {
        if ($pcode == 'utf-8') {
            $rsshtml = utf82gb($rsshtml);
        } else if ($pcode == 'big5') {
            $rsshtml = big52gb($rsshtml);
        }
    } else if ($cfg_soft_lang == 'utf-8') {
        if ($pcode == 'gbk' || $pcode == 'gb2312') {
            $rsshtml = gb2utf8($rsshtml);
        } else if ($pcode == 'big5') {
            $rsshtml = gb2utf8(big52gb($rsshtml));
        }
    }
    $rsarr = array();
    preg_match_all("/<item(.*)<title>(.*)<\/title>/isU", $rsshtml, $titles);
    preg_match_all("/<item(.*)<link>(.*)<\/link>/isU", $rsshtml, $links);
    preg_match_all("/<item(.*)<description>(.*)<\/description>/isU", $rsshtml, $descriptions);
    if (!isset($links[2])) {
        return '';
    }
    foreach ($links[2] as $k => $v) {
        $rsarr[$k]['link'] = RpCdata($v);

        if (isset($titles[2][$k])) {
            $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
        } else {
            $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
        }
        if (isset($descriptions[2][$k])) {
            $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k], $rssurl);
        } else {
            $rsarr[$k]['image'] = '';
        }
    }
    return $rsarr;
}

/**
 *  从RSS摘要获取图片信息
 *
 * @access    public
 * @param     string  $descriptions  描述
 * @param     string  $refurl  来源地址
 * @return    string
 */
function GetddImgFromRss($descriptions, $refurl)
{
    if ($descriptions == '') {
        return '';
    }
    preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $descriptions, $imgs);
    if (isset($imgs[2][0])) {
        $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
        $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
        return FillUrl($refurl, $imgs[2][0]);
    } else {
        return '';
    }
}

/**
 *  补全网址
 *
 * @access    public
 * @param     string  $refurl  来源地址
 * @param     string  $surl  站点地址
 * @return    string
 */
function FillUrl($refurl, $surl)
{
    $i = $pathStep = 0;
    $dstr = $pstr = $okurl = '';
    $refurl = trim($refurl);
    $surl = trim($surl);
    $urls = @parse_url($refurl);
    $basehost = ((!isset($urls['port']) || $urls['port'] == '80') ? $urls['host'] : $urls['host'] . ':' . $urls['port']);

    //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
    //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误，因此用其它方式处理
    $basepath = $basehost;
    $paths = explode('/', preg_replace("/^http:\/\//i", "", $refurl));
    $n = count($paths);
    for ($i = 1; $i < ($n - 1); $i++) {
        if (!preg_match("/[\?]/", $paths[$i])) $basepath .= '/' . $paths[$i];
    }
    if (!preg_match("/[\?\.]/", $paths[$n - 1])) {
        $basepath .= '/' . $paths[$n - 1];
    }
    if ($surl == '') {
        return $basepath;
    }
    $pos = strpos($surl, "#");
    if ($pos > 0) {
        $surl = substr($surl, 0, $pos);
    }

    //用 '/' 表示网站根的网址
    if ($surl[0] == '/') {
        $okurl = $basehost . $surl;
    } else if ($surl[0] == '.') {
        if (strlen($surl) <= 2) {
            return '';
        } else if ($surl[1] == '/') {
            $okurl = $basepath . preg_replace('/^./', '', $surl);
        } else {
            $okurl = $basepath . '/' . $surl;
        }
    } else {
        if (strlen($surl) < 7) {
            $okurl = $basepath . '/' . $surl;
        } else if (preg_match("/^http:\/\//i", $surl)) {
            $okurl = $surl;
        } else {
            $okurl = $basepath . '/' . $surl;
        }
    }
    $okurl = preg_replace("/^http:\/\//i", '', $okurl);
    $okurl = 'http://' . preg_replace("/\/{1,}/", '/', $okurl);
    return $okurl;
}

/**
 *  从匹配规则中获取列表网址
 *
 * @access    public
 * @param     string  $regxurl  正则地址
 * @param     string  $handurl  操作地址
 * @param     string  $startid  开始ID
 * @param     string  $endid  结束ID
 * @param     string  $addv  增值
 * @param     string  $usemore  使用更多
 * @param     string  $batchrule  列表规则
 * @return    string
 */
function GetUrlFromListRule($regxurl = '', $handurl = '', $startid = 0, $endid = 0, $addv = 1, $usemore = 0, $batchrule = '')
{
    global $dsql, $islisten;

    $lists = array();
    $n = 0;
    $islisten = (empty($islisten) ? 0 : $islisten);
    if ($handurl != '') {
        $handurls = explode("\n", $handurl);
        foreach ($handurls as $handurl) {
            $handurl = trim($handurl);
            if (preg_match("/^http:\/\//i", $handurl)) {
                $lists[$n][0] = $handurl;
                $lists[$n][1] = 0;
                $n++;
                if ($islisten == 1) {
                    break;
                }
            }
        }
    }
    if ($regxurl != '') {
        //没指定(#)和(*)
        if (!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl)) {
            $lists[$n][0] = $regxurl;
            $lists[$n][1] = 0;
            $n++;
        } else {
            if ($addv <= 0) {
                $addv = 1;
            }

            //没指定多栏目匹配规则
            if ($usemore == 0) {
                while ($startid <= $endid) {
                    $lists[$n][0] = str_replace("(*)", sprintf('%0' . strlen($startid) . 'd', $startid), $regxurl);
                    $lists[$n][1] = 0;
                    $startid = sprintf('%0' . strlen($startid) . 'd', $startid + $addv);
                    $n++;
                    if ($n > 2000 || $islisten == 1) {
                        break;
                    }
                }
            }

            //匹配多个栏目
            //规则表达式 [(#)=>(#)匹配的网址; (*)=>(*)的范围，如：1-20; typeid=>栏目id; addurl=>附加的网址(用|分开多个)]
            else {
                $nrules = explode(']', trim($batchrule));
                foreach ($nrules as $nrule) {
                    $nrule = trim($nrule);
                    $nrule = preg_replace("/^\[|\]$/", '', $nrule);
                    $nrules  = explode(';', $nrule);
                    if (count($nrules) < 3) {
                        continue;
                    }
                    $brtag = '';
                    $startid = 0;
                    $endid = 0;
                    $typeid = 0;
                    $addurls = array();
                    foreach ($nrules as $nrule) {
                        $nrule = trim($nrule);
                        list($k, $v) = explode('=>', $nrule);
                        if (trim($k) == '(#)') {
                            $brtag = trim($v);
                        } else if (trim($k) == 'typeid') {
                            $typeid = trim($v);
                        } else if (trim($k) == 'addurl') {
                            $addurl = trim($v);
                            $addurls = explode('|', $addurl);
                        } else if (trim($k) == '(*)') {
                            $v = preg_replace("/[ \r\n\t]/", '', trim($v));
                            list($startid, $endid) = explode('-', $v);
                        }
                    }

                    //如果栏目用栏目名称
                    if (preg_match('/[^0-9]/', $typeid)) {
                        $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
                        if (is_array($arr)) {
                            $typeid = $arr['id'];
                        } else {
                            $typeid = 0;
                        }
                    }

                    //附加网址优先
                    $mjj = 0;
                    if (isset($addurls[0])) {
                        foreach ($addurls as $addurl) {
                            $addurl = trim($addurl);
                            if ($addurl == '') {
                                continue;
                            }
                            $lists[$n][0] = $addurl;
                            $lists[$n][1] = $typeid;
                            $n++;
                            $mjj++;
                            if ($islisten == 1) {
                                break;
                            }
                        }
                    }

                    //如果为非监听模式或监听模式没手工指定的附加网址
                    if ($islisten != 1 || $mjj == 0) {
                        //匹配规则里的网址，注：(#)的网址是是允许使用(*)的
                        while ($startid <= $endid) {
                            $lists[$n][0] = str_replace("(#)", $brtag, $regxurl);
                            $lists[$n][0] = str_replace("(*)", sprintf('%0' . strlen($startid) . 'd', $startid), $lists[$n][0]);
                            $lists[$n][1] = $typeid;
                            $startid = sprintf('%0' . strlen($startid) . 'd', $startid + $addv);
                            $n++;
                            if ($islisten == 1) {
                                break;
                            }
                            if ($n > 20000) {
                                break;
                            }
                        }
                    }
                }
            } //End 匹配多栏目

        } //End使用规则匹配的情况

    }

    return $lists;
}//End