国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

552 lines
18KB

  1. <?php if (!defined('DEDEINC')) exit('dedebiz');
  2. /**
  3. * 采集小助手
  4. *
  5. * @version $Id: charset.helper.php 1 2010-07-05 11:43:09Z tianya $
  6. * @package DedeBIZ.Helpers
  7. * @copyright Copyright (c) 2020, DedeBIZ.COM
  8. * @license https://www.dedebiz.com/license
  9. * @link https://www.dedebiz.com
  10. */
  11. require_once(DEDEINC . "/dedehttpdown.class.php");
  12. require_once(DEDEINC . "/dedetag.class.php");
  13. require_once(DEDEINC . "/charset.func.php");
  14. /**
  15. * 下载图片
  16. *
  17. * @access public
  18. * @param string $gurl 地址
  19. * @param string $rfurl 来源地址
  20. * @param string $filename 文件名
  21. * @param string $gcookie 调整cookie
  22. * @param string $JumpCount 跳转计数
  23. * @param string $maxtime 最大次数
  24. * @return string
  25. */
  26. function DownImageKeep($gurl, $rfurl, $filename, $gcookie = "", $JumpCount = 0, $maxtime = 30)
  27. {
  28. $urlinfos = GetHostInfo($gurl);
  29. $ghost = trim($urlinfos['host']);
  30. if ($ghost == '') {
  31. return FALSE;
  32. }
  33. $gquery = $urlinfos['query'];
  34. if ($gcookie == "" && !empty($rfurl)) {
  35. $gcookie = RefurlCookie($rfurl);
  36. }
  37. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  38. $sessionQuery .= "Host: $ghost\r\n";
  39. $sessionQuery .= "Referer: $rfurl\r\n";
  40. $sessionQuery .= "Accept: */*\r\n";
  41. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  42. if ($gcookie != "" && !preg_match("/[\r\n]/", $gcookie)) {
  43. $sessionQuery .= $gcookie . "\r\n";
  44. }
  45. $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
  46. $errno = "";
  47. $errstr = "";
  48. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10);
  49. fwrite($m_fp, $sessionQuery);
  50. $lnum = 0;
  51. //获取详细应答头
  52. $m_httphead = array();
  53. $httpstas = explode(" ", fgets($m_fp, 256));
  54. $m_httphead["http-edition"] = trim($httpstas[0]);
  55. $m_httphead["http-state"] = trim($httpstas[1]);
  56. while (!feof($m_fp)) {
  57. $line = trim(fgets($m_fp, 256));
  58. if ($line == "" || $lnum > 100) {
  59. break;
  60. }
  61. $hkey = "";
  62. $hvalue = "";
  63. $v = 0;
  64. for ($i = 0; $i < strlen($line); $i++) {
  65. if ($v == 1) {
  66. $hvalue .= $line[$i];
  67. }
  68. if ($line[$i] == ":") {
  69. $v = 1;
  70. }
  71. if ($v == 0) {
  72. $hkey .= $line[$i];
  73. }
  74. }
  75. $hkey = trim($hkey);
  76. if ($hkey != "") {
  77. $m_httphead[strtolower($hkey)] = trim($hvalue);
  78. }
  79. }
  80. //分析返回记录
  81. if (preg_match("/^3/", $m_httphead["http-state"])) {
  82. if (isset($m_httphead["location"]) && $JumpCount < 3) {
  83. $JumpCount++;
  84. DownImageKeep($gurl, $rfurl, $filename, $gcookie, $JumpCount);
  85. } else {
  86. return FALSE;
  87. }
  88. }
  89. if (!preg_match("/^2/", $m_httphead["http-state"])) {
  90. return FALSE;
  91. }
  92. if (!isset($m_httphead)) {
  93. return FALSE;
  94. }
  95. $contentLength = $m_httphead['content-length'];
  96. //保存文件
  97. $fp = fopen($filename, "w") or die("写入文件:{$filename} 失败!");
  98. $i = 0;
  99. $okdata = "";
  100. $starttime = time();
  101. while (!feof($m_fp)) {
  102. $okdata .= fgetc($m_fp);
  103. $i++;
  104. //超时结束
  105. if (time() - $starttime > $maxtime) {
  106. break;
  107. }
  108. //到达指定大小结束
  109. if ($i >= $contentLength) {
  110. break;
  111. }
  112. }
  113. if ($okdata != "") {
  114. fwrite($fp, $okdata);
  115. }
  116. fclose($fp);
  117. if ($okdata == "") {
  118. @unlink($filename);
  119. fclose($m_fp);
  120. return FALSE;
  121. }
  122. fclose($m_fp);
  123. return TRUE;
  124. }
  125. /**
  126. * 获得某页面返回的Cookie信息
  127. *
  128. * @access public
  129. * @param string $gurl 调整地址
  130. * @return string
  131. */
  132. function RefurlCookie($gurl)
  133. {
  134. global $gcookie, $lastRfurl;
  135. $gurl = trim($gurl);
  136. if (!empty($gcookie) && $lastRfurl == $gurl) {
  137. return $gcookie;
  138. } else {
  139. $lastRfurl = $gurl;
  140. }
  141. if (trim($gurl) == '') {
  142. return '';
  143. }
  144. $urlinfos = GetHostInfo($gurl);
  145. $ghost = $urlinfos['host'];
  146. $gquery = $urlinfos['query'];
  147. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  148. $sessionQuery .= "Host: $ghost\r\n";
  149. $sessionQuery .= "Accept: */*\r\n";
  150. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  151. $sessionQuery .= "Connection: Close\r\n\r\n";
  152. $errno = "";
  153. $errstr = "";
  154. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10) or die($ghost . '<br />');
  155. fwrite($m_fp, $sessionQuery);
  156. $lnum = 0;
  157. //获取详细应答头
  158. $gcookie = "";
  159. while (!feof($m_fp)) {
  160. $line = trim(fgets($m_fp, 256));
  161. if ($line == "" || $lnum > 100) {
  162. break;
  163. } else {
  164. if (preg_match("/^cookie/i", $line)) {
  165. $gcookie = $line;
  166. break;
  167. }
  168. }
  169. }
  170. fclose($m_fp);
  171. return $gcookie;
  172. }
  173. /**
  174. * 获得网址的host和query部份
  175. *
  176. * @access public
  177. * @param string $gurl 调整地址
  178. * @return string
  179. */
  180. function GetHostInfo($gurl)
  181. {
  182. $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
  183. $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
  184. $garr['query'] = "/" . preg_replace("/^([^\/]*)\//i", "", $gurl);
  185. return $garr;
  186. }
  187. /**
  188. * HTML里的图片转DEDE格式
  189. *
  190. * @access public
  191. * @param string $body 文章内容
  192. * @return string
  193. */
  194. function TurnImageTag(&$body)
  195. {
  196. global $cfg_album_width, $cfg_ddimg_width;
  197. if (empty($cfg_album_width)) {
  198. $cfg_album_width = 800;
  199. }
  200. if (empty($cfg_ddimg_width)) {
  201. $cfg_ddimg_width = 150;
  202. }
  203. $patten = "/<\\s*img\\s.*?src\\s*=\\s*([\"\\'])?(?(1)(.*?)\\1|([^\\s\\>\"\\']+))/isx";
  204. preg_match_all($patten, $body, $images);
  205. $returnArray1 = $images[2];
  206. $returnArray2 = $images[3];
  207. foreach ($returnArray1 as $key => $value) {
  208. if ($value) {
  209. $ttx .= "{dede:img ddimg='$litpicname' text='图 " . ($key + 1) . "'}" . $value . "{/dede:img}" . "\r\n";
  210. } else {
  211. $ttx .= "{dede:img ddimg='$litpicname' text='图 " . ($key + 1) . "'}" . $returnArray2[$key] . "{/dede:img}" . "\r\n";
  212. }
  213. }
  214. $ttx = "\r\n{dede:pagestyle maxwidth='{$cfg_album_width}' ddmaxwidth='{$cfg_ddimg_width}' row='3' col='3' value='2'/}\r\n{dede:comments}图集类型会采集时生成此配置是正常的,不过如果后面没有跟着img标记则表示规则无效{/dede:comments}\r\n" . $ttx;
  215. return $ttx;
  216. }
  217. /**
  218. * HTML里的网址格式转换
  219. *
  220. * @access public
  221. * @param string $body 文章内容
  222. * @return string
  223. */
  224. function TurnLinkTag(&$body)
  225. {
  226. $ttx = '';
  227. $handid = '服务器';
  228. preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is", $body, $match);
  229. if (is_array($match[1]) && count($match[1]) > 0) {
  230. for ($i = 0; isset($match[1][$i]); $i++) {
  231. $servername = (isset($match[3][$i]) ? str_replace("'", "`", $match[3][$i]) : $handid . ($i + 1));
  232. if (preg_match("/[<>]/", $servername) || strlen($servername) > 40) {
  233. $servername = $handid . ($i + 1);
  234. }
  235. $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
  236. }
  237. }
  238. return $ttx;
  239. }
  240. /**
  241. * 替换XML的CDATA
  242. *
  243. * @access public
  244. * @param string $str 字符串
  245. * @return string
  246. */
  247. function RpCdata($str)
  248. {
  249. $str = str_replace('<![CDATA[', '', $str);
  250. $str = str_replace(']]>', '', $str);
  251. return $str;
  252. }
  253. /**
  254. * 分析RSS里的链接
  255. *
  256. * @access public
  257. * @param string $rssurl rss地址
  258. * @return string
  259. */
  260. function GetRssLinks($rssurl)
  261. {
  262. global $cfg_soft_lang;
  263. $dhd = new DedeHttpDown();
  264. $dhd->OpenUrl($rssurl);
  265. $rsshtml = $dhd->GetHtml();
  266. //分析编码
  267. preg_match("/encoding=[\"']([^\"']*)[\"']/is", $rsshtml, $infos);
  268. if (isset($infos[1])) {
  269. $pcode = strtolower(trim($infos[1]));
  270. } else {
  271. $pcode = strtolower($cfg_soft_lang);
  272. }
  273. if ($cfg_soft_lang == 'gb2312') {
  274. if ($pcode == 'utf-8') {
  275. $rsshtml = utf82gb($rsshtml);
  276. } else if ($pcode == 'big5') {
  277. $rsshtml = big52gb($rsshtml);
  278. }
  279. } else if ($cfg_soft_lang == 'utf-8') {
  280. if ($pcode == 'gbk' || $pcode == 'gb2312') {
  281. $rsshtml = gb2utf8($rsshtml);
  282. } else if ($pcode == 'big5') {
  283. $rsshtml = gb2utf8(big52gb($rsshtml));
  284. }
  285. }
  286. $rsarr = array();
  287. preg_match_all("/<item(.*)<title>(.*)<\/title>/isU", $rsshtml, $titles);
  288. preg_match_all("/<item(.*)<link>(.*)<\/link>/isU", $rsshtml, $links);
  289. preg_match_all("/<item(.*)<description>(.*)<\/description>/isU", $rsshtml, $descriptions);
  290. if (!isset($links[2])) {
  291. return '';
  292. }
  293. foreach ($links[2] as $k => $v) {
  294. $rsarr[$k]['link'] = RpCdata($v);
  295. if (isset($titles[2][$k])) {
  296. $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
  297. } else {
  298. $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
  299. }
  300. if (isset($descriptions[2][$k])) {
  301. $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k], $rssurl);
  302. } else {
  303. $rsarr[$k]['image'] = '';
  304. }
  305. }
  306. return $rsarr;
  307. }
  308. /**
  309. * 从RSS摘要获取图片信息
  310. *
  311. * @access public
  312. * @param string $descriptions 描述
  313. * @param string $refurl 来源地址
  314. * @return string
  315. */
  316. function GetddImgFromRss($descriptions, $refurl)
  317. {
  318. if ($descriptions == '') {
  319. return '';
  320. }
  321. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $descriptions, $imgs);
  322. if (isset($imgs[2][0])) {
  323. $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
  324. $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
  325. return FillUrl($refurl, $imgs[2][0]);
  326. } else {
  327. return '';
  328. }
  329. }
  330. /**
  331. * 补全网址
  332. *
  333. * @access public
  334. * @param string $refurl 来源地址
  335. * @param string $surl 站点地址
  336. * @return string
  337. */
  338. function FillUrl($refurl, $surl)
  339. {
  340. $i = $pathStep = 0;
  341. $dstr = $pstr = $okurl = '';
  342. $refurl = trim($refurl);
  343. $surl = trim($surl);
  344. $urls = @parse_url($refurl);
  345. $basehost = ((!isset($urls['port']) || $urls['port'] == '80') ? $urls['host'] : $urls['host'] . ':' . $urls['port']);
  346. //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
  347. //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
  348. $basepath = $basehost;
  349. $paths = explode('/', preg_replace("/^http:\/\//i", "", $refurl));
  350. $n = count($paths);
  351. for ($i = 1; $i < ($n - 1); $i++) {
  352. if (!preg_match("/[\?]/", $paths[$i])) $basepath .= '/' . $paths[$i];
  353. }
  354. if (!preg_match("/[\?\.]/", $paths[$n - 1])) {
  355. $basepath .= '/' . $paths[$n - 1];
  356. }
  357. if ($surl == '') {
  358. return $basepath;
  359. }
  360. $pos = strpos($surl, "#");
  361. if ($pos > 0) {
  362. $surl = substr($surl, 0, $pos);
  363. }
  364. //用 '/' 表示网站根的网址
  365. if ($surl[0] == '/') {
  366. $okurl = $basehost . $surl;
  367. } else if ($surl[0] == '.') {
  368. if (strlen($surl) <= 2) {
  369. return '';
  370. } else if ($surl[1] == '/') {
  371. $okurl = $basepath . preg_replace('/^./', '', $surl);
  372. } else {
  373. $okurl = $basepath . '/' . $surl;
  374. }
  375. } else {
  376. if (strlen($surl) < 7) {
  377. $okurl = $basepath . '/' . $surl;
  378. } else if (preg_match("/^http:\/\//i", $surl)) {
  379. $okurl = $surl;
  380. } else {
  381. $okurl = $basepath . '/' . $surl;
  382. }
  383. }
  384. $okurl = preg_replace("/^http:\/\//i", '', $okurl);
  385. $okurl = 'http://' . preg_replace("/\/{1,}/", '/', $okurl);
  386. return $okurl;
  387. }
  388. /**
  389. * 从匹配规则中获取列表网址
  390. *
  391. * @access public
  392. * @param string $regxurl 正则地址
  393. * @param string $handurl 操作地址
  394. * @param string $startid 开始ID
  395. * @param string $endid 结束ID
  396. * @param string $addv 增值
  397. * @param string $usemore 使用更多
  398. * @param string $batchrule 列表规则
  399. * @return string
  400. */
  401. function GetUrlFromListRule($regxurl = '', $handurl = '', $startid = 0, $endid = 0, $addv = 1, $usemore = 0, $batchrule = '')
  402. {
  403. global $dsql, $islisten;
  404. $lists = array();
  405. $n = 0;
  406. $islisten = (empty($islisten) ? 0 : $islisten);
  407. if ($handurl != '') {
  408. $handurls = explode("\n", $handurl);
  409. foreach ($handurls as $handurl) {
  410. $handurl = trim($handurl);
  411. if (preg_match("/^http:\/\//i", $handurl)) {
  412. $lists[$n][0] = $handurl;
  413. $lists[$n][1] = 0;
  414. $n++;
  415. if ($islisten == 1) {
  416. break;
  417. }
  418. }
  419. }
  420. }
  421. if ($regxurl != '') {
  422. //没指定(#)和(*)
  423. if (!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl)) {
  424. $lists[$n][0] = $regxurl;
  425. $lists[$n][1] = 0;
  426. $n++;
  427. } else {
  428. if ($addv <= 0) {
  429. $addv = 1;
  430. }
  431. //没指定多栏目匹配规则
  432. if ($usemore == 0) {
  433. while ($startid <= $endid) {
  434. $lists[$n][0] = str_replace("(*)", sprintf('%0' . strlen($startid) . 'd', $startid), $regxurl);
  435. $lists[$n][1] = 0;
  436. $startid = sprintf('%0' . strlen($startid) . 'd', $startid + $addv);
  437. $n++;
  438. if ($n > 2000 || $islisten == 1) {
  439. break;
  440. }
  441. }
  442. }
  443. //匹配多个栏目
  444. //规则表达式 [(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20; typeid=>栏目id; addurl=>附加的网址(用|分开多个)]
  445. else {
  446. $nrules = explode(']', trim($batchrule));
  447. foreach ($nrules as $nrule) {
  448. $nrule = trim($nrule);
  449. $nrule = preg_replace("/^\[|\]$/", '', $nrule);
  450. $nrules = explode(';', $nrule);
  451. if (count($nrules) < 3) {
  452. continue;
  453. }
  454. $brtag = '';
  455. $startid = 0;
  456. $endid = 0;
  457. $typeid = 0;
  458. $addurls = array();
  459. foreach ($nrules as $nrule) {
  460. $nrule = trim($nrule);
  461. list($k, $v) = explode('=>', $nrule);
  462. if (trim($k) == '(#)') {
  463. $brtag = trim($v);
  464. } else if (trim($k) == 'typeid') {
  465. $typeid = trim($v);
  466. } else if (trim($k) == 'addurl') {
  467. $addurl = trim($v);
  468. $addurls = explode('|', $addurl);
  469. } else if (trim($k) == '(*)') {
  470. $v = preg_replace("/[ \r\n\t]/", '', trim($v));
  471. list($startid, $endid) = explode('-', $v);
  472. }
  473. }
  474. //如果栏目用栏目名称
  475. if (preg_match('/[^0-9]/', $typeid)) {
  476. $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
  477. if (is_array($arr)) {
  478. $typeid = $arr['id'];
  479. } else {
  480. $typeid = 0;
  481. }
  482. }
  483. //附加网址优先
  484. $mjj = 0;
  485. if (isset($addurls[0])) {
  486. foreach ($addurls as $addurl) {
  487. $addurl = trim($addurl);
  488. if ($addurl == '') {
  489. continue;
  490. }
  491. $lists[$n][0] = $addurl;
  492. $lists[$n][1] = $typeid;
  493. $n++;
  494. $mjj++;
  495. if ($islisten == 1) {
  496. break;
  497. }
  498. }
  499. }
  500. //如果为非监听模式或监听模式没手工指定的附加网址
  501. if ($islisten != 1 || $mjj == 0) {
  502. //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
  503. while ($startid <= $endid) {
  504. $lists[$n][0] = str_replace("(#)", $brtag, $regxurl);
  505. $lists[$n][0] = str_replace("(*)", sprintf('%0' . strlen($startid) . 'd', $startid), $lists[$n][0]);
  506. $lists[$n][1] = $typeid;
  507. $startid = sprintf('%0' . strlen($startid) . 'd', $startid + $addv);
  508. $n++;
  509. if ($islisten == 1) {
  510. break;
  511. }
  512. if ($n > 20000) {
  513. break;
  514. }
  515. }
  516. }
  517. }
  518. } //End 匹配多栏目
  519. } //End使用规则匹配的情况
  520. }
  521. return $lists;
  522. }//End