国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

496 lines
16KB

  1. <?php
  2. if (!defined('DEDEINC')) exit('dedebiz');
  3. /**
  4. * 采集小助手
  5. *
  6. * @version $id:charset.helper.php 2010-07-05 11:43:09 tianya $
  7. * @package DedeBIZ.Helpers
  8. * @copyright Copyright (c) 2022 DedeBIZ.COM
  9. * @license https://www.dedebiz.com/license
  10. * @link https://www.dedebiz.com
  11. */
  12. require_once(DEDEINC."/libraries/dedehttpdown.class.php");
  13. require_once(DEDEINC."/dedetag.class.php");
  14. require_once(DEDEINC."/charset.func.php");
  15. /**
  16. * 下载图片
  17. *
  18. * @access public
  19. * @param string $gurl 地址
  20. * @param string $rfurl 来源地址
  21. * @param string $filename 文件名
  22. * @param string $gcookie 调整cookie
  23. * @param string $JumpCount 跳转计数
  24. * @param string $maxtime 最大次数
  25. * @return string
  26. */
  27. function DownImageKeep($gurl, $rfurl, $filename, $gcookie = "", $JumpCount = 0, $maxtime = 30)
  28. {
  29. $urlinfos = GetHostInfo($gurl);
  30. $ghost = trim($urlinfos['host']);
  31. if ($ghost == '') {
  32. return FALSE;
  33. }
  34. $gquery = $urlinfos['query'];
  35. if ($gcookie == "" && !empty($rfurl)) {
  36. $gcookie = RefurlCookie($rfurl);
  37. }
  38. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  39. $sessionQuery .= "Host: $ghost\r\n";
  40. $sessionQuery .= "Referer: $rfurl\r\n";
  41. $sessionQuery .= "Accept: */*\r\n";
  42. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  43. if ($gcookie != "" && !preg_match("/[\r\n]/", $gcookie)) {
  44. $sessionQuery .= $gcookie."\r\n";
  45. }
  46. $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
  47. $errno = "";
  48. $errstr = "";
  49. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10);
  50. fwrite($m_fp, $sessionQuery);
  51. $lnum = 0;
  52. //获取详细应答头
  53. $m_httphead = array();
  54. $httpstas = explode(" ", fgets($m_fp, 256));
  55. $m_httphead["http-edition"] = trim($httpstas[0]);
  56. $m_httphead["http-state"] = trim($httpstas[1]);
  57. while (!feof($m_fp)) {
  58. $line = trim(fgets($m_fp, 256));
  59. if ($line == "" || $lnum > 100) {
  60. break;
  61. }
  62. $hkey = "";
  63. $hvalue = "";
  64. $v = 0;
  65. for ($i = 0; $i < strlen($line); $i++) {
  66. if ($v == 1) {
  67. $hvalue .= $line[$i];
  68. }
  69. if ($line[$i] == ":") {
  70. $v = 1;
  71. }
  72. if ($v == 0) {
  73. $hkey .= $line[$i];
  74. }
  75. }
  76. $hkey = trim($hkey);
  77. if ($hkey != "") {
  78. $m_httphead[strtolower($hkey)] = trim($hvalue);
  79. }
  80. }
  81. //分析返回记录
  82. if (preg_match("/^3/", $m_httphead["http-state"])) {
  83. if (isset($m_httphead["location"]) && $JumpCount < 3) {
  84. $JumpCount++;
  85. DownImageKeep($gurl, $rfurl, $filename, $gcookie, $JumpCount);
  86. } else {
  87. return FALSE;
  88. }
  89. }
  90. if (!preg_match("/^2/", $m_httphead["http-state"])) {
  91. return FALSE;
  92. }
  93. if (!isset($m_httphead)) {
  94. return FALSE;
  95. }
  96. $contentLength = $m_httphead['content-length'];
  97. //保存文件
  98. $fp = fopen($filename, "w") or die("写入文件:{$filename} 失败");
  99. $i = 0;
  100. $okdata = "";
  101. $starttime = time();
  102. while (!feof($m_fp)) {
  103. $okdata .= fgetc($m_fp);
  104. $i++;
  105. //超时结束
  106. if (time() - $starttime > $maxtime) {
  107. break;
  108. }
  109. //到达指定大小结束
  110. if ($i >= $contentLength) {
  111. break;
  112. }
  113. }
  114. if ($okdata != "") {
  115. fwrite($fp, $okdata);
  116. }
  117. fclose($fp);
  118. if ($okdata == "") {
  119. @unlink($filename);
  120. fclose($m_fp);
  121. return FALSE;
  122. }
  123. fclose($m_fp);
  124. return TRUE;
  125. }
  126. /**
  127. * 获得某页面返回的Cookie信息
  128. *
  129. * @access public
  130. * @param string $gurl 调整地址
  131. * @return string
  132. */
  133. function RefurlCookie($gurl)
  134. {
  135. global $gcookie, $lastRfurl;
  136. $gurl = trim($gurl);
  137. if (!empty($gcookie) && $lastRfurl == $gurl) {
  138. return $gcookie;
  139. } else {
  140. $lastRfurl = $gurl;
  141. }
  142. if (trim($gurl) == '') {
  143. return '';
  144. }
  145. $urlinfos = GetHostInfo($gurl);
  146. $ghost = $urlinfos['host'];
  147. $gquery = $urlinfos['query'];
  148. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  149. $sessionQuery .= "Host: $ghost\r\n";
  150. $sessionQuery .= "Accept: */*\r\n";
  151. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  152. $sessionQuery .= "Connection: Close\r\n\r\n";
  153. $errno = "";
  154. $errstr = "";
  155. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10) or die($ghost.'<br>');
  156. fwrite($m_fp, $sessionQuery);
  157. $lnum = 0;
  158. //获取详细应答头
  159. $gcookie = "";
  160. while (!feof($m_fp)) {
  161. $line = trim(fgets($m_fp, 256));
  162. if ($line == "" || $lnum > 100) {
  163. break;
  164. } else {
  165. if (preg_match("/^cookie/i", $line)) {
  166. $gcookie = $line;
  167. break;
  168. }
  169. }
  170. }
  171. fclose($m_fp);
  172. return $gcookie;
  173. }
  174. /**
  175. * 获得网址的host和query部份
  176. *
  177. * @access public
  178. * @param string $gurl 调整地址
  179. * @return string
  180. */
  181. function GetHostInfo($gurl)
  182. {
  183. $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
  184. $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
  185. $garr['query'] = "/".preg_replace("/^([^\/]*)\//i", "", $gurl);
  186. return $garr;
  187. }
  188. /**
  189. * HTML里的网址格式转换
  190. *
  191. * @access public
  192. * @param string $body 文档
  193. * @return string
  194. */
  195. function TurnLinkTag(&$body)
  196. {
  197. $ttx = '';
  198. $handid = '服务器';
  199. preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is", $body, $match);
  200. if (is_array($match[1]) && count($match[1]) > 0) {
  201. for ($i = 0; isset($match[1][$i]); $i++) {
  202. $servername = (isset($match[3][$i]) ? str_replace("'", "`", $match[3][$i]) : $handid.($i + 1));
  203. if (preg_match("/[<>]/", $servername) || strlen($servername) > 40) {
  204. $servername = $handid.($i + 1);
  205. }
  206. $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
  207. }
  208. }
  209. return $ttx;
  210. }
  211. /**
  212. * 替换XML的CDATA
  213. *
  214. * @access public
  215. * @param string $str 字符串
  216. * @return string
  217. */
  218. function RpCdata($str)
  219. {
  220. $str = str_replace('<![CDATA[', '', $str);
  221. $str = str_replace(']]>', '', $str);
  222. return $str;
  223. }
  224. /**
  225. * 分析RSS里的链接
  226. *
  227. * @access public
  228. * @param string $rssurl rss地址
  229. * @return string
  230. */
  231. function GetRssLinks($rssurl)
  232. {
  233. global $cfg_soft_lang;
  234. $dhd = new DedeHttpDown();
  235. $dhd->OpenUrl($rssurl);
  236. $rsshtml = $dhd->GetHtml();
  237. //分析编码
  238. preg_match("/encoding=[\"']([^\"']*)[\"']/is", $rsshtml, $infos);
  239. if (isset($infos[1])) {
  240. $pcode = strtolower(trim($infos[1]));
  241. } else {
  242. $pcode = strtolower($cfg_soft_lang);
  243. }
  244. if ($cfg_soft_lang == 'gb2312') {
  245. if ($pcode == 'utf-8') {
  246. $rsshtml = utf82gb($rsshtml);
  247. } else if ($pcode == 'big5') {
  248. $rsshtml = big52gb($rsshtml);
  249. }
  250. } else if ($cfg_soft_lang == 'utf-8') {
  251. if ($pcode == 'gbk' || $pcode == 'gb2312') {
  252. $rsshtml = gb2utf8($rsshtml);
  253. } else if ($pcode == 'big5') {
  254. $rsshtml = gb2utf8(big52gb($rsshtml));
  255. }
  256. }
  257. $rsarr = array();
  258. preg_match_all("/<item(.*)<title>(.*)<\/title>/isU", $rsshtml, $titles);
  259. preg_match_all("/<item(.*)<link>(.*)<\/link>/isU", $rsshtml, $links);
  260. preg_match_all("/<item(.*)<description>(.*)<\/description>/isU", $rsshtml, $descriptions);
  261. if (!isset($links[2])) {
  262. return '';
  263. }
  264. foreach ($links[2] as $k => $v) {
  265. $rsarr[$k]['link'] = RpCdata($v);
  266. if (isset($titles[2][$k])) {
  267. $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
  268. } else {
  269. $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
  270. }
  271. if (isset($descriptions[2][$k])) {
  272. $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k], $rssurl);
  273. } else {
  274. $rsarr[$k]['image'] = '';
  275. }
  276. }
  277. return $rsarr;
  278. }
  279. /**
  280. * 从RSS摘要获取图片信息
  281. *
  282. * @access public
  283. * @param string $descriptions 描述
  284. * @param string $refurl 来源地址
  285. * @return string
  286. */
  287. function GetddImgFromRss($descriptions, $refurl)
  288. {
  289. if ($descriptions == '') {
  290. return '';
  291. }
  292. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $descriptions, $imgs);
  293. if (isset($imgs[2][0])) {
  294. $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
  295. $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
  296. return FillUrl($refurl, $imgs[2][0]);
  297. } else {
  298. return '';
  299. }
  300. }
  301. /**
  302. * 补全网址
  303. *
  304. * @access public
  305. * @param string $refurl 来源地址
  306. * @param string $surl 站点地址
  307. * @return string
  308. */
  309. function FillUrl($refurl, $surl)
  310. {
  311. $i = $pathStep = 0;
  312. $dstr = $pstr = $okurl = '';
  313. $refurl = trim($refurl);
  314. $surl = trim($surl);
  315. $urls = @parse_url($refurl);
  316. $basehost = ((!isset($urls['port']) || $urls['port'] == '80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
  317. //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
  318. //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
  319. $basepath = $basehost;
  320. $paths = explode('/', preg_replace("/^http:\/\//i", "", $refurl));
  321. $n = count($paths);
  322. for ($i = 1; $i < ($n - 1); $i++) {
  323. if (!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
  324. }
  325. if (!preg_match("/[\?\.]/", $paths[$n - 1])) {
  326. $basepath .= '/'.$paths[$n - 1];
  327. }
  328. if ($surl == '') {
  329. return $basepath;
  330. }
  331. $pos = strpos($surl, "#");
  332. if ($pos > 0) {
  333. $surl = substr($surl, 0, $pos);
  334. }
  335. //用 '/' 表示网站根的网址
  336. if ($surl[0] == '/') {
  337. $okurl = $basehost.$surl;
  338. } else if ($surl[0] == '.') {
  339. if (strlen($surl) <= 2) {
  340. return '';
  341. } else if ($surl[1] == '/') {
  342. $okurl = $basepath.preg_replace('/^./', '', $surl);
  343. } else {
  344. $okurl = $basepath.'/'.$surl;
  345. }
  346. } else {
  347. if (strlen($surl) < 7) {
  348. $okurl = $basepath.'/'.$surl;
  349. } else if (preg_match("/^http:\/\//i", $surl)) {
  350. $okurl = $surl;
  351. } else {
  352. $okurl = $basepath.'/'.$surl;
  353. }
  354. }
  355. $okurl = preg_replace("/^http:\/\//i", '', $okurl);
  356. $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
  357. return $okurl;
  358. }
  359. /**
  360. * 从匹配规则中获取列表网址
  361. *
  362. * @access public
  363. * @param string $regxurl 正则地址
  364. * @param string $handurl 操作地址
  365. * @param string $startid 开始id
  366. * @param string $endid 结束id
  367. * @param string $addv 增值
  368. * @param string $usemore 使用更多
  369. * @param string $batchrule 列表规则
  370. * @return string
  371. */
  372. function GetUrlFromListRule($regxurl = '', $handurl = '', $startid = 0, $endid = 0, $addv = 1, $usemore = 0, $batchrule = '')
  373. {
  374. global $dsql, $islisten;
  375. $lists = array();
  376. $n = 0;
  377. $islisten = (empty($islisten) ? 0 : $islisten);
  378. if ($handurl != '') {
  379. $handurls = explode("\n", $handurl);
  380. foreach ($handurls as $handurl) {
  381. $handurl = trim($handurl);
  382. if (preg_match("/^http:\/\//i", $handurl)) {
  383. $lists[$n][0] = $handurl;
  384. $lists[$n][1] = 0;
  385. $n++;
  386. if ($islisten == 1) {
  387. break;
  388. }
  389. }
  390. }
  391. }
  392. if ($regxurl != '') {
  393. //没指定(#)和(*)
  394. if (!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl)) {
  395. $lists[$n][0] = $regxurl;
  396. $lists[$n][1] = 0;
  397. $n++;
  398. } else {
  399. if ($addv <= 0) {
  400. $addv = 1;
  401. }
  402. //没指定多栏目匹配规则
  403. if ($usemore == 0) {
  404. while ($startid <= $endid) {
  405. $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $regxurl);
  406. $lists[$n][1] = 0;
  407. $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
  408. $n++;
  409. if ($n > 2000 || $islisten == 1) {
  410. break;
  411. }
  412. }
  413. }
  414. //匹配多个栏目
  415. //规则表达式 [(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20; typeid=>栏目id; addurl=>附加的网址(用|分开多个)]
  416. else {
  417. $nrules = explode(']', trim($batchrule));
  418. foreach ($nrules as $nrule) {
  419. $nrule = trim($nrule);
  420. $nrule = preg_replace("/^\[|\]$/", '', $nrule);
  421. $nrules = explode(';', $nrule);
  422. if (count($nrules) < 3) {
  423. continue;
  424. }
  425. $brtag = '';
  426. $startid = 0;
  427. $endid = 0;
  428. $typeid = 0;
  429. $addurls = array();
  430. foreach ($nrules as $nrule) {
  431. $nrule = trim($nrule);
  432. list($k, $v) = explode('=>', $nrule);
  433. if (trim($k) == '(#)') {
  434. $brtag = trim($v);
  435. } else if (trim($k) == 'typeid') {
  436. $typeid = trim($v);
  437. } else if (trim($k) == 'addurl') {
  438. $addurl = trim($v);
  439. $addurls = explode('|', $addurl);
  440. } else if (trim($k) == '(*)') {
  441. $v = preg_replace("/[ \r\n\t]/", '', trim($v));
  442. list($startid, $endid) = explode('-', $v);
  443. }
  444. }
  445. //如果栏目用栏目名称
  446. if (preg_match('/[^0-9]/', $typeid)) {
  447. $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
  448. if (is_array($arr)) {
  449. $typeid = $arr['id'];
  450. } else {
  451. $typeid = 0;
  452. }
  453. }
  454. //附加网址优先
  455. $mjj = 0;
  456. if (isset($addurls[0])) {
  457. foreach ($addurls as $addurl) {
  458. $addurl = trim($addurl);
  459. if ($addurl == '') {
  460. continue;
  461. }
  462. $lists[$n][0] = $addurl;
  463. $lists[$n][1] = $typeid;
  464. $n++;
  465. $mjj++;
  466. if ($islisten == 1) {
  467. break;
  468. }
  469. }
  470. }
  471. //如果为非监听模式或监听模式没手工指定的附加网址
  472. if ($islisten != 1 || $mjj == 0) {
  473. //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
  474. while ($startid <= $endid) {
  475. $lists[$n][0] = str_replace("(#)", $brtag, $regxurl);
  476. $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $lists[$n][0]);
  477. $lists[$n][1] = $typeid;
  478. $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
  479. $n++;
  480. if ($islisten == 1) {
  481. break;
  482. }
  483. if ($n > 20000) {
  484. break;
  485. }
  486. }
  487. }
  488. }
  489. } //End 匹配多栏目
  490. } //End使用规则匹配的情况
  491. }
  492. return $lists;
  493. }//End
  494. ?>