国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

494 satır
16KB

  1. <?php
  2. if (!defined('DEDEINC')) exit ('dedebiz');
  3. /**
  4. * 采集助手
  5. *
  6. * @version $id:charset.helper.php 2010-07-05 11:43:09 tianya $
  7. * @package DedeBIZ.Helpers
  8. * @copyright Copyright (c) 2022 DedeBIZ.COM
  9. * @license GNU GPL v2 (https://www.dedebiz.com/license)
  10. * @link https://www.dedebiz.com
  11. */
  12. require_once(DEDEINC."/libraries/dedehttpdown.class.php");
  13. require_once(DEDEINC."/dedetag.class.php");
  14. require_once(DEDEINC."/charset.func.php");
  15. /**
  16. * 下载图片
  17. *
  18. * @access public
  19. * @param string $gurl 地址
  20. * @param string $rfurl 来源地址
  21. * @param string $filename 文件名
  22. * @param string $gcookie 调整cookie
  23. * @param string $JumpCount 跳转计数
  24. * @param string $maxtime 最大次数
  25. * @return string
  26. */
  27. function DownImageKeep($gurl, $rfurl, $filename, $gcookie = "", $JumpCount = 0, $maxtime = 30)
  28. {
  29. $urlinfos = GetHostInfo($gurl);
  30. $ghost = trim($urlinfos['host']);
  31. if ($ghost == '') {
  32. return FALSE;
  33. }
  34. $gquery = $urlinfos['query'];
  35. if ($gcookie == "" && !empty($rfurl)) {
  36. $gcookie = RefurlCookie($rfurl);
  37. }
  38. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  39. $sessionQuery .= "Host: $ghost\r\n";
  40. $sessionQuery .= "Referer: $rfurl\r\n";
  41. $sessionQuery .= "Accept: */*\r\n";
  42. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  43. if ($gcookie != "" && !preg_match("/[\r\n]/", $gcookie)) {
  44. $sessionQuery .= $gcookie."\r\n";
  45. }
  46. $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
  47. $errno = '';
  48. $errstr = '';
  49. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10);
  50. fwrite($m_fp, $sessionQuery);
  51. $lnum = 0;
  52. //获取详细应答头
  53. $m_httphead = array();
  54. $httpstas = explode(" ", fgets($m_fp, 256));
  55. $m_httphead["http-edition"] = trim($httpstas[0]);
  56. $m_httphead["http-state"] = trim($httpstas[1]);
  57. while (!feof($m_fp)) {
  58. $line = trim(fgets($m_fp, 256));
  59. if ($line == "" || $lnum > 100) {
  60. break;
  61. }
  62. $hkey = '';
  63. $hvalue = '';
  64. $v = 0;
  65. for ($i = 0; $i < strlen($line); $i++) {
  66. if ($v == 1) {
  67. $hvalue .= $line[$i];
  68. }
  69. if ($line[$i] == ":") {
  70. $v = 1;
  71. }
  72. if ($v == 0) {
  73. $hkey .= $line[$i];
  74. }
  75. }
  76. $hkey = trim($hkey);
  77. if ($hkey != "") {
  78. $m_httphead[strtolower($hkey)] = trim($hvalue);
  79. }
  80. }
  81. //分析返回记录
  82. if (preg_match("/^3/", $m_httphead["http-state"])) {
  83. if (isset($m_httphead["location"]) && $JumpCount < 3) {
  84. $JumpCount++;
  85. DownImageKeep($gurl, $rfurl, $filename, $gcookie, $JumpCount);
  86. } else {
  87. return FALSE;
  88. }
  89. }
  90. if (!preg_match("/^2/", $m_httphead["http-state"])) {
  91. return FALSE;
  92. }
  93. if (!isset($m_httphead)) {
  94. return FALSE;
  95. }
  96. $contentLength = $m_httphead['content-length'];
  97. //保存文件
  98. $fp = fopen($filename, "w") or die("写入文件:{$filename} 失败");
  99. $i = 0;
  100. $okdata = '';
  101. $starttime = time();
  102. while (!feof($m_fp)) {
  103. $okdata .= fgetc($m_fp);
  104. $i++;
  105. //超时结束
  106. if (time() - $starttime > $maxtime) {
  107. break;
  108. }
  109. //到达指定大小结束
  110. if ($i >= $contentLength) {
  111. break;
  112. }
  113. }
  114. if ($okdata != "") {
  115. fwrite($fp, $okdata);
  116. }
  117. fclose($fp);
  118. if ($okdata == "") {
  119. @unlink($filename);
  120. fclose($m_fp);
  121. return FALSE;
  122. }
  123. fclose($m_fp);
  124. return TRUE;
  125. }
  126. /**
  127. * 获得某页面返回的Cookie信息
  128. *
  129. * @access public
  130. * @param string $gurl 调整地址
  131. * @return string
  132. */
  133. function RefurlCookie($gurl)
  134. {
  135. global $gcookie, $lastRfurl;
  136. $gurl = trim($gurl);
  137. if (!empty($gcookie) && $lastRfurl == $gurl) {
  138. return $gcookie;
  139. } else {
  140. $lastRfurl = $gurl;
  141. }
  142. if (trim($gurl) == '') {
  143. return '';
  144. }
  145. $urlinfos = GetHostInfo($gurl);
  146. $ghost = $urlinfos['host'];
  147. $gquery = $urlinfos['query'];
  148. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  149. $sessionQuery .= "Host: $ghost\r\n";
  150. $sessionQuery .= "Accept: */*\r\n";
  151. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  152. $sessionQuery .= "Connection: Close\r\n\r\n";
  153. $errno = '';
  154. $errstr = '';
  155. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10) or die($ghost.'<br>');
  156. fwrite($m_fp, $sessionQuery);
  157. $lnum = 0;
  158. //获取详细应答头
  159. $gcookie = '';
  160. while (!feof($m_fp)) {
  161. $line = trim(fgets($m_fp, 256));
  162. if ($line == "" || $lnum > 100) {
  163. break;
  164. } else {
  165. if (preg_match("/^cookie/i", $line)) {
  166. $gcookie = $line;
  167. break;
  168. }
  169. }
  170. }
  171. fclose($m_fp);
  172. return $gcookie;
  173. }
  174. /**
  175. * 获得网址的host和query部份
  176. *
  177. * @access public
  178. * @param string $gurl 调整地址
  179. * @return string
  180. */
  181. function GetHostInfo($gurl)
  182. {
  183. $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
  184. $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
  185. $garr['query'] = "/".preg_replace("/^([^\/]*)\//i", "", $gurl);
  186. return $garr;
  187. }
  188. /**
  189. * HTML里的网址格式转换
  190. *
  191. * @access public
  192. * @param string $body 文档
  193. * @return string
  194. */
  195. function TurnLinkTag(&$body)
  196. {
  197. $ttx = '';
  198. $handid = '服务器';
  199. preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is", $body, $match);
  200. if (is_array($match[1]) && count($match[1]) > 0) {
  201. for ($i = 0; isset($match[1][$i]); $i++) {
  202. $servername = (isset($match[3][$i]) ? str_replace("'", "`", $match[3][$i]) : $handid.($i + 1));
  203. if (preg_match("/[<>]/", $servername) || strlen($servername) > 40) {
  204. $servername = $handid.($i + 1);
  205. }
  206. $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
  207. }
  208. }
  209. return $ttx;
  210. }
  211. /**
  212. * 替换XML的CDATA
  213. *
  214. * @access public
  215. * @param string $str 字符串
  216. * @return string
  217. */
  218. function RpCdata($str)
  219. {
  220. $str = str_replace('<![CDATA[', '', $str);
  221. $str = str_replace(']]>', '', $str);
  222. return $str;
  223. }
  224. /**
  225. * 分析RSS里的链接
  226. *
  227. * @access public
  228. * @param string $rssurl rss地址
  229. * @return string
  230. */
  231. function GetRssLinks($rssurl)
  232. {
  233. global $cfg_soft_lang;
  234. $dhd = new DedeHttpDown();
  235. $dhd->OpenUrl($rssurl);
  236. $rsshtml = $dhd->GetHtml();
  237. //分析编码
  238. preg_match("/encoding=[\"']([^\"']*)[\"']/is", $rsshtml, $infos);
  239. if (isset($infos[1])) {
  240. $pcode = strtolower(trim($infos[1]));
  241. } else {
  242. $pcode = strtolower($cfg_soft_lang);
  243. }
  244. if ($cfg_soft_lang == 'gb2312') {
  245. if ($pcode == 'utf-8') {
  246. $rsshtml = utf82gb($rsshtml);
  247. } else if ($pcode == 'big5') {
  248. $rsshtml = big52gb($rsshtml);
  249. }
  250. } else if ($cfg_soft_lang == 'utf-8') {
  251. if ($pcode == 'gbk' || $pcode == 'gb2312') {
  252. $rsshtml = gb2utf8($rsshtml);
  253. } else if ($pcode == 'big5') {
  254. $rsshtml = gb2utf8(big52gb($rsshtml));
  255. }
  256. }
  257. $rsarr = array();
  258. preg_match_all("/<item(.*)<title>(.*)<\/title>/isU", $rsshtml, $titles);
  259. preg_match_all("/<item(.*)<link>(.*)<\/link>/isU", $rsshtml, $links);
  260. preg_match_all("/<item(.*)<description>(.*)<\/description>/isU", $rsshtml, $descriptions);
  261. if (!isset($links[2])) {
  262. return '';
  263. }
  264. foreach ($links[2] as $k => $v) {
  265. $rsarr[$k]['link'] = RpCdata($v);
  266. if (isset($titles[2][$k])) {
  267. $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
  268. } else {
  269. $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
  270. }
  271. if (isset($descriptions[2][$k])) {
  272. $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k], $rssurl);
  273. } else {
  274. $rsarr[$k]['image'] = '';
  275. }
  276. }
  277. return $rsarr;
  278. }
  279. /**
  280. * 从RSS摘要获取图片信息
  281. *
  282. * @access public
  283. * @param string $descriptions 描述
  284. * @param string $refurl 来源地址
  285. * @return string
  286. */
  287. function GetddImgFromRss($descriptions, $refurl)
  288. {
  289. if ($descriptions == '') {
  290. return '';
  291. }
  292. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $descriptions, $imgs);
  293. if (isset($imgs[2][0])) {
  294. $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
  295. $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
  296. return FillUrl($refurl, $imgs[2][0]);
  297. } else {
  298. return '';
  299. }
  300. }
  301. /**
  302. * 补全网址
  303. *
  304. * @access public
  305. * @param string $refurl 来源地址
  306. * @param string $surl 站点地址
  307. * @return string
  308. */
  309. function FillUrl($refurl, $surl)
  310. {
  311. $i = $pathStep = 0;
  312. $dstr = $pstr = $okurl = '';
  313. $refurl = trim($refurl);
  314. $surl = trim($surl);
  315. $urls = @parse_url($refurl);
  316. $basehost = ((!isset($urls['port']) || $urls['port'] == '80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
  317. //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
  318. $basepath = $basehost;
  319. $paths = explode('/', preg_replace("/^http:\/\//i", "", $refurl));
  320. $n = count($paths);
  321. for ($i = 1; $i < ($n - 1); $i++) {
  322. if (!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
  323. }
  324. if (!preg_match("/[\?\.]/", $paths[$n - 1])) {
  325. $basepath .= '/'.$paths[$n - 1];
  326. }
  327. if ($surl == '') {
  328. return $basepath;
  329. }
  330. $pos = strpos($surl, "#");
  331. if ($pos > 0) {
  332. $surl = substr($surl, 0, $pos);
  333. }
  334. //用 '/' 表示网站根的网址
  335. if ($surl[0] == '/') {
  336. $okurl = $basehost.$surl;
  337. } else if ($surl[0] == '.') {
  338. if (strlen($surl) <= 2) {
  339. return '';
  340. } else if ($surl[1] == '/') {
  341. $okurl = $basepath.preg_replace('/^./', '', $surl);
  342. } else {
  343. $okurl = $basepath.'/'.$surl;
  344. }
  345. } else {
  346. if (strlen($surl) < 7) {
  347. $okurl = $basepath.'/'.$surl;
  348. } else if (preg_match("/^http:\/\//i", $surl)) {
  349. $okurl = $surl;
  350. } else {
  351. $okurl = $basepath.'/'.$surl;
  352. }
  353. }
  354. $okurl = preg_replace("/^http:\/\//i", '', $okurl);
  355. $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
  356. return $okurl;
  357. }
  358. /**
  359. * 从匹配规则中获取列表网址
  360. *
  361. * @access public
  362. * @param string $regxurl 正则地址
  363. * @param string $handurl 操作地址
  364. * @param string $startid 开始id
  365. * @param string $endid 结束id
  366. * @param string $addv 增值
  367. * @param string $usemore 使用更多
  368. * @param string $batchrule 列表规则
  369. * @return string
  370. */
  371. function GetUrlFromListRule($regxurl = '', $handurl = '', $startid = 0, $endid = 0, $addv = 1, $usemore = 0, $batchrule = '')
  372. {
  373. global $dsql, $islisten;
  374. $lists = array();
  375. $n = 0;
  376. $islisten = (empty($islisten) ? 0 : $islisten);
  377. if ($handurl != '') {
  378. $handurls = explode("\n", $handurl);
  379. foreach ($handurls as $handurl) {
  380. $handurl = trim($handurl);
  381. if (preg_match("/^http:\/\//i", $handurl)) {
  382. $lists[$n][0] = $handurl;
  383. $lists[$n][1] = 0;
  384. $n++;
  385. if ($islisten == 1) {
  386. break;
  387. }
  388. }
  389. }
  390. }
  391. if ($regxurl != '') {
  392. //没指定(#)和(*)
  393. if (!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl)) {
  394. $lists[$n][0] = $regxurl;
  395. $lists[$n][1] = 0;
  396. $n++;
  397. } else {
  398. if ($addv <= 0) {
  399. $addv = 1;
  400. }
  401. //没指定多栏目匹配规则
  402. if ($usemore == 0) {
  403. while ($startid <= $endid) {
  404. $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $regxurl);
  405. $lists[$n][1] = 0;
  406. $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
  407. $n++;
  408. if ($n > 2000 || $islisten == 1) {
  409. break;
  410. }
  411. }
  412. }
  413. //匹配多个栏目,规则表达式[(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20;typeid=>栏目id;addurl=>附加的网址(用|分开多个)]
  414. else {
  415. $nrules = explode(']', trim($batchrule));
  416. foreach ($nrules as $nrule) {
  417. $nrule = trim($nrule);
  418. $nrule = preg_replace("/^\[|\]$/", '', $nrule);
  419. $nrules = explode(';', $nrule);
  420. if (count($nrules) < 3) {
  421. continue;
  422. }
  423. $brtag = '';
  424. $startid = 0;
  425. $endid = 0;
  426. $typeid = 0;
  427. $addurls = array();
  428. foreach ($nrules as $nrule) {
  429. $nrule = trim($nrule);
  430. list($k, $v) = explode('=>', $nrule);
  431. if (trim($k) == '(#)') {
  432. $brtag = trim($v);
  433. } else if (trim($k) == 'typeid') {
  434. $typeid = trim($v);
  435. } else if (trim($k) == 'addurl') {
  436. $addurl = trim($v);
  437. $addurls = explode('|', $addurl);
  438. } else if (trim($k) == '(*)') {
  439. $v = preg_replace("/[ \r\n\t]/", '', trim($v));
  440. list($startid, $endid) = explode('-', $v);
  441. }
  442. }
  443. //如果栏目用栏目名称
  444. if (preg_match('/[^0-9]/', $typeid)) {
  445. $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
  446. if (is_array($arr)) {
  447. $typeid = $arr['id'];
  448. } else {
  449. $typeid = 0;
  450. }
  451. }
  452. //附加网址优先
  453. $mjj = 0;
  454. if (isset($addurls[0])) {
  455. foreach ($addurls as $addurl) {
  456. $addurl = trim($addurl);
  457. if ($addurl == '') {
  458. continue;
  459. }
  460. $lists[$n][0] = $addurl;
  461. $lists[$n][1] = $typeid;
  462. $n++;
  463. $mjj++;
  464. if ($islisten == 1) {
  465. break;
  466. }
  467. }
  468. }
  469. //如果为非监听模式或监听模式没手工指定的附加网址
  470. if ($islisten != 1 || $mjj == 0) {
  471. //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
  472. while ($startid <= $endid) {
  473. $lists[$n][0] = str_replace("(#)", $brtag, $regxurl);
  474. $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $lists[$n][0]);
  475. $lists[$n][1] = $typeid;
  476. $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
  477. $n++;
  478. if ($islisten == 1) {
  479. break;
  480. }
  481. if ($n > 20000) {
  482. break;
  483. }
  484. }
  485. }
  486. }
  487. } //End 匹配多栏目
  488. } //End使用规则匹配的情况
  489. }
  490. return $lists;
  491. }//End
  492. ?>