国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

553 lines
18KB

  1. <?php
  2. if (!defined('DEDEINC')) exit('dedebiz');
  3. /**
  4. * 采集小助手
  5. *
  6. * @version $Id: charset.helper.php 1 2010-07-05 11:43:09Z tianya $
  7. * @package DedeBIZ.Helpers
  8. * @copyright Copyright (c) 2022, DedeBIZ.COM
  9. * @license https://www.dedebiz.com/license
  10. * @link https://www.dedebiz.com
  11. */
  12. require_once(DEDEINC."/dedehttpdown.class.php");
  13. require_once(DEDEINC."/dedetag.class.php");
  14. require_once(DEDEINC."/charset.func.php");
  15. /**
  16. * 下载图片
  17. *
  18. * @access public
  19. * @param string $gurl 地址
  20. * @param string $rfurl 来源地址
  21. * @param string $filename 文件名
  22. * @param string $gcookie 调整cookie
  23. * @param string $JumpCount 跳转计数
  24. * @param string $maxtime 最大次数
  25. * @return string
  26. */
  27. function DownImageKeep($gurl, $rfurl, $filename, $gcookie = "", $JumpCount = 0, $maxtime = 30)
  28. {
  29. $urlinfos = GetHostInfo($gurl);
  30. $ghost = trim($urlinfos['host']);
  31. if ($ghost == '') {
  32. return FALSE;
  33. }
  34. $gquery = $urlinfos['query'];
  35. if ($gcookie == "" && !empty($rfurl)) {
  36. $gcookie = RefurlCookie($rfurl);
  37. }
  38. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  39. $sessionQuery .= "Host: $ghost\r\n";
  40. $sessionQuery .= "Referer: $rfurl\r\n";
  41. $sessionQuery .= "Accept: */*\r\n";
  42. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  43. if ($gcookie != "" && !preg_match("/[\r\n]/", $gcookie)) {
  44. $sessionQuery .= $gcookie."\r\n";
  45. }
  46. $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
  47. $errno = "";
  48. $errstr = "";
  49. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10);
  50. fwrite($m_fp, $sessionQuery);
  51. $lnum = 0;
  52. //获取详细应答头
  53. $m_httphead = array();
  54. $httpstas = explode(" ", fgets($m_fp, 256));
  55. $m_httphead["http-edition"] = trim($httpstas[0]);
  56. $m_httphead["http-state"] = trim($httpstas[1]);
  57. while (!feof($m_fp)) {
  58. $line = trim(fgets($m_fp, 256));
  59. if ($line == "" || $lnum > 100) {
  60. break;
  61. }
  62. $hkey = "";
  63. $hvalue = "";
  64. $v = 0;
  65. for ($i = 0; $i < strlen($line); $i++) {
  66. if ($v == 1) {
  67. $hvalue .= $line[$i];
  68. }
  69. if ($line[$i] == ":") {
  70. $v = 1;
  71. }
  72. if ($v == 0) {
  73. $hkey .= $line[$i];
  74. }
  75. }
  76. $hkey = trim($hkey);
  77. if ($hkey != "") {
  78. $m_httphead[strtolower($hkey)] = trim($hvalue);
  79. }
  80. }
  81. //分析返回记录
  82. if (preg_match("/^3/", $m_httphead["http-state"])) {
  83. if (isset($m_httphead["location"]) && $JumpCount < 3) {
  84. $JumpCount++;
  85. DownImageKeep($gurl, $rfurl, $filename, $gcookie, $JumpCount);
  86. } else {
  87. return FALSE;
  88. }
  89. }
  90. if (!preg_match("/^2/", $m_httphead["http-state"])) {
  91. return FALSE;
  92. }
  93. if (!isset($m_httphead)) {
  94. return FALSE;
  95. }
  96. $contentLength = $m_httphead['content-length'];
  97. //保存文件
  98. $fp = fopen($filename, "w") or die("写入文件:{$filename} 失败");
  99. $i = 0;
  100. $okdata = "";
  101. $starttime = time();
  102. while (!feof($m_fp)) {
  103. $okdata .= fgetc($m_fp);
  104. $i++;
  105. //超时结束
  106. if (time() - $starttime > $maxtime) {
  107. break;
  108. }
  109. //到达指定大小结束
  110. if ($i >= $contentLength) {
  111. break;
  112. }
  113. }
  114. if ($okdata != "") {
  115. fwrite($fp, $okdata);
  116. }
  117. fclose($fp);
  118. if ($okdata == "") {
  119. @unlink($filename);
  120. fclose($m_fp);
  121. return FALSE;
  122. }
  123. fclose($m_fp);
  124. return TRUE;
  125. }
  126. /**
  127. * 获得某页面返回的Cookie信息
  128. *
  129. * @access public
  130. * @param string $gurl 调整地址
  131. * @return string
  132. */
  133. function RefurlCookie($gurl)
  134. {
  135. global $gcookie, $lastRfurl;
  136. $gurl = trim($gurl);
  137. if (!empty($gcookie) && $lastRfurl == $gurl) {
  138. return $gcookie;
  139. } else {
  140. $lastRfurl = $gurl;
  141. }
  142. if (trim($gurl) == '') {
  143. return '';
  144. }
  145. $urlinfos = GetHostInfo($gurl);
  146. $ghost = $urlinfos['host'];
  147. $gquery = $urlinfos['query'];
  148. $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  149. $sessionQuery .= "Host: $ghost\r\n";
  150. $sessionQuery .= "Accept: */*\r\n";
  151. $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  152. $sessionQuery .= "Connection: Close\r\n\r\n";
  153. $errno = "";
  154. $errstr = "";
  155. $m_fp = fsockopen($ghost, 80, $errno, $errstr, 10) or die($ghost.'<br />');
  156. fwrite($m_fp, $sessionQuery);
  157. $lnum = 0;
  158. //获取详细应答头
  159. $gcookie = "";
  160. while (!feof($m_fp)) {
  161. $line = trim(fgets($m_fp, 256));
  162. if ($line == "" || $lnum > 100) {
  163. break;
  164. } else {
  165. if (preg_match("/^cookie/i", $line)) {
  166. $gcookie = $line;
  167. break;
  168. }
  169. }
  170. }
  171. fclose($m_fp);
  172. return $gcookie;
  173. }
  174. /**
  175. * 获得网址的host和query部份
  176. *
  177. * @access public
  178. * @param string $gurl 调整地址
  179. * @return string
  180. */
  181. function GetHostInfo($gurl)
  182. {
  183. $gurl = preg_replace("/^http:\/\//i", "", trim($gurl));
  184. $garr['host'] = preg_replace("/\/(.*)$/i", "", $gurl);
  185. $garr['query'] = "/".preg_replace("/^([^\/]*)\//i", "", $gurl);
  186. return $garr;
  187. }
  188. /**
  189. * HTML里的图片转DEDE格式
  190. *
  191. * @access public
  192. * @param string $body 文章内容
  193. * @return string
  194. */
  195. function TurnImageTag(&$body)
  196. {
  197. global $cfg_album_width, $cfg_ddimg_width;
  198. if (empty($cfg_album_width)) {
  199. $cfg_album_width = 800;
  200. }
  201. if (empty($cfg_ddimg_width)) {
  202. $cfg_ddimg_width = 150;
  203. }
  204. $patten = "/<\\s*img\\s.*?src\\s*=\\s*([\"\\'])?(?(1)(.*?)\\1|([^\\s\\>\"\\']+))/isx";
  205. preg_match_all($patten, $body, $images);
  206. $returnArray1 = $images[2];
  207. $returnArray2 = $images[3];
  208. foreach ($returnArray1 as $key => $value) {
  209. if ($value) {
  210. $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key + 1)."'}".$value."{/dede:img}"."\r\n";
  211. } else {
  212. $ttx .= "{dede:img ddimg='$litpicname' text='图 ".($key + 1)."'}".$returnArray2[$key]."{/dede:img}"."\r\n";
  213. }
  214. }
  215. $ttx = "\r\n{dede:pagestyle maxwidth='{$cfg_album_width}' ddmaxwidth='{$cfg_ddimg_width}' row='3' col='3' value='2'/}\r\n{dede:comments}图集类型会采集时生成此配置是正常的,不过如果后面没有跟着img标记则表示规则无效{/dede:comments}\r\n".$ttx;
  216. return $ttx;
  217. }
  218. /**
  219. * HTML里的网址格式转换
  220. *
  221. * @access public
  222. * @param string $body 文章内容
  223. * @return string
  224. */
  225. function TurnLinkTag(&$body)
  226. {
  227. $ttx = '';
  228. $handid = '服务器';
  229. preg_match_all("/<a href=['\"](.+?)['\"]([^>]+?)>(.+?)<\/a>/is", $body, $match);
  230. if (is_array($match[1]) && count($match[1]) > 0) {
  231. for ($i = 0; isset($match[1][$i]); $i++) {
  232. $servername = (isset($match[3][$i]) ? str_replace("'", "`", $match[3][$i]) : $handid.($i + 1));
  233. if (preg_match("/[<>]/", $servername) || strlen($servername) > 40) {
  234. $servername = $handid.($i + 1);
  235. }
  236. $ttx .= "{dede:link text='$servername'} {$match[1][$i]} {/dede:link}\r\n";
  237. }
  238. }
  239. return $ttx;
  240. }
  241. /**
  242. * 替换XML的CDATA
  243. *
  244. * @access public
  245. * @param string $str 字符串
  246. * @return string
  247. */
  248. function RpCdata($str)
  249. {
  250. $str = str_replace('<![CDATA[', '', $str);
  251. $str = str_replace(']]>', '', $str);
  252. return $str;
  253. }
  254. /**
  255. * 分析RSS里的链接
  256. *
  257. * @access public
  258. * @param string $rssurl rss地址
  259. * @return string
  260. */
  261. function GetRssLinks($rssurl)
  262. {
  263. global $cfg_soft_lang;
  264. $dhd = new DedeHttpDown();
  265. $dhd->OpenUrl($rssurl);
  266. $rsshtml = $dhd->GetHtml();
  267. //分析编码
  268. preg_match("/encoding=[\"']([^\"']*)[\"']/is", $rsshtml, $infos);
  269. if (isset($infos[1])) {
  270. $pcode = strtolower(trim($infos[1]));
  271. } else {
  272. $pcode = strtolower($cfg_soft_lang);
  273. }
  274. if ($cfg_soft_lang == 'gb2312') {
  275. if ($pcode == 'utf-8') {
  276. $rsshtml = utf82gb($rsshtml);
  277. } else if ($pcode == 'big5') {
  278. $rsshtml = big52gb($rsshtml);
  279. }
  280. } else if ($cfg_soft_lang == 'utf-8') {
  281. if ($pcode == 'gbk' || $pcode == 'gb2312') {
  282. $rsshtml = gb2utf8($rsshtml);
  283. } else if ($pcode == 'big5') {
  284. $rsshtml = gb2utf8(big52gb($rsshtml));
  285. }
  286. }
  287. $rsarr = array();
  288. preg_match_all("/<item(.*)<title>(.*)<\/title>/isU", $rsshtml, $titles);
  289. preg_match_all("/<item(.*)<link>(.*)<\/link>/isU", $rsshtml, $links);
  290. preg_match_all("/<item(.*)<description>(.*)<\/description>/isU", $rsshtml, $descriptions);
  291. if (!isset($links[2])) {
  292. return '';
  293. }
  294. foreach ($links[2] as $k => $v) {
  295. $rsarr[$k]['link'] = RpCdata($v);
  296. if (isset($titles[2][$k])) {
  297. $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
  298. } else {
  299. $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
  300. }
  301. if (isset($descriptions[2][$k])) {
  302. $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k], $rssurl);
  303. } else {
  304. $rsarr[$k]['image'] = '';
  305. }
  306. }
  307. return $rsarr;
  308. }
  309. /**
  310. * 从RSS摘要获取图片信息
  311. *
  312. * @access public
  313. * @param string $descriptions 描述
  314. * @param string $refurl 来源地址
  315. * @return string
  316. */
  317. function GetddImgFromRss($descriptions, $refurl)
  318. {
  319. if ($descriptions == '') {
  320. return '';
  321. }
  322. preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $descriptions, $imgs);
  323. if (isset($imgs[2][0])) {
  324. $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
  325. $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
  326. return FillUrl($refurl, $imgs[2][0]);
  327. } else {
  328. return '';
  329. }
  330. }
  331. /**
  332. * 补全网址
  333. *
  334. * @access public
  335. * @param string $refurl 来源地址
  336. * @param string $surl 站点地址
  337. * @return string
  338. */
  339. function FillUrl($refurl, $surl)
  340. {
  341. $i = $pathStep = 0;
  342. $dstr = $pstr = $okurl = '';
  343. $refurl = trim($refurl);
  344. $surl = trim($surl);
  345. $urls = @parse_url($refurl);
  346. $basehost = ((!isset($urls['port']) || $urls['port'] == '80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
  347. //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
  348. //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
  349. $basepath = $basehost;
  350. $paths = explode('/', preg_replace("/^http:\/\//i", "", $refurl));
  351. $n = count($paths);
  352. for ($i = 1; $i < ($n - 1); $i++) {
  353. if (!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
  354. }
  355. if (!preg_match("/[\?\.]/", $paths[$n - 1])) {
  356. $basepath .= '/'.$paths[$n - 1];
  357. }
  358. if ($surl == '') {
  359. return $basepath;
  360. }
  361. $pos = strpos($surl, "#");
  362. if ($pos > 0) {
  363. $surl = substr($surl, 0, $pos);
  364. }
  365. //用 '/' 表示网站根的网址
  366. if ($surl[0] == '/') {
  367. $okurl = $basehost.$surl;
  368. } else if ($surl[0] == '.') {
  369. if (strlen($surl) <= 2) {
  370. return '';
  371. } else if ($surl[1] == '/') {
  372. $okurl = $basepath.preg_replace('/^./', '', $surl);
  373. } else {
  374. $okurl = $basepath.'/'.$surl;
  375. }
  376. } else {
  377. if (strlen($surl) < 7) {
  378. $okurl = $basepath.'/'.$surl;
  379. } else if (preg_match("/^http:\/\//i", $surl)) {
  380. $okurl = $surl;
  381. } else {
  382. $okurl = $basepath.'/'.$surl;
  383. }
  384. }
  385. $okurl = preg_replace("/^http:\/\//i", '', $okurl);
  386. $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
  387. return $okurl;
  388. }
  389. /**
  390. * 从匹配规则中获取列表网址
  391. *
  392. * @access public
  393. * @param string $regxurl 正则地址
  394. * @param string $handurl 操作地址
  395. * @param string $startid 开始ID
  396. * @param string $endid 结束ID
  397. * @param string $addv 增值
  398. * @param string $usemore 使用更多
  399. * @param string $batchrule 列表规则
  400. * @return string
  401. */
  402. function GetUrlFromListRule($regxurl = '', $handurl = '', $startid = 0, $endid = 0, $addv = 1, $usemore = 0, $batchrule = '')
  403. {
  404. global $dsql, $islisten;
  405. $lists = array();
  406. $n = 0;
  407. $islisten = (empty($islisten) ? 0 : $islisten);
  408. if ($handurl != '') {
  409. $handurls = explode("\n", $handurl);
  410. foreach ($handurls as $handurl) {
  411. $handurl = trim($handurl);
  412. if (preg_match("/^http:\/\//i", $handurl)) {
  413. $lists[$n][0] = $handurl;
  414. $lists[$n][1] = 0;
  415. $n++;
  416. if ($islisten == 1) {
  417. break;
  418. }
  419. }
  420. }
  421. }
  422. if ($regxurl != '') {
  423. //没指定(#)和(*)
  424. if (!preg_match("/\(\*\)/i", $regxurl) && !preg_match("/\(#\)/", $regxurl)) {
  425. $lists[$n][0] = $regxurl;
  426. $lists[$n][1] = 0;
  427. $n++;
  428. } else {
  429. if ($addv <= 0) {
  430. $addv = 1;
  431. }
  432. //没指定多栏目匹配规则
  433. if ($usemore == 0) {
  434. while ($startid <= $endid) {
  435. $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $regxurl);
  436. $lists[$n][1] = 0;
  437. $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
  438. $n++;
  439. if ($n > 2000 || $islisten == 1) {
  440. break;
  441. }
  442. }
  443. }
  444. //匹配多个栏目
  445. //规则表达式 [(#)=>(#)匹配的网址; (*)=>(*)的范围,如:1-20; typeid=>栏目id; addurl=>附加的网址(用|分开多个)]
  446. else {
  447. $nrules = explode(']', trim($batchrule));
  448. foreach ($nrules as $nrule) {
  449. $nrule = trim($nrule);
  450. $nrule = preg_replace("/^\[|\]$/", '', $nrule);
  451. $nrules = explode(';', $nrule);
  452. if (count($nrules) < 3) {
  453. continue;
  454. }
  455. $brtag = '';
  456. $startid = 0;
  457. $endid = 0;
  458. $typeid = 0;
  459. $addurls = array();
  460. foreach ($nrules as $nrule) {
  461. $nrule = trim($nrule);
  462. list($k, $v) = explode('=>', $nrule);
  463. if (trim($k) == '(#)') {
  464. $brtag = trim($v);
  465. } else if (trim($k) == 'typeid') {
  466. $typeid = trim($v);
  467. } else if (trim($k) == 'addurl') {
  468. $addurl = trim($v);
  469. $addurls = explode('|', $addurl);
  470. } else if (trim($k) == '(*)') {
  471. $v = preg_replace("/[ \r\n\t]/", '', trim($v));
  472. list($startid, $endid) = explode('-', $v);
  473. }
  474. }
  475. //如果栏目用栏目名称
  476. if (preg_match('/[^0-9]/', $typeid)) {
  477. $arr = $dsql->GetOne("SELECT id FROM `#@__arctype` WHERE typename LIKE '$typeid' ");
  478. if (is_array($arr)) {
  479. $typeid = $arr['id'];
  480. } else {
  481. $typeid = 0;
  482. }
  483. }
  484. //附加网址优先
  485. $mjj = 0;
  486. if (isset($addurls[0])) {
  487. foreach ($addurls as $addurl) {
  488. $addurl = trim($addurl);
  489. if ($addurl == '') {
  490. continue;
  491. }
  492. $lists[$n][0] = $addurl;
  493. $lists[$n][1] = $typeid;
  494. $n++;
  495. $mjj++;
  496. if ($islisten == 1) {
  497. break;
  498. }
  499. }
  500. }
  501. //如果为非监听模式或监听模式没手工指定的附加网址
  502. if ($islisten != 1 || $mjj == 0) {
  503. //匹配规则里的网址,注:(#)的网址是是允许使用(*)的
  504. while ($startid <= $endid) {
  505. $lists[$n][0] = str_replace("(#)", $brtag, $regxurl);
  506. $lists[$n][0] = str_replace("(*)", sprintf('%0'.strlen($startid).'d', $startid), $lists[$n][0]);
  507. $lists[$n][1] = $typeid;
  508. $startid = sprintf('%0'.strlen($startid).'d', $startid + $addv);
  509. $n++;
  510. if ($islisten == 1) {
  511. break;
  512. }
  513. if ($n > 20000) {
  514. break;
  515. }
  516. }
  517. }
  518. }
  519. } //End 匹配多栏目
  520. } //End使用规则匹配的情况
  521. }
  522. return $lists;
  523. }//End