国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1135 lines
38KB

  1. <?php
  2. if (!defined('DEDEINC')) exit('dedebiz');
  3. /**
  4. * Unicode编码词典的php分词器
  5. *
  6. * 1、只适用于php5,必要函数 iconv
  7. * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
  8. * 3、简单操作流程:SetSource -> StartAnalysis -> Get***Result
  9. * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
  10. *
  11. * @version $Id: splitword.class.php 2 11:45 2011-2-14 itplato $
  12. * @package DedeBIZ.Libraries
  13. * @copyright Copyright (c) 2022, DedeBIZ.COM
  14. * @license https://www.dedebiz.com/license
  15. * @link https://www.dedebiz.com
  16. */
  17. //常量定义
  18. define('_SP_', chr(0xFF).chr(0xFE));
  19. define('UCS2', 'ucs-2be');
  20. class SplitWord
  21. {
  22. //hash算法选项
  23. var $mask_value = 0xFFFF;
  24. //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
  25. var $sourceCharSet = 'utf-8';
  26. var $targetCharSet = 'utf-8';
  27. //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文
  28. var $resultType = 1;
  29. //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
  30. var $notSplitLen = 5;
  31. //把英文单词全部转小写
  32. var $toLower = FALSE;
  33. //使用最大切分模式对二元词进行消岐
  34. var $differMax = FALSE;
  35. //尝试合并单字
  36. var $unitWord = TRUE;
  37. //初始化类时直接加载词典
  38. var $loadInit = TRUE;
  39. //使用热门词优先模式进行消岐
  40. var $differFreq = FALSE;
  41. //被转换为unicode的源字符串
  42. var $sourceString = '';
  43. //附加词典
  44. var $addonDic = array();
  45. var $addonDicFile = 'data/words_addons.dic';
  46. //主词典
  47. var $dicStr = '';
  48. var $mainDic = array();
  49. var $mainDicHand = FALSE;
  50. var $mainDicInfos = array();
  51. var $mainDicFile = 'data/base_dic_full.dic';
  52. //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
  53. var $mainDicFileZip = 'data/base_dic_full.zip';
  54. var $isLoadAll = FALSE;
  55. var $isUnpacked = FALSE;
  56. //主词典词语最大长度 x / 2
  57. var $dicWordMax = 14;
  58. //粗分后的数组(通常是截取句子等用途)
  59. var $simpleResult = array();
  60. //最终结果(用空格分开的词汇列表)
  61. var $finallyResult = '';
  62. //是否已经载入词典
  63. var $isLoadDic = FALSE;
  64. //系统识别或合并的新词
  65. var $newWords = array();
  66. var $foundWordStr = '';
  67. //词库载入时间
  68. var $loadTime = 0;
  69. /**
  70. * 构造函数
  71. * @param $source_charset
  72. * @param $target_charset
  73. * @param $load_alldic
  74. * @param $source
  75. *
  76. * @return void
  77. */
  78. function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  79. {
  80. $this->SetSource( $source, $source_charset, $target_charset );
  81. $this->isLoadAll = $load_all;
  82. if(file_exists(DEDEINC.'/'.$this->mainDicFile)) $this->isUnpacked = TRUE;
  83. if($this->loadInit) $this->LoadDict();
  84. }
  85. function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  86. {
  87. $this->__construct($source_charset, $target_charset, $load_all, $source);
  88. }
  89. /**
  90. * 析构函数
  91. */
  92. function __destruct()
  93. {
  94. if( $this->mainDicHand !== FALSE )
  95. {
  96. @fclose( $this->mainDicHand );
  97. }
  98. }
  99. /**
  100. * 根据字符串计算key索引
  101. * @param $key
  102. * @return short int
  103. */
  104. function _get_index( $key )
  105. {
  106. $l = strlen($key);
  107. $h = 0x238f13af;
  108. while ($l--)
  109. {
  110. $h += ($h << 5);
  111. $h ^= ord($key[$l]);
  112. $h &= 0x7fffffff;
  113. }
  114. return ($h % $this->mask_value);
  115. }
  116. /**
  117. * 从文件获得词
  118. * @param $key
  119. * @param $type (类型 word 或 key_groups)
  120. * @return short int
  121. */
  122. function GetWordInfos( $key, $type='word' )
  123. {
  124. if( !$this->mainDicHand )
  125. {
  126. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  127. }
  128. $p = 0;
  129. $keynum = $this->_get_index( $key );
  130. if( isset($this->mainDicInfos[ $keynum ]) )
  131. {
  132. $data = $this->mainDicInfos[ $keynum ];
  133. }
  134. else
  135. {
  136. //rewind( $this->mainDicHand );
  137. $move_pos = $keynum * 8;
  138. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  139. $dat = fread($this->mainDicHand, 8);
  140. $arr = unpack('I1s/n1l/n1c', $dat);
  141. if( $arr['l'] == 0 )
  142. {
  143. return FALSE;
  144. }
  145. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  146. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  147. $this->mainDicInfos[ $keynum ] = $data;
  148. }
  149. if( !is_array($data) || !isset($data[$key]) )
  150. {
  151. return FALSE;
  152. }
  153. return ($type=='word' ? $data[$key] : $data);
  154. }
  155. /**
  156. * 设置源字符串
  157. * @param $source
  158. * @param $source_charset
  159. * @param $target_charset
  160. *
  161. * @return bool
  162. */
  163. function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
  164. {
  165. $this->sourceCharSet = strtolower($source_charset);
  166. $this->targetCharSet = strtolower($target_charset);
  167. $this->simpleResult = array();
  168. $this->finallyResult = array();
  169. $this->finallyIndex = array();
  170. if( $source != '' )
  171. {
  172. $rs = TRUE;
  173. if( preg_match("/^utf/", $source_charset) ) {
  174. $this->sourceString = @iconv('utf-8', UCS2, $source);
  175. }
  176. else if( preg_match("/^gb/", $source_charset) ) {
  177. $this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
  178. }
  179. else if( preg_match("/^big/", $source_charset) ) {
  180. $this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
  181. }
  182. else {
  183. $rs = FALSE;
  184. }
  185. }
  186. else
  187. {
  188. $rs = FALSE;
  189. }
  190. return $rs;
  191. }
  192. /**
  193. * 设置结果类型(只在获取finallyResult才有效)
  194. * @param $rstype 1 为全部, 2去除特殊符号
  195. *
  196. * @return void
  197. */
  198. function SetResultType( $rstype )
  199. {
  200. $this->resultType = $rstype;
  201. }
  202. /**
  203. * 载入词典
  204. *
  205. * @return void
  206. */
  207. function LoadDict( $maindic='' )
  208. {
  209. $this->addonDicFile = DEDEINC.'/'.$this->addonDicFile;
  210. $this->mainDicFile = DEDEINC.'/'.$this->mainDicFile;
  211. $this->mainDicFileZip = DEDEINC.'/'.$this->mainDicFileZip;
  212. $startt = microtime(TRUE);
  213. //正常读取文件
  214. $dicAddon = $this->addonDicFile;
  215. if($maindic=='' || !file_exists($maindic) )
  216. {
  217. $dicWords = $this->mainDicFile ;
  218. }
  219. else
  220. {
  221. $dicWords = $maindic;
  222. $this->mainDicFile = $maindic;
  223. }
  224. //加载主词典(只打开)
  225. if($this->isUnpacked){
  226. $this->mainDicHand = fopen($dicWords, 'r');
  227. }else{
  228. $this->InportDict($this->mainDicFileZip);
  229. }
  230. //载入副词典
  231. $hw = '';
  232. $ds = file($dicAddon);
  233. foreach($ds as $d)
  234. {
  235. $d = trim($d);
  236. if($d=='') continue;
  237. $estr = substr($d, 1, 1);
  238. if( $estr==':' ) {
  239. $hw = substr($d, 0, 1);
  240. }
  241. else
  242. {
  243. $spstr = _SP_;
  244. $spstr = iconv(UCS2, 'utf-8', $spstr);
  245. $ws = explode(',', $d);
  246. $wall = iconv('utf-8', UCS2, join($spstr, $ws));
  247. $ws = explode(_SP_, $wall);
  248. foreach($ws as $estr)
  249. {
  250. $this->addonDic[$hw][$estr] = strlen($estr);
  251. }
  252. }
  253. }
  254. $this->loadTime = microtime(TRUE) - $startt;
  255. $this->isLoadDic = TRUE;
  256. }
  257. /**
  258. * 检测某个词是否存在
  259. */
  260. function IsWord( $word )
  261. {
  262. $winfos = $this->GetWordInfos( $word );
  263. return ($winfos !== FALSE);
  264. }
  265. /**
  266. * 获得某个词的词性及词频信息
  267. * @parem $word unicode编码的词
  268. * @return void
  269. */
  270. function GetWordProperty($word)
  271. {
  272. if( strlen($word)<4 )
  273. {
  274. return '/s';
  275. }
  276. $infos = $this->GetWordInfos($word);
  277. return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
  278. }
  279. /**
  280. * 指定某词的词性信息(通常是新词)
  281. * @parem $word unicode编码的词
  282. * @parem $infos array('c' => 词频, 'm' => 词性);
  283. * @return void;
  284. */
  285. function SetWordInfos($word, $infos)
  286. {
  287. if( strlen($word)<4 )
  288. {
  289. return ;
  290. }
  291. if( isset($this->mainDicInfos[$word]) )
  292. {
  293. $this->newWords[$word]++;
  294. $this->mainDicInfos[$word]['c']++;
  295. }
  296. else
  297. {
  298. $this->newWords[$word] = 1;
  299. $this->mainDicInfos[$word] = $infos;
  300. }
  301. }
  302. /**
  303. * 开始执行分析
  304. * @parem bool optimize 是否对结果进行优化
  305. * @return bool
  306. */
  307. function StartAnalysis($optimize=TRUE)
  308. {
  309. if( !$this->isLoadDic )
  310. {
  311. $this->LoadDict();
  312. }
  313. $this->simpleResult = $this->finallyResult = array();
  314. $this->sourceString .= chr(0).chr(32);
  315. $slen = strlen($this->sourceString);
  316. $sbcArr = array();
  317. $j = 0;
  318. //全角与半角字符对照表
  319. for($i=0xFF00; $i < 0xFF5F; $i++)
  320. {
  321. $scb = 0x20 + $j;
  322. $j++;
  323. $sbcArr[$i] = $scb;
  324. }
  325. //对字符串进行粗分
  326. $onstr = '';
  327. $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  328. $s = 0;
  329. $ansiWordMatch = "[0-9a-z@#%\+\.-]";
  330. $notNumberMatch = "[a-z@#%\+]";
  331. for($i=0; $i < $slen; $i++)
  332. {
  333. $c = $this->sourceString[$i].$this->sourceString[++$i];
  334. $cn = hexdec(bin2hex($c));
  335. $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
  336. //ANSI字符
  337. if($cn < 0x80)
  338. {
  339. if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
  340. {
  341. if( $lastc != 2 && $onstr != '') {
  342. $this->simpleResult[$s]['w'] = $onstr;
  343. $this->simpleResult[$s]['t'] = $lastc;
  344. $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  345. $s++;
  346. $onstr = '';
  347. }
  348. $lastc = 2;
  349. $onstr .= chr(0).chr($cn);
  350. }
  351. else
  352. {
  353. if( $onstr != '' )
  354. {
  355. $this->simpleResult[$s]['w'] = $onstr;
  356. if( $lastc==2 )
  357. {
  358. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  359. }
  360. $this->simpleResult[$s]['t'] = $lastc;
  361. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  362. $s++;
  363. }
  364. $onstr = '';
  365. $lastc = 3;
  366. if($cn < 31)
  367. {
  368. continue;
  369. }
  370. else
  371. {
  372. $this->simpleResult[$s]['w'] = chr(0).chr($cn);
  373. $this->simpleResult[$s]['t'] = 3;
  374. $s++;
  375. }
  376. }
  377. }
  378. //普通字符
  379. else
  380. {
  381. //正常文字
  382. if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
  383. || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
  384. {
  385. if( $lastc != 1 && $onstr != '')
  386. {
  387. $this->simpleResult[$s]['w'] = $onstr;
  388. if( $lastc==2 )
  389. {
  390. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  391. }
  392. $this->simpleResult[$s]['t'] = $lastc;
  393. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  394. $s++;
  395. $onstr = '';
  396. }
  397. $lastc = 1;
  398. $onstr .= $c;
  399. }
  400. //特殊符号
  401. else
  402. {
  403. if( $onstr != '' )
  404. {
  405. $this->simpleResult[$s]['w'] = $onstr;
  406. if( $lastc==2 )
  407. {
  408. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  409. }
  410. $this->simpleResult[$s]['t'] = $lastc;
  411. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  412. $s++;
  413. }
  414. //检测书名
  415. if( $cn == 0x300A )
  416. {
  417. $tmpw = '';
  418. $n = 1;
  419. $isok = FALSE;
  420. $ew = chr(0x30).chr(0x0B);
  421. while(TRUE)
  422. {
  423. if(!isset($this->sourceString[$i+$n]) && !isset($this->sourceString[$i+$n+1]))
  424. break;
  425. $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
  426. if( $w == $ew )
  427. {
  428. $this->simpleResult[$s]['w'] = $c;
  429. $this->simpleResult[$s]['t'] = 5;
  430. $s++;
  431. $this->simpleResult[$s]['w'] = $tmpw;
  432. $this->newWords[$tmpw] = 1;
  433. if( !isset($this->newWords[$tmpw]) )
  434. {
  435. $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
  436. $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
  437. }
  438. $this->simpleResult[$s]['t'] = 13;
  439. $s++;
  440. //最大切分模式对书名继续分词
  441. if( $this->differMax )
  442. {
  443. $this->simpleResult[$s]['w'] = $tmpw;
  444. $this->simpleResult[$s]['t'] = 21;
  445. $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
  446. $s++;
  447. }
  448. $this->simpleResult[$s]['w'] = $ew;
  449. $this->simpleResult[$s]['t'] = 5;
  450. $s++;
  451. $i = $i + $n + 1;
  452. $isok = TRUE;
  453. $onstr = '';
  454. $lastc = 5;
  455. break;
  456. }
  457. else
  458. {
  459. $n = $n+2;
  460. $tmpw .= $w;
  461. if( strlen($tmpw) > 60 )
  462. {
  463. break;
  464. }
  465. }
  466. }//while
  467. if( !$isok )
  468. {
  469. $this->simpleResult[$s]['w'] = $c;
  470. $this->simpleResult[$s]['t'] = 5;
  471. $s++;
  472. $onstr = '';
  473. $lastc = 5;
  474. }
  475. continue;
  476. }
  477. $onstr = '';
  478. $lastc = 5;
  479. if( $cn==0x3000 )
  480. {
  481. continue;
  482. }
  483. else
  484. {
  485. $this->simpleResult[$s]['w'] = $c;
  486. $this->simpleResult[$s]['t'] = 5;
  487. $s++;
  488. }
  489. }//2byte symbol
  490. }//end 2byte char
  491. }//end for
  492. //处理分词后的结果
  493. $this->_sort_finally_result();
  494. }
  495. /**
  496. * 深入分词
  497. * @parem $str
  498. * @parem $ctype (2 英文类, 3 中/韩/日文类)
  499. * @parem $spos 当前粗分结果游标
  500. * @return bool
  501. */
  502. function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE )
  503. {
  504. //中文句子
  505. if( $ctype==1 )
  506. {
  507. $slen = strlen($str);
  508. //小于系统配置分词要求长度的句子
  509. if( $slen < $this->notSplitLen )
  510. {
  511. $tmpstr = '';
  512. $lastType = 0;
  513. if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
  514. if($slen < 5)
  515. {
  516. //echo iconv(UCS2, 'utf-8', $str).'<br/>';
  517. if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
  518. {
  519. $str2 = '';
  520. if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
  521. {
  522. $str2 = substr($str, 2, 2);
  523. $str = substr($str, 0, 2);
  524. }
  525. $ww = $this->simpleResult[$spos - 1]['w'].$str;
  526. $this->simpleResult[$spos - 1]['w'] = $ww;
  527. $this->simpleResult[$spos - 1]['t'] = 4;
  528. if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
  529. {
  530. $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
  531. $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
  532. }
  533. $this->simpleResult[$spos]['w'] = '';
  534. if( $str2 != '' )
  535. {
  536. $this->finallyResult[$spos-1][] = $ww;
  537. $this->finallyResult[$spos-1][] = $str2;
  538. }
  539. }
  540. else {
  541. $this->finallyResult[$spos][] = $str;
  542. }
  543. }
  544. else
  545. {
  546. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  547. }
  548. }
  549. //正常长度的句子,循环进行分词处理
  550. else
  551. {
  552. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  553. }
  554. }
  555. //英文句子,转为小写
  556. else
  557. {
  558. if( $this->toLower ) {
  559. $this->finallyResult[$spos][] = strtolower($str);
  560. }
  561. else {
  562. $this->finallyResult[$spos][] = $str;
  563. }
  564. }
  565. }
  566. /**
  567. * 中文的深入分词
  568. * @parem $str
  569. * @return void
  570. */
  571. function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE )
  572. {
  573. $quote1 = chr(0x20).chr(0x1C);
  574. $tmparr = array();
  575. $hasw = 0;
  576. //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。
  577. if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
  578. {
  579. $tmparr[] = $str;
  580. if( !isset($this->newWords[$str]) )
  581. {
  582. $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
  583. $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
  584. }
  585. if( !$this->differMax )
  586. {
  587. $this->finallyResult[$spos][] = $str;
  588. return ;
  589. }
  590. }
  591. //进行切分
  592. for($i=$slen-1; $i > 0; $i -= 2)
  593. {
  594. //单个词
  595. $nc = $str[$i-1].$str[$i];
  596. //是否已经到最后两个字
  597. if( $i <= 2 )
  598. {
  599. $tmparr[] = $nc;
  600. $i = 0;
  601. break;
  602. }
  603. $isok = FALSE;
  604. $i = $i + 1;
  605. for($k=$this->dicWordMax; $k>1; $k=$k-2)
  606. {
  607. if($i < $k) continue;
  608. $w = substr($str, $i-$k, $k);
  609. if( strlen($w) <= 2 )
  610. {
  611. $i = $i - 1;
  612. break;
  613. }
  614. if( $this->IsWord( $w ) )
  615. {
  616. $tmparr[] = $w;
  617. $i = $i - $k + 1;
  618. $isok = TRUE;
  619. break;
  620. }
  621. }
  622. //echo '<hr />';
  623. //没适合词
  624. if(!$isok) $tmparr[] = $nc;
  625. }
  626. $wcount = count($tmparr);
  627. if( $wcount==0 ) return ;
  628. $this->finallyResult[$spos] = array_reverse($tmparr);
  629. //优化结果(岐义处理、新词、数词、人名识别等)
  630. if( $optimize )
  631. {
  632. $this->_optimize_result( $this->finallyResult[$spos], $spos );
  633. }
  634. }
  635. /**
  636. * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
  637. * @parem $optimize 是否优化合并的结果
  638. * @return bool
  639. */
  640. //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  641. function _optimize_result( &$smarr, $spos )
  642. {
  643. $newarr = array();
  644. $prePos = $spos - 1;
  645. $arlen = count($smarr);
  646. $i = $j = 0;
  647. //检测数量词
  648. if( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
  649. {
  650. $lastw = $this->simpleResult[$prePos]['w'];
  651. $lastt = $this->simpleResult[$prePos]['t'];
  652. if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
  653. {
  654. $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
  655. $this->simpleResult[$prePos]['t'] = 4;
  656. if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
  657. {
  658. $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
  659. $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
  660. }
  661. $smarr[0] = '';
  662. $i++;
  663. }
  664. }
  665. for(; $i < $arlen; $i++)
  666. {
  667. if( !isset( $smarr[$i+1] ) )
  668. {
  669. $newarr[$j] = $smarr[$i];
  670. break;
  671. }
  672. $cw = $smarr[$i];
  673. $nw = $smarr[$i+1];
  674. $ischeck = FALSE;
  675. //检测数量词
  676. if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
  677. {
  678. //最大切分时保留合并前的词
  679. if($this->differMax)
  680. {
  681. $newarr[$j] = chr(0).chr(0x28);
  682. $j++;
  683. $newarr[$j] = $cw;
  684. $j++;
  685. $newarr[$j] = $nw;
  686. $j++;
  687. $newarr[$j] = chr(0).chr(0x29);
  688. $j++;
  689. }
  690. $newarr[$j] = $cw.$nw;
  691. if( !isset($this->newWords[$newarr[$j]]) )
  692. {
  693. $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
  694. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
  695. }
  696. $j++; $i++; $ischeck = TRUE;
  697. }
  698. //检测前导词(通常是姓)
  699. else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
  700. {
  701. $is_rs = FALSE;
  702. //词语是副词或介词或频率很高的词不作为人名
  703. if( strlen($nw)==4 )
  704. {
  705. $winfos = $this->GetWordInfos($nw);
  706. if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  707. {
  708. $is_rs = TRUE;
  709. }
  710. }
  711. if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
  712. {
  713. $newarr[$j] = $cw.$nw;
  714. //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";
  715. //尝试检测第三个词
  716. if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
  717. {
  718. $newarr[$j] .= $smarr[$i+2];
  719. $i++;
  720. }
  721. if( !isset($this->newWords[$newarr[$j]]) )
  722. {
  723. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
  724. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
  725. }
  726. //为了防止错误,保留合并前的姓名
  727. if(strlen($nw)==4)
  728. {
  729. $j++;
  730. $newarr[$j] = chr(0).chr(0x28);
  731. $j++;
  732. $newarr[$j] = $cw;
  733. $j++;
  734. $newarr[$j] = $nw;
  735. $j++;
  736. $newarr[$j] = chr(0).chr(0x29);
  737. }
  738. $j++; $i++; $ischeck = TRUE;
  739. }
  740. }
  741. //检测后缀词(地名等)
  742. else if( isset($this->addonDic['a'][$nw]) )
  743. {
  744. $is_rs = FALSE;
  745. //词语是副词或介词不作为前缀
  746. if( strlen($cw)>2 )
  747. {
  748. $winfos = $this->GetWordInfos($cw);
  749. if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  750. {
  751. $is_rs = TRUE;
  752. }
  753. }
  754. if( !isset($this->addonDic['s'][$cw]) && !$is_rs )
  755. {
  756. $newarr[$j] = $cw.$nw;
  757. if( !isset($this->newWords[$newarr[$j]]) )
  758. {
  759. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
  760. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
  761. }
  762. $i++; $j++; $ischeck = TRUE;
  763. }
  764. }
  765. //新词识别(暂无规则)
  766. else if($this->unitWord)
  767. {
  768. if(strlen($cw)==2 && strlen($nw)==2
  769. && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
  770. && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
  771. {
  772. $newarr[$j] = $cw.$nw;
  773. //尝试检测第三个词
  774. if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
  775. {
  776. $newarr[$j] .= $smarr[$i+2];
  777. $i++;
  778. }
  779. if( !isset($this->newWords[$newarr[$j]]) )
  780. {
  781. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
  782. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
  783. }
  784. $i++; $j++; $ischeck = TRUE;
  785. }
  786. }
  787. //不符合规则
  788. if( !$ischeck )
  789. {
  790. $newarr[$j] = $cw;
  791. //二元消岐处理——最大切分模式
  792. if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
  793. {
  794. $slen = strlen($nw);
  795. $hasDiff = FALSE;
  796. for($y=2; $y <= $slen-2; $y=$y+2)
  797. {
  798. $nhead = substr($nw, $y-2, 2);
  799. $nfont = $cw.substr($nw, 0, $y-2);
  800. if( $this->IsWord( $nfont.$nhead ) )
  801. {
  802. if( strlen($cw) > 2 ) $j++;
  803. $hasDiff = TRUE;
  804. $newarr[$j] = $nfont.$nhead;
  805. }
  806. }
  807. }
  808. $j++;
  809. }
  810. }//end for
  811. $smarr = $newarr;
  812. }
  813. /**
  814. * 转换最终分词结果到 finallyResult 数组
  815. * @return void
  816. */
  817. function _sort_finally_result()
  818. {
  819. $newarr = array();
  820. $i = 0;
  821. foreach($this->simpleResult as $k=>$v)
  822. {
  823. if( empty($v['w']) ) continue;
  824. if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
  825. {
  826. foreach($this->finallyResult[$k] as $w)
  827. {
  828. if(!empty($w))
  829. {
  830. $newarr[$i]['w'] = $w;
  831. $newarr[$i]['t'] = 20;
  832. $i++;
  833. }
  834. }
  835. }
  836. else if($v['t'] != 21)
  837. {
  838. $newarr[$i]['w'] = $v['w'];
  839. $newarr[$i]['t'] = $v['t'];
  840. $i++;
  841. }
  842. }
  843. $this->finallyResult = $newarr;
  844. $newarr = '';
  845. }
  846. /**
  847. * 把uncode字符串转换为输出字符串
  848. * @parem str
  849. * return string
  850. */
  851. function _out_string_encoding( &$str )
  852. {
  853. $rsc = $this->_source_result_charset();
  854. if( $rsc==1 ) {
  855. $rsstr = iconv(UCS2, 'utf-8', $str);
  856. }
  857. else if( $rsc==2 ) {
  858. $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
  859. }
  860. else{
  861. $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
  862. }
  863. return $rsstr;
  864. }
  865. /**
  866. * 获取最终结果字符串(用空格分开后的分词结果)
  867. * @return string
  868. */
  869. function GetFinallyResult($spword=' ', $word_meanings=FALSE)
  870. {
  871. $rsstr = '';
  872. foreach($this->finallyResult as $v)
  873. {
  874. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  875. {
  876. continue;
  877. }
  878. $m = '';
  879. if( $word_meanings )
  880. {
  881. $m = $this->GetWordProperty($v['w']);
  882. }
  883. $w = $this->_out_string_encoding($v['w']);
  884. if( $w != ' ' )
  885. {
  886. if($word_meanings) {
  887. $rsstr .= $spword.$w.$m;
  888. }
  889. else {
  890. $rsstr .= $spword.$w;
  891. }
  892. }
  893. }
  894. return $rsstr;
  895. }
  896. /**
  897. * 获取粗分结果,不包含粗分属性
  898. * @return array()
  899. */
  900. function GetSimpleResult()
  901. {
  902. $rearr = array();
  903. foreach($this->simpleResult as $k=>$v)
  904. {
  905. if( empty($v['w']) ) continue;
  906. $w = $this->_out_string_encoding($v['w']);
  907. if( $w != ' ' ) $rearr[] = $w;
  908. }
  909. return $rearr;
  910. }
  911. /**
  912. * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
  913. * @return array()
  914. */
  915. function GetSimpleResultAll()
  916. {
  917. $rearr = array();
  918. foreach($this->simpleResult as $k=>$v)
  919. {
  920. $w = $this->_out_string_encoding($v['w']);
  921. if( $w != ' ' )
  922. {
  923. $rearr[$k]['w'] = $w;
  924. $rearr[$k]['t'] = $v['t'];
  925. }
  926. }
  927. return $rearr;
  928. }
  929. /**
  930. * 获取索引hash数组
  931. * @return array('word'=>count,...)
  932. */
  933. function GetFinallyIndex()
  934. {
  935. $rearr = array();
  936. foreach($this->finallyResult as $v)
  937. {
  938. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  939. {
  940. continue;
  941. }
  942. $w = $this->_out_string_encoding($v['w']);
  943. if( $w == ' ' )
  944. {
  945. continue;
  946. }
  947. if( isset($rearr[$w]) )
  948. {
  949. $rearr[$w]++;
  950. }
  951. else
  952. {
  953. $rearr[$w] = 1;
  954. }
  955. }
  956. return $rearr;
  957. }
  958. /**
  959. * 获得保存目标编码
  960. * @return int
  961. */
  962. function _source_result_charset()
  963. {
  964. if( preg_match("/^utf/", $this->targetCharSet) ) {
  965. $rs = 1;
  966. }
  967. else if( preg_match("/^gb/", $this->targetCharSet) ) {
  968. $rs = 2;
  969. }
  970. else if( preg_match("/^big/", $this->targetCharSet) ) {
  971. $rs = 3;
  972. }
  973. else {
  974. $rs = 4;
  975. }
  976. return $rs;
  977. }
  978. /**
  979. * 编译词典
  980. * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
  981. * 注意, 需要PHP开放足够的内存才能完成操作
  982. * @return void
  983. */
  984. function MakeDict( $source_file, $target_file='' )
  985. {
  986. $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
  987. $allk = array();
  988. $fp = fopen($source_file, 'r');
  989. while( $line = fgets($fp, 512) )
  990. {
  991. if( $line[0]=='@' ) continue;
  992. list($w, $r, $a) = explode(',', $line);
  993. $a = trim( $a );
  994. $w = iconv('utf-8', UCS2, $w);
  995. $k = $this->_get_index( $w );
  996. if( isset($allk[ $k ]) )
  997. $allk[ $k ][ $w ] = array($r, $a);
  998. else
  999. $allk[ $k ][ $w ] = array($r, $a);
  1000. }
  1001. fclose( $fp );
  1002. $fp = fopen($target_file, 'w');
  1003. $heade_rarr = array();
  1004. $alldat = '';
  1005. $start_pos = $this->mask_value * 8;
  1006. foreach( $allk as $k => $v )
  1007. {
  1008. $dat = serialize( $v );
  1009. $dlen = strlen($dat);
  1010. $alldat .= $dat;
  1011. $heade_rarr[ $k ][0] = $start_pos;
  1012. $heade_rarr[ $k ][1] = $dlen;
  1013. $heade_rarr[ $k ][2] = count( $v );
  1014. $start_pos += $dlen;
  1015. }
  1016. unset( $allk );
  1017. for($i=0; $i < $this->mask_value; $i++)
  1018. {
  1019. if( !isset($heade_rarr[$i]) )
  1020. {
  1021. $heade_rarr[$i] = array(0, 0, 0);
  1022. }
  1023. fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
  1024. }
  1025. fwrite( $fp, $alldat);
  1026. fclose( $fp );
  1027. }
  1028. /**
  1029. * 导出词典的词条
  1030. * @parem $targetfile 保存位置
  1031. * @return void
  1032. */
  1033. function ExportDict( $targetfile )
  1034. {
  1035. if( !$this->mainDicHand )
  1036. {
  1037. $this->mainDicHand = fopen($this->mainDicFile, 'rw');
  1038. }
  1039. $fp = fopen($targetfile, 'w');
  1040. for($i=0; $i <= $this->mask_value; $i++)
  1041. {
  1042. $move_pos = $i * 8;
  1043. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  1044. $dat = fread($this->mainDicHand, 8);
  1045. $arr = unpack('I1s/n1l/n1c', $dat);
  1046. if( $arr['l'] == 0 )
  1047. {
  1048. continue;
  1049. }
  1050. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  1051. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  1052. if( !is_array($data) ) continue;
  1053. foreach($data as $k => $v)
  1054. {
  1055. $w = iconv(UCS2, 'utf-8', $k);
  1056. fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
  1057. }
  1058. }
  1059. fclose( $fp );
  1060. return TRUE;
  1061. }
  1062. function InportDict( $targetfile )
  1063. {
  1064. if(!ini_set('memory_limit', '128M'))
  1065. exit('设置内存错误,请到dede官网下载解压版的base_dic_full.dic!');
  1066. require_once(DEDEINC.'/zip.class.php');
  1067. $zip = new zip();
  1068. //echo $targetfile;
  1069. $unpackagefile = array_keys($zip->Extract($targetfile,DEDEINC.'/data/'));
  1070. //exit();
  1071. $this->MakeDict(DEDEINC.'/data/'.$unpackagefile[0]);
  1072. unlink(DEDEINC.'/data/'.$unpackagefile[0]);
  1073. return true;
  1074. }
  1075. }