国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1034 lines
37KB

  1. <?php
  2. if (!defined('DEDEINC')) exit('dedebiz');
  3. /**
  4. * Unicode编码词典的php分词器
  5. *
  6. * 1、只适用于php5,必要函数 iconv
  7. * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
  8. * 3、简单操作流程:SetSource -> StartAnalysis -> Get***Result
  9. * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
  10. *
  11. * @version $id:splitword.class.php 2 11:45 2011-2-14 itplato $
  12. * @package DedeBIZ.Libraries
  13. * @copyright Copyright (c) 2022 DedeBIZ.COM
  14. * @license https://www.dedebiz.com/license
  15. * @link https://www.dedebiz.com
  16. */
  17. //常量定义
  18. define('_SP_', chr(0xFF).chr(0xFE));
  19. define('UCS2', 'ucs-2be');
  20. class SplitWord
  21. {
  22. //hash算法选项
  23. var $mask_value = 0xFFFF;
  24. //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
  25. var $sourceCharSet = 'utf-8';
  26. var $targetCharSet = 'utf-8';
  27. //生成的分词结果数据类型 1 为全部,2为 词典词汇及单个中日韩简繁字符及英文,3 为词典词汇及英文
  28. var $resultType = 1;
  29. //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
  30. var $notSplitLen = 5;
  31. //把英文单词全部转小写
  32. var $toLower = FALSE;
  33. //使用最大切分模式对二元词进行消岐
  34. var $differMax = FALSE;
  35. //尝试合并单字
  36. var $unitWord = TRUE;
  37. //初始化类时直接加载词典
  38. var $loadInit = TRUE;
  39. //使用热门词优先模式进行消岐
  40. var $differFreq = FALSE;
  41. //被转换为unicode的源字符串
  42. var $sourceString = '';
  43. //附加词典
  44. var $addonDic = array();
  45. var $addonDicFile = 'data/words_addons.dic';
  46. //主词典
  47. var $dicStr = '';
  48. var $mainDic = array();
  49. var $mainDicHand = FALSE;
  50. var $mainDicInfos = array();
  51. var $mainDicFile = 'data/base_dic_full.dic';
  52. //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
  53. var $mainDicFileZip = 'data/base_dic_full.zip';
  54. var $isLoadAll = FALSE;
  55. var $isUnpacked = FALSE;
  56. //主词典词语最大长度 x / 2
  57. var $dicWordMax = 14;
  58. //粗分后的数组(通常是截取句子等用途)
  59. var $simpleResult = array();
  60. //最终结果(用空格分开的词汇列表)
  61. var $finallyResult = array();
  62. //是否已经载入词典
  63. var $isLoadDic = FALSE;
  64. //系统识别或合并的新词
  65. var $newWords = array();
  66. var $foundWordStr = '';
  67. //词库载入时间
  68. var $loadTime = 0;
  69. /**
  70. * 构造函数
  71. * @param $source_charset
  72. * @param $target_charset
  73. * @param $load_alldic
  74. * @param $source
  75. *
  76. * @return void
  77. */
  78. function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  79. {
  80. $this->SetSource( $source, $source_charset, $target_charset );
  81. $this->isLoadAll = $load_all;
  82. if (file_exists(DEDEINC.'/'.$this->mainDicFile)) $this->isUnpacked = TRUE;
  83. if ($this->loadInit) $this->LoadDict();
  84. }
  85. function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  86. {
  87. $this->__construct($source_charset, $target_charset, $load_all, $source);
  88. }
  89. /**
  90. * 析构函数
  91. */
  92. function __destruct()
  93. {
  94. if ( $this->mainDicHand !== FALSE )
  95. {
  96. @fclose( $this->mainDicHand );
  97. }
  98. }
  99. /**
  100. * 根据字符串计算key索引
  101. * @param $key
  102. * @return short int
  103. */
  104. function _get_index( $key )
  105. {
  106. $l = strlen($key);
  107. $h = 0x238f13af;
  108. while ($l--)
  109. {
  110. $h += ($h << 5);
  111. $h ^= ord($key[$l]);
  112. $h &= 0x7fffffff;
  113. }
  114. return ($h % $this->mask_value);
  115. }
  116. /**
  117. * 从文件获得词
  118. * @param $key
  119. * @param $type (类型 word 或 key_groups)
  120. * @return short int
  121. */
  122. function GetWordInfos( $key, $type='word' )
  123. {
  124. if ( !$this->mainDicHand )
  125. {
  126. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  127. }
  128. $p = 0;
  129. $keynum = (int)$this->_get_index( $key );
  130. if ( isset($this->mainDicInfos[ $keynum ]) )
  131. {
  132. $data = $this->mainDicInfos[ $keynum ];
  133. } else {
  134. //rewind( $this->mainDicHand );
  135. $move_pos = $keynum * 8;
  136. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  137. $dat = fread($this->mainDicHand, 8);
  138. $arr = unpack('I1s/n1l/n1c', $dat);
  139. if ( $arr['l'] == 0 )
  140. {
  141. return FALSE;
  142. }
  143. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  144. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  145. $this->mainDicInfos[ $keynum ] = $data;
  146. }
  147. if ( !is_array($data) || !isset($data[$key]) )
  148. {
  149. return FALSE;
  150. }
  151. return ($type=='word' ? $data[$key] : $data);
  152. }
  153. /**
  154. * 设置源字符串
  155. * @param $source
  156. * @param $source_charset
  157. * @param $target_charset
  158. *
  159. * @return bool
  160. */
  161. function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
  162. {
  163. $this->sourceCharSet = strtolower($source_charset);
  164. $this->targetCharSet = strtolower($target_charset);
  165. $this->simpleResult = array();
  166. $this->finallyResult = array();
  167. $this->finallyIndex = array();
  168. if ( $source != '' )
  169. {
  170. $rs = TRUE;
  171. if ( preg_match("/^utf/", $source_charset) ) {
  172. $this->sourceString = @iconv('utf-8', UCS2, $source);
  173. }
  174. else if ( preg_match("/^gb/", $source_charset) ) {
  175. $this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
  176. }
  177. else if ( preg_match("/^big/", $source_charset) ) {
  178. $this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
  179. } else {
  180. $rs = FALSE;
  181. }
  182. } else {
  183. $rs = FALSE;
  184. }
  185. return $rs;
  186. }
  187. /**
  188. * 设置结果类型(只在获取finallyResult才有效)
  189. * @param $rstype 1 为全部,2去除特殊符号
  190. *
  191. * @return void
  192. */
  193. function SetResultType( $rstype )
  194. {
  195. $this->resultType = $rstype;
  196. }
  197. /**
  198. * 载入词典
  199. *
  200. * @return void
  201. */
  202. function LoadDict( $maindic='' )
  203. {
  204. $this->addonDicFile = DEDEINC.'/'.$this->addonDicFile;
  205. $this->mainDicFile = DEDEINC.'/'.$this->mainDicFile;
  206. $this->mainDicFileZip = DEDEINC.'/'.$this->mainDicFileZip;
  207. $startt = microtime(TRUE);
  208. //正常读取文件
  209. $dicAddon = $this->addonDicFile;
  210. if ($maindic=='' || !file_exists($maindic) )
  211. {
  212. $dicWords = $this->mainDicFile ;
  213. } else {
  214. $dicWords = $maindic;
  215. $this->mainDicFile = $maindic;
  216. }
  217. //加载主词典(只打开)
  218. if ($this->isUnpacked){
  219. $this->mainDicHand = fopen($dicWords, 'r');
  220. }
  221. //载入副词典
  222. $hw = '';
  223. $ds = file($dicAddon);
  224. foreach($ds as $d)
  225. {
  226. $d = trim($d);
  227. if ($d=='') continue;
  228. $estr = substr($d, 1, 1);
  229. if ( $estr==':' ) {
  230. $hw = substr($d, 0, 1);
  231. } else {
  232. $spstr = _SP_;
  233. $spstr = iconv(UCS2, 'utf-8', $spstr);
  234. $ws = explode(',', $d);
  235. $wall = iconv('utf-8', UCS2, join($spstr, $ws));
  236. $ws = explode(_SP_, $wall);
  237. foreach($ws as $estr)
  238. {
  239. $this->addonDic[$hw][$estr] = strlen($estr);
  240. }
  241. }
  242. }
  243. $this->loadTime = microtime(TRUE) - $startt;
  244. $this->isLoadDic = TRUE;
  245. }
  246. /**
  247. * 检测某个词是否存在
  248. */
  249. function IsWord( $word )
  250. {
  251. $winfos = $this->GetWordInfos( $word );
  252. return ($winfos !== FALSE);
  253. }
  254. /**
  255. * 获得某个词的词性及词频信息
  256. * @parem $word unicode编码的词
  257. * @return void
  258. */
  259. function GetWordProperty($word)
  260. {
  261. if ( strlen($word)<4 )
  262. {
  263. return '/s';
  264. }
  265. $infos = $this->GetWordInfos($word);
  266. return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
  267. }
  268. /**
  269. * 指定某词的词性信息(通常是新词)
  270. * @parem $word unicode编码的词
  271. * @parem $infos array('c' => 词频, 'm' => 词性);
  272. * @return void;
  273. */
  274. function SetWordInfos($word, $infos)
  275. {
  276. if ( strlen($word)<4 )
  277. {
  278. return ;
  279. }
  280. if ( isset($this->mainDicInfos[$word]) )
  281. {
  282. $this->newWords[$word]++;
  283. $this->mainDicInfos[$word]['c']++;
  284. } else {
  285. $this->newWords[$word] = 1;
  286. $this->mainDicInfos[$word] = $infos;
  287. }
  288. }
  289. /**
  290. * 开始执行分析
  291. * @parem bool optimize 是否对结果进行优化
  292. * @return bool
  293. */
  294. function StartAnalysis($optimize=TRUE)
  295. {
  296. if ( !$this->isLoadDic )
  297. {
  298. $this->LoadDict();
  299. }
  300. $this->simpleResult = $this->finallyResult = array();
  301. $this->sourceString .= chr(0).chr(32);
  302. $slen = strlen($this->sourceString);
  303. $sbcArr = array();
  304. $j = 0;
  305. //全角与半角字符对照表
  306. for($i=0xFF00; $i < 0xFF5F; $i++)
  307. {
  308. $scb = 0x20 + $j;
  309. $j++;
  310. $sbcArr[$i] = $scb;
  311. }
  312. //对字符串进行粗分
  313. $onstr = '';
  314. $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  315. $s = 0;
  316. $ansiWordMatch = "[0-9a-z@#%\+\.-]";
  317. $notNumberMatch = "[a-z@#%\+]";
  318. for($i=0; $i < $slen; $i++)
  319. {
  320. $c = $this->sourceString[$i].$this->sourceString[++$i];
  321. $cn = hexdec(bin2hex($c));
  322. $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
  323. //ANSI字符
  324. if ($cn < 0x80)
  325. {
  326. if ( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
  327. {
  328. if ( $lastc != 2 && $onstr != '') {
  329. $this->simpleResult[$s]['w'] = $onstr;
  330. $this->simpleResult[$s]['t'] = $lastc;
  331. $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  332. $s++;
  333. $onstr = '';
  334. }
  335. $lastc = 2;
  336. $onstr .= chr(0).chr($cn);
  337. } else {
  338. if ( $onstr != '' )
  339. {
  340. $this->simpleResult[$s]['w'] = $onstr;
  341. if ( $lastc==2 )
  342. {
  343. if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  344. }
  345. $this->simpleResult[$s]['t'] = $lastc;
  346. if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  347. $s++;
  348. }
  349. $onstr = '';
  350. $lastc = 3;
  351. if ($cn < 31)
  352. {
  353. continue;
  354. } else {
  355. $this->simpleResult[$s]['w'] = chr(0).chr($cn);
  356. $this->simpleResult[$s]['t'] = 3;
  357. $s++;
  358. }
  359. }
  360. }
  361. //普通字符
  362. else
  363. {
  364. //正常文字
  365. if ( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
  366. || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
  367. {
  368. if ( $lastc != 1 && $onstr != '')
  369. {
  370. $this->simpleResult[$s]['w'] = $onstr;
  371. if ( $lastc==2 )
  372. {
  373. if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  374. }
  375. $this->simpleResult[$s]['t'] = $lastc;
  376. if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  377. $s++;
  378. $onstr = '';
  379. }
  380. $lastc = 1;
  381. $onstr .= $c;
  382. }
  383. //特殊符号
  384. else
  385. {
  386. if ( $onstr != '' )
  387. {
  388. $this->simpleResult[$s]['w'] = $onstr;
  389. if ( $lastc==2 )
  390. {
  391. if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  392. }
  393. $this->simpleResult[$s]['t'] = $lastc;
  394. if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  395. $s++;
  396. }
  397. //检测书名
  398. if ( $cn == 0x300A )
  399. {
  400. $tmpw = '';
  401. $n = 1;
  402. $isok = FALSE;
  403. $ew = chr(0x30).chr(0x0B);
  404. while(TRUE)
  405. {
  406. if (!isset($this->sourceString[$i+$n]) && !isset($this->sourceString[$i+$n+1]))
  407. break;
  408. $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
  409. if ( $w == $ew )
  410. {
  411. $this->simpleResult[$s]['w'] = $c;
  412. $this->simpleResult[$s]['t'] = 5;
  413. $s++;
  414. $this->simpleResult[$s]['w'] = $tmpw;
  415. $this->newWords[$tmpw] = 1;
  416. if ( !isset($this->newWords[$tmpw]) )
  417. {
  418. $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
  419. $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
  420. }
  421. $this->simpleResult[$s]['t'] = 13;
  422. $s++;
  423. //最大切分模式对书名继续分词
  424. if ( $this->differMax )
  425. {
  426. $this->simpleResult[$s]['w'] = $tmpw;
  427. $this->simpleResult[$s]['t'] = 21;
  428. $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
  429. $s++;
  430. }
  431. $this->simpleResult[$s]['w'] = $ew;
  432. $this->simpleResult[$s]['t'] = 5;
  433. $s++;
  434. $i = $i + $n + 1;
  435. $isok = TRUE;
  436. $onstr = '';
  437. $lastc = 5;
  438. break;
  439. } else {
  440. $n = $n+2;
  441. $tmpw .= $w;
  442. if ( strlen($tmpw) > 60 )
  443. {
  444. break;
  445. }
  446. }
  447. }//while
  448. if ( !$isok )
  449. {
  450. $this->simpleResult[$s]['w'] = $c;
  451. $this->simpleResult[$s]['t'] = 5;
  452. $s++;
  453. $onstr = '';
  454. $lastc = 5;
  455. }
  456. continue;
  457. }
  458. $onstr = '';
  459. $lastc = 5;
  460. if ( $cn==0x3000 )
  461. {
  462. continue;
  463. } else {
  464. $this->simpleResult[$s]['w'] = $c;
  465. $this->simpleResult[$s]['t'] = 5;
  466. $s++;
  467. }
  468. }//2byte symbol
  469. }//end 2byte char
  470. }//end for
  471. //处理分词后的结果
  472. $this->_sort_finally_result();
  473. }
  474. /**
  475. * 深入分词
  476. * @parem $str
  477. * @parem $ctype (2 英文类,3 中/韩/日文类)
  478. * @parem $spos 当前粗分结果游标
  479. * @return bool
  480. */
  481. function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE )
  482. {
  483. //中文句子
  484. if ( $ctype==1 )
  485. {
  486. $slen = strlen($str);
  487. //小于系统配置分词要求长度的句子
  488. if ( $slen < $this->notSplitLen )
  489. {
  490. $tmpstr = '';
  491. $lastType = 0;
  492. if ( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
  493. if ($slen < 5)
  494. {
  495. //echo iconv(UCS2, 'utf-8', $str).'<br>';
  496. if ( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
  497. {
  498. $str2 = '';
  499. if ( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
  500. {
  501. $str2 = substr($str, 2, 2);
  502. $str = substr($str, 0, 2);
  503. }
  504. $ww = $this->simpleResult[$spos - 1]['w'].$str;
  505. $this->simpleResult[$spos - 1]['w'] = $ww;
  506. $this->simpleResult[$spos - 1]['t'] = 4;
  507. if ( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
  508. {
  509. $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
  510. $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
  511. }
  512. $this->simpleResult[$spos]['w'] = '';
  513. if ( $str2 != '' )
  514. {
  515. $this->finallyResult[$spos-1][] = $ww;
  516. $this->finallyResult[$spos-1][] = $str2;
  517. }
  518. } else {
  519. $this->finallyResult[$spos][] = $str;
  520. }
  521. } else {
  522. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  523. }
  524. }
  525. //正常长度的句子,循环进行分词处理
  526. else {
  527. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  528. }
  529. }
  530. //英文句子,转为小写
  531. else {
  532. if ( $this->toLower ) {
  533. $this->finallyResult[$spos][] = strtolower($str);
  534. } else {
  535. $this->finallyResult[$spos][] = $str;
  536. }
  537. }
  538. }
  539. /**
  540. * 中文的深入分词
  541. * @parem $str
  542. * @return void
  543. */
  544. function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE )
  545. {
  546. $quote1 = chr(0x20).chr(0x1C);
  547. $tmparr = array();
  548. $hasw = 0;
  549. //如果前一个词为“,并且字符串小于3个字符当成一个词处理
  550. if ( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
  551. {
  552. $tmparr[] = $str;
  553. if ( !isset($this->newWords[$str]) )
  554. {
  555. $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
  556. $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
  557. }
  558. if ( !$this->differMax )
  559. {
  560. $this->finallyResult[$spos][] = $str;
  561. return ;
  562. }
  563. }
  564. //进行切分
  565. for($i=$slen-1; $i > 0; $i -= 2)
  566. {
  567. //单个词
  568. $nc = $str[$i-1].$str[$i];
  569. //是否已经到最后两个字
  570. if ( $i <= 2 )
  571. {
  572. $tmparr[] = $nc;
  573. $i = 0;
  574. break;
  575. }
  576. $isok = FALSE;
  577. $i = $i + 1;
  578. for($k=$this->dicWordMax; $k>1; $k=$k-2)
  579. {
  580. if ($i < $k) continue;
  581. $w = substr($str, $i-$k, $k);
  582. if ( strlen($w) <= 2 )
  583. {
  584. $i = $i - 1;
  585. break;
  586. }
  587. if ( $this->IsWord( $w ) )
  588. {
  589. $tmparr[] = $w;
  590. $i = $i - $k + 1;
  591. $isok = TRUE;
  592. break;
  593. }
  594. }
  595. //echo '<hr />';
  596. //没适合词
  597. if (!$isok) $tmparr[] = $nc;
  598. }
  599. $wcount = count($tmparr);
  600. if ( $wcount==0 ) return ;
  601. $this->finallyResult[$spos] = array_reverse($tmparr);
  602. //优化结果(岐义处理、新词、数词、人名识别等)
  603. if ( $optimize )
  604. {
  605. $this->_optimize_result( $this->finallyResult[$spos], $spos );
  606. }
  607. }
  608. /**
  609. * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
  610. * @parem $optimize 是否优化合并的结果
  611. * @return bool
  612. */
  613. //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  614. function _optimize_result( &$smarr, $spos )
  615. {
  616. $newarr = array();
  617. $prePos = $spos - 1;
  618. $arlen = count($smarr);
  619. $i = $j = 0;
  620. //检测数量词
  621. if ( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
  622. {
  623. $lastw = $this->simpleResult[$prePos]['w'];
  624. $lastt = $this->simpleResult[$prePos]['t'];
  625. if ( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
  626. {
  627. $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
  628. $this->simpleResult[$prePos]['t'] = 4;
  629. if ( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
  630. {
  631. $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
  632. $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
  633. }
  634. $smarr[0] = '';
  635. $i++;
  636. }
  637. }
  638. for(; $i < $arlen; $i++)
  639. {
  640. if ( !isset( $smarr[$i+1] ) )
  641. {
  642. $newarr[$j] = $smarr[$i];
  643. break;
  644. }
  645. $cw = $smarr[$i];
  646. $nw = $smarr[$i+1];
  647. $ischeck = FALSE;
  648. //检测数量词
  649. if ( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
  650. {
  651. //最大切分时保留合并前的词
  652. if ($this->differMax)
  653. {
  654. $newarr[$j] = chr(0).chr(0x28);
  655. $j++;
  656. $newarr[$j] = $cw;
  657. $j++;
  658. $newarr[$j] = $nw;
  659. $j++;
  660. $newarr[$j] = chr(0).chr(0x29);
  661. $j++;
  662. }
  663. $newarr[$j] = $cw.$nw;
  664. if ( !isset($this->newWords[$newarr[$j]]) )
  665. {
  666. $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
  667. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
  668. }
  669. $j++; $i++; $ischeck = TRUE;
  670. }
  671. //检测前导词(通常是姓)
  672. else if ( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
  673. {
  674. $is_rs = FALSE;
  675. //词语是副词或介词或频率很高的词不作为人名
  676. if ( strlen($nw)==4 )
  677. {
  678. $winfos = $this->GetWordInfos($nw);
  679. if (isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  680. {
  681. $is_rs = TRUE;
  682. }
  683. }
  684. if ( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
  685. {
  686. $newarr[$j] = $cw.$nw;
  687. //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br>";
  688. //尝试检测第三个词
  689. if ( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
  690. {
  691. $newarr[$j] .= $smarr[$i+2];
  692. $i++;
  693. }
  694. if ( !isset($this->newWords[$newarr[$j]]) )
  695. {
  696. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
  697. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
  698. }
  699. //为了防止错误,保留合并前的姓名
  700. if (strlen($nw)==4)
  701. {
  702. $j++;
  703. $newarr[$j] = chr(0).chr(0x28);
  704. $j++;
  705. $newarr[$j] = $cw;
  706. $j++;
  707. $newarr[$j] = $nw;
  708. $j++;
  709. $newarr[$j] = chr(0).chr(0x29);
  710. }
  711. $j++; $i++; $ischeck = TRUE;
  712. }
  713. }
  714. //检测后缀词(地名等)
  715. else if ( isset($this->addonDic['a'][$nw]) )
  716. {
  717. $is_rs = FALSE;
  718. //词语是副词或介词不作为前缀
  719. if ( strlen($cw)>2 )
  720. {
  721. $winfos = $this->GetWordInfos($cw);
  722. if (isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  723. {
  724. $is_rs = TRUE;
  725. }
  726. }
  727. if ( !isset($this->addonDic['s'][$cw]) && !$is_rs )
  728. {
  729. $newarr[$j] = $cw.$nw;
  730. if ( !isset($this->newWords[$newarr[$j]]) )
  731. {
  732. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
  733. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
  734. }
  735. $i++; $j++; $ischeck = TRUE;
  736. }
  737. }
  738. //新词识别(暂无规则)
  739. else if ($this->unitWord)
  740. {
  741. if (strlen($cw)==2 && strlen($nw)==2
  742. && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
  743. && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
  744. {
  745. $newarr[$j] = $cw.$nw;
  746. //尝试检测第三个词
  747. if ( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
  748. {
  749. $newarr[$j] .= $smarr[$i+2];
  750. $i++;
  751. }
  752. if ( !isset($this->newWords[$newarr[$j]]) )
  753. {
  754. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
  755. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
  756. }
  757. $i++; $j++; $ischeck = TRUE;
  758. }
  759. }
  760. //不符合规则
  761. if ( !$ischeck )
  762. {
  763. $newarr[$j] = $cw;
  764. //二元消岐处理——最大切分模式
  765. if ( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
  766. {
  767. $slen = strlen($nw);
  768. $hasDiff = FALSE;
  769. for($y=2; $y <= $slen-2; $y=$y+2)
  770. {
  771. $nhead = substr($nw, $y-2, 2);
  772. $nfont = $cw.substr($nw, 0, $y-2);
  773. if ( $this->IsWord( $nfont.$nhead ) )
  774. {
  775. if ( strlen($cw) > 2 ) $j++;
  776. $hasDiff = TRUE;
  777. $newarr[$j] = $nfont.$nhead;
  778. }
  779. }
  780. }
  781. $j++;
  782. }
  783. }//end for
  784. $smarr = $newarr;
  785. }
  786. /**
  787. * 转换最终分词结果到 finallyResult 数组
  788. * @return void
  789. */
  790. function _sort_finally_result()
  791. {
  792. $newarr = array();
  793. $i = 0;
  794. foreach($this->simpleResult as $k=>$v)
  795. {
  796. if ( empty($v['w']) ) continue;
  797. if ( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
  798. {
  799. foreach($this->finallyResult[$k] as $w)
  800. {
  801. if (!empty($w))
  802. {
  803. $newarr[$i]['w'] = $w;
  804. $newarr[$i]['t'] = 20;
  805. $i++;
  806. }
  807. }
  808. }
  809. else if ($v['t'] != 21)
  810. {
  811. $newarr[$i]['w'] = $v['w'];
  812. $newarr[$i]['t'] = $v['t'];
  813. $i++;
  814. }
  815. }
  816. $this->finallyResult = $newarr;
  817. $newarr = '';
  818. }
  819. /**
  820. * 把uncode字符串转换为输出字符串
  821. * @parem str
  822. * return string
  823. */
  824. function _out_string_encoding( &$str )
  825. {
  826. $rsc = $this->_source_result_charset();
  827. if ( $rsc==1 ) {
  828. $rsstr = iconv(UCS2, 'utf-8', $str);
  829. }
  830. else if ( $rsc==2 ) {
  831. $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
  832. } else {
  833. $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
  834. }
  835. return $rsstr;
  836. }
  837. /**
  838. * 获取最终结果字符串(用空格分开后的分词结果)
  839. * @return string
  840. */
  841. function GetFinallyResult($spword=' ', $word_meanings=FALSE)
  842. {
  843. $rsstr = '';
  844. foreach($this->finallyResult as $v)
  845. {
  846. if ( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  847. {
  848. continue;
  849. }
  850. $m = '';
  851. if ( $word_meanings )
  852. {
  853. $m = $this->GetWordProperty($v['w']);
  854. }
  855. $w = $this->_out_string_encoding($v['w']);
  856. if ( $w != ' ' )
  857. {
  858. if ($word_meanings) {
  859. $rsstr .= $spword.$w.$m;
  860. } else {
  861. $rsstr .= $spword.$w;
  862. }
  863. }
  864. }
  865. return $rsstr;
  866. }
  867. /**
  868. * 获取粗分结果,不包含粗分属性
  869. * @return array()
  870. */
  871. function GetSimpleResult()
  872. {
  873. $rearr = array();
  874. foreach($this->simpleResult as $k=>$v)
  875. {
  876. if ( empty($v['w']) ) continue;
  877. $w = $this->_out_string_encoding($v['w']);
  878. if ( $w != ' ' ) $rearr[] = $w;
  879. }
  880. return $rearr;
  881. }
  882. /**
  883. * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
  884. * @return array()
  885. */
  886. function GetSimpleResultAll()
  887. {
  888. $rearr = array();
  889. foreach($this->simpleResult as $k=>$v)
  890. {
  891. $w = $this->_out_string_encoding($v['w']);
  892. if ( $w != ' ' )
  893. {
  894. $rearr[$k]['w'] = $w;
  895. $rearr[$k]['t'] = $v['t'];
  896. }
  897. }
  898. return $rearr;
  899. }
  900. /**
  901. * 获取索引hash数组
  902. * @return array('word'=>count,)
  903. */
  904. function GetFinallyIndex()
  905. {
  906. $rearr = array();
  907. foreach($this->finallyResult as $v)
  908. {
  909. if ( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  910. {
  911. continue;
  912. }
  913. $w = $this->_out_string_encoding($v['w']);
  914. if ( $w == ' ' )
  915. {
  916. continue;
  917. }
  918. if ( isset($rearr[$w]) )
  919. {
  920. $rearr[$w]++;
  921. } else {
  922. $rearr[$w] = 1;
  923. }
  924. }
  925. return $rearr;
  926. }
  927. /**
  928. * 获得保存目标编码
  929. * @return int
  930. */
  931. function _source_result_charset()
  932. {
  933. if ( preg_match("/^utf/", $this->targetCharSet) ) {
  934. $rs = 1;
  935. }
  936. else if ( preg_match("/^gb/", $this->targetCharSet) ) {
  937. $rs = 2;
  938. }
  939. else if ( preg_match("/^big/", $this->targetCharSet) ) {
  940. $rs = 3;
  941. }
  942. else {
  943. $rs = 4;
  944. }
  945. return $rs;
  946. }
  947. /**
  948. * 编译词典
  949. * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
  950. * 注意, 需要PHP开放足够的内存才能完成操作
  951. * @return void
  952. */
  953. function MakeDict( $source_file, $target_file='' )
  954. {
  955. $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
  956. $allk = array();
  957. $fp = fopen($source_file, 'r');
  958. while( $line = fgets($fp, 512) )
  959. {
  960. if ( $line[0]=='@' ) continue;
  961. list($w, $r, $a) = explode(',', $line);
  962. $a = trim( $a );
  963. $w = iconv('utf-8', UCS2, $w);
  964. $k = $this->_get_index( $w );
  965. if ( isset($allk[ $k ]) )
  966. $allk[ $k ][ $w ] = array($r, $a);
  967. else
  968. $allk[ $k ][ $w ] = array($r, $a);
  969. }
  970. fclose( $fp );
  971. $fp = fopen($target_file, 'w');
  972. $heade_rarr = array();
  973. $alldat = '';
  974. $start_pos = $this->mask_value * 8;
  975. foreach( $allk as $k => $v )
  976. {
  977. $dat = serialize( $v );
  978. $dlen = strlen($dat);
  979. $alldat .= $dat;
  980. $heade_rarr[ $k ][0] = $start_pos;
  981. $heade_rarr[ $k ][1] = $dlen;
  982. $heade_rarr[ $k ][2] = count( $v );
  983. $start_pos += $dlen;
  984. }
  985. unset( $allk );
  986. for($i=0; $i < $this->mask_value; $i++)
  987. {
  988. if ( !isset($heade_rarr[$i]) )
  989. {
  990. $heade_rarr[$i] = array(0, 0, 0);
  991. }
  992. fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
  993. }
  994. fwrite( $fp, $alldat);
  995. fclose( $fp );
  996. }
  997. /**
  998. * 导出词典的词条
  999. * @parem $targetfile 保存位置
  1000. * @return void
  1001. */
  1002. function ExportDict( $targetfile )
  1003. {
  1004. if ( !$this->mainDicHand )
  1005. {
  1006. $this->mainDicHand = fopen($this->mainDicFile, 'rw');
  1007. }
  1008. $fp = fopen($targetfile, 'w');
  1009. for($i=0; $i <= $this->mask_value; $i++)
  1010. {
  1011. $move_pos = $i * 8;
  1012. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  1013. $dat = fread($this->mainDicHand, 8);
  1014. $arr = unpack('I1s/n1l/n1c', $dat);
  1015. if ( $arr['l'] == 0 )
  1016. {
  1017. continue;
  1018. }
  1019. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  1020. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  1021. if ( !is_array($data) ) continue;
  1022. foreach($data as $k => $v)
  1023. {
  1024. $w = iconv(UCS2, 'utf-8', $k);
  1025. fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
  1026. }
  1027. }
  1028. fclose( $fp );
  1029. return TRUE;
  1030. }
  1031. }