国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1134 lines
38KB

  1. <?php
  2. /**
  3. * Unicode编码词典的php分词器
  4. *
  5. * 1、只适用于php5,必要函数 iconv
  6. * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
  7. * 3、简单操作流程: SetSource -> StartAnalysis -> Get***Result
  8. * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
  9. *
  10. * @version $Id: splitword.class.php 2 11:45 2011-2-14 itplato $
  11. * @package DedeBIZ.Libraries
  12. * @copyright Copyright (c) 2020, DedeBIZ.COM
  13. * @license https://www.dedebiz.com/license
  14. * @link https://www.dedebiz.com
  15. */
  16. //常量定义
  17. define('_SP_', chr(0xFF).chr(0xFE));
  18. define('UCS2', 'ucs-2be');
  19. class SplitWord
  20. {
  21. //hash算法选项
  22. var $mask_value = 0xFFFF;
  23. //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
  24. var $sourceCharSet = 'utf-8';
  25. var $targetCharSet = 'utf-8';
  26. //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文
  27. var $resultType = 1;
  28. //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
  29. var $notSplitLen = 5;
  30. //把英文单词全部转小写
  31. var $toLower = FALSE;
  32. //使用最大切分模式对二元词进行消岐
  33. var $differMax = FALSE;
  34. //尝试合并单字
  35. var $unitWord = TRUE;
  36. //初始化类时直接加载词典
  37. var $loadInit = TRUE;
  38. //使用热门词优先模式进行消岐
  39. var $differFreq = FALSE;
  40. //被转换为unicode的源字符串
  41. var $sourceString = '';
  42. //附加词典
  43. var $addonDic = array();
  44. var $addonDicFile = 'data/words_addons.dic';
  45. //主词典
  46. var $dicStr = '';
  47. var $mainDic = array();
  48. var $mainDicHand = FALSE;
  49. var $mainDicInfos = array();
  50. var $mainDicFile = 'data/base_dic_full.dic';
  51. //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
  52. var $mainDicFileZip = 'data/base_dic_full.zip';
  53. var $isLoadAll = FALSE;
  54. var $isUnpacked = FALSE;
  55. //主词典词语最大长度 x / 2
  56. var $dicWordMax = 14;
  57. //粗分后的数组(通常是截取句子等用途)
  58. var $simpleResult = array();
  59. //最终结果(用空格分开的词汇列表)
  60. var $finallyResult = '';
  61. //是否已经载入词典
  62. var $isLoadDic = FALSE;
  63. //系统识别或合并的新词
  64. var $newWords = array();
  65. var $foundWordStr = '';
  66. //词库载入时间
  67. var $loadTime = 0;
  68. /**
  69. * 构造函数
  70. * @param $source_charset
  71. * @param $target_charset
  72. * @param $load_alldic
  73. * @param $source
  74. *
  75. * @return void
  76. */
  77. function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  78. {
  79. $this->SetSource( $source, $source_charset, $target_charset );
  80. $this->isLoadAll = $load_all;
  81. if(file_exists(DEDEINC.'/'.$this->mainDicFile)) $this->isUnpacked = TRUE;
  82. if($this->loadInit) $this->LoadDict();
  83. }
  84. function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  85. {
  86. $this->__construct($source_charset, $target_charset, $load_all, $source);
  87. }
  88. /**
  89. * 析构函数
  90. */
  91. function __destruct()
  92. {
  93. if( $this->mainDicHand !== FALSE )
  94. {
  95. @fclose( $this->mainDicHand );
  96. }
  97. }
  98. /**
  99. * 根据字符串计算key索引
  100. * @param $key
  101. * @return short int
  102. */
  103. function _get_index( $key )
  104. {
  105. $l = strlen($key);
  106. $h = 0x238f13af;
  107. while ($l--)
  108. {
  109. $h += ($h << 5);
  110. $h ^= ord($key[$l]);
  111. $h &= 0x7fffffff;
  112. }
  113. return ($h % $this->mask_value);
  114. }
  115. /**
  116. * 从文件获得词
  117. * @param $key
  118. * @param $type (类型 word 或 key_groups)
  119. * @return short int
  120. */
  121. function GetWordInfos( $key, $type='word' )
  122. {
  123. if( !$this->mainDicHand )
  124. {
  125. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  126. }
  127. $p = 0;
  128. $keynum = $this->_get_index( $key );
  129. if( isset($this->mainDicInfos[ $keynum ]) )
  130. {
  131. $data = $this->mainDicInfos[ $keynum ];
  132. }
  133. else
  134. {
  135. //rewind( $this->mainDicHand );
  136. $move_pos = $keynum * 8;
  137. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  138. $dat = fread($this->mainDicHand, 8);
  139. $arr = unpack('I1s/n1l/n1c', $dat);
  140. if( $arr['l'] == 0 )
  141. {
  142. return FALSE;
  143. }
  144. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  145. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  146. $this->mainDicInfos[ $keynum ] = $data;
  147. }
  148. if( !is_array($data) || !isset($data[$key]) )
  149. {
  150. return FALSE;
  151. }
  152. return ($type=='word' ? $data[$key] : $data);
  153. }
  154. /**
  155. * 设置源字符串
  156. * @param $source
  157. * @param $source_charset
  158. * @param $target_charset
  159. *
  160. * @return bool
  161. */
  162. function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
  163. {
  164. $this->sourceCharSet = strtolower($source_charset);
  165. $this->targetCharSet = strtolower($target_charset);
  166. $this->simpleResult = array();
  167. $this->finallyResult = array();
  168. $this->finallyIndex = array();
  169. if( $source != '' )
  170. {
  171. $rs = TRUE;
  172. if( preg_match("/^utf/", $source_charset) ) {
  173. $this->sourceString = @iconv('utf-8', UCS2, $source);
  174. }
  175. else if( preg_match("/^gb/", $source_charset) ) {
  176. $this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
  177. }
  178. else if( preg_match("/^big/", $source_charset) ) {
  179. $this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
  180. }
  181. else {
  182. $rs = FALSE;
  183. }
  184. }
  185. else
  186. {
  187. $rs = FALSE;
  188. }
  189. return $rs;
  190. }
  191. /**
  192. * 设置结果类型(只在获取finallyResult才有效)
  193. * @param $rstype 1 为全部, 2去除特殊符号
  194. *
  195. * @return void
  196. */
  197. function SetResultType( $rstype )
  198. {
  199. $this->resultType = $rstype;
  200. }
  201. /**
  202. * 载入词典
  203. *
  204. * @return void
  205. */
  206. function LoadDict( $maindic='' )
  207. {
  208. $this->addonDicFile = DEDEINC.'/'.$this->addonDicFile;
  209. $this->mainDicFile = DEDEINC.'/'.$this->mainDicFile;
  210. $this->mainDicFileZip = DEDEINC.'/'.$this->mainDicFileZip;
  211. $startt = microtime(TRUE);
  212. //正常读取文件
  213. $dicAddon = $this->addonDicFile;
  214. if($maindic=='' || !file_exists($maindic) )
  215. {
  216. $dicWords = $this->mainDicFile ;
  217. }
  218. else
  219. {
  220. $dicWords = $maindic;
  221. $this->mainDicFile = $maindic;
  222. }
  223. //加载主词典(只打开)
  224. if($this->isUnpacked){
  225. $this->mainDicHand = fopen($dicWords, 'r');
  226. }else{
  227. $this->InportDict($this->mainDicFileZip);
  228. }
  229. //载入副词典
  230. $hw = '';
  231. $ds = file($dicAddon);
  232. foreach($ds as $d)
  233. {
  234. $d = trim($d);
  235. if($d=='') continue;
  236. $estr = substr($d, 1, 1);
  237. if( $estr==':' ) {
  238. $hw = substr($d, 0, 1);
  239. }
  240. else
  241. {
  242. $spstr = _SP_;
  243. $spstr = iconv(UCS2, 'utf-8', $spstr);
  244. $ws = explode(',', $d);
  245. $wall = iconv('utf-8', UCS2, join($spstr, $ws));
  246. $ws = explode(_SP_, $wall);
  247. foreach($ws as $estr)
  248. {
  249. $this->addonDic[$hw][$estr] = strlen($estr);
  250. }
  251. }
  252. }
  253. $this->loadTime = microtime(TRUE) - $startt;
  254. $this->isLoadDic = TRUE;
  255. }
  256. /**
  257. * 检测某个词是否存在
  258. */
  259. function IsWord( $word )
  260. {
  261. $winfos = $this->GetWordInfos( $word );
  262. return ($winfos !== FALSE);
  263. }
  264. /**
  265. * 获得某个词的词性及词频信息
  266. * @parem $word unicode编码的词
  267. * @return void
  268. */
  269. function GetWordProperty($word)
  270. {
  271. if( strlen($word)<4 )
  272. {
  273. return '/s';
  274. }
  275. $infos = $this->GetWordInfos($word);
  276. return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
  277. }
  278. /**
  279. * 指定某词的词性信息(通常是新词)
  280. * @parem $word unicode编码的词
  281. * @parem $infos array('c' => 词频, 'm' => 词性);
  282. * @return void;
  283. */
  284. function SetWordInfos($word, $infos)
  285. {
  286. if( strlen($word)<4 )
  287. {
  288. return ;
  289. }
  290. if( isset($this->mainDicInfos[$word]) )
  291. {
  292. $this->newWords[$word]++;
  293. $this->mainDicInfos[$word]['c']++;
  294. }
  295. else
  296. {
  297. $this->newWords[$word] = 1;
  298. $this->mainDicInfos[$word] = $infos;
  299. }
  300. }
  301. /**
  302. * 开始执行分析
  303. * @parem bool optimize 是否对结果进行优化
  304. * @return bool
  305. */
  306. function StartAnalysis($optimize=TRUE)
  307. {
  308. if( !$this->isLoadDic )
  309. {
  310. $this->LoadDict();
  311. }
  312. $this->simpleResult = $this->finallyResult = array();
  313. $this->sourceString .= chr(0).chr(32);
  314. $slen = strlen($this->sourceString);
  315. $sbcArr = array();
  316. $j = 0;
  317. //全角与半角字符对照表
  318. for($i=0xFF00; $i < 0xFF5F; $i++)
  319. {
  320. $scb = 0x20 + $j;
  321. $j++;
  322. $sbcArr[$i] = $scb;
  323. }
  324. //对字符串进行粗分
  325. $onstr = '';
  326. $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  327. $s = 0;
  328. $ansiWordMatch = "[0-9a-z@#%\+\.-]";
  329. $notNumberMatch = "[a-z@#%\+]";
  330. for($i=0; $i < $slen; $i++)
  331. {
  332. $c = $this->sourceString[$i].$this->sourceString[++$i];
  333. $cn = hexdec(bin2hex($c));
  334. $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
  335. //ANSI字符
  336. if($cn < 0x80)
  337. {
  338. if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
  339. {
  340. if( $lastc != 2 && $onstr != '') {
  341. $this->simpleResult[$s]['w'] = $onstr;
  342. $this->simpleResult[$s]['t'] = $lastc;
  343. $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  344. $s++;
  345. $onstr = '';
  346. }
  347. $lastc = 2;
  348. $onstr .= chr(0).chr($cn);
  349. }
  350. else
  351. {
  352. if( $onstr != '' )
  353. {
  354. $this->simpleResult[$s]['w'] = $onstr;
  355. if( $lastc==2 )
  356. {
  357. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  358. }
  359. $this->simpleResult[$s]['t'] = $lastc;
  360. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  361. $s++;
  362. }
  363. $onstr = '';
  364. $lastc = 3;
  365. if($cn < 31)
  366. {
  367. continue;
  368. }
  369. else
  370. {
  371. $this->simpleResult[$s]['w'] = chr(0).chr($cn);
  372. $this->simpleResult[$s]['t'] = 3;
  373. $s++;
  374. }
  375. }
  376. }
  377. //普通字符
  378. else
  379. {
  380. //正常文字
  381. if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
  382. || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
  383. {
  384. if( $lastc != 1 && $onstr != '')
  385. {
  386. $this->simpleResult[$s]['w'] = $onstr;
  387. if( $lastc==2 )
  388. {
  389. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  390. }
  391. $this->simpleResult[$s]['t'] = $lastc;
  392. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  393. $s++;
  394. $onstr = '';
  395. }
  396. $lastc = 1;
  397. $onstr .= $c;
  398. }
  399. //特殊符号
  400. else
  401. {
  402. if( $onstr != '' )
  403. {
  404. $this->simpleResult[$s]['w'] = $onstr;
  405. if( $lastc==2 )
  406. {
  407. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  408. }
  409. $this->simpleResult[$s]['t'] = $lastc;
  410. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  411. $s++;
  412. }
  413. //检测书名
  414. if( $cn == 0x300A )
  415. {
  416. $tmpw = '';
  417. $n = 1;
  418. $isok = FALSE;
  419. $ew = chr(0x30).chr(0x0B);
  420. while(TRUE)
  421. {
  422. if(!isset($this->sourceString[$i+$n]) && !isset($this->sourceString[$i+$n+1]))
  423. break;
  424. $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
  425. if( $w == $ew )
  426. {
  427. $this->simpleResult[$s]['w'] = $c;
  428. $this->simpleResult[$s]['t'] = 5;
  429. $s++;
  430. $this->simpleResult[$s]['w'] = $tmpw;
  431. $this->newWords[$tmpw] = 1;
  432. if( !isset($this->newWords[$tmpw]) )
  433. {
  434. $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
  435. $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
  436. }
  437. $this->simpleResult[$s]['t'] = 13;
  438. $s++;
  439. //最大切分模式对书名继续分词
  440. if( $this->differMax )
  441. {
  442. $this->simpleResult[$s]['w'] = $tmpw;
  443. $this->simpleResult[$s]['t'] = 21;
  444. $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
  445. $s++;
  446. }
  447. $this->simpleResult[$s]['w'] = $ew;
  448. $this->simpleResult[$s]['t'] = 5;
  449. $s++;
  450. $i = $i + $n + 1;
  451. $isok = TRUE;
  452. $onstr = '';
  453. $lastc = 5;
  454. break;
  455. }
  456. else
  457. {
  458. $n = $n+2;
  459. $tmpw .= $w;
  460. if( strlen($tmpw) > 60 )
  461. {
  462. break;
  463. }
  464. }
  465. }//while
  466. if( !$isok )
  467. {
  468. $this->simpleResult[$s]['w'] = $c;
  469. $this->simpleResult[$s]['t'] = 5;
  470. $s++;
  471. $onstr = '';
  472. $lastc = 5;
  473. }
  474. continue;
  475. }
  476. $onstr = '';
  477. $lastc = 5;
  478. if( $cn==0x3000 )
  479. {
  480. continue;
  481. }
  482. else
  483. {
  484. $this->simpleResult[$s]['w'] = $c;
  485. $this->simpleResult[$s]['t'] = 5;
  486. $s++;
  487. }
  488. }//2byte symbol
  489. }//end 2byte char
  490. }//end for
  491. //处理分词后的结果
  492. $this->_sort_finally_result();
  493. }
  494. /**
  495. * 深入分词
  496. * @parem $str
  497. * @parem $ctype (2 英文类, 3 中/韩/日文类)
  498. * @parem $spos 当前粗分结果游标
  499. * @return bool
  500. */
  501. function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE )
  502. {
  503. //中文句子
  504. if( $ctype==1 )
  505. {
  506. $slen = strlen($str);
  507. //小于系统配置分词要求长度的句子
  508. if( $slen < $this->notSplitLen )
  509. {
  510. $tmpstr = '';
  511. $lastType = 0;
  512. if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
  513. if($slen < 5)
  514. {
  515. //echo iconv(UCS2, 'utf-8', $str).'<br/>';
  516. if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
  517. {
  518. $str2 = '';
  519. if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
  520. {
  521. $str2 = substr($str, 2, 2);
  522. $str = substr($str, 0, 2);
  523. }
  524. $ww = $this->simpleResult[$spos - 1]['w'].$str;
  525. $this->simpleResult[$spos - 1]['w'] = $ww;
  526. $this->simpleResult[$spos - 1]['t'] = 4;
  527. if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
  528. {
  529. $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
  530. $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
  531. }
  532. $this->simpleResult[$spos]['w'] = '';
  533. if( $str2 != '' )
  534. {
  535. $this->finallyResult[$spos-1][] = $ww;
  536. $this->finallyResult[$spos-1][] = $str2;
  537. }
  538. }
  539. else {
  540. $this->finallyResult[$spos][] = $str;
  541. }
  542. }
  543. else
  544. {
  545. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  546. }
  547. }
  548. //正常长度的句子,循环进行分词处理
  549. else
  550. {
  551. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  552. }
  553. }
  554. //英文句子,转为小写
  555. else
  556. {
  557. if( $this->toLower ) {
  558. $this->finallyResult[$spos][] = strtolower($str);
  559. }
  560. else {
  561. $this->finallyResult[$spos][] = $str;
  562. }
  563. }
  564. }
  565. /**
  566. * 中文的深入分词
  567. * @parem $str
  568. * @return void
  569. */
  570. function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE )
  571. {
  572. $quote1 = chr(0x20).chr(0x1C);
  573. $tmparr = array();
  574. $hasw = 0;
  575. //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。
  576. if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
  577. {
  578. $tmparr[] = $str;
  579. if( !isset($this->newWords[$str]) )
  580. {
  581. $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
  582. $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
  583. }
  584. if( !$this->differMax )
  585. {
  586. $this->finallyResult[$spos][] = $str;
  587. return ;
  588. }
  589. }
  590. //进行切分
  591. for($i=$slen-1; $i > 0; $i -= 2)
  592. {
  593. //单个词
  594. $nc = $str[$i-1].$str[$i];
  595. //是否已经到最后两个字
  596. if( $i <= 2 )
  597. {
  598. $tmparr[] = $nc;
  599. $i = 0;
  600. break;
  601. }
  602. $isok = FALSE;
  603. $i = $i + 1;
  604. for($k=$this->dicWordMax; $k>1; $k=$k-2)
  605. {
  606. if($i < $k) continue;
  607. $w = substr($str, $i-$k, $k);
  608. if( strlen($w) <= 2 )
  609. {
  610. $i = $i - 1;
  611. break;
  612. }
  613. if( $this->IsWord( $w ) )
  614. {
  615. $tmparr[] = $w;
  616. $i = $i - $k + 1;
  617. $isok = TRUE;
  618. break;
  619. }
  620. }
  621. //echo '<hr />';
  622. //没适合词
  623. if(!$isok) $tmparr[] = $nc;
  624. }
  625. $wcount = count($tmparr);
  626. if( $wcount==0 ) return ;
  627. $this->finallyResult[$spos] = array_reverse($tmparr);
  628. //优化结果(岐义处理、新词、数词、人名识别等)
  629. if( $optimize )
  630. {
  631. $this->_optimize_result( $this->finallyResult[$spos], $spos );
  632. }
  633. }
  634. /**
  635. * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
  636. * @parem $optimize 是否优化合并的结果
  637. * @return bool
  638. */
  639. //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  640. function _optimize_result( &$smarr, $spos )
  641. {
  642. $newarr = array();
  643. $prePos = $spos - 1;
  644. $arlen = count($smarr);
  645. $i = $j = 0;
  646. //检测数量词
  647. if( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
  648. {
  649. $lastw = $this->simpleResult[$prePos]['w'];
  650. $lastt = $this->simpleResult[$prePos]['t'];
  651. if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
  652. {
  653. $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
  654. $this->simpleResult[$prePos]['t'] = 4;
  655. if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
  656. {
  657. $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
  658. $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
  659. }
  660. $smarr[0] = '';
  661. $i++;
  662. }
  663. }
  664. for(; $i < $arlen; $i++)
  665. {
  666. if( !isset( $smarr[$i+1] ) )
  667. {
  668. $newarr[$j] = $smarr[$i];
  669. break;
  670. }
  671. $cw = $smarr[$i];
  672. $nw = $smarr[$i+1];
  673. $ischeck = FALSE;
  674. //检测数量词
  675. if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
  676. {
  677. //最大切分时保留合并前的词
  678. if($this->differMax)
  679. {
  680. $newarr[$j] = chr(0).chr(0x28);
  681. $j++;
  682. $newarr[$j] = $cw;
  683. $j++;
  684. $newarr[$j] = $nw;
  685. $j++;
  686. $newarr[$j] = chr(0).chr(0x29);
  687. $j++;
  688. }
  689. $newarr[$j] = $cw.$nw;
  690. if( !isset($this->newWords[$newarr[$j]]) )
  691. {
  692. $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
  693. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
  694. }
  695. $j++; $i++; $ischeck = TRUE;
  696. }
  697. //检测前导词(通常是姓)
  698. else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
  699. {
  700. $is_rs = FALSE;
  701. //词语是副词或介词或频率很高的词不作为人名
  702. if( strlen($nw)==4 )
  703. {
  704. $winfos = $this->GetWordInfos($nw);
  705. if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  706. {
  707. $is_rs = TRUE;
  708. }
  709. }
  710. if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
  711. {
  712. $newarr[$j] = $cw.$nw;
  713. //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";
  714. //尝试检测第三个词
  715. if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
  716. {
  717. $newarr[$j] .= $smarr[$i+2];
  718. $i++;
  719. }
  720. if( !isset($this->newWords[$newarr[$j]]) )
  721. {
  722. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
  723. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
  724. }
  725. //为了防止错误,保留合并前的姓名
  726. if(strlen($nw)==4)
  727. {
  728. $j++;
  729. $newarr[$j] = chr(0).chr(0x28);
  730. $j++;
  731. $newarr[$j] = $cw;
  732. $j++;
  733. $newarr[$j] = $nw;
  734. $j++;
  735. $newarr[$j] = chr(0).chr(0x29);
  736. }
  737. $j++; $i++; $ischeck = TRUE;
  738. }
  739. }
  740. //检测后缀词(地名等)
  741. else if( isset($this->addonDic['a'][$nw]) )
  742. {
  743. $is_rs = FALSE;
  744. //词语是副词或介词不作为前缀
  745. if( strlen($cw)>2 )
  746. {
  747. $winfos = $this->GetWordInfos($cw);
  748. if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  749. {
  750. $is_rs = TRUE;
  751. }
  752. }
  753. if( !isset($this->addonDic['s'][$cw]) && !$is_rs )
  754. {
  755. $newarr[$j] = $cw.$nw;
  756. if( !isset($this->newWords[$newarr[$j]]) )
  757. {
  758. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
  759. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
  760. }
  761. $i++; $j++; $ischeck = TRUE;
  762. }
  763. }
  764. //新词识别(暂无规则)
  765. else if($this->unitWord)
  766. {
  767. if(strlen($cw)==2 && strlen($nw)==2
  768. && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
  769. && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
  770. {
  771. $newarr[$j] = $cw.$nw;
  772. //尝试检测第三个词
  773. if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
  774. {
  775. $newarr[$j] .= $smarr[$i+2];
  776. $i++;
  777. }
  778. if( !isset($this->newWords[$newarr[$j]]) )
  779. {
  780. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
  781. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
  782. }
  783. $i++; $j++; $ischeck = TRUE;
  784. }
  785. }
  786. //不符合规则
  787. if( !$ischeck )
  788. {
  789. $newarr[$j] = $cw;
  790. //二元消岐处理——最大切分模式
  791. if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
  792. {
  793. $slen = strlen($nw);
  794. $hasDiff = FALSE;
  795. for($y=2; $y <= $slen-2; $y=$y+2)
  796. {
  797. $nhead = substr($nw, $y-2, 2);
  798. $nfont = $cw.substr($nw, 0, $y-2);
  799. if( $this->IsWord( $nfont.$nhead ) )
  800. {
  801. if( strlen($cw) > 2 ) $j++;
  802. $hasDiff = TRUE;
  803. $newarr[$j] = $nfont.$nhead;
  804. }
  805. }
  806. }
  807. $j++;
  808. }
  809. }//end for
  810. $smarr = $newarr;
  811. }
  812. /**
  813. * 转换最终分词结果到 finallyResult 数组
  814. * @return void
  815. */
  816. function _sort_finally_result()
  817. {
  818. $newarr = array();
  819. $i = 0;
  820. foreach($this->simpleResult as $k=>$v)
  821. {
  822. if( empty($v['w']) ) continue;
  823. if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
  824. {
  825. foreach($this->finallyResult[$k] as $w)
  826. {
  827. if(!empty($w))
  828. {
  829. $newarr[$i]['w'] = $w;
  830. $newarr[$i]['t'] = 20;
  831. $i++;
  832. }
  833. }
  834. }
  835. else if($v['t'] != 21)
  836. {
  837. $newarr[$i]['w'] = $v['w'];
  838. $newarr[$i]['t'] = $v['t'];
  839. $i++;
  840. }
  841. }
  842. $this->finallyResult = $newarr;
  843. $newarr = '';
  844. }
  845. /**
  846. * 把uncode字符串转换为输出字符串
  847. * @parem str
  848. * return string
  849. */
  850. function _out_string_encoding( &$str )
  851. {
  852. $rsc = $this->_source_result_charset();
  853. if( $rsc==1 ) {
  854. $rsstr = iconv(UCS2, 'utf-8', $str);
  855. }
  856. else if( $rsc==2 ) {
  857. $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
  858. }
  859. else{
  860. $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
  861. }
  862. return $rsstr;
  863. }
  864. /**
  865. * 获取最终结果字符串(用空格分开后的分词结果)
  866. * @return string
  867. */
  868. function GetFinallyResult($spword=' ', $word_meanings=FALSE)
  869. {
  870. $rsstr = '';
  871. foreach($this->finallyResult as $v)
  872. {
  873. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  874. {
  875. continue;
  876. }
  877. $m = '';
  878. if( $word_meanings )
  879. {
  880. $m = $this->GetWordProperty($v['w']);
  881. }
  882. $w = $this->_out_string_encoding($v['w']);
  883. if( $w != ' ' )
  884. {
  885. if($word_meanings) {
  886. $rsstr .= $spword.$w.$m;
  887. }
  888. else {
  889. $rsstr .= $spword.$w;
  890. }
  891. }
  892. }
  893. return $rsstr;
  894. }
  895. /**
  896. * 获取粗分结果,不包含粗分属性
  897. * @return array()
  898. */
  899. function GetSimpleResult()
  900. {
  901. $rearr = array();
  902. foreach($this->simpleResult as $k=>$v)
  903. {
  904. if( empty($v['w']) ) continue;
  905. $w = $this->_out_string_encoding($v['w']);
  906. if( $w != ' ' ) $rearr[] = $w;
  907. }
  908. return $rearr;
  909. }
  910. /**
  911. * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
  912. * @return array()
  913. */
  914. function GetSimpleResultAll()
  915. {
  916. $rearr = array();
  917. foreach($this->simpleResult as $k=>$v)
  918. {
  919. $w = $this->_out_string_encoding($v['w']);
  920. if( $w != ' ' )
  921. {
  922. $rearr[$k]['w'] = $w;
  923. $rearr[$k]['t'] = $v['t'];
  924. }
  925. }
  926. return $rearr;
  927. }
  928. /**
  929. * 获取索引hash数组
  930. * @return array('word'=>count,...)
  931. */
  932. function GetFinallyIndex()
  933. {
  934. $rearr = array();
  935. foreach($this->finallyResult as $v)
  936. {
  937. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  938. {
  939. continue;
  940. }
  941. $w = $this->_out_string_encoding($v['w']);
  942. if( $w == ' ' )
  943. {
  944. continue;
  945. }
  946. if( isset($rearr[$w]) )
  947. {
  948. $rearr[$w]++;
  949. }
  950. else
  951. {
  952. $rearr[$w] = 1;
  953. }
  954. }
  955. return $rearr;
  956. }
  957. /**
  958. * 获得保存目标编码
  959. * @return int
  960. */
  961. function _source_result_charset()
  962. {
  963. if( preg_match("/^utf/", $this->targetCharSet) ) {
  964. $rs = 1;
  965. }
  966. else if( preg_match("/^gb/", $this->targetCharSet) ) {
  967. $rs = 2;
  968. }
  969. else if( preg_match("/^big/", $this->targetCharSet) ) {
  970. $rs = 3;
  971. }
  972. else {
  973. $rs = 4;
  974. }
  975. return $rs;
  976. }
  977. /**
  978. * 编译词典
  979. * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
  980. * 注意, 需要PHP开放足够的内存才能完成操作
  981. * @return void
  982. */
  983. function MakeDict( $source_file, $target_file='' )
  984. {
  985. $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
  986. $allk = array();
  987. $fp = fopen($source_file, 'r');
  988. while( $line = fgets($fp, 512) )
  989. {
  990. if( $line[0]=='@' ) continue;
  991. list($w, $r, $a) = explode(',', $line);
  992. $a = trim( $a );
  993. $w = iconv('utf-8', UCS2, $w);
  994. $k = $this->_get_index( $w );
  995. if( isset($allk[ $k ]) )
  996. $allk[ $k ][ $w ] = array($r, $a);
  997. else
  998. $allk[ $k ][ $w ] = array($r, $a);
  999. }
  1000. fclose( $fp );
  1001. $fp = fopen($target_file, 'w');
  1002. $heade_rarr = array();
  1003. $alldat = '';
  1004. $start_pos = $this->mask_value * 8;
  1005. foreach( $allk as $k => $v )
  1006. {
  1007. $dat = serialize( $v );
  1008. $dlen = strlen($dat);
  1009. $alldat .= $dat;
  1010. $heade_rarr[ $k ][0] = $start_pos;
  1011. $heade_rarr[ $k ][1] = $dlen;
  1012. $heade_rarr[ $k ][2] = count( $v );
  1013. $start_pos += $dlen;
  1014. }
  1015. unset( $allk );
  1016. for($i=0; $i < $this->mask_value; $i++)
  1017. {
  1018. if( !isset($heade_rarr[$i]) )
  1019. {
  1020. $heade_rarr[$i] = array(0, 0, 0);
  1021. }
  1022. fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
  1023. }
  1024. fwrite( $fp, $alldat);
  1025. fclose( $fp );
  1026. }
  1027. /**
  1028. * 导出词典的词条
  1029. * @parem $targetfile 保存位置
  1030. * @return void
  1031. */
  1032. function ExportDict( $targetfile )
  1033. {
  1034. if( !$this->mainDicHand )
  1035. {
  1036. $this->mainDicHand = fopen($this->mainDicFile, 'rw');
  1037. }
  1038. $fp = fopen($targetfile, 'w');
  1039. for($i=0; $i <= $this->mask_value; $i++)
  1040. {
  1041. $move_pos = $i * 8;
  1042. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  1043. $dat = fread($this->mainDicHand, 8);
  1044. $arr = unpack('I1s/n1l/n1c', $dat);
  1045. if( $arr['l'] == 0 )
  1046. {
  1047. continue;
  1048. }
  1049. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  1050. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  1051. if( !is_array($data) ) continue;
  1052. foreach($data as $k => $v)
  1053. {
  1054. $w = iconv(UCS2, 'utf-8', $k);
  1055. fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
  1056. }
  1057. }
  1058. fclose( $fp );
  1059. return TRUE;
  1060. }
  1061. function InportDict( $targetfile )
  1062. {
  1063. if(!ini_set('memory_limit', '128M'))
  1064. exit('设置内存错误,请到dede官网下载解压版的base_dic_full.dic!');
  1065. require_once(DEDEINC.'/zip.class.php');
  1066. $zip = new zip();
  1067. //echo $targetfile;
  1068. $unpackagefile = array_keys($zip->Extract($targetfile,DEDEINC.'/data/'));
  1069. //exit();
  1070. $this->MakeDict(DEDEINC.'/data/'.$unpackagefile[0]);
  1071. unlink(DEDEINC.'/data/'.$unpackagefile[0]);
  1072. return true;
  1073. }
  1074. }