国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

splitword.class.php 38KB

3 年前
3 年前
3 年前
3 年前
3 年前
3 年前
3 年前
3 年前
3 年前
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125
  1. <?php
  2. if (!defined('DEDEINC')) exit('dedebiz');
  3. /**
  4. * Unicode编码词典的php分词器
  5. *
  6. * 1、只适用于php5,必要函数 iconv
  7. * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
  8. * 3、简单操作流程:SetSource -> StartAnalysis -> Get***Result
  9. * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
  10. *
  11. * @version $Id: splitword.class.php 2 11:45 2011-2-14 itplato $
  12. * @package DedeBIZ.Libraries
  13. * @copyright Copyright (c) 2022, DedeBIZ.COM
  14. * @license https://www.dedebiz.com/license
  15. * @link https://www.dedebiz.com
  16. */
  17. //常量定义
  18. define('_SP_', chr(0xFF).chr(0xFE));
  19. define('UCS2', 'ucs-2be');
  20. class SplitWord
  21. {
  22. //hash算法选项
  23. var $mask_value = 0xFFFF;
  24. //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
  25. var $sourceCharSet = 'utf-8';
  26. var $targetCharSet = 'utf-8';
  27. //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文
  28. var $resultType = 1;
  29. //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
  30. var $notSplitLen = 5;
  31. //把英文单词全部转小写
  32. var $toLower = FALSE;
  33. //使用最大切分模式对二元词进行消岐
  34. var $differMax = FALSE;
  35. //尝试合并单字
  36. var $unitWord = TRUE;
  37. //初始化类时直接加载词典
  38. var $loadInit = TRUE;
  39. //使用热门词优先模式进行消岐
  40. var $differFreq = FALSE;
  41. //被转换为unicode的源字符串
  42. var $sourceString = '';
  43. //附加词典
  44. var $addonDic = array();
  45. var $addonDicFile = 'data/words_addons.dic';
  46. //主词典
  47. var $dicStr = '';
  48. var $mainDic = array();
  49. var $mainDicHand = FALSE;
  50. var $mainDicInfos = array();
  51. var $mainDicFile = 'data/base_dic_full.dic';
  52. //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
  53. var $mainDicFileZip = 'data/base_dic_full.zip';
  54. var $isLoadAll = FALSE;
  55. var $isUnpacked = FALSE;
  56. //主词典词语最大长度 x / 2
  57. var $dicWordMax = 14;
  58. //粗分后的数组(通常是截取句子等用途)
  59. var $simpleResult = array();
  60. //最终结果(用空格分开的词汇列表)
  61. var $finallyResult = '';
  62. //是否已经载入词典
  63. var $isLoadDic = FALSE;
  64. //系统识别或合并的新词
  65. var $newWords = array();
  66. var $foundWordStr = '';
  67. //词库载入时间
  68. var $loadTime = 0;
  69. /**
  70. * 构造函数
  71. * @param $source_charset
  72. * @param $target_charset
  73. * @param $load_alldic
  74. * @param $source
  75. *
  76. * @return void
  77. */
  78. function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  79. {
  80. $this->SetSource( $source, $source_charset, $target_charset );
  81. $this->isLoadAll = $load_all;
  82. if(file_exists(DEDEINC.'/'.$this->mainDicFile)) $this->isUnpacked = TRUE;
  83. if($this->loadInit) $this->LoadDict();
  84. }
  85. function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  86. {
  87. $this->__construct($source_charset, $target_charset, $load_all, $source);
  88. }
  89. /**
  90. * 析构函数
  91. */
  92. function __destruct()
  93. {
  94. if( $this->mainDicHand !== FALSE )
  95. {
  96. @fclose( $this->mainDicHand );
  97. }
  98. }
  99. /**
  100. * 根据字符串计算key索引
  101. * @param $key
  102. * @return short int
  103. */
  104. function _get_index( $key )
  105. {
  106. $l = strlen($key);
  107. $h = 0x238f13af;
  108. while ($l--)
  109. {
  110. $h += ($h << 5);
  111. $h ^= ord($key[$l]);
  112. $h &= 0x7fffffff;
  113. }
  114. return ($h % $this->mask_value);
  115. }
  116. /**
  117. * 从文件获得词
  118. * @param $key
  119. * @param $type (类型 word 或 key_groups)
  120. * @return short int
  121. */
  122. function GetWordInfos( $key, $type='word' )
  123. {
  124. if( !$this->mainDicHand )
  125. {
  126. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  127. }
  128. $p = 0;
  129. $keynum = $this->_get_index( $key );
  130. if( isset($this->mainDicInfos[ $keynum ]) )
  131. {
  132. $data = $this->mainDicInfos[ $keynum ];
  133. } else {
  134. //rewind( $this->mainDicHand );
  135. $move_pos = $keynum * 8;
  136. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  137. $dat = fread($this->mainDicHand, 8);
  138. $arr = unpack('I1s/n1l/n1c', $dat);
  139. if( $arr['l'] == 0 )
  140. {
  141. return FALSE;
  142. }
  143. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  144. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  145. $this->mainDicInfos[ $keynum ] = $data;
  146. }
  147. if( !is_array($data) || !isset($data[$key]) )
  148. {
  149. return FALSE;
  150. }
  151. return ($type=='word' ? $data[$key] : $data);
  152. }
  153. /**
  154. * 设置源字符串
  155. * @param $source
  156. * @param $source_charset
  157. * @param $target_charset
  158. *
  159. * @return bool
  160. */
  161. function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
  162. {
  163. $this->sourceCharSet = strtolower($source_charset);
  164. $this->targetCharSet = strtolower($target_charset);
  165. $this->simpleResult = array();
  166. $this->finallyResult = array();
  167. $this->finallyIndex = array();
  168. if( $source != '' )
  169. {
  170. $rs = TRUE;
  171. if( preg_match("/^utf/", $source_charset) ) {
  172. $this->sourceString = @iconv('utf-8', UCS2, $source);
  173. }
  174. else if( preg_match("/^gb/", $source_charset) ) {
  175. $this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
  176. }
  177. else if( preg_match("/^big/", $source_charset) ) {
  178. $this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
  179. }
  180. else {
  181. $rs = FALSE;
  182. }
  183. } else {
  184. $rs = FALSE;
  185. }
  186. return $rs;
  187. }
  188. /**
  189. * 设置结果类型(只在获取finallyResult才有效)
  190. * @param $rstype 1 为全部, 2去除特殊符号
  191. *
  192. * @return void
  193. */
  194. function SetResultType( $rstype )
  195. {
  196. $this->resultType = $rstype;
  197. }
  198. /**
  199. * 载入词典
  200. *
  201. * @return void
  202. */
  203. function LoadDict( $maindic='' )
  204. {
  205. $this->addonDicFile = DEDEINC.'/'.$this->addonDicFile;
  206. $this->mainDicFile = DEDEINC.'/'.$this->mainDicFile;
  207. $this->mainDicFileZip = DEDEINC.'/'.$this->mainDicFileZip;
  208. $startt = microtime(TRUE);
  209. //正常读取文件
  210. $dicAddon = $this->addonDicFile;
  211. if($maindic=='' || !file_exists($maindic) )
  212. {
  213. $dicWords = $this->mainDicFile ;
  214. } else {
  215. $dicWords = $maindic;
  216. $this->mainDicFile = $maindic;
  217. }
  218. //加载主词典(只打开)
  219. if($this->isUnpacked){
  220. $this->mainDicHand = fopen($dicWords, 'r');
  221. } else {
  222. $this->InportDict($this->mainDicFileZip);
  223. }
  224. //载入副词典
  225. $hw = '';
  226. $ds = file($dicAddon);
  227. foreach($ds as $d)
  228. {
  229. $d = trim($d);
  230. if($d=='') continue;
  231. $estr = substr($d, 1, 1);
  232. if( $estr==':' ) {
  233. $hw = substr($d, 0, 1);
  234. }
  235. else
  236. {
  237. $spstr = _SP_;
  238. $spstr = iconv(UCS2, 'utf-8', $spstr);
  239. $ws = explode(',', $d);
  240. $wall = iconv('utf-8', UCS2, join($spstr, $ws));
  241. $ws = explode(_SP_, $wall);
  242. foreach($ws as $estr)
  243. {
  244. $this->addonDic[$hw][$estr] = strlen($estr);
  245. }
  246. }
  247. }
  248. $this->loadTime = microtime(TRUE) - $startt;
  249. $this->isLoadDic = TRUE;
  250. }
  251. /**
  252. * 检测某个词是否存在
  253. */
  254. function IsWord( $word )
  255. {
  256. $winfos = $this->GetWordInfos( $word );
  257. return ($winfos !== FALSE);
  258. }
  259. /**
  260. * 获得某个词的词性及词频信息
  261. * @parem $word unicode编码的词
  262. * @return void
  263. */
  264. function GetWordProperty($word)
  265. {
  266. if( strlen($word)<4 )
  267. {
  268. return '/s';
  269. }
  270. $infos = $this->GetWordInfos($word);
  271. return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
  272. }
  273. /**
  274. * 指定某词的词性信息(通常是新词)
  275. * @parem $word unicode编码的词
  276. * @parem $infos array('c' => 词频, 'm' => 词性);
  277. * @return void;
  278. */
  279. function SetWordInfos($word, $infos)
  280. {
  281. if( strlen($word)<4 )
  282. {
  283. return ;
  284. }
  285. if( isset($this->mainDicInfos[$word]) )
  286. {
  287. $this->newWords[$word]++;
  288. $this->mainDicInfos[$word]['c']++;
  289. } else {
  290. $this->newWords[$word] = 1;
  291. $this->mainDicInfos[$word] = $infos;
  292. }
  293. }
  294. /**
  295. * 开始执行分析
  296. * @parem bool optimize 是否对结果进行优化
  297. * @return bool
  298. */
  299. function StartAnalysis($optimize=TRUE)
  300. {
  301. if( !$this->isLoadDic )
  302. {
  303. $this->LoadDict();
  304. }
  305. $this->simpleResult = $this->finallyResult = array();
  306. $this->sourceString .= chr(0).chr(32);
  307. $slen = strlen($this->sourceString);
  308. $sbcArr = array();
  309. $j = 0;
  310. //全角与半角字符对照表
  311. for($i=0xFF00; $i < 0xFF5F; $i++)
  312. {
  313. $scb = 0x20 + $j;
  314. $j++;
  315. $sbcArr[$i] = $scb;
  316. }
  317. //对字符串进行粗分
  318. $onstr = '';
  319. $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  320. $s = 0;
  321. $ansiWordMatch = "[0-9a-z@#%\+\.-]";
  322. $notNumberMatch = "[a-z@#%\+]";
  323. for($i=0; $i < $slen; $i++)
  324. {
  325. $c = $this->sourceString[$i].$this->sourceString[++$i];
  326. $cn = hexdec(bin2hex($c));
  327. $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
  328. //ANSI字符
  329. if($cn < 0x80)
  330. {
  331. if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
  332. {
  333. if( $lastc != 2 && $onstr != '') {
  334. $this->simpleResult[$s]['w'] = $onstr;
  335. $this->simpleResult[$s]['t'] = $lastc;
  336. $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  337. $s++;
  338. $onstr = '';
  339. }
  340. $lastc = 2;
  341. $onstr .= chr(0).chr($cn);
  342. }
  343. else
  344. {
  345. if( $onstr != '' )
  346. {
  347. $this->simpleResult[$s]['w'] = $onstr;
  348. if( $lastc==2 )
  349. {
  350. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  351. }
  352. $this->simpleResult[$s]['t'] = $lastc;
  353. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  354. $s++;
  355. }
  356. $onstr = '';
  357. $lastc = 3;
  358. if($cn < 31)
  359. {
  360. continue;
  361. }
  362. else
  363. {
  364. $this->simpleResult[$s]['w'] = chr(0).chr($cn);
  365. $this->simpleResult[$s]['t'] = 3;
  366. $s++;
  367. }
  368. }
  369. }
  370. //普通字符
  371. else
  372. {
  373. //正常文字
  374. if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
  375. || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
  376. {
  377. if( $lastc != 1 && $onstr != '')
  378. {
  379. $this->simpleResult[$s]['w'] = $onstr;
  380. if( $lastc==2 )
  381. {
  382. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  383. }
  384. $this->simpleResult[$s]['t'] = $lastc;
  385. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  386. $s++;
  387. $onstr = '';
  388. }
  389. $lastc = 1;
  390. $onstr .= $c;
  391. }
  392. //特殊符号
  393. else
  394. {
  395. if( $onstr != '' )
  396. {
  397. $this->simpleResult[$s]['w'] = $onstr;
  398. if( $lastc==2 )
  399. {
  400. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  401. }
  402. $this->simpleResult[$s]['t'] = $lastc;
  403. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  404. $s++;
  405. }
  406. //检测书名
  407. if( $cn == 0x300A )
  408. {
  409. $tmpw = '';
  410. $n = 1;
  411. $isok = FALSE;
  412. $ew = chr(0x30).chr(0x0B);
  413. while(TRUE)
  414. {
  415. if(!isset($this->sourceString[$i+$n]) && !isset($this->sourceString[$i+$n+1]))
  416. break;
  417. $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
  418. if( $w == $ew )
  419. {
  420. $this->simpleResult[$s]['w'] = $c;
  421. $this->simpleResult[$s]['t'] = 5;
  422. $s++;
  423. $this->simpleResult[$s]['w'] = $tmpw;
  424. $this->newWords[$tmpw] = 1;
  425. if( !isset($this->newWords[$tmpw]) )
  426. {
  427. $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
  428. $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
  429. }
  430. $this->simpleResult[$s]['t'] = 13;
  431. $s++;
  432. //最大切分模式对书名继续分词
  433. if( $this->differMax )
  434. {
  435. $this->simpleResult[$s]['w'] = $tmpw;
  436. $this->simpleResult[$s]['t'] = 21;
  437. $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
  438. $s++;
  439. }
  440. $this->simpleResult[$s]['w'] = $ew;
  441. $this->simpleResult[$s]['t'] = 5;
  442. $s++;
  443. $i = $i + $n + 1;
  444. $isok = TRUE;
  445. $onstr = '';
  446. $lastc = 5;
  447. break;
  448. }
  449. else
  450. {
  451. $n = $n+2;
  452. $tmpw .= $w;
  453. if( strlen($tmpw) > 60 )
  454. {
  455. break;
  456. }
  457. }
  458. }//while
  459. if( !$isok )
  460. {
  461. $this->simpleResult[$s]['w'] = $c;
  462. $this->simpleResult[$s]['t'] = 5;
  463. $s++;
  464. $onstr = '';
  465. $lastc = 5;
  466. }
  467. continue;
  468. }
  469. $onstr = '';
  470. $lastc = 5;
  471. if( $cn==0x3000 )
  472. {
  473. continue;
  474. }
  475. else
  476. {
  477. $this->simpleResult[$s]['w'] = $c;
  478. $this->simpleResult[$s]['t'] = 5;
  479. $s++;
  480. }
  481. }//2byte symbol
  482. }//end 2byte char
  483. }//end for
  484. //处理分词后的结果
  485. $this->_sort_finally_result();
  486. }
  487. /**
  488. * 深入分词
  489. * @parem $str
  490. * @parem $ctype (2 英文类, 3 中/韩/日文类)
  491. * @parem $spos 当前粗分结果游标
  492. * @return bool
  493. */
  494. function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE )
  495. {
  496. //中文句子
  497. if( $ctype==1 )
  498. {
  499. $slen = strlen($str);
  500. //小于系统配置分词要求长度的句子
  501. if( $slen < $this->notSplitLen )
  502. {
  503. $tmpstr = '';
  504. $lastType = 0;
  505. if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
  506. if($slen < 5)
  507. {
  508. //echo iconv(UCS2, 'utf-8', $str).'<br>';
  509. if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
  510. {
  511. $str2 = '';
  512. if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
  513. {
  514. $str2 = substr($str, 2, 2);
  515. $str = substr($str, 0, 2);
  516. }
  517. $ww = $this->simpleResult[$spos - 1]['w'].$str;
  518. $this->simpleResult[$spos - 1]['w'] = $ww;
  519. $this->simpleResult[$spos - 1]['t'] = 4;
  520. if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
  521. {
  522. $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
  523. $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
  524. }
  525. $this->simpleResult[$spos]['w'] = '';
  526. if( $str2 != '' )
  527. {
  528. $this->finallyResult[$spos-1][] = $ww;
  529. $this->finallyResult[$spos-1][] = $str2;
  530. }
  531. }
  532. else {
  533. $this->finallyResult[$spos][] = $str;
  534. }
  535. }
  536. else
  537. {
  538. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  539. }
  540. }
  541. //正常长度的句子,循环进行分词处理
  542. else
  543. {
  544. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  545. }
  546. }
  547. //英文句子,转为小写
  548. else
  549. {
  550. if( $this->toLower ) {
  551. $this->finallyResult[$spos][] = strtolower($str);
  552. }
  553. else {
  554. $this->finallyResult[$spos][] = $str;
  555. }
  556. }
  557. }
  558. /**
  559. * 中文的深入分词
  560. * @parem $str
  561. * @return void
  562. */
  563. function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE )
  564. {
  565. $quote1 = chr(0x20).chr(0x1C);
  566. $tmparr = array();
  567. $hasw = 0;
  568. //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理
  569. if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
  570. {
  571. $tmparr[] = $str;
  572. if( !isset($this->newWords[$str]) )
  573. {
  574. $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
  575. $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
  576. }
  577. if( !$this->differMax )
  578. {
  579. $this->finallyResult[$spos][] = $str;
  580. return ;
  581. }
  582. }
  583. //进行切分
  584. for($i=$slen-1; $i > 0; $i -= 2)
  585. {
  586. //单个词
  587. $nc = $str[$i-1].$str[$i];
  588. //是否已经到最后两个字
  589. if( $i <= 2 )
  590. {
  591. $tmparr[] = $nc;
  592. $i = 0;
  593. break;
  594. }
  595. $isok = FALSE;
  596. $i = $i + 1;
  597. for($k=$this->dicWordMax; $k>1; $k=$k-2)
  598. {
  599. if($i < $k) continue;
  600. $w = substr($str, $i-$k, $k);
  601. if( strlen($w) <= 2 )
  602. {
  603. $i = $i - 1;
  604. break;
  605. }
  606. if( $this->IsWord( $w ) )
  607. {
  608. $tmparr[] = $w;
  609. $i = $i - $k + 1;
  610. $isok = TRUE;
  611. break;
  612. }
  613. }
  614. //echo '<hr />';
  615. //没适合词
  616. if(!$isok) $tmparr[] = $nc;
  617. }
  618. $wcount = count($tmparr);
  619. if( $wcount==0 ) return ;
  620. $this->finallyResult[$spos] = array_reverse($tmparr);
  621. //优化结果(岐义处理、新词、数词、人名识别等)
  622. if( $optimize )
  623. {
  624. $this->_optimize_result( $this->finallyResult[$spos], $spos );
  625. }
  626. }
  627. /**
  628. * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
  629. * @parem $optimize 是否优化合并的结果
  630. * @return bool
  631. */
  632. //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  633. function _optimize_result( &$smarr, $spos )
  634. {
  635. $newarr = array();
  636. $prePos = $spos - 1;
  637. $arlen = count($smarr);
  638. $i = $j = 0;
  639. //检测数量词
  640. if( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
  641. {
  642. $lastw = $this->simpleResult[$prePos]['w'];
  643. $lastt = $this->simpleResult[$prePos]['t'];
  644. if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
  645. {
  646. $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
  647. $this->simpleResult[$prePos]['t'] = 4;
  648. if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
  649. {
  650. $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
  651. $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
  652. }
  653. $smarr[0] = '';
  654. $i++;
  655. }
  656. }
  657. for(; $i < $arlen; $i++)
  658. {
  659. if( !isset( $smarr[$i+1] ) )
  660. {
  661. $newarr[$j] = $smarr[$i];
  662. break;
  663. }
  664. $cw = $smarr[$i];
  665. $nw = $smarr[$i+1];
  666. $ischeck = FALSE;
  667. //检测数量词
  668. if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
  669. {
  670. //最大切分时保留合并前的词
  671. if($this->differMax)
  672. {
  673. $newarr[$j] = chr(0).chr(0x28);
  674. $j++;
  675. $newarr[$j] = $cw;
  676. $j++;
  677. $newarr[$j] = $nw;
  678. $j++;
  679. $newarr[$j] = chr(0).chr(0x29);
  680. $j++;
  681. }
  682. $newarr[$j] = $cw.$nw;
  683. if( !isset($this->newWords[$newarr[$j]]) )
  684. {
  685. $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
  686. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
  687. }
  688. $j++; $i++; $ischeck = TRUE;
  689. }
  690. //检测前导词(通常是姓)
  691. else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
  692. {
  693. $is_rs = FALSE;
  694. //词语是副词或介词或频率很高的词不作为人名
  695. if( strlen($nw)==4 )
  696. {
  697. $winfos = $this->GetWordInfos($nw);
  698. if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  699. {
  700. $is_rs = TRUE;
  701. }
  702. }
  703. if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
  704. {
  705. $newarr[$j] = $cw.$nw;
  706. //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br>";
  707. //尝试检测第三个词
  708. if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
  709. {
  710. $newarr[$j] .= $smarr[$i+2];
  711. $i++;
  712. }
  713. if( !isset($this->newWords[$newarr[$j]]) )
  714. {
  715. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
  716. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
  717. }
  718. //为了防止错误,保留合并前的姓名
  719. if(strlen($nw)==4)
  720. {
  721. $j++;
  722. $newarr[$j] = chr(0).chr(0x28);
  723. $j++;
  724. $newarr[$j] = $cw;
  725. $j++;
  726. $newarr[$j] = $nw;
  727. $j++;
  728. $newarr[$j] = chr(0).chr(0x29);
  729. }
  730. $j++; $i++; $ischeck = TRUE;
  731. }
  732. }
  733. //检测后缀词(地名等)
  734. else if( isset($this->addonDic['a'][$nw]) )
  735. {
  736. $is_rs = FALSE;
  737. //词语是副词或介词不作为前缀
  738. if( strlen($cw)>2 )
  739. {
  740. $winfos = $this->GetWordInfos($cw);
  741. if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  742. {
  743. $is_rs = TRUE;
  744. }
  745. }
  746. if( !isset($this->addonDic['s'][$cw]) && !$is_rs )
  747. {
  748. $newarr[$j] = $cw.$nw;
  749. if( !isset($this->newWords[$newarr[$j]]) )
  750. {
  751. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
  752. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
  753. }
  754. $i++; $j++; $ischeck = TRUE;
  755. }
  756. }
  757. //新词识别(暂无规则)
  758. else if($this->unitWord)
  759. {
  760. if(strlen($cw)==2 && strlen($nw)==2
  761. && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
  762. && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
  763. {
  764. $newarr[$j] = $cw.$nw;
  765. //尝试检测第三个词
  766. if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
  767. {
  768. $newarr[$j] .= $smarr[$i+2];
  769. $i++;
  770. }
  771. if( !isset($this->newWords[$newarr[$j]]) )
  772. {
  773. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
  774. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
  775. }
  776. $i++; $j++; $ischeck = TRUE;
  777. }
  778. }
  779. //不符合规则
  780. if( !$ischeck )
  781. {
  782. $newarr[$j] = $cw;
  783. //二元消岐处理——最大切分模式
  784. if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
  785. {
  786. $slen = strlen($nw);
  787. $hasDiff = FALSE;
  788. for($y=2; $y <= $slen-2; $y=$y+2)
  789. {
  790. $nhead = substr($nw, $y-2, 2);
  791. $nfont = $cw.substr($nw, 0, $y-2);
  792. if( $this->IsWord( $nfont.$nhead ) )
  793. {
  794. if( strlen($cw) > 2 ) $j++;
  795. $hasDiff = TRUE;
  796. $newarr[$j] = $nfont.$nhead;
  797. }
  798. }
  799. }
  800. $j++;
  801. }
  802. }//end for
  803. $smarr = $newarr;
  804. }
  805. /**
  806. * 转换最终分词结果到 finallyResult 数组
  807. * @return void
  808. */
  809. function _sort_finally_result()
  810. {
  811. $newarr = array();
  812. $i = 0;
  813. foreach($this->simpleResult as $k=>$v)
  814. {
  815. if( empty($v['w']) ) continue;
  816. if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
  817. {
  818. foreach($this->finallyResult[$k] as $w)
  819. {
  820. if(!empty($w))
  821. {
  822. $newarr[$i]['w'] = $w;
  823. $newarr[$i]['t'] = 20;
  824. $i++;
  825. }
  826. }
  827. }
  828. else if($v['t'] != 21)
  829. {
  830. $newarr[$i]['w'] = $v['w'];
  831. $newarr[$i]['t'] = $v['t'];
  832. $i++;
  833. }
  834. }
  835. $this->finallyResult = $newarr;
  836. $newarr = '';
  837. }
  838. /**
  839. * 把uncode字符串转换为输出字符串
  840. * @parem str
  841. * return string
  842. */
  843. function _out_string_encoding( &$str )
  844. {
  845. $rsc = $this->_source_result_charset();
  846. if( $rsc==1 ) {
  847. $rsstr = iconv(UCS2, 'utf-8', $str);
  848. }
  849. else if( $rsc==2 ) {
  850. $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
  851. }
  852. else{
  853. $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
  854. }
  855. return $rsstr;
  856. }
  857. /**
  858. * 获取最终结果字符串(用空格分开后的分词结果)
  859. * @return string
  860. */
  861. function GetFinallyResult($spword=' ', $word_meanings=FALSE)
  862. {
  863. $rsstr = '';
  864. foreach($this->finallyResult as $v)
  865. {
  866. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  867. {
  868. continue;
  869. }
  870. $m = '';
  871. if( $word_meanings )
  872. {
  873. $m = $this->GetWordProperty($v['w']);
  874. }
  875. $w = $this->_out_string_encoding($v['w']);
  876. if( $w != ' ' )
  877. {
  878. if($word_meanings) {
  879. $rsstr .= $spword.$w.$m;
  880. }
  881. else {
  882. $rsstr .= $spword.$w;
  883. }
  884. }
  885. }
  886. return $rsstr;
  887. }
  888. /**
  889. * 获取粗分结果,不包含粗分属性
  890. * @return array()
  891. */
  892. function GetSimpleResult()
  893. {
  894. $rearr = array();
  895. foreach($this->simpleResult as $k=>$v)
  896. {
  897. if( empty($v['w']) ) continue;
  898. $w = $this->_out_string_encoding($v['w']);
  899. if( $w != ' ' ) $rearr[] = $w;
  900. }
  901. return $rearr;
  902. }
  903. /**
  904. * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
  905. * @return array()
  906. */
  907. function GetSimpleResultAll()
  908. {
  909. $rearr = array();
  910. foreach($this->simpleResult as $k=>$v)
  911. {
  912. $w = $this->_out_string_encoding($v['w']);
  913. if( $w != ' ' )
  914. {
  915. $rearr[$k]['w'] = $w;
  916. $rearr[$k]['t'] = $v['t'];
  917. }
  918. }
  919. return $rearr;
  920. }
  921. /**
  922. * 获取索引hash数组
  923. * @return array('word'=>count,)
  924. */
  925. function GetFinallyIndex()
  926. {
  927. $rearr = array();
  928. foreach($this->finallyResult as $v)
  929. {
  930. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  931. {
  932. continue;
  933. }
  934. $w = $this->_out_string_encoding($v['w']);
  935. if( $w == ' ' )
  936. {
  937. continue;
  938. }
  939. if( isset($rearr[$w]) )
  940. {
  941. $rearr[$w]++;
  942. }
  943. else
  944. {
  945. $rearr[$w] = 1;
  946. }
  947. }
  948. return $rearr;
  949. }
  950. /**
  951. * 获得保存目标编码
  952. * @return int
  953. */
  954. function _source_result_charset()
  955. {
  956. if( preg_match("/^utf/", $this->targetCharSet) ) {
  957. $rs = 1;
  958. }
  959. else if( preg_match("/^gb/", $this->targetCharSet) ) {
  960. $rs = 2;
  961. }
  962. else if( preg_match("/^big/", $this->targetCharSet) ) {
  963. $rs = 3;
  964. }
  965. else {
  966. $rs = 4;
  967. }
  968. return $rs;
  969. }
  970. /**
  971. * 编译词典
  972. * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
  973. * 注意, 需要PHP开放足够的内存才能完成操作
  974. * @return void
  975. */
  976. function MakeDict( $source_file, $target_file='' )
  977. {
  978. $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
  979. $allk = array();
  980. $fp = fopen($source_file, 'r');
  981. while( $line = fgets($fp, 512) )
  982. {
  983. if( $line[0]=='@' ) continue;
  984. list($w, $r, $a) = explode(',', $line);
  985. $a = trim( $a );
  986. $w = iconv('utf-8', UCS2, $w);
  987. $k = $this->_get_index( $w );
  988. if( isset($allk[ $k ]) )
  989. $allk[ $k ][ $w ] = array($r, $a);
  990. else
  991. $allk[ $k ][ $w ] = array($r, $a);
  992. }
  993. fclose( $fp );
  994. $fp = fopen($target_file, 'w');
  995. $heade_rarr = array();
  996. $alldat = '';
  997. $start_pos = $this->mask_value * 8;
  998. foreach( $allk as $k => $v )
  999. {
  1000. $dat = serialize( $v );
  1001. $dlen = strlen($dat);
  1002. $alldat .= $dat;
  1003. $heade_rarr[ $k ][0] = $start_pos;
  1004. $heade_rarr[ $k ][1] = $dlen;
  1005. $heade_rarr[ $k ][2] = count( $v );
  1006. $start_pos += $dlen;
  1007. }
  1008. unset( $allk );
  1009. for($i=0; $i < $this->mask_value; $i++)
  1010. {
  1011. if( !isset($heade_rarr[$i]) )
  1012. {
  1013. $heade_rarr[$i] = array(0, 0, 0);
  1014. }
  1015. fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
  1016. }
  1017. fwrite( $fp, $alldat);
  1018. fclose( $fp );
  1019. }
  1020. /**
  1021. * 导出词典的词条
  1022. * @parem $targetfile 保存位置
  1023. * @return void
  1024. */
  1025. function ExportDict( $targetfile )
  1026. {
  1027. if( !$this->mainDicHand )
  1028. {
  1029. $this->mainDicHand = fopen($this->mainDicFile, 'rw');
  1030. }
  1031. $fp = fopen($targetfile, 'w');
  1032. for($i=0; $i <= $this->mask_value; $i++)
  1033. {
  1034. $move_pos = $i * 8;
  1035. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  1036. $dat = fread($this->mainDicHand, 8);
  1037. $arr = unpack('I1s/n1l/n1c', $dat);
  1038. if( $arr['l'] == 0 )
  1039. {
  1040. continue;
  1041. }
  1042. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  1043. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  1044. if( !is_array($data) ) continue;
  1045. foreach($data as $k => $v)
  1046. {
  1047. $w = iconv(UCS2, 'utf-8', $k);
  1048. fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
  1049. }
  1050. }
  1051. fclose( $fp );
  1052. return TRUE;
  1053. }
  1054. function InportDict( $targetfile )
  1055. {
  1056. if(!ini_set('memory_limit', '128M'))
  1057. exit('设置内存错误,请到dede官网下载解压版的base_dic_full.dic!');
  1058. require_once(DEDEINC.'/libraries/zip.class.php');
  1059. $zip = new zip();
  1060. //echo $targetfile;
  1061. $unpackagefile = array_keys($zip->Extract($targetfile,DEDEINC.'/data/'));
  1062. //exit();
  1063. $this->MakeDict(DEDEINC.'/data/'.$unpackagefile[0]);
  1064. unlink(DEDEINC.'/data/'.$unpackagefile[0]);
  1065. return true;
  1066. }
  1067. }