StartAnalysis -> Get***Result
 *  4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
 *
 * @version        $id:splitword.class.php 2 11:45 2011-2-14 itplato $
 * @package        DedeBIZ.Libraries
 * @copyright      Copyright (c) 2022 DedeBIZ.COM
 * @license        GNU GPL v2 (https://www.dedebiz.com/license)
 * @link           https://www.dedebiz.com
 */
//常量定义
define('_SP_', chr(0xFF).chr(0xFE)); 
define('UCS2', 'ucs-2be');
class SplitWord
{
    //hash算法选项
    var $mask_value = 0xFFFF;
    //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)  
    var $sourceCharSet = 'utf-8';
    var $targetCharSet = 'utf-8';
    //生成的分词结果数据类型 1 为全部,2为 词典词汇及单个中日韩简繁字符及英文,3 为词典词汇及英文
    var $resultType = 1;
    //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
    var $notSplitLen = 5;
    //把英文单词全部转小写
    var $toLower = FALSE;
    //使用最大切分模式对二元词进行消岐
    var $differMax = FALSE;
    //尝试合并单字
    var $unitWord = TRUE;
    //初始化类时直接加载词典
    var $loadInit = TRUE;
    //使用热门词优先模式进行消岐
    var $differFreq = FALSE;
    //被转换为unicode的源字符串
    var $sourceString = '';
    //附加词典
    var $addonDic = array();
    var $addonDicFile = 'data/words_addons.dic';
    //主词典 
    var $dicStr = '';
    var $mainDic = array();
    var $mainDicHand = FALSE;
    var $mainDicInfos = array();
    var $mainDicFile = 'data/base_dic_full.dic';
    //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
    var $mainDicFileZip = 'data/base_dic_full.zip';
    var $isLoadAll = FALSE;
    var $isUnpacked = FALSE;
    //主词典词语最大长度 x / 2
    var $dicWordMax = 14;
    //粗分后的数组(通常是截取句子等用途)
    var $simpleResult = array();
    //最终结果(用空格分开的词汇列表)
    var $finallyResult = array();
    //是否已经载入词典
    var $isLoadDic = FALSE;
    //系统识别或合并的新词
    var $newWords = array();
    var $foundWordStr = '';
    //词库载入时间
    var $loadTime = 0;
    /**
     * 构造函数
     * @param $source_charset
     * @param $target_charset
     * @param $load_alldic 
     * @param $source
     *
     * @return void
     */
    function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
    {
        $this->SetSource( $source, $source_charset, $target_charset );
        $this->isLoadAll = $load_all;
        if (file_exists(DEDEINC.'/'.$this->mainDicFile)) $this->isUnpacked = TRUE;
        if ($this->loadInit) $this->LoadDict();
    }
    function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
    {
        $this->__construct($source_charset, $target_charset, $load_all, $source);
    }
    /**
    * 析构函数
    */
    function __destruct()
    {
        if ( $this->mainDicHand !== FALSE )
        {
            @fclose( $this->mainDicHand );
        }
    }
    /**
     * 根据字符串计算key索引
     * @param $key
     * @return int int
     */
    function _get_index( $key )
    {
        $l = strlen($key);
        $h = 0x238f13af;
        while ($l--)
        {
            $h += ($h << 5);
            $h ^= ord($key[$l]);
            $h &= 0x7fffffff;
        }
        return ($h % $this->mask_value);
    }
    /**
     * 从文件获得词
     * @param $key
     * @param $type (类型 word 或 key_groups)
     * @return mixed int
     */
    function GetWordInfos( $key, $type='word' )
    {
        if ( !$this->mainDicHand )
        {
            $this->mainDicHand = fopen($this->mainDicFile, 'r');
        }
        $p = 0;
        $keynum = (int)$this->_get_index( $key );
        if ( isset($this->mainDicInfos[ $keynum ]) )
        {
            $data = $this->mainDicInfos[ $keynum ];
        } else {
            //rewind( $this->mainDicHand );
            $move_pos = $keynum * 8;
            fseek($this->mainDicHand, $move_pos, SEEK_SET);
            $dat = fread($this->mainDicHand, 8);
            $arr = unpack('I1s/n1l/n1c', $dat);
            if ( $arr['l'] == 0 )
            {
                return FALSE;
            }
            fseek($this->mainDicHand, $arr['s'], SEEK_SET);
            $data = @unserialize(fread($this->mainDicHand, $arr['l']));
            $this->mainDicInfos[ $keynum ] = $data;
       }
       if ( !is_array($data) || !isset($data[$key]) ) 
       {
           return FALSE;
       }
       return ($type=='word' ? $data[$key] : $data);
    }
    /**
     * 设置源字符串
     * @param $source
     * @param $source_charset
     * @param $target_charset
     *
     * @return bool
     */
    function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
    {
        $this->sourceCharSet = strtolower($source_charset);
        $this->targetCharSet = strtolower($target_charset);
        $this->simpleResult = array();
        $this->finallyResult = array();
        if ( $source != '' )
        {
            $rs = TRUE;
            if ( preg_match("/^utf/", $source_charset) ) {
                $this->sourceString = @iconv('utf-8', UCS2, $source);
            }
            else if ( preg_match("/^gb/", $source_charset) ) {
                $this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
            }
            else if ( preg_match("/^big/", $source_charset) ) {
                $this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
            } else {
                $rs = FALSE;
            }
        } else {
           $rs = FALSE;
        }
        return $rs;
    }
    /**
     * 设置结果类型(只在获取finallyResult才有效)
     * @param $rstype 1 为全部,2去除特殊符号
     *
     * @return void
     */
    function SetResultType( $rstype )
    {
        $this->resultType = $rstype;
    }
    /**
     * 载入词典
     *
     * @return void
     */
    function LoadDict( $maindic='' )
    {
		$this->addonDicFile = DEDEINC.'/libraries/'.$this->addonDicFile;
		$this->mainDicFile = DEDEINC.'/libraries/'.$this->mainDicFile;
		$this->mainDicFileZip = DEDEINC.'/libraries/'.$this->mainDicFileZip;
        $startt = microtime(TRUE);
        //正常读取文件
        $dicAddon = $this->addonDicFile;
        if ($maindic=='' || !file_exists($maindic) )
        {
            $dicWords = $this->mainDicFile ;
        } else {
            $dicWords = $maindic;
            $this->mainDicFile = $maindic;
        }
        //加载主词典(只打开)
        if ($this->isUnpacked){
        	$this->mainDicHand = fopen($dicWords, 'r');
        }
        //载入副词典
        $hw = '';
        $ds = file($dicAddon);
        foreach($ds as $d)
        {
            $d = trim($d);
            if ($d=='') continue;
            $estr = substr($d, 1, 1);
            if ( $estr==':' ) {
                $hw = substr($d, 0, 1);
            } else {
                $spstr = _SP_;
                $spstr = iconv(UCS2, 'utf-8', $spstr);
                $ws = explode(',', $d);
                $wall = iconv('utf-8', UCS2, join($spstr, $ws));
                $ws = explode(_SP_, $wall);
                foreach($ws as $estr)
                {
                    $this->addonDic[$hw][$estr] = strlen($estr);
                }
            }
        }
        $this->loadTime = microtime(TRUE) - $startt;
        $this->isLoadDic = TRUE;
    }
    /**
    * 检测某个词是否存在
    */
    function IsWord( $word )
    {
         $winfos = $this->GetWordInfos( $word );
         return ($winfos !== FALSE);
    }
    /**
     * 获得某个词的词性及词频信息
     * @parem $word unicode编码的词
     * @return string
     */
    function GetWordProperty($word)
    {
        if ( strlen($word)<4 )
        {
            return '/s';
        }
        $infos = $this->GetWordInfos($word);
        return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
    }
    /**
     * 指定某词的词性信息(通常是新词)
     * @parem $word unicode编码的词
     * @parem $infos array('c' => 词频, 'm' => 词性);
     * @return void;
     */
    function SetWordInfos($word, $infos)
    {
        if ( strlen($word)<4 )
        {
            return ;
        }
        if ( isset($this->mainDicInfos[$word]) )
        {
            $this->newWords[$word]++;
            $this->mainDicInfos[$word]['c']++;
        } else {
            $this->newWords[$word] = 1;
            $this->mainDicInfos[$word] = $infos;
        }
    }
    /**
     * 开始执行分析
     * @parem bool optimize 是否对结果进行优化
     * @return void
     */
    function StartAnalysis($optimize=TRUE)
    {
        if ( !$this->isLoadDic )
        {
            $this->LoadDict();
        }
        $this->simpleResult = $this->finallyResult = array();
        $this->sourceString .= chr(0).chr(32);
        $slen = strlen($this->sourceString);
        $sbcArr = array();
        $j = 0;
        //全角与半角字符对照表
        for($i=0xFF00; $i < 0xFF5F; $i++)
        {
            $scb = 0x20 + $j;
            $j++;
            $sbcArr[$i] = $scb;
        }
        //对字符串进行粗分
        $onstr = '';
        $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
        $s = 0;
        $ansiWordMatch = "[0-9a-z@#%\+\.-]";
        $notNumberMatch = "[a-z@#%\+]";
        for($i=0; $i < $slen; $i++)
        {
            $c = $this->sourceString[$i].$this->sourceString[++$i];
            $cn = hexdec(bin2hex($c));
            $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
            //ANSI字符
            if ($cn < 0x80)
            {
                if ( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
                {
                    if ( $lastc != 2 && $onstr != '') {
                        $this->simpleResult[$s]['w'] = $onstr;
                        $this->simpleResult[$s]['t'] = $lastc;
                        $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                        $onstr = '';
                    }
                    $lastc = 2;
                    $onstr .= chr(0).chr($cn);
                } else {
                    if ( $onstr != '' )
                    {
                        $this->simpleResult[$s]['w'] = $onstr;
                        if ( $lastc==2 )
                        {
                            if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
                        }
                        $this->simpleResult[$s]['t'] = $lastc;
                        if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                    }
                    $onstr = '';
                    $lastc = 3;
                    if ($cn < 31)
                    {
                        continue;
                    } else {
                        $this->simpleResult[$s]['w'] = chr(0).chr($cn);
                        $this->simpleResult[$s]['t'] = 3;
                        $s++;
                    }
                }
            }
            //普通字符
            else
            {
                //正常文字
                if ( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
                    || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
                {
                    if ( $lastc != 1 && $onstr != '')
                    {
                        $this->simpleResult[$s]['w'] = $onstr;
                        if ( $lastc==2 )
                        {
                            if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
                        }
                        $this->simpleResult[$s]['t'] = $lastc;
                        if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                        $onstr = '';
                    }
                    $lastc = 1;
                    $onstr .= $c;
                }
                //特殊符号
                else
                {
                    if ( $onstr != '' )
                    {
                        $this->simpleResult[$s]['w'] = $onstr;
                        if ( $lastc==2 )
                        {
                            if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
                        }
                        $this->simpleResult[$s]['t'] = $lastc;
                        if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
                        $s++;
                    }
                    //检测书名
                    if ( $cn == 0x300A )
                    {
                        $tmpw = '';
                        $n = 1;
                        $isok = FALSE;
                        $ew = chr(0x30).chr(0x0B);
                        while(TRUE)
                        {
							if (!isset($this->sourceString[$i+$n]) && !isset($this->sourceString[$i+$n+1]))
							break;
                            $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
                            if ( $w == $ew )
                            {
                                $this->simpleResult[$s]['w'] = $c;
                                $this->simpleResult[$s]['t'] = 5;
                                $s++;
                        
                                $this->simpleResult[$s]['w'] = $tmpw;
                                $this->newWords[$tmpw] = 1;
                                if ( !isset($this->newWords[$tmpw]) )
                                {
                                    $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
                                    $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
                                }
                                $this->simpleResult[$s]['t'] = 13;
                                $s++;
                                //最大切分模式对书名继续分词
                                if ( $this->differMax )
                                {
                                    $this->simpleResult[$s]['w'] = $tmpw;
                                    $this->simpleResult[$s]['t'] = 21;
                                    $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
                                    $s++;
                                }
                                $this->simpleResult[$s]['w'] = $ew;
                                $this->simpleResult[$s]['t'] =  5;
                                $s++;
                                $i = $i + $n + 1;
                                $isok = TRUE;
                                $onstr = '';
                                $lastc = 5;
                                break;
                            } else {
                                $n = $n+2;
                                $tmpw .= $w;
                                if ( strlen($tmpw) > 60 )
                                {
                                    break;
                                }
                            }
                        }//while
                        if ( !$isok )
                        {
                            $this->simpleResult[$s]['w'] = $c;
                              $this->simpleResult[$s]['t'] = 5;
                              $s++;
                              $onstr = '';
                            $lastc = 5;
                        }
                        continue;
                    }
                    $onstr = '';
                    $lastc = 5;
                    if ( $cn==0x3000 )
                    {
                        continue;
                    } else {
                        $this->simpleResult[$s]['w'] = $c;
                        $this->simpleResult[$s]['t'] = 5;
                        $s++;
                    }
                }
            }
        }
        //处理分词后的结果
        $this->_sort_finally_result();
    }
    /**
     * 深入分词
     * @parem $str
     * @parem $ctype (2 英文类,3 中/韩/日文类)
     * @parem $spos   当前粗分结果游标
     * @return void
     */
    function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE )
    {
        //中文句子
        if ( $ctype==1 )
        {
            $slen = strlen($str);
            //小于系统配置分词要求长度的句子
            if ( $slen < $this->notSplitLen )
            {
                $tmpstr = '';
                $lastType = 0;
                if ( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
                if ($slen < 5)
                {
                      //echo iconv(UCS2, 'utf-8', $str).'
';
                      if ( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
                      {
                            $str2 = '';
                            if ( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
                            {
                                $str2 = substr($str, 2, 2);
                                $str  = substr($str, 0, 2);
                            }
                            $ww = $this->simpleResult[$spos - 1]['w'].$str;
                            $this->simpleResult[$spos - 1]['w'] = $ww;
                            $this->simpleResult[$spos - 1]['t'] = 4;
                            if ( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
                            {
                                $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
                                $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
                            }
                            $this->simpleResult[$spos]['w'] = '';
                            if ( $str2 != '' )
                            {
                                $this->finallyResult[$spos-1][] = $ww;
                                $this->finallyResult[$spos-1][] = $str2;
                            }
                       } else {
                              $this->finallyResult[$spos][] = $str;
                       }
                } else {
                      $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
                }
            }
            //正常长度的句子,循环进行分词处理
            else {
                $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
            }
        }
        //英文句子,转为小写
        else {
            if ( $this->toLower ) {
                $this->finallyResult[$spos][] = strtolower($str);
            } else {
                $this->finallyResult[$spos][] = $str;
            }
        }
    }
    /**
     * 中文的深入分词
     * @parem $str
     * @return void
     */
    function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE )
    {
        $quote1 = chr(0x20).chr(0x1C);
        $tmparr = array();
        $hasw = 0;
        //如果前一个词为“,并且字符串小于3个字符当成一个词处理
        if ( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
        {
            $tmparr[] = $str;
            if ( !isset($this->newWords[$str]) )
            {
                $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
                $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
            }
            if ( !$this->differMax )
            {
                $this->finallyResult[$spos][] = $str;
                return ;
            }
        }
        //进行切分
        for($i=$slen-1; $i > 0; $i -= 2)
        {
            //单个词
            $nc = $str[$i-1].$str[$i];
            //是否已经到最后两个字
            if ( $i <= 2 )
            {
                $tmparr[] = $nc;
                $i = 0;
                break;
            }
            $isok = FALSE;
            $i = $i + 1;
            for($k=$this->dicWordMax; $k>1; $k=$k-2)
            {
                if ($i < $k) continue;
                $w = substr($str, $i-$k, $k);
                if ( strlen($w) <= 2 )
                {
                    $i = $i - 1;
                    break;
                }
                if ( $this->IsWord( $w ) )
                {
                    $tmparr[] = $w;
                    $i = $i - $k + 1;
                    $isok = TRUE;
                    break;
                }
            }
            //echo '