Home » lucene-3.0.1-src » org.apache.lucene.analysis.cn.smart.hhmm » [javadoc | source]

    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.lucene.analysis.cn.smart.hhmm;
   19   
   20   import java.util.List;
   21   
   22   import org.apache.lucene.analysis.cn.smart.CharType;
   23   import org.apache.lucene.analysis.cn.smart.Utility;
   24   import org.apache.lucene.analysis.cn.smart.WordType;
   25   import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
   26   
   27   /**
   28    * Finds the optimal segmentation of a sentence into Chinese words
   29    * <p><font color="#FF0000">
   30    * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
   31    * The APIs and file formats introduced here might change in the future and will not be 
   32    * supported anymore in such a case.</font>
   33    * </p>
   34    */
   35   public class HHMMSegmenter {
   36   
   37     private static WordDictionary wordDict = WordDictionary.getInstance();
   38   
   39     /**
   40      * Create the {@link SegGraph} for a sentence.
   41      * 
   42      * @param sentence input sentence, without start and end markers
   43      * @return {@link SegGraph} corresponding to the input sentence.
   44      */
   45     private SegGraph createSegGraph(String sentence) {
   46       int i = 0, j;
   47       int length = sentence.length();
   48       int foundIndex;
   49       int[] charTypeArray = getCharTypes(sentence);
   50       StringBuilder wordBuf = new StringBuilder();
   51       SegToken token;
   52       int frequency = 0; // the number of times word appears.
   53       boolean hasFullWidth;
   54       int wordType;
   55       char[] charArray;
   56   
   57       SegGraph segGraph = new SegGraph();
   58       while (i < length) {
   59         hasFullWidth = false;
   60         switch (charTypeArray[i]) {
   61           case CharType.SPACE_LIKE:
   62             i++;
   63             break;
   64           case CharType.HANZI:
   65             j = i + 1;
   66             wordBuf.delete(0, wordBuf.length());
   67             // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, 
   68             // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will 
   69             // cause word division.
   70             wordBuf.append(sentence.charAt(i));
   71             charArray = new char[] { sentence.charAt(i) };
   72             frequency = wordDict.getFrequency(charArray);
   73             token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
   74                 frequency);
   75             segGraph.addToken(token);
   76   
   77             foundIndex = wordDict.getPrefixMatch(charArray);
   78             while (j <= length && foundIndex != -1) {
   79               if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
   80                 // It is the phrase we are looking for; In other words, we have found a phrase SegToken
   81                 // from i to j.  It is not a monosyllabic word (single word).
   82                 frequency = wordDict.getFrequency(charArray);
   83                 token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
   84                     frequency);
   85                 segGraph.addToken(token);
   86               }
   87   
   88               while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
   89                 j++;
   90   
   91               if (j < length && charTypeArray[j] == CharType.HANZI) {
   92                 wordBuf.append(sentence.charAt(j));
   93                 charArray = new char[wordBuf.length()];
   94                 wordBuf.getChars(0, charArray.length, charArray, 0);
   95                 // idArray has been found (foundWordIndex!=-1) as a prefix before.  
   96                 // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.  
   97                 // So start searching after foundWordIndex.
   98                 foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
   99                 j++;
  100               } else {
  101                 break;
  102               }
  103             }
  104             i++;
  105             break;
  106           case CharType.FULLWIDTH_LETTER:
  107             hasFullWidth = true;
  108           case CharType.LETTER:
  109             j = i + 1;
  110             while (j < length
  111                 && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
  112               if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
  113                 hasFullWidth = true;
  114               j++;
  115             }
  116             // Found a Token from i to j. Type is LETTER char string.
  117             charArray = Utility.STRING_CHAR_ARRAY;
  118             frequency = wordDict.getFrequency(charArray);
  119             wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
  120             token = new SegToken(charArray, i, j, wordType, frequency);
  121             segGraph.addToken(token);
  122             i = j;
  123             break;
  124           case CharType.FULLWIDTH_DIGIT:
  125             hasFullWidth = true;
  126           case CharType.DIGIT:
  127             j = i + 1;
  128             while (j < length
  129                 && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
  130               if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
  131                 hasFullWidth = true;
  132               j++;
  133             }
  134             // Found a Token from i to j. Type is NUMBER char string.
  135             charArray = Utility.NUMBER_CHAR_ARRAY;
  136             frequency = wordDict.getFrequency(charArray);
  137             wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
  138             token = new SegToken(charArray, i, j, wordType, frequency);
  139             segGraph.addToken(token);
  140             i = j;
  141             break;
  142           case CharType.DELIMITER:
  143             j = i + 1;
  144             // No need to search the weight for the punctuation.  Picking the highest frequency will work.
  145             frequency = Utility.MAX_FREQUENCE;
  146             charArray = new char[] { sentence.charAt(i) };
  147             token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
  148             segGraph.addToken(token);
  149             i = j;
  150             break;
  151           default:
  152             j = i + 1;
  153             // Treat the unrecognized char symbol as unknown string.
  154             // For example, any symbol not in GB2312 is treated as one of these.
  155             charArray = Utility.STRING_CHAR_ARRAY;
  156             frequency = wordDict.getFrequency(charArray);
  157             token = new SegToken(charArray, i, j, WordType.STRING, frequency);
  158             segGraph.addToken(token);
  159             i = j;
  160             break;
  161         }
  162       }
  163   
  164       // Add two more Tokens: "beginning xx beginning"
  165       charArray = Utility.START_CHAR_ARRAY;
  166       frequency = wordDict.getFrequency(charArray);
  167       token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
  168       segGraph.addToken(token);
  169   
  170       // "end xx end"
  171       charArray = Utility.END_CHAR_ARRAY;
  172       frequency = wordDict.getFrequency(charArray);
  173       token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
  174           frequency);
  175       segGraph.addToken(token);
  176   
  177       return segGraph;
  178     }
  179   
  180     /**
  181      * Get the character types for every character in a sentence.
  182      * 
  183      * @see Utility#getCharType(char)
  184      * @param sentence input sentence
  185      * @return array of character types corresponding to character positions in the sentence
  186      */
  187     private static int[] getCharTypes(String sentence) {
  188       int length = sentence.length();
  189       int[] charTypeArray = new int[length];
  190       // the type of each character by position
  191       for (int i = 0; i < length; i++) {
  192         charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
  193       }
  194   
  195       return charTypeArray;
  196     }
  197   
  198     /**
  199      * Return a list of {@link SegToken} representing the best segmentation of a sentence
  200      * @param sentence input sentence
  201      * @return best segmentation as a {@link List}
  202      */
  203     public List<SegToken> process(String sentence) {
  204       SegGraph segGraph = createSegGraph(sentence);
  205       BiSegGraph biSegGraph = new BiSegGraph(segGraph);
  206       List<SegToken> shortPath = biSegGraph.getShortPath();
  207       return shortPath;
  208     }
  209   }

Home » lucene-3.0.1-src » org.apache.lucene.analysis.cn.smart.hhmm » [javadoc | source]