Home » lucene-3.0.1-src » org.apache.lucene.analysis.cn.smart.hhmm » [javadoc | source]

    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.lucene.analysis.cn.smart.hhmm;
   19   
   20   import java.util.ArrayList;
   21   import java.util.Collection;
   22   import java.util.HashMap;
   23   import java.util.List;
   24   import java.util.Map;
   25   
   26   import org.apache.lucene.analysis.cn.smart.Utility;
   27   
   28   /**
   29    * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
   30    * <p>
   31    * For each start offset, a list of possible token pairs is stored.
   32    * </p>
   33    * <p><font color="#FF0000">
   34    * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
   35    * The APIs and file formats introduced here might change in the future and will not be 
   36    * supported anymore in such a case.</font>
   37    * </p>
   38    */
   39   class BiSegGraph {
   40   
   41     private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
   42   
   43     private List<SegToken> segTokenList;
   44   
   45     private static BigramDictionary bigramDict = BigramDictionary.getInstance();
   46   
   47     public BiSegGraph(SegGraph segGraph) {
   48       segTokenList = segGraph.makeIndex();
   49       generateBiSegGraph(segGraph);
   50     }
   51   
   52     /*
   53      * Generate a BiSegGraph based upon a SegGraph
   54      */
   55     private void generateBiSegGraph(SegGraph segGraph) {
   56       double smooth = 0.1;
   57       int wordPairFreq = 0;
   58       int maxStart = segGraph.getMaxStart();
   59       double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
   60   
   61       int next;
   62       char[] idBuffer;
   63       // get the list of tokens ordered and indexed
   64       segTokenList = segGraph.makeIndex();
   65       // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
   66       int key = -1;
   67       List<SegToken> nextTokens = null;
   68       while (key < maxStart) {
   69         if (segGraph.isStartExist(key)) {
   70   
   71           List<SegToken> tokenList = segGraph.getStartList(key);
   72   
   73           // Calculate all tokens for a given key.
   74           for (SegToken t1 : tokenList) {
   75             oneWordFreq = t1.weight;
   76             next = t1.endOffset;
   77             nextTokens = null;
   78             // Find the next corresponding Token.
   79             // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
   80             // If we cannot find the next Token, then go to the end and repeat the same cycle.
   81             while (next <= maxStart) {
   82               // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
   83               if (segGraph.isStartExist(next)) {
   84                 nextTokens = segGraph.getStartList(next);
   85                 break;
   86               }
   87               next++;
   88             }
   89             if (nextTokens == null) {
   90               break;
   91             }
   92             for (SegToken t2 : nextTokens) {
   93               idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
   94               System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
   95               idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
   96               System.arraycopy(t2.charArray, 0, idBuffer,
   97                   t1.charArray.length + 1, t2.charArray.length);
   98   
   99               // Two linked Words frequency
  100               wordPairFreq = bigramDict.getFrequency(idBuffer);
  101   
  102               // Smoothing
  103   
  104               // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
  105               weight = -Math
  106                   .log(smooth
  107                       * (1.0 + oneWordFreq)
  108                       / (Utility.MAX_FREQUENCE + 0.0)
  109                       + (1.0 - smooth)
  110                       * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
  111   
  112               SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
  113                   t2.index, weight);
  114               this.addSegTokenPair(tokenPair);
  115             }
  116           }
  117         }
  118         key++;
  119       }
  120   
  121     }
  122   
  123     /**
  124      * Returns true if their is a list of token pairs at this offset (index of the second token)
  125      * 
  126      * @param to index of the second token in the token pair
  127      * @return true if a token pair exists
  128      */
  129     public boolean isToExist(int to) {
  130       return tokenPairListTable.get(Integer.valueOf(to)) != null;
  131     }
  132   
  133     /**
  134      * Return a {@link List} of all token pairs at this offset (index of the second token)
  135      * 
  136      * @param to index of the second token in the token pair
  137      * @return {@link List} of token pairs.
  138      */
  139     public List<SegTokenPair> getToList(int to) {
  140       return tokenPairListTable.get(to);
  141     }
  142   
  143     /**
  144      * Add a {@link SegTokenPair}
  145      * 
  146      * @param tokenPair {@link SegTokenPair}
  147      */
  148     public void addSegTokenPair(SegTokenPair tokenPair) {
  149       int to = tokenPair.to;
  150       if (!isToExist(to)) {
  151         ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
  152         newlist.add(tokenPair);
  153         tokenPairListTable.put(to, newlist);
  154       } else {
  155         List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
  156         tokenPairList.add(tokenPair);
  157       }
  158     }
  159   
  160     /**
  161      * Get the number of {@link SegTokenPair} entries in the table.
  162      * @return number of {@link SegTokenPair} entries
  163      */
  164     public int getToCount() {
  165       return tokenPairListTable.size();
  166     }
  167   
  168     /**
  169      * Find the shortest path with the Viterbi algorithm.
  170      * @return {@link List}
  171      */
  172     public List<SegToken> getShortPath() {
  173       int current;
  174       int nodeCount = getToCount();
  175       List<PathNode> path = new ArrayList<PathNode>();
  176       PathNode zeroPath = new PathNode();
  177       zeroPath.weight = 0;
  178       zeroPath.preNode = 0;
  179       path.add(zeroPath);
  180       for (current = 1; current <= nodeCount; current++) {
  181         double weight;
  182         List<SegTokenPair> edges = getToList(current);
  183   
  184         double minWeight = Double.MAX_VALUE;
  185         SegTokenPair minEdge = null;
  186         for (SegTokenPair edge : edges) {
  187           weight = edge.weight;
  188           PathNode preNode = path.get(edge.from);
  189           if (preNode.weight + weight < minWeight) {
  190             minWeight = preNode.weight + weight;
  191             minEdge = edge;
  192           }
  193         }
  194         PathNode newNode = new PathNode();
  195         newNode.weight = minWeight;
  196         newNode.preNode = minEdge.from;
  197         path.add(newNode);
  198       }
  199   
  200       // Calculate PathNodes
  201       int preNode, lastNode;
  202       lastNode = path.size() - 1;
  203       current = lastNode;
  204       List<Integer> rpath = new ArrayList<Integer>();
  205       List<SegToken> resultPath = new ArrayList<SegToken>();
  206   
  207       rpath.add(current);
  208       while (current != 0) {
  209         PathNode currentPathNode = path.get(current);
  210         preNode = currentPathNode.preNode;
  211         rpath.add(Integer.valueOf(preNode));
  212         current = preNode;
  213       }
  214       for (int j = rpath.size() - 1; j >= 0; j--) {
  215         Integer idInteger = (Integer) rpath.get(j);
  216         int id = idInteger.intValue();
  217         SegToken t = segTokenList.get(id);
  218         resultPath.add(t);
  219       }
  220       return resultPath;
  221   
  222     }
  223   
  224     @Override
  225     public String toString() {
  226       StringBuilder sb = new StringBuilder();
  227       Collection<ArrayList<SegTokenPair>>  values = tokenPairListTable.values();
  228       for (ArrayList<SegTokenPair> segList : values) {
  229         for (SegTokenPair pair : segList) {
  230           sb.append(pair + "\n");
  231         }
  232       }
  233       return sb.toString();
  234     }
  235   
  236   }

Home » lucene-3.0.1-src » org.apache.lucene.analysis.cn.smart.hhmm » [javadoc | source]