Home » lucene-3.0.1-src » org.apache.lucene.analysis.cn.smart.hhmm » [javadoc | source]

    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.lucene.analysis.cn.smart.hhmm;
   19   
   20   import java.io.File;
   21   import java.io.FileInputStream;
   22   import java.io.FileNotFoundException;
   23   import java.io.FileOutputStream;
   24   import java.io.IOException;
   25   import java.io.InputStream;
   26   import java.io.ObjectInputStream;
   27   import java.io.ObjectOutputStream;
   28   import java.io.RandomAccessFile;
   29   import java.io.UnsupportedEncodingException;
   30   import java.nio.ByteBuffer;
   31   import java.nio.ByteOrder;
   32   
   33   import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
   34   import org.apache.lucene.analysis.cn.smart.Utility;
   35   
   36   /**
   37    * SmartChineseAnalyzer Word Dictionary
   38    *
   39    * <p><font color="#FF0000">
   40    * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
   41    * The APIs and file formats introduced here might change in the future and will not be 
   42    * supported anymore in such a case.</font>
   43    * </p>
   44    */
   45   class WordDictionary extends AbstractDictionary {
   46   
   47     private WordDictionary() {
   48     }
   49   
   50     private static WordDictionary singleInstance;
   51   
   52     /**
   53      * Large prime number for hash function
   54      */
   55     public static final int PRIME_INDEX_LENGTH = 12071;
   56   
   57     /**
   58      * wordIndexTable guarantees to hash all Chinese characters in Unicode into 
   59      * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this 
   60      * program only handles the 6768 characters found in GB2312 plus some 
   61      * ASCII characters. Therefore in order to guarantee better precision, it is
   62      * necessary to retain the original symbol in the charIndexTable.
   63      */
   64     private short[] wordIndexTable;
   65   
   66     private char[] charIndexTable;
   67   
   68     /**
   69      * To avoid taking too much space, the data structure needed to store the 
   70      * lexicon requires two multidimensional arrays to store word and frequency.
   71      * Each word is placed in a char[]. Each char represents a Chinese char or 
   72      * other symbol.  Each frequency is put into an int. These two arrays 
   73      * correspond to each other one-to-one. Therefore, one can use 
   74      * wordItem_charArrayTable[i][j] to look up word from lexicon, and 
   75      * wordItem_frequencyTable[i][j] to look up the corresponding frequency. 
   76      */
   77     private char[][][] wordItem_charArrayTable;
   78   
   79     private int[][] wordItem_frequencyTable;
   80   
   81     // static Logger log = Logger.getLogger(WordDictionary.class);
   82   
   83     /**
   84      * Get the singleton dictionary instance.
   85      * @return singleton
   86      */
   87     public synchronized static WordDictionary getInstance() {
   88       if (singleInstance == null) {
   89         singleInstance = new WordDictionary();
   90         try {
   91           singleInstance.load();
   92         } catch (IOException e) {
   93           String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
   94           singleInstance.load(wordDictRoot);
   95         } catch (ClassNotFoundException e) {
   96           throw new RuntimeException(e);
   97         }
   98       }
   99       return singleInstance;
  100     }
  101   
  102     /**
  103      * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
  104      * 
  105      * @param dctFileRoot path to dictionary directory
  106      */
  107     public void load(String dctFileRoot) {
  108       String dctFilePath = dctFileRoot + "/coredict.dct";
  109       File serialObj = new File(dctFileRoot + "/coredict.mem");
  110   
  111       if (serialObj.exists() && loadFromObj(serialObj)) {
  112   
  113       } else {
  114         try {
  115           wordIndexTable = new short[PRIME_INDEX_LENGTH];
  116           charIndexTable = new char[PRIME_INDEX_LENGTH];
  117           for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
  118             charIndexTable[i] = 0;
  119             wordIndexTable[i] = -1;
  120           }
  121           wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
  122           wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
  123           // int total =
  124           loadMainDataFromFile(dctFilePath);
  125           expandDelimiterData();
  126           mergeSameWords();
  127           sortEachItems();
  128           // log.info("load dictionary: " + dctFilePath + " total:" + total);
  129         } catch (IOException e) {
  130           throw new RuntimeException(e.getMessage());
  131         }
  132   
  133         saveToObj(serialObj);
  134       }
  135   
  136     }
  137   
  138     /**
  139      * Load coredict.mem internally from the jar file.
  140      * 
  141      * @throws ClassNotFoundException
  142      * @throws IOException
  143      */
  144     public void load() throws IOException, ClassNotFoundException {
  145       InputStream input = this.getClass().getResourceAsStream("coredict.mem");
  146       loadFromObjectInputStream(input);
  147     }
  148   
  149     private boolean loadFromObj(File serialObj) {
  150       try {
  151         loadFromObjectInputStream(new FileInputStream(serialObj));
  152         return true;
  153       } catch (FileNotFoundException e) {
  154         e.printStackTrace();
  155       } catch (IOException e) {
  156         e.printStackTrace();
  157       } catch (ClassNotFoundException e) {
  158         e.printStackTrace();
  159       }
  160       return false;
  161     }
  162   
  163     private void loadFromObjectInputStream(InputStream serialObjectInputStream)
  164         throws IOException, ClassNotFoundException {
  165       ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
  166       wordIndexTable = (short[]) input.readObject();
  167       charIndexTable = (char[]) input.readObject();
  168       wordItem_charArrayTable = (char[][][]) input.readObject();
  169       wordItem_frequencyTable = (int[][]) input.readObject();
  170       // log.info("load core dict from serialization.");
  171       input.close();
  172     }
  173   
  174     private void saveToObj(File serialObj) {
  175       try {
  176         ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
  177             serialObj));
  178         output.writeObject(wordIndexTable);
  179         output.writeObject(charIndexTable);
  180         output.writeObject(wordItem_charArrayTable);
  181         output.writeObject(wordItem_frequencyTable);
  182         output.close();
  183         // log.info("serialize core dict.");
  184       } catch (Exception e) {
  185         // log.warn(e.getMessage());
  186       }
  187     }
  188   
  189     /**
  190      * Load the datafile into this WordDictionary
  191      * 
  192      * @param dctFilePath path to word dictionary (coredict.dct)
  193      * @return number of words read
  194      * @throws FileNotFoundException
  195      * @throws IOException
  196      * @throws UnsupportedEncodingException
  197      */
  198     private int loadMainDataFromFile(String dctFilePath)
  199         throws FileNotFoundException, IOException, UnsupportedEncodingException {
  200       int i, cnt, length, total = 0;
  201       // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
  202       // The 3756th is used (as a header) to store information.
  203       int[] buffer = new int[3];
  204       byte[] intBuffer = new byte[4];
  205       String tmpword;
  206       RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
  207   
  208       // GB2312 characters 0 - 6768
  209       for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
  210         // if (i == 5231)
  211         // System.out.println(i);
  212   
  213         dctFile.read(intBuffer);
  214         // the dictionary was developed for C, and byte order must be converted to work with Java
  215         cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
  216         if (cnt <= 0) {
  217           wordItem_charArrayTable[i] = null;
  218           wordItem_frequencyTable[i] = null;
  219           continue;
  220         }
  221         wordItem_charArrayTable[i] = new char[cnt][];
  222         wordItem_frequencyTable[i] = new int[cnt];
  223         total += cnt;
  224         int j = 0;
  225         while (j < cnt) {
  226           // wordItemTable[i][j] = new WordItem();
  227           dctFile.read(intBuffer);
  228           buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
  229               .getInt();// frequency
  230           dctFile.read(intBuffer);
  231           buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
  232               .getInt();// length
  233           dctFile.read(intBuffer);
  234           buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
  235               .getInt();// handle
  236   
  237           // wordItemTable[i][j].frequency = buffer[0];
  238           wordItem_frequencyTable[i][j] = buffer[0];
  239   
  240           length = buffer[1];
  241           if (length > 0) {
  242             byte[] lchBuffer = new byte[length];
  243             dctFile.read(lchBuffer);
  244             tmpword = new String(lchBuffer, "GB2312");
  245             // indexTable[i].wordItems[j].word = tmpword;
  246             // wordItemTable[i][j].charArray = tmpword.toCharArray();
  247             wordItem_charArrayTable[i][j] = tmpword.toCharArray();
  248           } else {
  249             // wordItemTable[i][j].charArray = null;
  250             wordItem_charArrayTable[i][j] = null;
  251           }
  252           // System.out.println(indexTable[i].wordItems[j]);
  253           j++;
  254         }
  255   
  256         String str = getCCByGB2312Id(i);
  257         setTableIndex(str.charAt(0), i);
  258       }
  259       dctFile.close();
  260       return total;
  261     }
  262   
  263     /**
  264      * The original lexicon puts all information with punctuation into a 
  265      * chart (from 1 to 3755). Here it then gets expanded, separately being
  266      * placed into the chart that has the corresponding symbol.
  267      */
  268     private void expandDelimiterData() {
  269       int i;
  270       int cnt;
  271       // Punctuation then treating index 3755 as 1, 
  272       // distribute the original punctuation corresponding dictionary into 
  273       int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
  274       i = 0;
  275       while (i < wordItem_charArrayTable[delimiterIndex].length) {
  276         char c = wordItem_charArrayTable[delimiterIndex][i][0];
  277         int j = getGB2312Id(c);// the id value of the punctuation
  278         if (wordItem_charArrayTable[j] == null) {
  279   
  280           int k = i;
  281           // Starting from i, count the number of the following worditem symbol from j
  282           while (k < wordItem_charArrayTable[delimiterIndex].length
  283               && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
  284             k++;
  285           }
  286           // c is the punctuation character, j is the id value of c
  287           // k-1 represents the index of the last punctuation character
  288           cnt = k - i;
  289           if (cnt != 0) {
  290             wordItem_charArrayTable[j] = new char[cnt][];
  291             wordItem_frequencyTable[j] = new int[cnt];
  292           }
  293   
  294           // Assign value for each wordItem.
  295           for (k = 0; k < cnt; k++, i++) {
  296             // wordItemTable[j][k] = new WordItem();
  297             wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
  298             wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
  299             System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
  300                 wordItem_charArrayTable[j][k], 0,
  301                 wordItem_charArrayTable[j][k].length);
  302           }
  303           setTableIndex(c, j);
  304         }
  305       }
  306       // Delete the original corresponding symbol array.
  307       wordItem_charArrayTable[delimiterIndex] = null;
  308       wordItem_frequencyTable[delimiterIndex] = null;
  309     }
  310   
  311     /*
  312      * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
  313      */
  314     private void mergeSameWords() {
  315       int i;
  316       for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
  317         if (wordItem_charArrayTable[i] == null)
  318           continue;
  319         int len = 1;
  320         for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
  321           if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
  322               wordItem_charArrayTable[i][j - 1], 0) != 0)
  323             len++;
  324   
  325         }
  326         if (len < wordItem_charArrayTable[i].length) {
  327           char[][] tempArray = new char[len][];
  328           int[] tempFreq = new int[len];
  329           int k = 0;
  330           tempArray[0] = wordItem_charArrayTable[i][0];
  331           tempFreq[0] = wordItem_frequencyTable[i][0];
  332           for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
  333             if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
  334                 tempArray[k], 0) != 0) {
  335               k++;
  336               // temp[k] = wordItemTable[i][j];
  337               tempArray[k] = wordItem_charArrayTable[i][j];
  338               tempFreq[k] = wordItem_frequencyTable[i][j];
  339             } else {
  340               // temp[k].frequency += wordItemTable[i][j].frequency;
  341               tempFreq[k] += wordItem_frequencyTable[i][j];
  342             }
  343           }
  344           // wordItemTable[i] = temp;
  345           wordItem_charArrayTable[i] = tempArray;
  346           wordItem_frequencyTable[i] = tempFreq;
  347         }
  348       }
  349     }
  350   
  351     private void sortEachItems() {
  352       char[] tmpArray;
  353       int tmpFreq;
  354       for (int i = 0; i < wordItem_charArrayTable.length; i++) {
  355         if (wordItem_charArrayTable[i] != null
  356             && wordItem_charArrayTable[i].length > 1) {
  357           for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
  358             for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
  359               if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
  360                   wordItem_charArrayTable[i][j2], 0) > 0) {
  361                 tmpArray = wordItem_charArrayTable[i][j];
  362                 tmpFreq = wordItem_frequencyTable[i][j];
  363                 wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
  364                 wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
  365                 wordItem_charArrayTable[i][j2] = tmpArray;
  366                 wordItem_frequencyTable[i][j2] = tmpFreq;
  367               }
  368             }
  369           }
  370         }
  371       }
  372     }
  373   
  374     /*
  375      * Calculate character c's position in hash table, 
  376      * then initialize the value of that position in the address table.
  377      */
  378     private boolean setTableIndex(char c, int j) {
  379       int index = getAvaliableTableIndex(c);
  380       if (index != -1) {
  381         charIndexTable[index] = c;
  382         wordIndexTable[index] = (short) j;
  383         return true;
  384       } else
  385         return false;
  386     }
  387   
  388     private short getAvaliableTableIndex(char c) {
  389       int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
  390       int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
  391       if (hash1 < 0)
  392         hash1 = PRIME_INDEX_LENGTH + hash1;
  393       if (hash2 < 0)
  394         hash2 = PRIME_INDEX_LENGTH + hash2;
  395       int index = hash1;
  396       int i = 1;
  397       while (charIndexTable[index] != 0 && charIndexTable[index] != c
  398           && i < PRIME_INDEX_LENGTH) {
  399         index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
  400         i++;
  401       }
  402       // System.out.println(i - 1);
  403   
  404       if (i < PRIME_INDEX_LENGTH
  405           && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
  406         return (short) index;
  407       } else
  408         return -1;
  409     }
  410   
  411     private short getWordItemTableIndex(char c) {
  412       int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
  413       int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
  414       if (hash1 < 0)
  415         hash1 = PRIME_INDEX_LENGTH + hash1;
  416       if (hash2 < 0)
  417         hash2 = PRIME_INDEX_LENGTH + hash2;
  418       int index = hash1;
  419       int i = 1;
  420       while (charIndexTable[index] != 0 && charIndexTable[index] != c
  421           && i < PRIME_INDEX_LENGTH) {
  422         index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
  423         i++;
  424       }
  425   
  426       if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
  427         return (short) index;
  428       } else
  429         return -1;
  430     }
  431   
  432     /**
  433      * Look up the text string corresponding with the word char array, 
  434      * and return the position of the word list.
  435      * 
  436      * @param knownHashIndex already figure out position of the first word 
  437      *   symbol charArray[0] in hash table. If not calculated yet, can be 
  438      *   replaced with function int findInTable(char[] charArray).
  439      * @param charArray look up the char array corresponding with the word.
  440      * @return word location in word array.  If not found, then return -1.
  441      */
  442     private int findInTable(short knownHashIndex, char[] charArray) {
  443       if (charArray == null || charArray.length == 0)
  444         return -1;
  445   
  446       char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
  447       int start = 0, end = items.length - 1;
  448       int mid = (start + end) / 2, cmpResult;
  449   
  450       // Binary search for the index of idArray
  451       while (start <= end) {
  452         cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
  453   
  454         if (cmpResult == 0)
  455           return mid;// find it
  456         else if (cmpResult < 0)
  457           start = mid + 1;
  458         else if (cmpResult > 0)
  459           end = mid - 1;
  460   
  461         mid = (start + end) / 2;
  462       }
  463       return -1;
  464     }
  465   
  466     /**
  467      * Find the first word in the dictionary that starts with the supplied prefix
  468      * 
  469      * @see #getPrefixMatch(char[], int)
  470      * @param charArray input prefix
  471      * @return index of word, or -1 if not found
  472      */
  473     public int getPrefixMatch(char[] charArray) {
  474       return getPrefixMatch(charArray, 0);
  475     }
  476   
  477     /**
  478      * Find the nth word in the dictionary that starts with the supplied prefix
  479      * 
  480      * @see #getPrefixMatch(char[])
  481      * @param charArray input prefix
  482      * @param knownStart relative position in the dictionary to start
  483      * @return index of word, or -1 if not found
  484      */
  485     public int getPrefixMatch(char[] charArray, int knownStart) {
  486       short index = getWordItemTableIndex(charArray[0]);
  487       if (index == -1)
  488         return -1;
  489       char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
  490       int start = knownStart, end = items.length - 1;
  491   
  492       int mid = (start + end) / 2, cmpResult;
  493   
  494       // Binary search for the index of idArray
  495       while (start <= end) {
  496         cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
  497         if (cmpResult == 0) {
  498           // Get the first item which match the current word
  499           while (mid >= 0
  500               && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
  501             mid--;
  502           mid++;
  503           return mid;// Find the first word that uses charArray as prefix.
  504         } else if (cmpResult < 0)
  505           end = mid - 1;
  506         else
  507           start = mid + 1;
  508         mid = (start + end) / 2;
  509       }
  510       return -1;
  511     }
  512   
  513     /**
  514      * Get the frequency of a word from the dictionary
  515      * 
  516      * @param charArray input word
  517      * @return word frequency, or zero if the word is not found
  518      */
  519     public int getFrequency(char[] charArray) {
  520       short hashIndex = getWordItemTableIndex(charArray[0]);
  521       if (hashIndex == -1)
  522         return 0;
  523       int itemIndex = findInTable(hashIndex, charArray);
  524       if (itemIndex != -1)
  525         return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
  526       return 0;
  527   
  528     }
  529   
  530     /**
  531      * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
  532      * 
  533      * @param charArray input word
  534      * @param itemIndex item index for table charArray[0]
  535      * @return true if the entry exists
  536      */
  537     public boolean isEqual(char[] charArray, int itemIndex) {
  538       short hashIndex = getWordItemTableIndex(charArray[0]);
  539       return Utility.compareArray(charArray, 1,
  540           wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
  541     }
  542   
  543   }

Home » lucene-3.0.1-src » org.apache.lucene.analysis.cn.smart.hhmm » [javadoc | source]