Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » cn » [javadoc | source]
    1   package org.apache.lucene.analysis.cn;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   
   21   import java.io.IOException;
   22   import java.io.Reader;
   23   
   24   import org.apache.lucene.analysis.Tokenizer;
   25   import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
   26   import org.apache.lucene.analysis.tokenattributes.TermAttribute;
   27   import org.apache.lucene.util.AttributeSource;
   28   
   29   
   30   /**
   31    * Tokenize Chinese text as individual chinese characters.
   32    * 
   33    * <p>
   34    * The difference between ChineseTokenizer and
   35    * CJKTokenizer is that they have different
   36    * token parsing logic.
   37    * </p>
   38    * <p>
   39    * For example, if the Chinese text
   40    * "C1C2C3C4" is to be indexed:
   41    * <ul>
   42    * <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4. 
   43    * <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
   44    * </ul>
   45    * </p>
   46    * <p>
   47    * Therefore the index created by CJKTokenizer is much larger.
   48    * </p>
   49    * <p>
   50    * The problem is that when searching for C1, C1C2, C1C3,
   51    * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
   52    * CJKTokenizer will not work.
   53    * </p>
   54    * @version 1.0
   55    *
   56    */
   57   
   58   public final class ChineseTokenizer extends Tokenizer {
   59   
   60   
   61       public ChineseTokenizer(Reader in) {
   62         super(in);
   63         init();
   64       }
   65   
   66       public ChineseTokenizer(AttributeSource source, Reader in) {
   67         super(source, in);
   68         init();
   69       }
   70   
   71       public ChineseTokenizer(AttributeFactory factory, Reader in) {
   72         super(factory, in);
   73         init();
   74       }
   75       
   76       private void init() {
   77         termAtt = addAttribute(TermAttribute.class);
   78         offsetAtt = addAttribute(OffsetAttribute.class);
   79       }
   80       
   81       private int offset = 0, bufferIndex=0, dataLen=0;
   82       private final static int MAX_WORD_LEN = 255;
   83       private final static int IO_BUFFER_SIZE = 1024;
   84       private final char[] buffer = new char[MAX_WORD_LEN];
   85       private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
   86   
   87   
   88       private int length;
   89       private int start;
   90   
   91       private TermAttribute termAtt;
   92       private OffsetAttribute offsetAtt;
   93       
   94       private final void push(char c) {
   95   
   96           if (length == 0) start = offset-1;            // start of token
   97           buffer[length++] = Character.toLowerCase(c);  // buffer it
   98   
   99       }
  100   
  101       private final boolean flush() {
  102   
  103           if (length>0) {
  104               //System.out.println(new String(buffer, 0,
  105               //length));
  106             termAtt.setTermBuffer(buffer, 0, length);
  107             offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
  108             return true;
  109           }
  110           else
  111               return false;
  112       }
  113   
  114       @Override
  115       public boolean incrementToken() throws IOException {
  116           clearAttributes();
  117   
  118           length = 0;
  119           start = offset;
  120   
  121   
  122           while (true) {
  123   
  124               final char c;
  125               offset++;
  126   
  127               if (bufferIndex >= dataLen) {
  128                   dataLen = input.read(ioBuffer);
  129                   bufferIndex = 0;
  130               }
  131   
  132               if (dataLen == -1) {
  133                 offset--;
  134                 return flush();
  135               } else
  136                   c = ioBuffer[bufferIndex++];
  137   
  138   
  139               switch(Character.getType(c)) {
  140   
  141               case Character.DECIMAL_DIGIT_NUMBER:
  142               case Character.LOWERCASE_LETTER:
  143               case Character.UPPERCASE_LETTER:
  144                   push(c);
  145                   if (length == MAX_WORD_LEN) return flush();
  146                   break;
  147   
  148               case Character.OTHER_LETTER:
  149                   if (length>0) {
  150                       bufferIndex--;
  151                       offset--;
  152                       return flush();
  153                   }
  154                   push(c);
  155                   return flush();
  156   
  157               default:
  158                   if (length>0) return flush();
  159                   break;
  160               }
  161           }
  162       }
  163       
  164       @Override
  165       public final void end() {
  166         // set final offset
  167         final int finalOffset = correctOffset(offset);
  168         this.offsetAtt.setOffset(finalOffset, finalOffset);
  169       }
  170   
  171       @Override
  172       public void reset() throws IOException {
  173         super.reset();
  174         offset = bufferIndex = dataLen = 0;
  175       }
  176       
  177       @Override
  178       public void reset(Reader input) throws IOException {
  179         super.reset(input);
  180         reset();
  181       }
  182   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » cn » [javadoc | source]