Home » lucene-3.0.1-src » org.apache » lucene » analysis » compound » [javadoc | source]

    1   package org.apache.lucene.analysis.compound;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io.IOException;
   21   import java.util.Arrays;
   22   import java.util.Collection;
   23   import java.util.Iterator;
   24   import java.util.LinkedList;
   25   import java.util.Set;
   26   
   27   import org.apache.lucene.analysis.CharArraySet;
   28   import org.apache.lucene.analysis.Token;
   29   import org.apache.lucene.analysis.TokenFilter;
   30   import org.apache.lucene.analysis.TokenStream;
   31   import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
   32   import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
   33   import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
   34   import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
   35   import org.apache.lucene.analysis.tokenattributes.TermAttribute;
   36   import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
   37   
   38   /**
   39    * Base class for decomposition token filters.
   40    */
   41   public abstract class CompoundWordTokenFilterBase extends TokenFilter {
   42     /**
   43      * The default for minimal word length that gets decomposed
   44      */
   45     public static final int DEFAULT_MIN_WORD_SIZE = 5;
   46   
   47     /**
   48      * The default for minimal length of subwords that get propagated to the output of this filter
   49      */
   50     public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
   51   
   52     /**
   53      * The default for maximal length of subwords that get propagated to the output of this filter
   54      */
   55     public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
   56     
   57     protected final CharArraySet dictionary;
   58     protected final LinkedList tokens;
   59     protected final int minWordSize;
   60     protected final int minSubwordSize;
   61     protected final int maxSubwordSize;
   62     protected final boolean onlyLongestMatch;
   63     
   64     private TermAttribute termAtt;
   65     private OffsetAttribute offsetAtt;
   66     private FlagsAttribute flagsAtt;
   67     private PositionIncrementAttribute posIncAtt;
   68     private TypeAttribute typeAtt;
   69     private PayloadAttribute payloadAtt;
   70     
   71     private final Token wrapper = new Token();
   72   
   73     protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
   74       this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
   75     }
   76     
   77     protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
   78       this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
   79     }
   80   
   81     protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) {
   82       this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
   83     }
   84   
   85     protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
   86       this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
   87     }
   88   
   89     protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) {
   90       this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
   91     }
   92   
   93     protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
   94       super(input);
   95       
   96       this.tokens=new LinkedList();
   97       this.minWordSize=minWordSize;
   98       this.minSubwordSize=minSubwordSize;
   99       this.maxSubwordSize=maxSubwordSize;
  100       this.onlyLongestMatch=onlyLongestMatch;
  101       
  102       if (dictionary instanceof CharArraySet) {
  103         this.dictionary = (CharArraySet) dictionary;
  104       } else {
  105         this.dictionary = new CharArraySet(dictionary.size(), false);
  106         addAllLowerCase(this.dictionary, dictionary);
  107       }
  108       
  109       termAtt = addAttribute(TermAttribute.class);
  110       offsetAtt = addAttribute(OffsetAttribute.class);
  111       flagsAtt = addAttribute(FlagsAttribute.class);
  112       posIncAtt = addAttribute(PositionIncrementAttribute.class);
  113       typeAtt = addAttribute(TypeAttribute.class);
  114       payloadAtt = addAttribute(PayloadAttribute.class);
  115     }
  116   
  117     /**
  118      * Create a set of words from an array
  119      * The resulting Set does case insensitive matching
  120      * TODO We should look for a faster dictionary lookup approach.
  121      * @param dictionary 
  122      * @return {@link Set} of lowercased terms 
  123      */
  124     public static final Set makeDictionary(final String[] dictionary) {
  125       // is the below really case insensitive? 
  126       CharArraySet dict = new CharArraySet(dictionary.length, false);
  127       addAllLowerCase(dict, Arrays.asList(dictionary));
  128       return dict;
  129     }
  130     
  131     private final void setToken(final Token token) throws IOException {
  132       clearAttributes();
  133       termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
  134       flagsAtt.setFlags(token.getFlags());
  135       typeAtt.setType(token.type());
  136       offsetAtt.setOffset(token.startOffset(), token.endOffset());
  137       posIncAtt.setPositionIncrement(token.getPositionIncrement());
  138       payloadAtt.setPayload(token.getPayload());
  139     }
  140     
  141     @Override
  142     public final boolean incrementToken() throws IOException {
  143       if (tokens.size() > 0) {
  144         setToken((Token)tokens.removeFirst());
  145         return true;
  146       }
  147   
  148       if (input.incrementToken() == false)
  149         return false;
  150       
  151       wrapper.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
  152       wrapper.setStartOffset(offsetAtt.startOffset());
  153       wrapper.setEndOffset(offsetAtt.endOffset());
  154       wrapper.setFlags(flagsAtt.getFlags());
  155       wrapper.setType(typeAtt.type());
  156       wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
  157       wrapper.setPayload(payloadAtt.getPayload());
  158       
  159       decompose(wrapper);
  160   
  161       if (tokens.size() > 0) {
  162         setToken((Token)tokens.removeFirst());
  163         return true;
  164       } else {
  165         return false;
  166       }
  167     }
  168     
  169     protected static final void addAllLowerCase(Set target, Collection col) {
  170       Iterator iter=col.iterator();
  171       
  172       while (iter.hasNext()) {
  173         target.add(((String)iter.next()).toLowerCase());
  174       }
  175     }
  176     
  177     protected static char[] makeLowerCaseCopy(final char[] buffer) {
  178       char[] result=new char[buffer.length];
  179       System.arraycopy(buffer, 0, result, 0, buffer.length);
  180       
  181       for (int i=0;i<buffer.length;++i) {
  182          result[i]=Character.toLowerCase(buffer[i]);
  183       }
  184       
  185       return result;
  186     }
  187     
  188     protected final Token createToken(final int offset, final int length,
  189         final Token prototype) {
  190       int newStart = prototype.startOffset() + offset;
  191       Token t = prototype.clone(prototype.termBuffer(), offset, length, newStart, newStart+length);
  192       t.setPositionIncrement(0);
  193       return t;
  194     }
  195   
  196     protected void decompose(final Token token) {
  197       // In any case we give the original token back
  198       tokens.add((Token) token.clone());
  199   
  200       // Only words longer than minWordSize get processed
  201       if (token.termLength() < this.minWordSize) {
  202         return;
  203       }
  204       
  205       decomposeInternal(token);
  206     }
  207     
  208     protected abstract void decomposeInternal(final Token token);
  209   
  210     @Override
  211     public void reset() throws IOException {
  212       super.reset();
  213       tokens.clear();
  214     }
  215   }

Home » lucene-3.0.1-src » org.apache » lucene » analysis » compound » [javadoc | source]