Home » lucene-3.0.1-src » org.apache » lucene » analysis » compound » [javadoc | source]

    1   package org.apache.lucene.analysis.compound;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   
   21   import java.util.Set;
   22   
   23   import org.apache.lucene.analysis.Token;
   24   import org.apache.lucene.analysis.TokenFilter; // for javadocs
   25   import org.apache.lucene.analysis.TokenStream;
   26   
   27   /**
   28    * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
   29    * <p>
   30    * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
   31    * "Donaudampfschiff" even when you only enter "schiff". 
   32    *  It uses a brute-force algorithm to achieve this.
   33    * </p>
   34    */
   35   public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
   36     /**
   37      * 
   38      * @param input the {@link TokenStream} to process
   39      * @param dictionary the word dictionary to match against
   40      * @param minWordSize only words longer than this get processed
   41      * @param minSubwordSize only subwords longer than this get to the output stream
   42      * @param maxSubwordSize only subwords shorter than this get to the output stream
   43      * @param onlyLongestMatch Add only the longest matching subword to the stream
   44      */
   45     public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
   46         int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
   47       super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
   48     }
   49   
   50     /**
   51      * 
   52      * @param input the {@link TokenStream} to process
   53      * @param dictionary the word dictionary to match against
   54      */
   55     public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
   56       super(input, dictionary);
   57     }
   58   
   59     /**
   60      * 
   61      * @param input the {@link TokenStream} to process
   62      * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
   63      *        lower case strings. 
   64      */
   65     public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
   66       super(input, dictionary);
   67     }
   68   
   69     /**
   70      * 
   71      * @param input the {@link TokenStream} to process
   72      * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
   73      *        lower case strings. 
   74      * @param minWordSize only words longer than this get processed
   75      * @param minSubwordSize only subwords longer than this get to the output stream
   76      * @param maxSubwordSize only subwords shorter than this get to the output stream
   77      * @param onlyLongestMatch Add only the longest matching subword to the stream
   78      */
   79     public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
   80         int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
   81       super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
   82     }
   83   
   84     @Override
   85     protected void decomposeInternal(final Token token) {
   86       // Only words longer than minWordSize get processed
   87       if (token.termLength() < this.minWordSize) {
   88         return;
   89       }
   90       
   91       char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
   92       
   93       for (int i=0;i<token.termLength()-this.minSubwordSize;++i) {
   94           Token longestMatchToken=null;
   95           for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
   96               if(i+j>token.termLength()) {
   97                   break;
   98               }
   99               if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
  100                   if (this.onlyLongestMatch) {
  101                      if (longestMatchToken!=null) {
  102                        if (longestMatchToken.termLength()<j) {
  103                          longestMatchToken=createToken(i,j,token);
  104                        }
  105                      } else {
  106                        longestMatchToken=createToken(i,j,token);
  107                      }
  108                   } else {
  109                      tokens.add(createToken(i,j,token));
  110                   }
  111               } 
  112           }
  113           if (this.onlyLongestMatch && longestMatchToken!=null) {
  114             tokens.add(longestMatchToken);
  115           }
  116       }
  117     }
  118   }

Home » lucene-3.0.1-src » org.apache » lucene » analysis » compound » [javadoc | source]