Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]
    1   package org.apache.lucene.analysis;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io.BufferedReader;
   21   import java.io.File;
   22   import java.io.FileReader;
   23   import java.io.IOException;
   24   import java.io.Reader;
   25   import java.util.HashMap;
   26   import java.util.HashSet;
   27   
   28   /**
   29    * Loader for text files that represent a list of stopwords.
   30    */
   31   public class WordlistLoader {
   32   
   33     /**
   34      * Loads a text file and adds every line as an entry to a HashSet (omitting
   35      * leading and trailing whitespace). Every line of the file should contain only
   36      * one word. The words need to be in lowercase if you make use of an
   37      * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   38      *
   39      * @param wordfile File containing the wordlist
   40      * @return A HashSet with the file's words
   41      */
   42     public static HashSet<String> getWordSet(File wordfile) throws IOException {
   43       HashSet<String> result = new HashSet<String>();
   44       FileReader reader = null;
   45       try {
   46         reader = new FileReader(wordfile);
   47         result = getWordSet(reader);
   48       }
   49       finally {
   50         if (reader != null)
   51           reader.close();
   52       }
   53       return result;
   54     }
   55   
   56     /**
   57      * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
   58      * leading and trailing whitespace). Every line of the file should contain only
   59      * one word. The words need to be in lowercase if you make use of an
   60      * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   61      *
   62      * @param wordfile File containing the wordlist
   63      * @param comment The comment string to ignore
   64      * @return A HashSet with the file's words
   65      */
   66     public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
   67       HashSet<String> result = new HashSet<String>();
   68       FileReader reader = null;
   69       try {
   70         reader = new FileReader(wordfile);
   71         result = getWordSet(reader, comment);
   72       }
   73       finally {
   74         if (reader != null)
   75           reader.close();
   76       }
   77       return result;
   78     }
   79   
   80   
   81     /**
   82      * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
   83      * leading and trailing whitespace). Every line of the Reader should contain only
   84      * one word. The words need to be in lowercase if you make use of an
   85      * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   86      *
   87      * @param reader Reader containing the wordlist
   88      * @return A HashSet with the reader's words
   89      */
   90     public static HashSet<String> getWordSet(Reader reader) throws IOException {
   91       HashSet<String> result = new HashSet<String>();
   92       BufferedReader br = null;
   93       try {
   94         if (reader instanceof BufferedReader) {
   95           br = (BufferedReader) reader;
   96         } else {
   97           br = new BufferedReader(reader);
   98         }
   99         String word = null;
  100         while ((word = br.readLine()) != null) {
  101           result.add(word.trim());
  102         }
  103       }
  104       finally {
  105         if (br != null)
  106           br.close();
  107       }
  108       return result;
  109     }
  110   
  111     /**
  112      * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
  113      * leading and trailing whitespace). Every line of the Reader should contain only
  114      * one word. The words need to be in lowercase if you make use of an
  115      * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
  116      *
  117      * @param reader Reader containing the wordlist
  118      * @param comment The string representing a comment.
  119      * @return A HashSet with the reader's words
  120      */
  121     public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
  122       HashSet<String> result = new HashSet<String>();
  123       BufferedReader br = null;
  124       try {
  125         if (reader instanceof BufferedReader) {
  126           br = (BufferedReader) reader;
  127         } else {
  128           br = new BufferedReader(reader);
  129         }
  130         String word = null;
  131         while ((word = br.readLine()) != null) {
  132           if (word.startsWith(comment) == false){
  133             result.add(word.trim());
  134           }
  135         }
  136       }
  137       finally {
  138         if (br != null)
  139           br.close();
  140       }
  141       return result;
  142     }
  143   
  144   
  145   
  146     /**
  147      * Reads a stem dictionary. Each line contains:
  148      * <pre>word<b>\t</b>stem</pre>
  149      * (i.e. two tab seperated words)
  150      *
  151      * @return stem dictionary that overrules the stemming algorithm
  152      * @throws IOException 
  153      */
  154     public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
  155       if (wordstemfile == null)
  156         throw new NullPointerException("wordstemfile may not be null");
  157       HashMap<String, String> result = new HashMap<String, String>();
  158       BufferedReader br = null;
  159       FileReader fr = null;
  160       try {
  161         fr = new FileReader(wordstemfile);
  162         br = new BufferedReader(fr);
  163         String line;
  164         while ((line = br.readLine()) != null) {
  165           String[] wordstem = line.split("\t", 2);
  166           result.put(wordstem[0], wordstem[1]);
  167         }
  168       } finally {
  169         if (fr != null)
  170           fr.close();
  171         if (br != null)
  172           br.close();
  173       }
  174       return result;
  175     }
  176   
  177   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]