Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]
    1   package org.apache.lucene.analysis;
    2   
    3   import java.util.AbstractSet;
    4   import java.util.Collection;
    5   import java.util.Collections;
    6   import java.util.Iterator;
    7   import java.util.Set;
    8   
    9   /**
   10    * Licensed to the Apache Software Foundation (ASF) under one or more
   11    * contributor license agreements.  See the NOTICE file distributed with
   12    * this work for additional information regarding copyright ownership.
   13    * The ASF licenses this file to You under the Apache License, Version 2.0
   14    * (the "License"); you may not use this file except in compliance with
   15    * the License.  You may obtain a copy of the License at
   16    *
   17    *     http://www.apache.org/licenses/LICENSE-2.0
   18    *
   19    * Unless required by applicable law or agreed to in writing, software
   20    * distributed under the License is distributed on an "AS IS" BASIS,
   21    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   22    * See the License for the specific language governing permissions and
   23    * limitations under the License.
   24    */
   25   
   26   
   27   /**
   28    * A simple class that stores Strings as char[]'s in a
   29    * hash table.  Note that this is not a general purpose
   30    * class.  For example, it cannot remove items from the
   31    * set, nor does it resize its hash table to be smaller,
   32    * etc.  It is designed to be quick to test if a char[]
   33    * is in the set without the necessity of converting it
   34    * to a String first.
   35    * <P>
   36    * <em>Please note:</em> This class implements {@link java.util.Set Set} but
   37    * does not behave like it should in all cases. The generic type is
   38    * {@code Set<Object>}, because you can add any object to it,
   39    * that has a string representation. The add methods will use
   40    * {@link Object#toString} and store the result using a {@code char[]}
   41    * buffer. The same behaviour have the {@code contains()} methods.
   42    * The {@link #iterator()} returns an {@code Iterator<String>}.
   43    * For type safety also {@link #stringIterator()} is provided.
   44    */
   45   
   46   public class CharArraySet extends AbstractSet<Object> {
   47     private final static int INIT_SIZE = 8;
   48     private char[][] entries;
   49     private int count;
   50     private final boolean ignoreCase;
   51     public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false));
   52   
   53     /** Create set with enough capacity to hold startSize
   54      *  terms */
   55     public CharArraySet(int startSize, boolean ignoreCase) {
   56       this.ignoreCase = ignoreCase;
   57       int size = INIT_SIZE;
   58       while(startSize + (startSize>>2) > size)
   59         size <<= 1;
   60       entries = new char[size][];
   61     }
   62   
   63     /** Create set from a Collection of char[] or String */
   64     public CharArraySet(Collection<? extends Object> c, boolean ignoreCase) {
   65       this(c.size(), ignoreCase);
   66       addAll(c);
   67     }
   68     
   69     /** Create set from entries */
   70     private CharArraySet(char[][] entries, boolean ignoreCase, int count){
   71       this.entries = entries;
   72       this.ignoreCase = ignoreCase;
   73       this.count = count;
   74     }
   75   
   76     /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
   77      * are in the set */
   78     public boolean contains(char[] text, int off, int len) {
   79       return entries[getSlot(text, off, len)] != null;
   80     }
   81   
   82     /** true if the <code>CharSequence</code> is in the set */
   83     public boolean contains(CharSequence cs) {
   84       return entries[getSlot(cs)] != null;
   85     }
   86   
   87     private int getSlot(char[] text, int off, int len) {
   88       int code = getHashCode(text, off, len);
   89       int pos = code & (entries.length-1);
   90       char[] text2 = entries[pos];
   91       if (text2 != null && !equals(text, off, len, text2)) {
   92         final int inc = ((code>>8)+code)|1;
   93         do {
   94           code += inc;
   95           pos = code & (entries.length-1);
   96           text2 = entries[pos];
   97         } while (text2 != null && !equals(text, off, len, text2));
   98       }
   99       return pos;
  100     }
  101   
  102     /** Returns true if the String is in the set */  
  103     private int getSlot(CharSequence text) {
  104       int code = getHashCode(text);
  105       int pos = code & (entries.length-1);
  106       char[] text2 = entries[pos];
  107       if (text2 != null && !equals(text, text2)) {
  108         final int inc = ((code>>8)+code)|1;
  109         do {
  110           code += inc;
  111           pos = code & (entries.length-1);
  112           text2 = entries[pos];
  113         } while (text2 != null && !equals(text, text2));
  114       }
  115       return pos;
  116     }
  117   
  118     /** Add this CharSequence into the set */
  119     public boolean add(CharSequence text) {
  120       return add(text.toString()); // could be more efficient
  121     }
  122     
  123     /** Add this String into the set */
  124     public boolean add(String text) {
  125       return add(text.toCharArray());
  126     }
  127   
  128     /** Add this char[] directly to the set.
  129      * If ignoreCase is true for this Set, the text array will be directly modified.
  130      * The user should never modify this text array after calling this method.
  131      */
  132     public boolean add(char[] text) {
  133       if (ignoreCase)
  134         for(int i=0;i<text.length;i++)
  135           text[i] = Character.toLowerCase(text[i]);
  136       int slot = getSlot(text, 0, text.length);
  137       if (entries[slot] != null) return false;
  138       entries[slot] = text;
  139       count++;
  140   
  141       if (count + (count>>2) > entries.length) {
  142         rehash();
  143       }
  144   
  145       return true;
  146     }
  147   
  148     private boolean equals(char[] text1, int off, int len, char[] text2) {
  149       if (len != text2.length)
  150         return false;
  151       if (ignoreCase) {
  152         for(int i=0;i<len;i++) {
  153           if (Character.toLowerCase(text1[off+i]) != text2[i])
  154             return false;
  155         }
  156       } else {
  157         for(int i=0;i<len;i++) {
  158           if (text1[off+i] != text2[i])
  159             return false;
  160         }
  161       }
  162       return true;
  163     }
  164   
  165     private boolean equals(CharSequence text1, char[] text2) {
  166       int len = text1.length();
  167       if (len != text2.length)
  168         return false;
  169       if (ignoreCase) {
  170         for(int i=0;i<len;i++) {
  171           if (Character.toLowerCase(text1.charAt(i)) != text2[i])
  172             return false;
  173         }
  174       } else {
  175         for(int i=0;i<len;i++) {
  176           if (text1.charAt(i) != text2[i])
  177             return false;
  178         }
  179       }
  180       return true;
  181     }
  182   
  183     private void rehash() {
  184       final int newSize = 2*entries.length;
  185       char[][] oldEntries = entries;
  186       entries = new char[newSize][];
  187   
  188       for(int i=0;i<oldEntries.length;i++) {
  189         char[] text = oldEntries[i];
  190         if (text != null) {
  191           // todo: could be faster... no need to compare strings on collision
  192           entries[getSlot(text,0,text.length)] = text;
  193         }
  194       }
  195     }
  196     
  197     private int getHashCode(char[] text, int offset, int len) {
  198       int code = 0;
  199       final int stop = offset + len;
  200       if (ignoreCase) {
  201         for (int i=offset; i<stop; i++) {
  202           code = code*31 + Character.toLowerCase(text[i]);
  203         }
  204       } else {
  205         for (int i=offset; i<stop; i++) {
  206           code = code*31 + text[i];
  207         }
  208       }
  209       return code;
  210     }
  211   
  212     private int getHashCode(CharSequence text) {
  213       int code = 0;
  214       int len = text.length();
  215       if (ignoreCase) {
  216         for (int i=0; i<len; i++) {
  217           code = code*31 + Character.toLowerCase(text.charAt(i));
  218         }
  219       } else {
  220         for (int i=0; i<len; i++) {
  221           code = code*31 + text.charAt(i);
  222         }
  223       }
  224       return code;
  225     }
  226   
  227   
  228     @Override
  229     public int size() {
  230       return count;
  231     }
  232   
  233     @Override
  234     public boolean isEmpty() {
  235       return count==0;
  236     }
  237   
  238     @Override
  239     public boolean contains(Object o) {
  240       if (o instanceof char[]) {
  241         final char[] text = (char[])o;
  242         return contains(text, 0, text.length);
  243       } 
  244       return contains(o.toString());
  245     }
  246   
  247     @Override
  248     public boolean add(Object o) {
  249       if (o instanceof char[]) {
  250         return add((char[])o);
  251       }
  252       return add(o.toString());
  253     }
  254     
  255     /**
  256      * Returns an unmodifiable {@link CharArraySet}. This allows to provide
  257      * unmodifiable views of internal sets for "read-only" use.
  258      * 
  259      * @param set
  260      *          a set for which the unmodifiable set is returned.
  261      * @return an new unmodifiable {@link CharArraySet}.
  262      * @throws NullPointerException
  263      *           if the given set is <code>null</code>.
  264      */
  265     public static CharArraySet unmodifiableSet(CharArraySet set) {
  266       if (set == null)
  267         throw new NullPointerException("Given set is null");
  268       if (set == EMPTY_SET)
  269         return EMPTY_SET;
  270       if (set instanceof UnmodifiableCharArraySet)
  271         return set;
  272   
  273       /*
  274        * Instead of delegating calls to the given set copy the low-level values to
  275        * the unmodifiable Subclass
  276        */
  277       return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
  278     }
  279   
  280     /**
  281      * Returns a copy of the given set as a {@link CharArraySet}. If the given set
  282      * is a {@link CharArraySet} the ignoreCase property will be preserved.
  283      * 
  284      * @param set
  285      *          a set to copy
  286      * @return a copy of the given set as a {@link CharArraySet}. If the given set
  287      *         is a {@link CharArraySet} the ignoreCase property will be
  288      *         preserved.
  289      */
  290     public static CharArraySet copy(Set<?> set) {
  291       if (set == null)
  292         throw new NullPointerException("Given set is null");
  293       if(set == EMPTY_SET)
  294         return EMPTY_SET;
  295       final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase
  296           : false;
  297       return new CharArraySet(set, ignoreCase);
  298     }
  299     
  300   
  301     /** The Iterator<String> for this set.  Strings are constructed on the fly, so
  302      * use <code>nextCharArray</code> for more efficient access. */
  303     public class CharArraySetIterator implements Iterator<String> {
  304       int pos=-1;
  305       char[] next;
  306       CharArraySetIterator() {
  307         goNext();
  308       }
  309   
  310       private void goNext() {
  311         next = null;
  312         pos++;
  313         while (pos < entries.length && (next=entries[pos]) == null) pos++;
  314       }
  315   
  316       public boolean hasNext() {
  317         return next != null;
  318       }
  319   
  320       /** do not modify the returned char[] */
  321       public char[] nextCharArray() {
  322         char[] ret = next;
  323         goNext();
  324         return ret;
  325       }
  326   
  327       /** Returns the next String, as a Set<String> would...
  328        * use nextCharArray() for better efficiency. */
  329       public String next() {
  330         return new String(nextCharArray());
  331       }
  332   
  333       public void remove() {
  334         throw new UnsupportedOperationException();
  335       }
  336     }
  337   
  338     /** returns an iterator of new allocated Strings */
  339     public Iterator<String> stringIterator() {
  340       return new CharArraySetIterator();
  341     }
  342   
  343     /** returns an iterator of new allocated Strings, this method violates the Set interface */
  344     @Override
  345     @SuppressWarnings("unchecked")
  346     public Iterator<Object> iterator() {
  347       return (Iterator) stringIterator();
  348     }
  349     
  350     /**
  351      * Efficient unmodifiable {@link CharArraySet}. This implementation does not
  352      * delegate calls to a give {@link CharArraySet} like
  353      * {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
  354      * the internal representation of a {@link CharArraySet} to a super
  355      * constructor and overrides all mutators. 
  356      */
  357     private static final class UnmodifiableCharArraySet extends CharArraySet {
  358   
  359       private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
  360           int count) {
  361         super(entries, ignoreCase, count);
  362       }
  363   
  364       @Override
  365       public boolean add(Object o){
  366         throw new UnsupportedOperationException();
  367       }
  368       
  369       @Override
  370       public boolean addAll(Collection<? extends Object> coll) {
  371         throw new UnsupportedOperationException();
  372       }
  373       
  374       @Override
  375       public boolean add(char[] text) {
  376         throw new UnsupportedOperationException();
  377       }
  378   
  379       @Override
  380       public boolean add(CharSequence text) {
  381         throw new UnsupportedOperationException();
  382       }
  383   
  384       @Override
  385       public boolean add(String text) {
  386         throw new UnsupportedOperationException();
  387       }
  388     }
  389   
  390   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]