Save This Page
Home » mahout-collections-1.0-src » org.apache.mahout.math.set » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one
    3    * or more contributor license agreements. See the NOTICE file
    4    * distributed with this work for additional information
    5    * regarding copyright ownership. The ASF licenses this file
    6    * to you under the Apache License, Version 2.0 (the
    7    * "License"); you may not use this file except in compliance
    8    * with the License. You may obtain a copy of the License at
    9    *
   10    * http://www.apache.org/licenses/LICENSE-2.0
   11    *
   12    * Unless required by applicable law or agreed to in writing,
   13    * software distributed under the License is distributed on an
   14    * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   15    * KIND, either express or implied. See the License for the
   16    * specific language governing permissions and limitations
   17    * under the License.
   18    */
   19   
   20   package org.apache.mahout.math.set;
   21   
   22   import java.util.ArrayList;
   23   import java.util.Arrays;
   24   import java.util.Collection;
   25   import java.util.Iterator;
   26   import java.util.List;
   27   import java.util.Set;
   28   
   29   import org.apache.mahout.math.function.ObjectProcedure;
   30   import org.apache.mahout.math.map.PrimeFinder;
   31   
   32   /**
   33     * Open hashing alternative to java.util.HashSet.
   34    **/
   35   public class OpenHashSet<T> extends AbstractSet implements Set<T>  {
   36     protected static final byte FREE = 0;
   37     protected static final byte FULL = 1;
   38     protected static final byte REMOVED = 2;
   39     protected static final char NO_KEY_VALUE = 0;
   40   
   41     /** The hash table keys. */
   42     private Object[] table;
   43   
   44     /** The state of each hash table entry (FREE, FULL, REMOVED). */
   45     private byte[] state;
   46   
   47     /** The number of table entries in state==FREE. */
   48     private int freeEntries;
   49   
   50   
   51     /** Constructs an empty map with default capacity and default load factors. */
   52     public OpenHashSet() {
   53       this(defaultCapacity);
   54     }
   55   
   56     /**
   57      * Constructs an empty map with the specified initial capacity and default load factors.
   58      *
   59      * @param initialCapacity the initial capacity of the map.
   60      * @throws IllegalArgumentException if the initial capacity is less than zero.
   61      */
   62     public OpenHashSet(int initialCapacity) {
   63       this(initialCapacity, defaultMinLoadFactor, defaultMaxLoadFactor);
   64     }
   65   
   66     /**
   67      * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
   68      *
   69      * @param initialCapacity the initial capacity.
   70      * @param minLoadFactor   the minimum load factor.
   71      * @param maxLoadFactor   the maximum load factor.
   72      * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
   73      *                                  (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
   74      *                                  maxLoadFactor)</tt>.
   75      */
   76     public OpenHashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
   77       setUp(initialCapacity, minLoadFactor, maxLoadFactor);
   78     }
   79   
   80     /** Removes all values associations from the receiver. Implicitly calls <tt>trimToSize()</tt>. */
   81     @Override
   82     public void clear() {
   83       Arrays.fill(this.state, 0, state.length - 1, FREE);
   84       distinct = 0;
   85       freeEntries = table.length; // delta
   86       trimToSize();
   87     }
   88   
   89     /**
   90      * Returns a deep copy of the receiver.
   91      *
   92      * @return a deep copy of the receiver.
   93      */
   94     @SuppressWarnings("unchecked")
   95     @Override
   96     public Object clone() {
   97       OpenHashSet<T> copy = (OpenHashSet<T>) super.clone();
   98       copy.table = copy.table.clone();
   99       copy.state = copy.state.clone();
  100       return copy;
  101     }
  102   
  103     /**
  104      * Returns <tt>true</tt> if the receiver contains the specified key.
  105      *
  106      * @return <tt>true</tt> if the receiver contains the specified key.
  107      */
  108     @Override
  109     @SuppressWarnings("unchecked")
  110     public boolean contains(Object key) {
  111       return indexOfKey((T)key) >= 0;
  112     }
  113   
  114     /**
  115      * Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
  116      * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. <p> This
  117      * method never need be called; it is for performance tuning only. Calling this method before <tt>add()</tt>ing a
  118      * large number of associations boosts performance, because the receiver will grow only once instead of potentially
  119      * many times and hash collisions get less probable.
  120      *
  121      * @param minCapacity the desired minimum capacity.
  122      */
  123     @Override
  124     public void ensureCapacity(int minCapacity) {
  125       if (table.length < minCapacity) {
  126         int newCapacity = nextPrime(minCapacity);
  127         rehash(newCapacity);
  128       }
  129     }
  130   
  131     /**
  132      * Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
  133      * Subclasses can define a particular order, for example, "sorted by key". All methods which <i>can</i> be expressed
  134      * in terms of this method (most methods can) <i>must guarantee</i> to use the <i>same</i> order defined by this
  135      * method, even if it is no particular order. This is necessary so that, for example, methods <tt>keys</tt> and
  136      * <tt>values</tt> will yield association pairs, not two uncorrelated lists.
  137      *
  138      * @param procedure the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise
  139      *                  continues.
  140      * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
  141      */
  142     @SuppressWarnings("unchecked")
  143     public boolean forEachKey(ObjectProcedure<T> procedure) {
  144       for (int i = table.length; i-- > 0;) {
  145         if (state[i] == FULL) {
  146           if (!procedure.apply((T)table[i])) {
  147             return false;
  148           }
  149         }
  150       }
  151       return true;
  152     }
  153   
  154     /**
  155      * @param key the key to be added to the receiver.
  156      * @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
  157      *         key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
  158      *         at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
  159      *         slot index.
  160      */
  161     protected int indexOfInsertion(T key) {
  162       Object[] tab = table;
  163       byte[] stat = state;
  164       int length = tab.length;
  165   
  166       int hash = key.hashCode() & 0x7FFFFFFF;
  167       int i = hash % length;
  168       int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
  169       //int decrement = (hash / length) % length;
  170       if (decrement == 0) {
  171         decrement = 1;
  172       }
  173   
  174       // stop if we find a removed or free slot, or if we find the key itself
  175       // do NOT skip over removed slots (yes, open addressing is like that...)
  176       while (stat[i] == FULL && tab[i] != key) {
  177         i -= decrement;
  178         //hashCollisions++;
  179         if (i < 0) {
  180           i += length;
  181         }
  182       }
  183   
  184       if (stat[i] == REMOVED) {
  185         // stop if we find a free slot, or if we find the key itself.
  186         // do skip over removed slots (yes, open addressing is like that...)
  187         // assertion: there is at least one FREE slot.
  188         int j = i;
  189         while (stat[i] != FREE && (stat[i] == REMOVED || tab[i] != key)) {
  190           i -= decrement;
  191           //hashCollisions++;
  192           if (i < 0) {
  193             i += length;
  194           }
  195         }
  196         if (stat[i] == FREE) {
  197           i = j;
  198         }
  199       }
  200   
  201   
  202       if (stat[i] == FULL) {
  203         // key already contained at slot i.
  204         // return a negative number identifying the slot.
  205         return -i - 1;
  206       }
  207       // not already contained, should be inserted at slot i.
  208       // return a number >= 0 identifying the slot.
  209       return i;
  210     }
  211   
  212     /**
  213      * @param key the key to be searched in the receiver.
  214      * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
  215      */
  216     protected int indexOfKey(T key) {
  217       Object[] tab = table;
  218       byte[] stat = state;
  219       int length = tab.length;
  220   
  221       int hash = key.hashCode() & 0x7FFFFFFF;
  222       int i = hash % length;
  223       int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
  224       //int decrement = (hash / length) % length;
  225       if (decrement == 0) {
  226         decrement = 1;
  227       }
  228   
  229       // stop if we find a free slot, or if we find the key itself.
  230       // do skip over removed slots (yes, open addressing is like that...)
  231       while (stat[i] != FREE && (stat[i] == REMOVED || (!key.equals(tab[i])))) {
  232         i -= decrement;
  233         //hashCollisions++;
  234         if (i < 0) {
  235           i += length;
  236         }
  237       }
  238   
  239       if (stat[i] == FREE) {
  240         return -1;
  241       } // not found
  242       return i; //found, return index where key is contained
  243     }
  244   
  245     /**
  246      * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
  247      * call returns the specified list has a new size that equals <tt>this.size()</tt>. 
  248      * This method can be used
  249      * to iterate over the keys of the receiver.
  250      *
  251      * @param list the list to be filled, can have any size.
  252      */
  253     @SuppressWarnings("unchecked")
  254     public void keys(List<T> list) {
  255       list.clear();
  256     
  257   
  258       Object [] tab = table;
  259       byte[] stat = state;
  260   
  261       for (int i = tab.length; i-- > 0;) {
  262         if (stat[i] == FULL) {
  263           list.add((T)tab[i]);
  264         }
  265       }
  266     }
  267   
  268     @SuppressWarnings("unchecked")
  269     @Override
  270     public boolean add(Object key) {
  271       int i = indexOfInsertion((T)key);
  272       if (i < 0) { //already contained
  273         return false;
  274       }
  275   
  276       if (this.distinct > this.highWaterMark) {
  277         int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
  278         rehash(newCapacity);
  279         return add(key);
  280       }
  281   
  282       this.table[i] = key;
  283       if (this.state[i] == FREE) {
  284         this.freeEntries--;
  285       }
  286       this.state[i] = FULL;
  287       this.distinct++;
  288   
  289       if (this.freeEntries < 1) { //delta
  290         int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
  291         rehash(newCapacity);
  292         return add(key);
  293       }
  294       
  295       return true;
  296     }
  297   
  298     /**
  299      * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
  300      * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
  301      * mark.
  302      */
  303     @SuppressWarnings("unchecked")
  304     protected void rehash(int newCapacity) {
  305       int oldCapacity = table.length;
  306       //if (oldCapacity == newCapacity) return;
  307   
  308       Object[] oldTable = table;
  309       byte[] oldState = state;
  310   
  311       Object[] newTable = new Object[newCapacity];
  312       byte[] newState = new byte[newCapacity];
  313   
  314       this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
  315       this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
  316   
  317       this.table = newTable;
  318       this.state = newState;
  319       this.freeEntries = newCapacity - this.distinct; // delta
  320   
  321       for (int i = oldCapacity; i-- > 0;) {
  322         if (oldState[i] == FULL) {
  323           Object element = oldTable[i];
  324           int index = indexOfInsertion((T)element);
  325           newTable[index] = element;
  326           newState[index] = FULL;
  327         }
  328       }
  329     }
  330   
  331     /**
  332      * Removes the given key with its associated element from the receiver, if present.
  333      *
  334      * @param key the key to be removed from the receiver.
  335      * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
  336      */
  337     @SuppressWarnings("unchecked")
  338     @Override
  339     public boolean remove(Object key) {
  340       int i = indexOfKey((T)key);
  341       if (i < 0) {
  342         return false;
  343       } // key not contained
  344   
  345       this.state[i] = REMOVED;
  346       this.distinct--;
  347   
  348       if (this.distinct < this.lowWaterMark) {
  349         int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
  350         rehash(newCapacity);
  351       }
  352   
  353       return true;
  354     }
  355   
  356     /**
  357      * Initializes the receiver.
  358      *
  359      * @param initialCapacity the initial capacity of the receiver.
  360      * @param minLoadFactor   the minLoadFactor of the receiver.
  361      * @param maxLoadFactor   the maxLoadFactor of the receiver.
  362      * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
  363      *                                  (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
  364      *                                  maxLoadFactor)</tt>.
  365      */
  366     @Override
  367     protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
  368       int capacity = initialCapacity;
  369       super.setUp(capacity, minLoadFactor, maxLoadFactor);
  370       capacity = nextPrime(capacity);
  371       if (capacity == 0) {
  372         capacity = 1;
  373       } // open addressing needs at least one FREE slot at any time.
  374   
  375       this.table = new Object[capacity];
  376       this.state = new byte[capacity];
  377   
  378       // memory will be exhausted long before this pathological case happens, anyway.
  379       this.minLoadFactor = minLoadFactor;
  380       if (capacity == PrimeFinder.largestPrime) {
  381         this.maxLoadFactor = 1.0;
  382       } else {
  383         this.maxLoadFactor = maxLoadFactor;
  384       }
  385   
  386       this.distinct = 0;
  387       this.freeEntries = capacity; // delta
  388   
  389       // lowWaterMark will be established upon first expansion.
  390       // establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
  391       // After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
  392       // See ensureCapacity(...)
  393       this.lowWaterMark = 0;
  394       this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
  395     }
  396   
  397     /**
  398      * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
  399      * application can use this operation to minimize the storage of the receiver.
  400      */
  401     @Override
  402     public void trimToSize() {
  403       // * 1.2 because open addressing's performance exponentially degrades beyond that point
  404       // so that even rehashing the table can take very long
  405       int newCapacity = nextPrime((int) (1 + 1.2 * size()));
  406       if (table.length > newCapacity) {
  407         rehash(newCapacity);
  408       }
  409     }
  410   
  411     /**
  412      * Access for unit tests.
  413      * @param capacity
  414      * @param minLoadFactor
  415      * @param maxLoadFactor
  416      */
  417     void getInternalFactors(int[] capacity, 
  418         double[] minLoadFactor, 
  419         double[] maxLoadFactor) {
  420       capacity[0] = table.length;
  421       minLoadFactor[0] = this.minLoadFactor;
  422       maxLoadFactor[0] = this.maxLoadFactor;
  423     }
  424   
  425     @Override
  426     public boolean isEmpty() {
  427       return size() == 0;
  428     }
  429     
  430     /**
  431      * OpenHashSet instances are only equal to other OpenHashSet instances, not to 
  432      * any other collection. Hypothetically, we should check for and permit
  433      * equals on other Sets.
  434      */
  435     @SuppressWarnings("unchecked")
  436     public boolean equals(Object obj) {
  437       if (obj == this) {
  438         return true;
  439       }
  440   
  441       if (!(obj instanceof OpenHashSet)) {
  442         return false;
  443       }
  444       final OpenHashSet<T> other = (OpenHashSet<T>) obj;
  445       if (other.size() != size()) {
  446         return false;
  447       }
  448   
  449       return
  450           forEachKey(
  451               new ObjectProcedure<T>() {
  452                 @Override
  453                 public boolean apply(T key) {
  454                   return other.contains(key);
  455                 }
  456               }
  457           );
  458     }
  459     
  460     /**
  461      * Implement the standard Java Collections iterator. Note that 'remove' is silently
  462      * ineffectual here. This method is provided for convenience, only.
  463      */
  464     @Override
  465     public Iterator<T> iterator() {
  466       List<T> keyList = new ArrayList<T>();
  467       keys(keyList);
  468       return keyList.iterator();
  469     }
  470   
  471     @Override
  472     public Object[] toArray() {
  473       List<T> keyList = new ArrayList<T>();
  474       keys(keyList);
  475       return keyList.toArray();
  476     }
  477   
  478     @Override
  479     public boolean addAll(Collection<? extends T> c) {
  480       boolean anyAdded = false;
  481       for(T o : c) {
  482         boolean added = add(o);
  483         anyAdded |= added;
  484       }
  485       return anyAdded;
  486     }
  487   
  488     @Override
  489     public boolean containsAll(Collection<?> c) {
  490       for (Object o : c) {
  491         if (!contains(o)) {
  492           return false;
  493         }
  494       }
  495       return true;
  496     }
  497   
  498     @Override
  499     public boolean removeAll(Collection<?> c) {
  500       boolean anyRemoved = false;
  501       for(Object o : c) {
  502         boolean removed = remove(o);
  503         anyRemoved |= removed;
  504       }
  505       return anyRemoved;
  506     }
  507   
  508     @Override
  509     public boolean retainAll(Collection<?> c) {
  510       final Collection<?> finalCollection = c;
  511       final boolean[] modified = new boolean[1];
  512       modified[0] = false;
  513       forEachKey(new ObjectProcedure<T>() {
  514   
  515         @Override
  516         public boolean apply(T element) {
  517           if (!finalCollection.contains(element)) {
  518             remove(element);
  519             modified[0] = true;
  520           }
  521           return true;
  522         }});
  523       return modified[0];
  524     }
  525   
  526     @Override
  527     public <T2> T2[] toArray(T2[] a) {
  528       List<T> keys = new ArrayList<T>();
  529       keys(keys);
  530       return keys.toArray(a);
  531     }
  532   
  533     public List<T> keys() {
  534       List<T> keys = new ArrayList<T>();
  535       keys(keys);
  536       return keys;
  537     }
  538   }

Save This Page
Home » mahout-collections-1.0-src » org.apache.mahout.math.set » [javadoc | source]