Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]
    1   package org.apache.lucene.analysis;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io.IOException;
   21   import java.lang.ref.WeakReference;
   22   import java.util.Iterator;
   23   import java.util.LinkedList;
   24   import java.util.List;
   25   
   26   import org.apache.lucene.util.AttributeImpl;
   27   import org.apache.lucene.util.AttributeSource;
   28   
   29   /**
   30    * This TokenFilter provides the ability to set aside attribute states
   31    * that have already been analyzed.  This is useful in situations where multiple fields share
   32    * many common analysis steps and then go their separate ways.
   33    * <p/>
   34    * It is also useful for doing things like entity extraction or proper noun analysis as
   35    * part of the analysis workflow and saving off those tokens for use in another field.
   36    *
   37    * <pre>
   38   TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
   39   TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
   40   TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
   41   
   42   TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
   43   source2.addSinkTokenStream(sink1);
   44   source2.addSinkTokenStream(sink2);
   45   
   46   TokenStream final1 = new LowerCaseFilter(source1);
   47   TokenStream final2 = source2;
   48   TokenStream final3 = new EntityDetect(sink1);
   49   TokenStream final4 = new URLDetect(sink2);
   50   
   51   d.add(new Field("f1", final1));
   52   d.add(new Field("f2", final2));
   53   d.add(new Field("f3", final3));
   54   d.add(new Field("f4", final4));
   55    * </pre>
   56    * In this example, <code>sink1</code> and <code>sink2</code> will both get tokens from both
   57    * <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
   58    * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
   59    * It is important, that tees are consumed before sinks (in the above example, the field names must be
   60    * less the sink's field names). If you are not sure, which stream is consumed first, you can simply
   61    * add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
   62    * This TokenFilter is exhausted after this. In the above example, change
   63    * the example above to:
   64    * <pre>
   65   ...
   66   TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
   67   TokenStream final2 = source2.newSinkTokenStream();
   68   sink1.consumeAllTokens();
   69   sink2.consumeAllTokens();
   70   ...
   71    * </pre>
   72    * In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
   73    * <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
   74    */
   75   public final class TeeSinkTokenFilter extends TokenFilter {
   76     private final List<WeakReference<SinkTokenStream>> sinks = new LinkedList<WeakReference<SinkTokenStream>>();
   77     
   78     /**
   79      * Instantiates a new TeeSinkTokenFilter.
   80      */
   81     public TeeSinkTokenFilter(TokenStream input) {
   82       super(input);
   83     }
   84   
   85     /**
   86      * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
   87      */
   88     public SinkTokenStream newSinkTokenStream() {
   89       return newSinkTokenStream(ACCEPT_ALL_FILTER);
   90     }
   91     
   92     /**
   93      * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
   94      * that pass the supplied filter.
   95      * @see SinkFilter
   96      */
   97     public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
   98       SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
   99       this.sinks.add(new WeakReference<SinkTokenStream>(sink));
  100       return sink;
  101     }
  102     
  103     /**
  104      * Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
  105      * to this one. The supplied stream will also receive all consumed tokens.
  106      * This method can be used to pass tokens from two different tees to one sink.
  107      */
  108     public void addSinkTokenStream(final SinkTokenStream sink) {
  109       // check that sink has correct factory
  110       if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
  111         throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
  112       }
  113       // add eventually missing attribute impls to the existing sink
  114       for (Iterator<AttributeImpl> it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
  115         sink.addAttributeImpl(it.next());
  116       }
  117       this.sinks.add(new WeakReference<SinkTokenStream>(sink));
  118     }
  119     
  120     /**
  121      * <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
  122      * when itself is consumed. To be sure, that all tokens from the input
  123      * stream are passed to the sinks, you can call this methods.
  124      * This instance is exhausted after this, but all sinks are instant available.
  125      */
  126     public void consumeAllTokens() throws IOException {
  127       while (incrementToken());
  128     }
  129     
  130     @Override
  131     public boolean incrementToken() throws IOException {
  132       if (input.incrementToken()) {
  133         // capture state lazily - maybe no SinkFilter accepts this state
  134         AttributeSource.State state = null;
  135         for (WeakReference<SinkTokenStream> ref : sinks) {
  136           final SinkTokenStream sink = ref.get();
  137           if (sink != null) {
  138             if (sink.accept(this)) {
  139               if (state == null) {
  140                 state = this.captureState();
  141               }
  142               sink.addState(state);
  143             }
  144           }
  145         }
  146         return true;
  147       }
  148       
  149       return false;
  150     }
  151     
  152     @Override
  153     public final void end() throws IOException {
  154       super.end();
  155       AttributeSource.State finalState = captureState();
  156       for (WeakReference<SinkTokenStream> ref : sinks) {
  157         final SinkTokenStream sink = ref.get();
  158         if (sink != null) {
  159           sink.setFinalState(finalState);
  160         }
  161       }
  162     }
  163     
  164     /**
  165      * A filter that decides which {@link AttributeSource} states to store in the sink.
  166      */
  167     public static abstract class SinkFilter {
  168       /**
  169        * Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored
  170        * in the sink. 
  171        */
  172       public abstract boolean accept(AttributeSource source);
  173       
  174       /**
  175        * Called by {@link SinkTokenStream#reset()}. This method does nothing by default
  176        * and can optionally be overridden.
  177        */
  178       public void reset() throws IOException {
  179         // nothing to do; can be overridden
  180       }
  181     }
  182     
  183     public static final class SinkTokenStream extends TokenStream {
  184       private final List<AttributeSource.State> cachedStates = new LinkedList<AttributeSource.State>();
  185       private AttributeSource.State finalState;
  186       private Iterator<AttributeSource.State> it = null;
  187       private SinkFilter filter;
  188       
  189       private SinkTokenStream(AttributeSource source, SinkFilter filter) {
  190         super(source);
  191         this.filter = filter;
  192       }
  193       
  194       private boolean accept(AttributeSource source) {
  195         return filter.accept(source);
  196       }
  197       
  198       private void addState(AttributeSource.State state) {
  199         if (it != null) {
  200           throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
  201         }
  202         cachedStates.add(state);
  203       }
  204       
  205       private void setFinalState(AttributeSource.State finalState) {
  206         this.finalState = finalState;
  207       }
  208       
  209       @Override
  210       public final boolean incrementToken() throws IOException {
  211         // lazy init the iterator
  212         if (it == null) {
  213           it = cachedStates.iterator();
  214         }
  215       
  216         if (!it.hasNext()) {
  217           return false;
  218         }
  219         
  220         AttributeSource.State state = it.next();
  221         restoreState(state);
  222         return true;
  223       }
  224     
  225       @Override
  226       public final void end() throws IOException {
  227         if (finalState != null) {
  228           restoreState(finalState);
  229         }
  230       }
  231       
  232       @Override
  233       public final void reset() {
  234         it = cachedStates.iterator();
  235       }
  236     }
  237       
  238     private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
  239       @Override
  240       public boolean accept(AttributeSource source) {
  241         return true;
  242       }
  243     };
  244     
  245   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]