Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » search » highlight » [javadoc | source]
    1   /*
    2    * Created on 28-Oct-2004
    3    */
    4   package org.apache.lucene.search.highlight;
    5   
    6   /**
    7    * Licensed to the Apache Software Foundation (ASF) under one or more
    8    * contributor license agreements.  See the NOTICE file distributed with
    9    * this work for additional information regarding copyright ownership.
   10    * The ASF licenses this file to You under the Apache License, Version 2.0
   11    * (the "License"); you may not use this file except in compliance with
   12    * the License.  You may obtain a copy of the License at
   13    *
   14    *     http://www.apache.org/licenses/LICENSE-2.0
   15    *
   16    * Unless required by applicable law or agreed to in writing, software
   17    * distributed under the License is distributed on an "AS IS" BASIS,
   18    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   19    * See the License for the specific language governing permissions and
   20    * limitations under the License.
   21    */
   22   
   23   import java.io.IOException;
   24   import java.io.StringReader;
   25   import java.util.ArrayList;
   26   import java.util.Arrays;
   27   import java.util.Comparator;
   28   
   29   import org.apache.lucene.analysis.Analyzer;
   30   import org.apache.lucene.analysis.Token;
   31   import org.apache.lucene.analysis.TokenStream;
   32   import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
   33   import org.apache.lucene.analysis.tokenattributes.TermAttribute;
   34   import org.apache.lucene.document.Document;
   35   import org.apache.lucene.index.IndexReader;
   36   import org.apache.lucene.index.TermFreqVector;
   37   import org.apache.lucene.index.TermPositionVector;
   38   import org.apache.lucene.index.TermVectorOffsetInfo;
   39   
   40   /**
   41    * Hides implementation issues associated with obtaining a TokenStream for use with
   42    * the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or
   43    * from Analyzer class reparsing the stored content.
   44    */
   45   public class TokenSources
   46   {
   47     /**
   48      * A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to
   49      * using the passed in {@link org.apache.lucene.document.Document} to retrieve the TokenStream.  This is useful when
   50      * you already have the document, but would prefer to use the vector first.
   51      * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try and get the vector from
   52      * @param docId The docId to retrieve.
   53      * @param field The field to retrieve on the document
   54      * @param doc The document to fall back on
   55      * @param analyzer The analyzer to use for creating the TokenStream if the vector doesn't exist
   56      * @return The {@link org.apache.lucene.analysis.TokenStream} for the {@link org.apache.lucene.document.Fieldable} on the {@link org.apache.lucene.document.Document}
   57      * @throws IOException if there was an error loading
   58      */
   59     public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Document doc, Analyzer analyzer) throws IOException{
   60       TokenStream ts=null;
   61   
   62   		TermFreqVector tfv = reader.getTermFreqVector(docId,field);
   63   		if(tfv!=null)
   64   		{
   65   		    if(tfv instanceof TermPositionVector)
   66   		    {
   67   		        ts=getTokenStream((TermPositionVector) tfv);
   68   		    }
   69   		}
   70   		//No token info stored so fall back to analyzing raw content
   71   		if(ts==null)
   72   		{
   73   		    ts=getTokenStream(doc,field,analyzer);
   74   		}
   75   		return ts;
   76     }
   77       /**
   78        * A convenience method that tries a number of approaches to getting a token stream.
   79        * The cost of finding there are no termVectors in the index is minimal (1000 invocations still 
   80        * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
   81        * @param reader
   82        * @param docId
   83        * @param field
   84        * @param analyzer
   85        * @return null if field not stored correctly 
   86        * @throws IOException
   87        */
   88       public static TokenStream getAnyTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer) throws IOException
   89       {
   90   		TokenStream ts=null;
   91   
   92   		TermFreqVector tfv = reader.getTermFreqVector(docId,field);
   93   		if(tfv!=null)
   94   		{
   95   		    if(tfv instanceof TermPositionVector)
   96   		    {
   97   		        ts=getTokenStream((TermPositionVector) tfv);
   98   		    }
   99   		}
  100   		//No token info stored so fall back to analyzing raw content
  101   		if(ts==null)
  102   		{
  103   		    ts=getTokenStream(reader,docId,field,analyzer);
  104   		}
  105   		return ts;
  106       }
  107       
  108       
  109       public static TokenStream getTokenStream(TermPositionVector tpv)
  110       {
  111           //assumes the worst and makes no assumptions about token position sequences.
  112            return getTokenStream(tpv,false);   
  113       }
  114       /**
  115        * Low level api.
  116        * Returns a token stream or null if no offset info available in index.
  117        * This can be used to feed the highlighter with a pre-parsed token stream 
  118        * 
  119        * In my tests the speeds to recreate 1000 token streams using this method are:
  120        * - with TermVector offset only data stored - 420  milliseconds 
  121        * - with TermVector offset AND position data stored - 271 milliseconds
  122        *  (nb timings for TermVector with position data are based on a tokenizer with contiguous
  123        *  positions - no overlaps or gaps)
  124        * The cost of not using TermPositionVector to store
  125        * pre-parsed content and using an analyzer to re-parse the original content: 
  126        * - reanalyzing the original content - 980 milliseconds
  127        * 
  128        * The re-analyze timings will typically vary depending on -
  129        * 	1) The complexity of the analyzer code (timings above were using a 
  130        * 	   stemmer/lowercaser/stopword combo)
  131        *  2) The  number of other fields (Lucene reads ALL fields off the disk 
  132        *     when accessing just one document field - can cost dear!)
  133        *  3) Use of compression on field storage - could be faster due to compression (less disk IO)
  134        *     or slower (more CPU burn) depending on the content.
  135        *
  136        * @param tpv
  137        * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
  138        * to eek out the last drops of performance, set to true. If in doubt, set to false.
  139        */
  140       public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) {
  141           //an object used to iterate across an array of tokens
  142           class StoredTokenStream extends TokenStream {
  143             Token tokens[];
  144             int currentToken = 0;
  145             TermAttribute termAtt;
  146             OffsetAttribute offsetAtt;
  147       
  148             StoredTokenStream(Token tokens[]) {
  149               this.tokens = tokens;
  150               termAtt = addAttribute(TermAttribute.class);
  151               offsetAtt = addAttribute(OffsetAttribute.class);
  152             }
  153       
  154             @Override
  155             public boolean incrementToken() throws IOException {
  156               if (currentToken >= tokens.length) {
  157                 return false;
  158               }
  159               clearAttributes();
  160               Token token = tokens[currentToken++];
  161               termAtt.setTermBuffer(token.term());
  162               offsetAtt.setOffset(token.startOffset(), token.endOffset());
  163               return true;
  164             }
  165           }      
  166           //code to reconstruct the original sequence of Tokens
  167           String[] terms=tpv.getTerms();          
  168           int[] freq=tpv.getTermFrequencies();
  169           int totalTokens=0;
  170   
  171           for (int t = 0; t < freq.length; t++)
  172           {
  173               totalTokens+=freq[t];
  174           }
  175           Token tokensInOriginalOrder[]=new Token[totalTokens];
  176           ArrayList<Token> unsortedTokens = null;
  177           for (int t = 0; t < freq.length; t++)
  178           {
  179               TermVectorOffsetInfo[] offsets=tpv.getOffsets(t);
  180               if(offsets==null)
  181               {
  182                   return null;
  183               }
  184               
  185               int[] pos=null;
  186               if(tokenPositionsGuaranteedContiguous)
  187               {
  188                   //try get the token position info to speed up assembly of tokens into sorted sequence
  189                   pos=tpv.getTermPositions(t);
  190               }
  191               if(pos==null)
  192               {	
  193                   //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
  194                   if(unsortedTokens==null)
  195                   {
  196                       unsortedTokens=new ArrayList<Token>();
  197                   }
  198                   for (int tp = 0; tp < offsets.length; tp++)
  199                   {
  200                     Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
  201                     token.setTermBuffer(terms[t]);
  202                     unsortedTokens.add(token);
  203                   }
  204               }
  205               else
  206               {
  207                   //We have positions stored and a guarantee that the token position information is contiguous
  208                   
  209                   // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
  210                   // creates jumps in position numbers - this code would fail under those circumstances
  211                   
  212                   //tokens stored with positions - can use this to index straight into sorted array
  213                   for (int tp = 0; tp < pos.length; tp++)
  214                   {
  215                     Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
  216                     tokensInOriginalOrder[pos[tp]] = token;
  217                   }                
  218               }
  219           }
  220           //If the field has been stored without position data we must perform a sort        
  221           if(unsortedTokens!=null) {
  222               tokensInOriginalOrder= unsortedTokens.toArray(new Token[unsortedTokens.size()]);
  223               Arrays.sort(tokensInOriginalOrder, new Comparator<Token>(){
  224                   public int compare(Token t1, Token t2) {
  225                       if(t1.startOffset()>t2.endOffset())
  226                           return 1;
  227                       if(t1.startOffset()<t2.startOffset())
  228                           return -1;
  229                       return 0;
  230                   }});
  231           }
  232           return new StoredTokenStream(tokensInOriginalOrder);
  233       }
  234   
  235       public static TokenStream getTokenStream(IndexReader reader,int docId, String field) throws IOException
  236       {
  237   		TermFreqVector tfv = reader.getTermFreqVector(docId,field);
  238   		if(tfv==null)
  239   		{
  240   		    throw new IllegalArgumentException(field+" in doc #"+docId
  241   		            	+"does not have any term position data stored");
  242   		}
  243   	    if(tfv instanceof TermPositionVector)
  244   	    {
  245   			TermPositionVector tpv=(TermPositionVector) reader.getTermFreqVector(docId,field);
  246   	        return getTokenStream(tpv);	        
  247   	    }
  248   	    throw new IllegalArgumentException(field+" in doc #"+docId
  249               	+"does not have any term position data stored");
  250       }
  251   
  252       //convenience method
  253       public static TokenStream getTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer) throws IOException
  254       {
  255   		  Document doc=reader.document(docId);
  256   		  return getTokenStream(doc, field, analyzer);
  257       }
  258       
  259     public static TokenStream getTokenStream(Document doc, String field, Analyzer analyzer){
  260       String contents=doc.get(field);
  261   		if(contents==null)
  262   		{
  263   		    throw new IllegalArgumentException("Field "+field +" in document is not stored and cannot be analyzed");
  264   		}
  265           return getTokenStream(field, contents, analyzer);
  266     }
  267     //convenience method
  268     public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){
  269       return analyzer.tokenStream(field,new StringReader(contents));
  270     }
  271   
  272   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » search » highlight » [javadoc | source]