Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » search » highlight » [javadoc | source]
    1   package org.apache.lucene.search.highlight;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io.IOException;
   21   import java.util.HashMap;
   22   import java.util.HashSet;
   23   import java.util.Map;
   24   import java.util.Set;
   25   
   26   import org.apache.lucene.analysis.CachingTokenFilter;
   27   import org.apache.lucene.analysis.TokenStream;
   28   import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
   29   import org.apache.lucene.analysis.tokenattributes.TermAttribute;
   30   import org.apache.lucene.index.IndexReader;
   31   import org.apache.lucene.index.memory.MemoryIndex;
   32   import org.apache.lucene.search.Query;
   33   import org.apache.lucene.search.spans.SpanQuery;
   34   import org.apache.lucene.util.StringHelper;
   35   
   36   /**
   37    * {@link Scorer} implementation which scores text fragments by the number of
   38    * unique query terms found. This class converts appropriate {@link Query}s to
   39    * {@link SpanQuery}s and attempts to score only those terms that participated in
   40    * generating the 'hit' on the document.
   41    */
   42   public class QueryScorer implements Scorer {
   43     private float totalScore;
   44     private Set<String> foundTerms;
   45     private Map<String,WeightedSpanTerm> fieldWeightedSpanTerms;
   46     private float maxTermWeight;
   47     private int position = -1;
   48     private String defaultField;
   49     private TermAttribute termAtt;
   50     private PositionIncrementAttribute posIncAtt;
   51     private boolean expandMultiTermQuery = true;
   52     private Query query;
   53     private String field;
   54     private IndexReader reader;
   55     private boolean skipInitExtractor;
   56     private boolean wrapToCaching = true;
   57   
   58     /**
   59      * @param query Query to use for highlighting
   60      */
   61     public QueryScorer(Query query) {
   62       init(query, null, null, true);
   63     }
   64   
   65     /**
   66      * @param query Query to use for highlighting
   67      * @param field Field to highlight - pass null to ignore fields
   68      */
   69     public QueryScorer(Query query, String field) {
   70       init(query, field, null, true);
   71     }
   72   
   73     /**
   74      * @param query Query to use for highlighting
   75      * @param field Field to highlight - pass null to ignore fields
   76      * @param reader {@link IndexReader} to use for quasi tf/idf scoring
   77      */
   78     public QueryScorer(Query query, IndexReader reader, String field) {
   79       init(query, field, reader, true);
   80     }
   81   
   82   
   83     /**
   84      * @param query to use for highlighting
   85      * @param reader {@link IndexReader} to use for quasi tf/idf scoring
   86      * @param field to highlight - pass null to ignore fields
   87      * @param defaultField
   88      */
   89     public QueryScorer(Query query, IndexReader reader, String field, String defaultField) {
   90       this.defaultField = StringHelper.intern(defaultField);
   91       init(query, field, reader, true);
   92     }
   93   
   94     /**
   95      * @param defaultField - The default field for queries with the field name unspecified
   96      */
   97     public QueryScorer(Query query, String field, String defaultField) {
   98       this.defaultField = StringHelper.intern(defaultField);
   99       init(query, field, null, true);
  100     }
  101   
  102     /**
  103      * @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s
  104      */
  105     public QueryScorer(WeightedSpanTerm[] weightedTerms) {
  106       this.fieldWeightedSpanTerms = new HashMap<String,WeightedSpanTerm>(weightedTerms.length);
  107   
  108       for (int i = 0; i < weightedTerms.length; i++) {
  109         WeightedSpanTerm existingTerm = fieldWeightedSpanTerms.get(weightedTerms[i].term);
  110   
  111         if ((existingTerm == null) ||
  112               (existingTerm.weight < weightedTerms[i].weight)) {
  113           // if a term is defined more than once, always use the highest
  114           // scoring weight
  115           fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]);
  116           maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
  117         }
  118       }
  119       skipInitExtractor = true;
  120     }
  121   
  122     /*
  123      * (non-Javadoc)
  124      *
  125      * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
  126      */
  127     public float getFragmentScore() {
  128       return totalScore;
  129     }
  130   
  131     /**
  132      *
  133      * @return The highest weighted term (useful for passing to
  134      *         GradientFormatter to set top end of coloring scale).
  135      */
  136     public float getMaxTermWeight() {
  137       return maxTermWeight;
  138     }
  139   
  140     /*
  141      * (non-Javadoc)
  142      *
  143      * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
  144      *      int)
  145      */
  146     public float getTokenScore() {
  147       position += posIncAtt.getPositionIncrement();
  148       String termText = termAtt.term();
  149   
  150       WeightedSpanTerm weightedSpanTerm;
  151   
  152       if ((weightedSpanTerm = fieldWeightedSpanTerms.get(
  153                 termText)) == null) {
  154         return 0;
  155       }
  156   
  157       if (weightedSpanTerm.positionSensitive &&
  158             !weightedSpanTerm.checkPosition(position)) {
  159         return 0;
  160       }
  161   
  162       float score = weightedSpanTerm.getWeight();
  163   
  164       // found a query term - is it unique in this doc?
  165       if (!foundTerms.contains(termText)) {
  166         totalScore += score;
  167         foundTerms.add(termText);
  168       }
  169   
  170       return score;
  171     }
  172   
  173     /* (non-Javadoc)
  174      * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
  175      */
  176     public TokenStream init(TokenStream tokenStream) throws IOException {
  177       position = -1;
  178       termAtt = tokenStream.addAttribute(TermAttribute.class);
  179       posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  180       if(!skipInitExtractor) {
  181         if(fieldWeightedSpanTerms != null) {
  182           fieldWeightedSpanTerms.clear();
  183         }
  184         return initExtractor(tokenStream);
  185       }
  186       return null;
  187     }
  188     
  189     /**
  190      * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing
  191      * Span information to a {@link Fragmenter}.
  192      *
  193      * @param token to get {@link WeightedSpanTerm} for
  194      * @return WeightedSpanTerm for token
  195      */
  196     public WeightedSpanTerm getWeightedSpanTerm(String token) {
  197       return fieldWeightedSpanTerms.get(token);
  198     }
  199   
  200     /**
  201      */
  202     private void init(Query query, String field, IndexReader reader, boolean expandMultiTermQuery) {
  203       this.reader = reader;
  204       this.expandMultiTermQuery = expandMultiTermQuery;
  205       this.query = query;
  206       this.field = field;
  207     }
  208     
  209     private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
  210       WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
  211           : new WeightedSpanTermExtractor(defaultField);
  212   
  213       qse.setExpandMultiTermQuery(expandMultiTermQuery);
  214       qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
  215       if (reader == null) {
  216         this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
  217             tokenStream, field);
  218       } else {
  219         this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query,
  220             tokenStream, field, reader);
  221       }
  222       if(qse.isCachedTokenStream()) {
  223         return qse.getTokenStream();
  224       }
  225       
  226       return null;
  227     }
  228   
  229     /*
  230      * (non-Javadoc)
  231      *
  232      * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
  233      */
  234     public void startFragment(TextFragment newFragment) {
  235       foundTerms = new HashSet<String>();
  236       totalScore = 0;
  237     }
  238     
  239     /**
  240      * @return true if multi-term queries should be expanded
  241      */
  242     public boolean isExpandMultiTermQuery() {
  243       return expandMultiTermQuery;
  244     }
  245   
  246     /**
  247      * Controls whether or not multi-term queries are expanded
  248      * against a {@link MemoryIndex} {@link IndexReader}.
  249      * 
  250      * @param expandMultiTermQuery true if multi-term queries should be expanded
  251      */
  252     public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
  253       this.expandMultiTermQuery = expandMultiTermQuery;
  254     }
  255     
  256     /**
  257      * By default, {@link TokenStream}s that are not of the type
  258      * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
  259      * ensure an efficient reset - if you are already using a different caching
  260      * {@link TokenStream} impl and you don't want it to be wrapped, set this to
  261      * false.
  262      * 
  263      * @param wrap
  264      */
  265     public void setWrapIfNotCachingTokenFilter(boolean wrap) {
  266       this.wrapToCaching = wrap;
  267     }
  268   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » search » highlight » [javadoc | source]