Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » search » highlight » [javadoc | source]
    1   package org.apache.lucene.search.highlight;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   import java.io.IOException;
   20   import java.util.ArrayList;
   21   import java.util.Collection;
   22   import java.util.HashMap;
   23   import java.util.HashSet;
   24   import java.util.Iterator;
   25   import java.util.List;
   26   import java.util.Map;
   27   import java.util.Set;
   28   
   29   import org.apache.lucene.analysis.CachingTokenFilter;
   30   import org.apache.lucene.analysis.TokenStream;
   31   import org.apache.lucene.index.FilterIndexReader;
   32   import org.apache.lucene.index.IndexReader;
   33   import org.apache.lucene.index.Term;
   34   import org.apache.lucene.index.TermEnum;
   35   import org.apache.lucene.index.memory.MemoryIndex;
   36   import org.apache.lucene.search;
   37   import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
   38   import org.apache.lucene.search.spans.SpanFirstQuery;
   39   import org.apache.lucene.search.spans.SpanNearQuery;
   40   import org.apache.lucene.search.spans.SpanNotQuery;
   41   import org.apache.lucene.search.spans.SpanOrQuery;
   42   import org.apache.lucene.search.spans.SpanQuery;
   43   import org.apache.lucene.search.spans.SpanTermQuery;
   44   import org.apache.lucene.search.spans.Spans;
   45   import org.apache.lucene.util.StringHelper;
   46   
   47   /**
   48    * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether 
   49    * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
   50    */
   51   public class WeightedSpanTermExtractor {
   52   
   53     private String fieldName;
   54     private TokenStream tokenStream;
   55     private Map<String,IndexReader> readers = new HashMap<String,IndexReader>(10); 
   56     private String defaultField;
   57     private boolean expandMultiTermQuery;
   58     private boolean cachedTokenStream;
   59     private boolean wrapToCaching = true;
   60   
   61     public WeightedSpanTermExtractor() {
   62     }
   63   
   64     public WeightedSpanTermExtractor(String defaultField) {
   65       if (defaultField != null) {
   66         this.defaultField = StringHelper.intern(defaultField);
   67       }
   68     }
   69   
   70     private void closeReaders() {
   71       Collection<IndexReader> readerSet = readers.values();
   72   
   73       for (final IndexReader reader : readerSet) {
   74         try {
   75           reader.close();
   76         } catch (IOException e) {
   77           // alert?
   78         }
   79       }
   80     }
   81   
   82     /**
   83      * Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
   84      * 
   85      * @param query
   86      *          Query to extract Terms from
   87      * @param terms
   88      *          Map to place created WeightedSpanTerms in
   89      * @throws IOException
   90      */
   91     private void extract(Query query, Map<String,WeightedSpanTerm> terms) throws IOException {
   92       if (query instanceof BooleanQuery) {
   93         BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
   94   
   95         for (int i = 0; i < queryClauses.length; i++) {
   96           if (!queryClauses[i].isProhibited()) {
   97             extract(queryClauses[i].getQuery(), terms);
   98           }
   99         }
  100       } else if (query instanceof PhraseQuery) {
  101         PhraseQuery phraseQuery = ((PhraseQuery) query);
  102         Term[] phraseQueryTerms = phraseQuery.getTerms();
  103         SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
  104         for (int i = 0; i < phraseQueryTerms.length; i++) {
  105           clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
  106         }
  107         int slop = phraseQuery.getSlop();
  108         int[] positions = phraseQuery.getPositions();
  109         // add largest position increment to slop
  110         if (positions.length > 0) {
  111           int lastPos = positions[0];
  112           int largestInc = 0;
  113           int sz = positions.length;
  114           for (int i = 1; i < sz; i++) {
  115             int pos = positions[i];
  116             int inc = pos - lastPos;
  117             if (inc > largestInc) {
  118               largestInc = inc;
  119             }
  120             lastPos = pos;
  121           }
  122           if(largestInc > 1) {
  123             slop += largestInc;
  124           }
  125         }
  126   
  127         boolean inorder = false;
  128   
  129         if (slop == 0) {
  130           inorder = true;
  131         }
  132   
  133         SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
  134         sp.setBoost(query.getBoost());
  135         extractWeightedSpanTerms(terms, sp);
  136       } else if (query instanceof TermQuery) {
  137         extractWeightedTerms(terms, query);
  138       } else if (query instanceof SpanQuery) {
  139         extractWeightedSpanTerms(terms, (SpanQuery) query);
  140       } else if (query instanceof FilteredQuery) {
  141         extract(((FilteredQuery) query).getQuery(), terms);
  142       } else if (query instanceof DisjunctionMaxQuery) {
  143         for (Iterator<Query> iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
  144           extract(iterator.next(), terms);
  145         }
  146       } else if (query instanceof MultiTermQuery && expandMultiTermQuery) {
  147         MultiTermQuery mtq = ((MultiTermQuery)query);
  148         if(mtq.getRewriteMethod() != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) {
  149           mtq = (MultiTermQuery) mtq.clone();
  150           mtq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
  151           query = mtq;
  152         }
  153         FakeReader fReader = new FakeReader();
  154         MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.rewrite(fReader, mtq);
  155         if (fReader.field != null) {
  156           IndexReader ir = getReaderForField(fReader.field);
  157           extract(query.rewrite(ir), terms);
  158         }
  159       } else if (query instanceof MultiPhraseQuery) {
  160         final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
  161         final List<Term[]> termArrays = mpq.getTermArrays();
  162         final int[] positions = mpq.getPositions();
  163         if (positions.length > 0) {
  164   
  165           int maxPosition = positions[positions.length - 1];
  166           for (int i = 0; i < positions.length - 1; ++i) {
  167             if (positions[i] > maxPosition) {
  168               maxPosition = positions[i];
  169             }
  170           }
  171   
  172           final List<SpanQuery>[] disjunctLists = new List[maxPosition + 1];
  173           int distinctPositions = 0;
  174   
  175           for (int i = 0; i < termArrays.size(); ++i) {
  176             final Term[] termArray = termArrays.get(i);
  177             List<SpanQuery> disjuncts = disjunctLists[positions[i]];
  178             if (disjuncts == null) {
  179               disjuncts = (disjunctLists[positions[i]] = new ArrayList<SpanQuery>(termArray.length));
  180               ++distinctPositions;
  181             }
  182             for (int j = 0; j < termArray.length; ++j) {
  183               disjuncts.add(new SpanTermQuery(termArray[j]));
  184             }
  185           }
  186   
  187           int positionGaps = 0;
  188           int position = 0;
  189           final SpanQuery[] clauses = new SpanQuery[distinctPositions];
  190           for (int i = 0; i < disjunctLists.length; ++i) {
  191             List<SpanQuery> disjuncts = disjunctLists[i];
  192             if (disjuncts != null) {
  193               clauses[position++] = new SpanOrQuery(disjuncts
  194                   .toArray(new SpanQuery[disjuncts.size()]));
  195             } else {
  196               ++positionGaps;
  197             }
  198           }
  199   
  200           final int slop = mpq.getSlop();
  201           final boolean inorder = (slop == 0);
  202   
  203           SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
  204           sp.setBoost(query.getBoost());
  205           extractWeightedSpanTerms(terms, sp);
  206         }
  207       }
  208     }
  209   
  210     /**
  211      * Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>SpanQuery</code>.
  212      * 
  213      * @param terms
  214      *          Map to place created WeightedSpanTerms in
  215      * @param spanQuery
  216      *          SpanQuery to extract Terms from
  217      * @throws IOException
  218      */
  219     private void extractWeightedSpanTerms(Map<String,WeightedSpanTerm> terms, SpanQuery spanQuery) throws IOException {
  220       Set<String> fieldNames;
  221   
  222       if (fieldName == null) {
  223         fieldNames = new HashSet<String>();
  224         collectSpanQueryFields(spanQuery, fieldNames);
  225       } else {
  226         fieldNames = new HashSet<String>(1);
  227         fieldNames.add(fieldName);
  228       }
  229       // To support the use of the default field name
  230       if (defaultField != null) {
  231         fieldNames.add(defaultField);
  232       }
  233       
  234       Map<String, SpanQuery> queries = new HashMap<String, SpanQuery>();
  235    
  236       Set<Term> nonWeightedTerms = new HashSet<Term>();
  237       final boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
  238       if (mustRewriteQuery) {
  239         for (final String field : fieldNames) {
  240           final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getReaderForField(field));
  241           queries.put(field, rewrittenQuery);
  242           rewrittenQuery.extractTerms(nonWeightedTerms);
  243         }
  244       } else {
  245         spanQuery.extractTerms(nonWeightedTerms);
  246       }
  247   
  248       List<PositionSpan> spanPositions = new ArrayList<PositionSpan>();
  249   
  250       for (final String field : fieldNames) {
  251   
  252         IndexReader reader = getReaderForField(field);
  253         final Spans spans;
  254         if (mustRewriteQuery) {
  255           spans = queries.get(field).getSpans(reader);
  256         } else {
  257           spans = spanQuery.getSpans(reader);
  258         }
  259   
  260   
  261         // collect span positions
  262         while (spans.next()) {
  263           spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1));
  264         }
  265         
  266       }
  267   
  268       if (spanPositions.size() == 0) {
  269         // no spans found
  270         return;
  271       }
  272   
  273       for (final Term queryTerm :  nonWeightedTerms) {
  274   
  275         if (fieldNameComparator(queryTerm.field())) {
  276           WeightedSpanTerm weightedSpanTerm = terms.get(queryTerm.text());
  277   
  278           if (weightedSpanTerm == null) {
  279             weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), queryTerm.text());
  280             weightedSpanTerm.addPositionSpans(spanPositions);
  281             weightedSpanTerm.positionSensitive = true;
  282             terms.put(queryTerm.text(), weightedSpanTerm);
  283           } else {
  284             if (spanPositions.size() > 0) {
  285               weightedSpanTerm.addPositionSpans(spanPositions);
  286             }
  287           }
  288         }
  289       }
  290     }
  291   
  292     /**
  293      * Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
  294      * 
  295      * @param terms
  296      *          Map to place created WeightedSpanTerms in
  297      * @param query
  298      *          Query to extract Terms from
  299      * @throws IOException
  300      */
  301     private void extractWeightedTerms(Map<String,WeightedSpanTerm> terms, Query query) throws IOException {
  302       Set<Term> nonWeightedTerms = new HashSet<Term>();
  303       query.extractTerms(nonWeightedTerms);
  304   
  305       for (final Term queryTerm : nonWeightedTerms) {
  306   
  307         if (fieldNameComparator(queryTerm.field())) {
  308           WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text());
  309           terms.put(queryTerm.text(), weightedSpanTerm);
  310         }
  311       }
  312     }
  313   
  314     /**
  315      * Necessary to implement matches for queries against <code>defaultField</code>
  316      */
  317     private boolean fieldNameComparator(String fieldNameToCheck) {
  318       boolean rv = fieldName == null || fieldNameToCheck == fieldName
  319           || fieldNameToCheck == defaultField;
  320       return rv;
  321     }
  322   
  323     private IndexReader getReaderForField(String field) throws IOException {
  324       if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
  325         tokenStream = new CachingTokenFilter(tokenStream);
  326         cachedTokenStream = true;
  327       }
  328       IndexReader reader = readers.get(field);
  329       if (reader == null) {
  330         MemoryIndex indexer = new MemoryIndex();
  331         indexer.addField(field, tokenStream);
  332         tokenStream.reset();
  333         IndexSearcher searcher = indexer.createSearcher();
  334         reader = searcher.getIndexReader();
  335         readers.put(field, reader);
  336       }
  337   
  338       return reader;
  339     }
  340   
  341     /**
  342      * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
  343      * 
  344      * <p>
  345      * 
  346      * @param query
  347      *          that caused hit
  348      * @param tokenStream
  349      *          of text to be highlighted
  350      * @return Map containing WeightedSpanTerms
  351      * @throws IOException
  352      */
  353     public Map<String,WeightedSpanTerm> getWeightedSpanTerms(Query query, TokenStream tokenStream)
  354         throws IOException {
  355       return getWeightedSpanTerms(query, tokenStream, null);
  356     }
  357   
  358     /**
  359      * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
  360      * 
  361      * <p>
  362      * 
  363      * @param query
  364      *          that caused hit
  365      * @param tokenStream
  366      *          of text to be highlighted
  367      * @param fieldName
  368      *          restricts Term's used based on field name
  369      * @return Map containing WeightedSpanTerms
  370      * @throws IOException
  371      */
  372     public Map<String,WeightedSpanTerm> getWeightedSpanTerms(Query query, TokenStream tokenStream,
  373         String fieldName) throws IOException {
  374       if (fieldName != null) {
  375         this.fieldName = StringHelper.intern(fieldName);
  376       } else {
  377         this.fieldName = null;
  378       }
  379   
  380       Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
  381       this.tokenStream = tokenStream;
  382       try {
  383         extract(query, terms);
  384       } finally {
  385         closeReaders();
  386       }
  387   
  388       return terms;
  389     }
  390   
  391     /**
  392      * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
  393      * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
  394      * 
  395      * <p>
  396      * 
  397      * @param query
  398      *          that caused hit
  399      * @param tokenStream
  400      *          of text to be highlighted
  401      * @param fieldName
  402      *          restricts Term's used based on field name
  403      * @param reader
  404      *          to use for scoring
  405      * @return Map of WeightedSpanTerms with quasi tf/idf scores
  406      * @throws IOException
  407      */
  408     public Map<String,WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName,
  409         IndexReader reader) throws IOException {
  410       if (fieldName != null) {
  411         this.fieldName = StringHelper.intern(fieldName);
  412       } else {
  413         this.fieldName = null;
  414       }
  415       this.tokenStream = tokenStream;
  416   
  417       Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
  418       extract(query, terms);
  419   
  420       int totalNumDocs = reader.numDocs();
  421       Set<String> weightedTerms = terms.keySet();
  422       Iterator<String> it = weightedTerms.iterator();
  423   
  424       try {
  425         while (it.hasNext()) {
  426           WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
  427           int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
  428           // docFreq counts deletes
  429           if(totalNumDocs < docFreq) {
  430             docFreq = totalNumDocs;
  431           }
  432           // IDF algorithm taken from DefaultSimilarity class
  433           float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
  434           weightedSpanTerm.weight *= idf;
  435         }
  436       } finally {
  437   
  438         closeReaders();
  439       }
  440   
  441       return terms;
  442     }
  443     
  444     private void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
  445       if (spanQuery instanceof FieldMaskingSpanQuery) {
  446         collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
  447       } else if (spanQuery instanceof SpanFirstQuery) {
  448         collectSpanQueryFields(((SpanFirstQuery)spanQuery).getMatch(), fieldNames);
  449       } else if (spanQuery instanceof SpanNearQuery) {
  450         for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
  451           collectSpanQueryFields(clause, fieldNames);
  452         }
  453       } else if (spanQuery instanceof SpanNotQuery) {
  454         collectSpanQueryFields(((SpanNotQuery)spanQuery).getInclude(), fieldNames);
  455       } else if (spanQuery instanceof SpanOrQuery) {
  456         for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
  457           collectSpanQueryFields(clause, fieldNames);
  458         }
  459       } else {
  460         fieldNames.add(spanQuery.getField());
  461       }
  462     }
  463     
  464     private boolean mustRewriteQuery(SpanQuery spanQuery) {
  465       if (!expandMultiTermQuery) {
  466         return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
  467       } else if (spanQuery instanceof FieldMaskingSpanQuery) {
  468         return mustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery());
  469       } else if (spanQuery instanceof SpanFirstQuery) {
  470         return mustRewriteQuery(((SpanFirstQuery)spanQuery).getMatch());
  471       } else if (spanQuery instanceof SpanNearQuery) {
  472         for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
  473           if (mustRewriteQuery(clause)) {
  474             return true;
  475           }
  476         }
  477         return false; 
  478       } else if (spanQuery instanceof SpanNotQuery) {
  479         SpanNotQuery spanNotQuery = (SpanNotQuery)spanQuery;
  480         return mustRewriteQuery(spanNotQuery.getInclude()) || mustRewriteQuery(spanNotQuery.getExclude());
  481       } else if (spanQuery instanceof SpanOrQuery) {
  482         for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
  483           if (mustRewriteQuery(clause)) {
  484             return true;
  485           }
  486         }
  487         return false; 
  488       } else if (spanQuery instanceof SpanTermQuery) {
  489         return false;
  490       } else {
  491         return true;
  492       }
  493     }
  494     
  495     /**
  496      * This class makes sure that if both position sensitive and insensitive
  497      * versions of the same term are added, the position insensitive one wins.
  498      */
  499     static private class PositionCheckingMap<K> extends HashMap<K,WeightedSpanTerm> {
  500   
  501       @Override
  502       public void putAll(Map m) {
  503         Iterator<Map.Entry<K, WeightedSpanTerm>> it = m.entrySet().iterator();
  504         while (it.hasNext()) {
  505           Map.Entry<K, WeightedSpanTerm> entry = it.next();
  506           this.put(entry.getKey(), entry.getValue());
  507         }
  508       }
  509   
  510       @Override
  511       public WeightedSpanTerm put(K key, WeightedSpanTerm value) {
  512         WeightedSpanTerm prev = super.put(key, value);
  513         if (prev == null) return prev;
  514         WeightedSpanTerm prevTerm = prev;
  515         WeightedSpanTerm newTerm = value;
  516         if (!prevTerm.positionSensitive) {
  517           newTerm.positionSensitive = false;
  518         }
  519         return prev;
  520       }
  521       
  522     }
  523     
  524     public boolean getExpandMultiTermQuery() {
  525       return expandMultiTermQuery;
  526     }
  527   
  528     public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
  529       this.expandMultiTermQuery = expandMultiTermQuery;
  530     }
  531     
  532     public boolean isCachedTokenStream() {
  533       return cachedTokenStream;
  534     }
  535     
  536     public TokenStream getTokenStream() {
  537       return tokenStream;
  538     }
  539     
  540     /**
  541      * By default, {@link TokenStream}s that are not of the type
  542      * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
  543      * ensure an efficient reset - if you are already using a different caching
  544      * {@link TokenStream} impl and you don't want it to be wrapped, set this to
  545      * false.
  546      * 
  547      * @param wrap
  548      */
  549     public void setWrapIfNotCachingTokenFilter(boolean wrap) {
  550       this.wrapToCaching = wrap;
  551     }
  552     
  553     /**
  554      * 
  555      * A fake IndexReader class to extract the field from a MultiTermQuery
  556      * 
  557      */
  558     static final class FakeReader extends FilterIndexReader {
  559   
  560       private static final IndexReader EMPTY_MEMORY_INDEX_READER =
  561         new MemoryIndex().createSearcher().getIndexReader();
  562       
  563       String field;
  564   
  565       FakeReader() {
  566         super(EMPTY_MEMORY_INDEX_READER);
  567       }
  568   
  569       @Override
  570       public TermEnum terms(final Term t) throws IOException {
  571         // only set first fieldname, maybe use a Set?
  572         if (t != null && field == null)
  573           field = t.field();
  574         return super.terms(t);
  575       }
  576   
  577   
  578     }
  579   
  580   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » search » highlight » [javadoc | source]