Home » lucene-3.0.1-src » org.apache » lucene » search » spans » [javadoc | source]

    1   package org.apache.lucene.search.spans;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import org.apache.lucene.index.IndexReader;
   21   
   22   import java.io.IOException;
   23   import java.util.ArrayList;
   24   import java.util.Arrays;
   25   import java.util.Comparator;
   26   import java.util.HashSet;
   27   import java.util.LinkedList;
   28   import java.util.List;
   29   import java.util.Collection;
   30   import java.util.Set;
   31   
   32   /** A Spans that is formed from the ordered subspans of a SpanNearQuery
   33    * where the subspans do not overlap and have a maximum slop between them.
   34    * <p>
   35    * The formed spans only contains minimum slop matches.<br>
   36    * The matching slop is computed from the distance(s) between
   37    * the non overlapping matching Spans.<br>
   38    * Successive matches are always formed from the successive Spans
   39    * of the SpanNearQuery.
   40    * <p>
   41    * The formed spans may contain overlaps when the slop is at least 1.
   42    * For example, when querying using
   43    * <pre>t1 t2 t3</pre>
   44    * with slop at least 1, the fragment:
   45    * <pre>t1 t2 t1 t3 t2 t3</pre>
   46    * matches twice:
   47    * <pre>t1 t2 .. t3      </pre>
   48    * <pre>      t1 .. t2 t3</pre>
   49    *
   50    *
   51    * Expert:
   52    * Only public for subclassing.  Most implementations should not need this class
   53    */
   54   public class NearSpansOrdered extends Spans {
   55     private final int allowedSlop;
   56     private boolean firstTime = true;
   57     private boolean more = false;
   58   
   59     /** The spans in the same order as the SpanNearQuery */
   60     private final Spans[] subSpans;
   61   
   62     /** Indicates that all subSpans have same doc() */
   63     private boolean inSameDoc = false;
   64   
   65     private int matchDoc = -1;
   66     private int matchStart = -1;
   67     private int matchEnd = -1;
   68     private List<byte[]> matchPayload;
   69   
   70     private final Spans[] subSpansByDoc;
   71     private final Comparator<Spans> spanDocComparator = new Comparator<Spans>() {
   72       public int compare(Spans o1, Spans o2) {
   73         return o1.doc() - o2.doc();
   74       }
   75     };
   76     
   77     private SpanNearQuery query;
   78     private boolean collectPayloads = true;
   79     
   80     public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader) throws IOException {
   81       this(spanNearQuery, reader, true);
   82     }
   83   
   84     public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, boolean collectPayloads)
   85     throws IOException {
   86       if (spanNearQuery.getClauses().length < 2) {
   87         throw new IllegalArgumentException("Less than 2 clauses: "
   88                                            + spanNearQuery);
   89       }
   90       this.collectPayloads = collectPayloads;
   91       allowedSlop = spanNearQuery.getSlop();
   92       SpanQuery[] clauses = spanNearQuery.getClauses();
   93       subSpans = new Spans[clauses.length];
   94       matchPayload = new LinkedList<byte[]>();
   95       subSpansByDoc = new Spans[clauses.length];
   96       for (int i = 0; i < clauses.length; i++) {
   97         subSpans[i] = clauses[i].getSpans(reader);
   98         subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
   99       }
  100       query = spanNearQuery; // kept for toString() only.
  101     }
  102   
  103     // inherit javadocs
  104     @Override
  105     public int doc() { return matchDoc; }
  106   
  107     // inherit javadocs
  108     @Override
  109     public int start() { return matchStart; }
  110   
  111     // inherit javadocs
  112     @Override
  113     public int end() { return matchEnd; }
  114     
  115     public Spans[] getSubSpans() {
  116   	  return subSpans;
  117     }  
  118   
  119     // TODO: Remove warning after API has been finalized
  120     // TODO: Would be nice to be able to lazy load payloads
  121     @Override
  122     public Collection<byte[]> getPayload() throws IOException {
  123       return matchPayload;
  124     }
  125   
  126     // TODO: Remove warning after API has been finalized
  127     @Override
  128     public boolean isPayloadAvailable() {
  129       return matchPayload.isEmpty() == false;
  130     }
  131   
  132     // inherit javadocs
  133     @Override
  134     public boolean next() throws IOException {
  135       if (firstTime) {
  136         firstTime = false;
  137         for (int i = 0; i < subSpans.length; i++) {
  138           if (! subSpans[i].next()) {
  139             more = false;
  140             return false;
  141           }
  142         }
  143         more = true;
  144       }
  145       if(collectPayloads) {
  146         matchPayload.clear();
  147       }
  148       return advanceAfterOrdered();
  149     }
  150   
  151     // inherit javadocs
  152     @Override
  153     public boolean skipTo(int target) throws IOException {
  154       if (firstTime) {
  155         firstTime = false;
  156         for (int i = 0; i < subSpans.length; i++) {
  157           if (! subSpans[i].skipTo(target)) {
  158             more = false;
  159             return false;
  160           }
  161         }
  162         more = true;
  163       } else if (more && (subSpans[0].doc() < target)) {
  164         if (subSpans[0].skipTo(target)) {
  165           inSameDoc = false;
  166         } else {
  167           more = false;
  168           return false;
  169         }
  170       }
  171       if(collectPayloads) {
  172         matchPayload.clear();
  173       }
  174       return advanceAfterOrdered();
  175     }
  176     
  177     /** Advances the subSpans to just after an ordered match with a minimum slop
  178      * that is smaller than the slop allowed by the SpanNearQuery.
  179      * @return true iff there is such a match.
  180      */
  181     private boolean advanceAfterOrdered() throws IOException {
  182       while (more && (inSameDoc || toSameDoc())) {
  183         if (stretchToOrder() && shrinkToAfterShortestMatch()) {
  184           return true;
  185         }
  186       }
  187       return false; // no more matches
  188     }
  189   
  190   
  191     /** Advance the subSpans to the same document */
  192     private boolean toSameDoc() throws IOException {
  193       Arrays.sort(subSpansByDoc, spanDocComparator);
  194       int firstIndex = 0;
  195       int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
  196       while (subSpansByDoc[firstIndex].doc() != maxDoc) {
  197         if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) {
  198           more = false;
  199           inSameDoc = false;
  200           return false;
  201         }
  202         maxDoc = subSpansByDoc[firstIndex].doc();
  203         if (++firstIndex == subSpansByDoc.length) {
  204           firstIndex = 0;
  205         }
  206       }
  207       for (int i = 0; i < subSpansByDoc.length; i++) {
  208         assert (subSpansByDoc[i].doc() == maxDoc)
  209                : " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0]
  210                                    + "\n at doc " + subSpansByDoc[i].doc()
  211                                    + ", but should be at " + maxDoc;
  212       }
  213       inSameDoc = true;
  214       return true;
  215     }
  216     
  217     /** Check whether two Spans in the same document are ordered.
  218      * @param spans1 
  219      * @param spans2 
  220      * @return true iff spans1 starts before spans2
  221      *              or the spans start at the same position,
  222      *              and spans1 ends before spans2.
  223      */
  224     static final boolean docSpansOrdered(Spans spans1, Spans spans2) {
  225       assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc();
  226       int start1 = spans1.start();
  227       int start2 = spans2.start();
  228       /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */
  229       return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2);
  230     }
  231   
  232     /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans
  233      * starts and ends as parameters.
  234      */
  235     private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) {
  236       return (start1 == start2) ? (end1 < end2) : (start1 < start2);
  237     }
  238   
  239     /** Order the subSpans within the same document by advancing all later spans
  240      * after the previous one.
  241      */
  242     private boolean stretchToOrder() throws IOException {
  243       matchDoc = subSpans[0].doc();
  244       for (int i = 1; inSameDoc && (i < subSpans.length); i++) {
  245         while (! docSpansOrdered(subSpans[i-1], subSpans[i])) {
  246           if (! subSpans[i].next()) {
  247             inSameDoc = false;
  248             more = false;
  249             break;
  250           } else if (matchDoc != subSpans[i].doc()) {
  251             inSameDoc = false;
  252             break;
  253           }
  254         }
  255       }
  256       return inSameDoc;
  257     }
  258   
  259     /** The subSpans are ordered in the same doc, so there is a possible match.
  260      * Compute the slop while making the match as short as possible by advancing
  261      * all subSpans except the last one in reverse order.
  262      */
  263     private boolean shrinkToAfterShortestMatch() throws IOException {
  264       matchStart = subSpans[subSpans.length - 1].start();
  265       matchEnd = subSpans[subSpans.length - 1].end();
  266       Set<byte[]> possibleMatchPayloads = new HashSet<byte[]>();
  267       if (subSpans[subSpans.length - 1].isPayloadAvailable()) {
  268         possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload());
  269       }
  270   
  271       Collection<byte[]> possiblePayload = null;
  272       
  273       int matchSlop = 0;
  274       int lastStart = matchStart;
  275       int lastEnd = matchEnd;
  276       for (int i = subSpans.length - 2; i >= 0; i--) {
  277         Spans prevSpans = subSpans[i];
  278         if (collectPayloads && prevSpans.isPayloadAvailable()) {
  279           Collection<byte[]> payload = prevSpans.getPayload();
  280           possiblePayload = new ArrayList<byte[]>(payload.size());
  281           possiblePayload.addAll(payload);
  282         }
  283         
  284         int prevStart = prevSpans.start();
  285         int prevEnd = prevSpans.end();
  286         while (true) { // Advance prevSpans until after (lastStart, lastEnd)
  287           if (! prevSpans.next()) {
  288             inSameDoc = false;
  289             more = false;
  290             break; // Check remaining subSpans for final match.
  291           } else if (matchDoc != prevSpans.doc()) {
  292             inSameDoc = false; // The last subSpans is not advanced here.
  293             break; // Check remaining subSpans for last match in this document.
  294           } else {
  295             int ppStart = prevSpans.start();
  296             int ppEnd = prevSpans.end(); // Cannot avoid invoking .end()
  297             if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) {
  298               break; // Check remaining subSpans.
  299             } else { // prevSpans still before (lastStart, lastEnd)
  300               prevStart = ppStart;
  301               prevEnd = ppEnd;
  302               if (collectPayloads && prevSpans.isPayloadAvailable()) {
  303                 Collection<byte[]> payload = prevSpans.getPayload();
  304                 possiblePayload = new ArrayList<byte[]>(payload.size());
  305                 possiblePayload.addAll(payload);
  306               }
  307             }
  308           }
  309         }
  310   
  311         if (collectPayloads && possiblePayload != null) {
  312           possibleMatchPayloads.addAll(possiblePayload);
  313         }
  314         
  315         assert prevStart <= matchStart;
  316         if (matchStart > prevEnd) { // Only non overlapping spans add to slop.
  317           matchSlop += (matchStart - prevEnd);
  318         }
  319   
  320         /* Do not break on (matchSlop > allowedSlop) here to make sure
  321          * that subSpans[0] is advanced after the match, if any.
  322          */
  323         matchStart = prevStart;
  324         lastStart = prevStart;
  325         lastEnd = prevEnd;
  326       }
  327       
  328       boolean match = matchSlop <= allowedSlop;
  329       
  330       if(collectPayloads && match && possibleMatchPayloads.size() > 0) {
  331         matchPayload.addAll(possibleMatchPayloads);
  332       }
  333   
  334       return match; // ordered and allowed slop
  335     }
  336   
  337     @Override
  338     public String toString() {
  339       return getClass().getName() + "("+query.toString()+")@"+
  340         (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END"));
  341     }
  342   }
  343   

Home » lucene-3.0.1-src » org.apache » lucene » search » spans » [javadoc | source]