1# Copyright 2010 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6#    1. Redistributions of source code must retain the above copyright notice,
7#       this list of conditions and the following disclaimer.
8#
9#    2. Redistributions in binary form must reproduce the above copyright
10#       notice, this list of conditions and the following disclaimer in the
11#       documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""
29This module contains Query objects that deal with "spans".
30
31Span queries allow for positional constraints on matching documents. For
32example, the :class:`whoosh.spans.SpanNear` query matches documents where one
33term occurs near another. Because you can nest span queries, and wrap them
34around almost any non-span query, you can create very complex constraints.
35
36For example, to find documents containing "whoosh" at most 5 positions before
37"library" in the "text" field::
38
39    from whoosh import query, spans
40    t1 = query.Term("text", "whoosh")
41    t2 = query.Term("text", "library")
42    q = spans.SpanNear(t1, t2, slop=5)
43
44"""
45
46from whoosh.matching import mcore, wrappers, binary
47from whoosh.query import Query, And, AndMaybe, Or, Term
48from whoosh.util import make_binary_tree
49
50
51# Span class
52
53class Span(object):
54    __slots__ = ("start", "end", "startchar", "endchar", "boost")
55
56    def __init__(self, start, end=None, startchar=None, endchar=None,
57                 boost=1.0):
58        if end is None:
59            end = start
60        assert start <= end
61        self.start = start
62        self.end = end
63        self.startchar = startchar
64        self.endchar = endchar
65        self.boost = boost
66
67    def __repr__(self):
68        if self.startchar is not None or self.endchar is not None:
69            return "<%d-%d %d:%d>" % (self.start, self.end, self.startchar,
70                                      self.endchar)
71        else:
72            return "<%d-%d>" % (self.start, self.end)
73
74    def __eq__(self, span):
75        return (self.start == span.start
76                and self.end == span.end
77                and self.startchar == span.startchar
78                and self.endchar == span.endchar)
79
80    def __ne__(self, span):
81        return self.start != span.start or self.end != span.end
82
83    def __lt__(self, span):
84        return self.start < span.start
85
86    def __gt__(self, span):
87        return self.start > span.start
88
89    def __hash__(self):
90        return hash((self.start, self.end))
91
92    @classmethod
93    def merge(cls, spans):
94        """Merges overlapping and touches spans in the given list of spans.
95
96        Note that this modifies the original list.
97
98        >>> spans = [Span(1,2), Span(3)]
99        >>> Span.merge(spans)
100        >>> spans
101        [<1-3>]
102        """
103
104        i = 0
105        while i < len(spans) - 1:
106            here = spans[i]
107            j = i + 1
108            while j < len(spans):
109                there = spans[j]
110                if there.start > here.end + 1:
111                    break
112                if here.touches(there) or here.overlaps(there):
113                    here = here.to(there)
114                    spans[i] = here
115                    del spans[j]
116                else:
117                    j += 1
118            i += 1
119        return spans
120
121    def to(self, span):
122        if self.startchar is None:
123            minchar = span.startchar
124        elif span.startchar is None:
125            minchar = self.startchar
126        else:
127            minchar = min(self.startchar, span.startchar)
128        if self.endchar is None:
129            maxchar = span.endchar
130        elif span.endchar is None:
131            maxchar = self.endchar
132        else:
133            maxchar = max(self.endchar, span.endchar)
134
135        minpos = min(self.start, span.start)
136        maxpos = max(self.end, span.end)
137        return self.__class__(minpos, maxpos, minchar, maxchar)
138
139    def overlaps(self, span):
140        return ((self.start >= span.start and self.start <= span.end)
141                or (self.end >= span.start and self.end <= span.end)
142                or (span.start >= self.start and span.start <= self.end)
143                or (span.end >= self.start and span.end <= self.end))
144
145    def surrounds(self, span):
146        return self.start < span.start and self.end > span.end
147
148    def is_within(self, span):
149        return self.start >= span.start and self.end <= span.end
150
151    def is_before(self, span):
152        return self.end < span.start
153
154    def is_after(self, span):
155        return self.start > span.end
156
157    def touches(self, span):
158        return self.start == span.end + 1 or self.end == span.start - 1
159
160    def distance_to(self, span):
161        if self.overlaps(span):
162            return 0
163        elif self.is_before(span):
164            return span.start - self.end
165        else:
166            return self.start - span.end
167
168
169def bisect_spans(spans, start):
170    lo = 0
171    hi = len(spans)
172    while lo < hi:
173        mid = (lo + hi) // 2
174        if spans[mid].start < start:
175            lo = mid + 1
176        else:
177            hi = mid
178    return lo
179
180
181# Base matchers
182
183class SpanWrappingMatcher(wrappers.WrappingMatcher):
184    """An abstract matcher class that wraps a "regular" matcher. This matcher
185    uses the sub-matcher's matching logic, but only matches documents that have
186    matching spans, i.e. where ``_get_spans()`` returns a non-empty list.
187
188    Subclasses must implement the ``_get_spans()`` method, which returns a list
189    of valid spans for the current document.
190    """
191
192    def __init__(self, child):
193        super(SpanWrappingMatcher, self).__init__(child)
194        self._spans = None
195        if self.is_active():
196            self._find_next()
197
198    def copy(self):
199        m = self.__class__(self.child.copy())
200        m._spans = self._spans
201        return m
202
203    def _replacement(self, newchild):
204        return self.__class__(newchild)
205
206    def _find_next(self):
207        if not self.is_active():
208            return
209
210        child = self.child
211        r = False
212
213        spans = self._get_spans()
214        while child.is_active() and not spans:
215            r = child.next() or r
216            if not child.is_active():
217                return True
218            spans = self._get_spans()
219        self._spans = spans
220
221        return r
222
223    def spans(self):
224        return self._spans
225
226    def next(self):
227        self.child.next()
228        self._find_next()
229
230    def skip_to(self, id):
231        self.child.skip_to(id)
232        self._find_next()
233
234    def all_ids(self):
235        while self.is_active():
236            if self.spans():
237                yield self.id()
238            self.next()
239
240
241class SpanBiMatcher(SpanWrappingMatcher):
242    def copy(self):
243        return self.__class__(self.a.copy(), self.b.copy())
244
245    def depth(self):
246        return 1 + max(self.a.depth(), self.b.depth())
247
248    def replace(self, minquality=0):
249        # TODO: fix this
250        if not self.is_active():
251            return mcore.NullMatcher()
252        return self
253
254
255# Queries
256
257class SpanQuery(Query):
258    """Abstract base class for span-based queries. Each span query type wraps
259    a "regular" query that implements the basic document-matching functionality
260    (for example, SpanNear wraps an And query, because SpanNear requires that
261    the two sub-queries occur in the same documents. The wrapped query is
262    stored in the ``q`` attribute.
263
264    Subclasses usually only need to implement the initializer to set the
265    wrapped query, and ``matcher()`` to return a span-aware matcher object.
266    """
267
268    def _subm(self, s, context=None):
269        return self.q.matcher(s, context)
270
271    def __repr__(self):
272        return "%s(%r)" % (self.__class__.__name__, self.q)
273
274    def __eq__(self, other):
275        return (other and self.__class__ is other.__class__
276                and self.q == other.q)
277
278    def __hash__(self):
279        return hash(self.__class__.__name__) ^ hash(self.q)
280
281    def field(self):
282        return None
283
284    def needs_spans(self):
285        return True
286
287
288class WrappingSpan(SpanQuery):
289    def is_leaf(self):
290        return False
291
292    def apply(self, fn):
293        return self.__class__(fn(self.q), limit=self.limit)
294
295    def field(self):
296        return self.q.field()
297
298
299class SpanFirst(WrappingSpan):
300    """Matches spans that end within the first N positions. This lets you
301    for example only match terms near the beginning of the document.
302    """
303
304    def __init__(self, q, limit=0):
305        """
306        :param q: the query to match.
307        :param limit: the query must match within this position at the start
308            of a document. The default is ``0``, which means the query must
309            match at the first position.
310        """
311
312        self.q = q
313        self.limit = limit
314
315    def __eq__(self, other):
316        return (other and self.__class__ is other.__class__
317                and self.q == other.q and self.limit == other.limit)
318
319    def __hash__(self):
320        return hash(self.q) ^ hash(self.limit)
321
322    def matcher(self, searcher, context=None):
323        m = self._subm(searcher, context)
324        return SpanFirst.SpanFirstMatcher(m, limit=self.limit)
325
326    class SpanFirstMatcher(SpanWrappingMatcher):
327        def __init__(self, child, limit=0):
328            self.limit = limit
329            super(SpanFirst.SpanFirstMatcher, self).__init__(child)
330
331        def copy(self):
332            return self.__class__(self.child.copy(), limit=self.limit)
333
334        def _replacement(self, newchild):
335            return self.__class__(newchild, limit=self.limit)
336
337        def _get_spans(self):
338            return [span for span in self.child.spans()
339                    if span.end <= self.limit]
340
341
342class SpanNear(SpanQuery):
343    """
344    Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2
345    takes a list of sub-queries instead of requiring you to create a binary
346    tree of query objects.
347
348    Matches queries that occur near each other. By default, only matches
349    queries that occur right next to each other (slop=1) and in order
350    (ordered=True).
351
352    For example, to find documents where "whoosh" occurs next to "library"
353    in the "text" field::
354
355        from whoosh import query, spans
356        t1 = query.Term("text", "whoosh")
357        t2 = query.Term("text", "library")
358        q = spans.SpanNear(t1, t2)
359
360    To find documents where "whoosh" occurs at most 5 positions before
361    "library"::
362
363        q = spans.SpanNear(t1, t2, slop=5)
364
365    To find documents where "whoosh" occurs at most 5 positions before or after
366    "library"::
367
368        q = spans.SpanNear(t1, t2, slop=5, ordered=False)
369
370    You can use the ``phrase()`` class method to create a tree of SpanNear
371    queries to match a list of terms::
372
373        q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"],
374                                  slop=2)
375    """
376
377    def __init__(self, a, b, slop=1, ordered=True, mindist=1):
378        """
379        :param a: the first query to match.
380        :param b: the second query that must occur within "slop" positions of
381            the first query.
382        :param slop: the number of positions within which the queries must
383            occur. Default is 1, meaning the queries must occur right next
384            to each other.
385        :param ordered: whether a must occur before b. Default is True.
386        :pram mindist: the minimum distance allowed between the queries.
387        """
388
389        self.q = And([a, b])
390        self.a = a
391        self.b = b
392        self.slop = slop
393        self.ordered = ordered
394        self.mindist = mindist
395
396    def __repr__(self):
397        return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
398                % (self.__class__.__name__, self.q, self.slop, self.ordered,
399                   self.mindist))
400
401    def __eq__(self, other):
402        return (other and self.__class__ == other.__class__
403                and self.q == other.q and self.slop == other.slop
404                and self.ordered == other.ordered
405                and self.mindist == other.mindist)
406
407    def __hash__(self):
408        return (hash(self.a) ^ hash(self.b) ^ hash(self.slop)
409                ^ hash(self.ordered) ^ hash(self.mindist))
410
411    def is_leaf(self):
412        return False
413
414    def apply(self, fn):
415        return self.__class__(fn(self.a), fn(self.b), slop=self.slop,
416                              ordered=self.ordered, mindist=self.mindist)
417
418    def matcher(self, searcher, context=None):
419        ma = self.a.matcher(searcher, context)
420        mb = self.b.matcher(searcher, context)
421        return SpanNear.SpanNearMatcher(ma, mb, slop=self.slop,
422                                        ordered=self.ordered,
423                                        mindist=self.mindist)
424
425    @classmethod
426    def phrase(cls, fieldname, words, slop=1, ordered=True):
427        """Returns a tree of SpanNear queries to match a list of terms.
428
429        This class method is a convenience for constructing a phrase query
430        using a binary tree of SpanNear queries::
431
432            SpanNear.phrase("content", ["alfa", "bravo", "charlie", "delta"])
433
434        :param fieldname: the name of the field to search in.
435        :param words: a sequence of texts to search for.
436        :param slop: the number of positions within which the terms must
437            occur. Default is 1, meaning the terms must occur right next
438            to each other.
439        :param ordered: whether the terms must occur in order. Default is True.
440        """
441
442        terms = [Term(fieldname, word) for word in words]
443        return make_binary_tree(cls, terms, slop=slop, ordered=ordered)
444
445    class SpanNearMatcher(SpanWrappingMatcher):
446        def __init__(self, a, b, slop=1, ordered=True, mindist=1):
447            self.a = a
448            self.b = b
449            self.slop = slop
450            self.ordered = ordered
451            self.mindist = mindist
452            isect = binary.IntersectionMatcher(a, b)
453            super(SpanNear.SpanNearMatcher, self).__init__(isect)
454
455        def copy(self):
456            return self.__class__(self.a.copy(), self.b.copy(), slop=self.slop,
457                                  ordered=self.ordered, mindist=self.mindist)
458
459        def replace(self, minquality=0):
460            # TODO: fix this
461            if not self.is_active():
462                return mcore.NullMatcher()
463            return self
464
465        def _get_spans(self):
466            slop = self.slop
467            mindist = self.mindist
468            ordered = self.ordered
469            spans = set()
470
471            bspans = self.b.spans()
472            for aspan in self.a.spans():
473                for bspan in bspans:
474                    if (bspan.end < aspan.start - slop
475                        or (ordered and aspan.start > bspan.start)):
476                        # B is too far in front of A, or B is in front of A
477                        # *at all* when ordered is True
478                        continue
479                    if bspan.start > aspan.end + slop:
480                        # B is too far from A. Since spans are listed in
481                        # start position order, we know that all spans after
482                        # this one will also be too far.
483                        break
484
485                    # Check the distance between the spans
486                    dist = aspan.distance_to(bspan)
487                    if mindist <= dist <= slop:
488                        spans.add(aspan.to(bspan))
489
490            return sorted(spans)
491
492
493class SpanNear2(SpanQuery):
494    """
495    Matches queries that occur near each other. By default, only matches
496    queries that occur right next to each other (slop=1) and in order
497    (ordered=True).
498
499    New code should use this query type instead of :class:`SpanNear`.
500
501    (Unlike :class:`SpanNear`, this query takes a list of subqueries instead of
502    requiring you to build a binary tree of query objects. This query should
503    also be slightly faster due to less overhead.)
504
505    For example, to find documents where "whoosh" occurs next to "library"
506    in the "text" field::
507
508        from whoosh import query, spans
509        t1 = query.Term("text", "whoosh")
510        t2 = query.Term("text", "library")
511        q = spans.SpanNear2([t1, t2])
512
513    To find documents where "whoosh" occurs at most 5 positions before
514    "library"::
515
516        q = spans.SpanNear2([t1, t2], slop=5)
517
518    To find documents where "whoosh" occurs at most 5 positions before or after
519    "library"::
520
521        q = spans.SpanNear2(t1, t2, slop=5, ordered=False)
522    """
523
524    def __init__(self, qs, slop=1, ordered=True, mindist=1):
525        """
526        :param qs: a sequence of sub-queries to match.
527        :param slop: the number of positions within which the queries must
528            occur. Default is 1, meaning the queries must occur right next
529            to each other.
530        :param ordered: whether a must occur before b. Default is True.
531        :pram mindist: the minimum distance allowed between the queries.
532        """
533
534        self.qs = qs
535        self.slop = slop
536        self.ordered = ordered
537        self.mindist = mindist
538
539    def __repr__(self):
540        return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
541                % (self.__class__.__name__, self.qs, self.slop, self.ordered,
542                   self.mindist))
543
544    def __eq__(self, other):
545        return (other and self.__class__ == other.__class__
546                and self.qs == other.qs and self.slop == other.slop
547                and self.ordered == other.ordered
548                and self.mindist == other.mindist)
549
550    def __hash__(self):
551        h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist)
552        for q in self.qs:
553            h ^= hash(q)
554        return h
555
556    def _and_query(self):
557        return q.And(self.qs)
558
559    def estimate_size(self, ixreader):
560        return self._and_query().estimate_size(ixreader)
561
562    def estimate_min_size(self, ixreader):
563        return self._and_query().estimate_min_size(ixreader)
564
565    def is_leaf(self):
566        return False
567
568    def children(self):
569        return self.qs
570
571    def apply(self, fn):
572        return self.__class__([fn(q) for q in self.qs], slop=self.slop,
573                              ordered=self.ordered, mindist=self.mindist)
574
575    def matcher(self, searcher, context=None):
576        ms = [q.matcher(searcher, context) for q in self.qs]
577        return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered,
578                                     mindist=self.mindist)
579
580    class SpanNear2Matcher(SpanWrappingMatcher):
581        def __init__(self, ms, slop=1, ordered=True, mindist=1):
582            self.ms = ms
583            self.slop = slop
584            self.ordered = ordered
585            self.mindist = mindist
586            isect = make_binary_tree(binary.IntersectionMatcher, ms)
587            super(SpanNear2.SpanNear2Matcher, self).__init__(isect)
588
589        def copy(self):
590            return self.__class__([m.copy() for m in self.ms], slop=self.slop,
591                                  ordered=self.ordered, mindist=self.mindist)
592
593        def replace(self, minquality=0):
594            # TODO: fix this
595            if not self.is_active():
596                return mcore.NullMatcher()
597            return self
598
599        def _get_spans(self):
600            slop = self.slop
601            mindist = self.mindist
602            ordered = self.ordered
603            ms = self.ms
604
605            aspans = ms[0].spans()
606            i = 1
607            while i < len(ms) and aspans:
608                bspans = ms[i].spans()
609                spans = set()
610                for aspan in aspans:
611                    # Use a binary search to find the first position we should
612                    # start looking for possible matches
613                    if ordered:
614                        start = aspan.start
615                    else:
616                        start = max(0, aspan.start - slop)
617                    j = bisect_spans(bspans, start)
618
619                    while j < len(bspans):
620                        bspan = bspans[j]
621                        j += 1
622
623                        if (bspan.end < aspan.start - slop
624                            or (ordered and aspan.start > bspan.start)):
625                            # B is too far in front of A, or B is in front of A
626                            # *at all* when ordered is True
627                            continue
628                        if bspan.start > aspan.end + slop:
629                            # B is too far from A. Since spans are listed in
630                            # start position order, we know that all spans after
631                            # this one will also be too far.
632                            break
633
634                        # Check the distance between the spans
635                        dist = aspan.distance_to(bspan)
636                        if mindist <= dist <= slop:
637                            spans.add(aspan.to(bspan))
638                aspans = sorted(spans)
639                i += 1
640
641            if i == len(ms):
642                return aspans
643            else:
644                return []
645
646
647class SpanOr(SpanQuery):
648    """Matches documents that match any of a list of sub-queries. Unlike
649    query.Or, this class merges together matching spans from the different
650    sub-queries when they overlap.
651    """
652
653    def __init__(self, subqs):
654        """
655        :param subqs: a list of queries to match.
656        """
657
658        self.q = Or(subqs)
659        self.subqs = subqs
660
661    def is_leaf(self):
662        return False
663
664    def apply(self, fn):
665        return self.__class__([fn(sq) for sq in self.subqs])
666
667    def matcher(self, searcher, context=None):
668        matchers = [q.matcher(searcher, context) for q in self.subqs]
669        return make_binary_tree(SpanOr.SpanOrMatcher, matchers)
670
671    class SpanOrMatcher(SpanBiMatcher):
672        def __init__(self, a, b):
673            self.a = a
674            self.b = b
675            um = binary.UnionMatcher(a, b)
676            super(SpanOr.SpanOrMatcher, self).__init__(um)
677
678        def _get_spans(self):
679            a_active = self.a.is_active()
680            b_active = self.b.is_active()
681
682            if a_active:
683                a_id = self.a.id()
684                if b_active:
685                    b_id = self.b.id()
686                    if a_id == b_id:
687                        spans = sorted(set(self.a.spans())
688                                       | set(self.b.spans()))
689                    elif a_id < b_id:
690                        spans = self.a.spans()
691                    else:
692                        spans = self.b.spans()
693                else:
694                    spans = self.a.spans()
695            else:
696                spans = self.b.spans()
697
698            Span.merge(spans)
699            return spans
700
701
702class SpanBiQuery(SpanQuery):
703    # Intermediate base class for methods common to "a/b" span query types
704
705    def is_leaf(self):
706        return False
707
708    def apply(self, fn):
709        return self.__class__(fn(self.a), fn(self.b))
710
711    def matcher(self, searcher, context=None):
712        ma = self.a.matcher(searcher, context)
713        mb = self.b.matcher(searcher, context)
714        return self._Matcher(ma, mb)
715
716
717class SpanNot(SpanBiQuery):
718    """Matches spans from the first query only if they don't overlap with
719    spans from the second query. If there are no non-overlapping spans, the
720    document does not match.
721
722    For example, to match documents that contain "bear" at most 2 places after
723    "apple" in the "text" field but don't have "cute" between them::
724
725        from whoosh import query, spans
726        t1 = query.Term("text", "apple")
727        t2 = query.Term("text", "bear")
728        near = spans.SpanNear(t1, t2, slop=2)
729        q = spans.SpanNot(near, query.Term("text", "cute"))
730    """
731
732    def __init__(self, a, b):
733        """
734        :param a: the query to match.
735        :param b: do not match any spans that overlap with spans from this
736            query.
737        """
738
739        self.q = AndMaybe(a, b)
740        self.a = a
741        self.b = b
742
743    class _Matcher(SpanBiMatcher):
744        def __init__(self, a, b):
745            self.a = a
746            self.b = b
747            amm = binary.AndMaybeMatcher(a, b)
748            super(SpanNot._Matcher, self).__init__(amm)
749
750        def _get_spans(self):
751            if self.a.id() == self.b.id():
752                spans = []
753                bspans = self.b.spans()
754                for aspan in self.a.spans():
755                    overlapped = False
756                    for bspan in bspans:
757                        if aspan.overlaps(bspan):
758                            overlapped = True
759                            break
760                    if not overlapped:
761                        spans.append(aspan)
762                return spans
763            else:
764                return self.a.spans()
765
766
767class SpanContains(SpanBiQuery):
768    """Matches documents where the spans of the first query contain any spans
769    of the second query.
770
771    For example, to match documents where "apple" occurs at most 10 places
772    before "bear" in the "text" field and "cute" is between them::
773
774        from whoosh import query, spans
775        t1 = query.Term("text", "apple")
776        t2 = query.Term("text", "bear")
777        near = spans.SpanNear(t1, t2, slop=10)
778        q = spans.SpanContains(near, query.Term("text", "cute"))
779    """
780
781    def __init__(self, a, b):
782        """
783        :param a: the query to match.
784        :param b: the query whose spans must occur within the matching spans
785            of the first query.
786        """
787
788        self.q = And([a, b])
789        self.a = a
790        self.b = b
791
792    class _Matcher(SpanBiMatcher):
793        def __init__(self, a, b):
794            self.a = a
795            self.b = b
796            im = binary.IntersectionMatcher(a, b)
797            super(SpanContains._Matcher, self).__init__(im)
798
799        def _get_spans(self):
800            spans = []
801            bspans = self.b.spans()
802            for aspan in self.a.spans():
803                for bspan in bspans:
804                    if aspan.start > bspan.end:
805                        continue
806                    if aspan.end < bspan.start:
807                        break
808
809                    if bspan.is_within(aspan):
810                        spans.append(aspan)
811                        break
812            return spans
813
814
815class SpanBefore(SpanBiQuery):
816    """Matches documents where the spans of the first query occur before any
817    spans of the second query.
818
819    For example, to match documents where "apple" occurs anywhere before
820    "bear"::
821
822        from whoosh import query, spans
823        t1 = query.Term("text", "apple")
824        t2 = query.Term("text", "bear")
825        q = spans.SpanBefore(t1, t2)
826    """
827
828    def __init__(self, a, b):
829        """
830        :param a: the query that must occur before the second.
831        :param b: the query that must occur after the first.
832        """
833
834        self.a = a
835        self.b = b
836        self.q = And([a, b])
837
838    class _Matcher(SpanBiMatcher):
839        def __init__(self, a, b):
840            self.a = a
841            self.b = b
842            im = binary.IntersectionMatcher(a, b)
843            super(SpanBefore._Matcher, self).__init__(im)
844
845        def _get_spans(self):
846            bminstart = min(bspan.start for bspan in self.b.spans())
847            return [aspan for aspan in self.a.spans() if aspan.end < bminstart]
848
849
850class SpanCondition(SpanBiQuery):
851    """Matches documents that satisfy both subqueries, but only uses the spans
852    from the first subquery.
853
854    This is useful when you want to place conditions on matches but not have
855    those conditions affect the spans returned.
856
857    For example, to get spans for the term ``alfa`` in documents that also
858    must contain the term ``bravo``::
859
860        SpanCondition(Term("text", u"alfa"), Term("text", u"bravo"))
861
862    """
863
864    def __init__(self, a, b):
865        self.a = a
866        self.b = b
867        self.q = And([a, b])
868
869    class _Matcher(SpanBiMatcher):
870        def __init__(self, a, b):
871            self.a = a
872            im = binary.IntersectionMatcher(a, b)
873            super(SpanCondition._Matcher, self).__init__(im)
874
875        def _get_spans(self):
876            return self.a.spans()
877
878
879
880
881
882