1# Copyright 2010 Matt Chaput. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are met: 5# 6# 1. Redistributions of source code must retain the above copyright notice, 7# this list of conditions and the following disclaimer. 8# 9# 2. Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23# 24# The views and conclusions contained in the software and documentation are 25# those of the authors and should not be interpreted as representing official 26# policies, either expressed or implied, of Matt Chaput. 27 28""" 29This module contains Query objects that deal with "spans". 30 31Span queries allow for positional constraints on matching documents. For 32example, the :class:`whoosh.spans.SpanNear` query matches documents where one 33term occurs near another. Because you can nest span queries, and wrap them 34around almost any non-span query, you can create very complex constraints. 35 36For example, to find documents containing "whoosh" at most 5 positions before 37"library" in the "text" field:: 38 39 from whoosh import query, spans 40 t1 = query.Term("text", "whoosh") 41 t2 = query.Term("text", "library") 42 q = spans.SpanNear(t1, t2, slop=5) 43 44""" 45 46from whoosh.matching import mcore, wrappers, binary 47from whoosh.query import Query, And, AndMaybe, Or, Term 48from whoosh.util import make_binary_tree 49 50 51# Span class 52 53class Span(object): 54 __slots__ = ("start", "end", "startchar", "endchar", "boost") 55 56 def __init__(self, start, end=None, startchar=None, endchar=None, 57 boost=1.0): 58 if end is None: 59 end = start 60 assert start <= end 61 self.start = start 62 self.end = end 63 self.startchar = startchar 64 self.endchar = endchar 65 self.boost = boost 66 67 def __repr__(self): 68 if self.startchar is not None or self.endchar is not None: 69 return "<%d-%d %d:%d>" % (self.start, self.end, self.startchar, 70 self.endchar) 71 else: 72 return "<%d-%d>" % (self.start, self.end) 73 74 def __eq__(self, span): 75 return (self.start == span.start 76 and self.end == span.end 77 and self.startchar == span.startchar 78 and self.endchar == span.endchar) 79 80 def __ne__(self, span): 81 return self.start != span.start or self.end != span.end 82 83 def __lt__(self, span): 84 return self.start < span.start 85 86 def __gt__(self, span): 87 return self.start > span.start 88 89 def __hash__(self): 90 return hash((self.start, self.end)) 91 92 @classmethod 93 def merge(cls, spans): 94 """Merges overlapping and touches spans in the given list of spans. 95 96 Note that this modifies the original list. 97 98 >>> spans = [Span(1,2), Span(3)] 99 >>> Span.merge(spans) 100 >>> spans 101 [<1-3>] 102 """ 103 104 i = 0 105 while i < len(spans) - 1: 106 here = spans[i] 107 j = i + 1 108 while j < len(spans): 109 there = spans[j] 110 if there.start > here.end + 1: 111 break 112 if here.touches(there) or here.overlaps(there): 113 here = here.to(there) 114 spans[i] = here 115 del spans[j] 116 else: 117 j += 1 118 i += 1 119 return spans 120 121 def to(self, span): 122 if self.startchar is None: 123 minchar = span.startchar 124 elif span.startchar is None: 125 minchar = self.startchar 126 else: 127 minchar = min(self.startchar, span.startchar) 128 if self.endchar is None: 129 maxchar = span.endchar 130 elif span.endchar is None: 131 maxchar = self.endchar 132 else: 133 maxchar = max(self.endchar, span.endchar) 134 135 minpos = min(self.start, span.start) 136 maxpos = max(self.end, span.end) 137 return self.__class__(minpos, maxpos, minchar, maxchar) 138 139 def overlaps(self, span): 140 return ((self.start >= span.start and self.start <= span.end) 141 or (self.end >= span.start and self.end <= span.end) 142 or (span.start >= self.start and span.start <= self.end) 143 or (span.end >= self.start and span.end <= self.end)) 144 145 def surrounds(self, span): 146 return self.start < span.start and self.end > span.end 147 148 def is_within(self, span): 149 return self.start >= span.start and self.end <= span.end 150 151 def is_before(self, span): 152 return self.end < span.start 153 154 def is_after(self, span): 155 return self.start > span.end 156 157 def touches(self, span): 158 return self.start == span.end + 1 or self.end == span.start - 1 159 160 def distance_to(self, span): 161 if self.overlaps(span): 162 return 0 163 elif self.is_before(span): 164 return span.start - self.end 165 else: 166 return self.start - span.end 167 168 169def bisect_spans(spans, start): 170 lo = 0 171 hi = len(spans) 172 while lo < hi: 173 mid = (lo + hi) // 2 174 if spans[mid].start < start: 175 lo = mid + 1 176 else: 177 hi = mid 178 return lo 179 180 181# Base matchers 182 183class SpanWrappingMatcher(wrappers.WrappingMatcher): 184 """An abstract matcher class that wraps a "regular" matcher. This matcher 185 uses the sub-matcher's matching logic, but only matches documents that have 186 matching spans, i.e. where ``_get_spans()`` returns a non-empty list. 187 188 Subclasses must implement the ``_get_spans()`` method, which returns a list 189 of valid spans for the current document. 190 """ 191 192 def __init__(self, child): 193 super(SpanWrappingMatcher, self).__init__(child) 194 self._spans = None 195 if self.is_active(): 196 self._find_next() 197 198 def copy(self): 199 m = self.__class__(self.child.copy()) 200 m._spans = self._spans 201 return m 202 203 def _replacement(self, newchild): 204 return self.__class__(newchild) 205 206 def _find_next(self): 207 if not self.is_active(): 208 return 209 210 child = self.child 211 r = False 212 213 spans = self._get_spans() 214 while child.is_active() and not spans: 215 r = child.next() or r 216 if not child.is_active(): 217 return True 218 spans = self._get_spans() 219 self._spans = spans 220 221 return r 222 223 def spans(self): 224 return self._spans 225 226 def next(self): 227 self.child.next() 228 self._find_next() 229 230 def skip_to(self, id): 231 self.child.skip_to(id) 232 self._find_next() 233 234 def all_ids(self): 235 while self.is_active(): 236 if self.spans(): 237 yield self.id() 238 self.next() 239 240 241class SpanBiMatcher(SpanWrappingMatcher): 242 def copy(self): 243 return self.__class__(self.a.copy(), self.b.copy()) 244 245 def depth(self): 246 return 1 + max(self.a.depth(), self.b.depth()) 247 248 def replace(self, minquality=0): 249 # TODO: fix this 250 if not self.is_active(): 251 return mcore.NullMatcher() 252 return self 253 254 255# Queries 256 257class SpanQuery(Query): 258 """Abstract base class for span-based queries. Each span query type wraps 259 a "regular" query that implements the basic document-matching functionality 260 (for example, SpanNear wraps an And query, because SpanNear requires that 261 the two sub-queries occur in the same documents. The wrapped query is 262 stored in the ``q`` attribute. 263 264 Subclasses usually only need to implement the initializer to set the 265 wrapped query, and ``matcher()`` to return a span-aware matcher object. 266 """ 267 268 def _subm(self, s, context=None): 269 return self.q.matcher(s, context) 270 271 def __repr__(self): 272 return "%s(%r)" % (self.__class__.__name__, self.q) 273 274 def __eq__(self, other): 275 return (other and self.__class__ is other.__class__ 276 and self.q == other.q) 277 278 def __hash__(self): 279 return hash(self.__class__.__name__) ^ hash(self.q) 280 281 def field(self): 282 return None 283 284 def needs_spans(self): 285 return True 286 287 288class WrappingSpan(SpanQuery): 289 def is_leaf(self): 290 return False 291 292 def apply(self, fn): 293 return self.__class__(fn(self.q), limit=self.limit) 294 295 def field(self): 296 return self.q.field() 297 298 299class SpanFirst(WrappingSpan): 300 """Matches spans that end within the first N positions. This lets you 301 for example only match terms near the beginning of the document. 302 """ 303 304 def __init__(self, q, limit=0): 305 """ 306 :param q: the query to match. 307 :param limit: the query must match within this position at the start 308 of a document. The default is ``0``, which means the query must 309 match at the first position. 310 """ 311 312 self.q = q 313 self.limit = limit 314 315 def __eq__(self, other): 316 return (other and self.__class__ is other.__class__ 317 and self.q == other.q and self.limit == other.limit) 318 319 def __hash__(self): 320 return hash(self.q) ^ hash(self.limit) 321 322 def matcher(self, searcher, context=None): 323 m = self._subm(searcher, context) 324 return SpanFirst.SpanFirstMatcher(m, limit=self.limit) 325 326 class SpanFirstMatcher(SpanWrappingMatcher): 327 def __init__(self, child, limit=0): 328 self.limit = limit 329 super(SpanFirst.SpanFirstMatcher, self).__init__(child) 330 331 def copy(self): 332 return self.__class__(self.child.copy(), limit=self.limit) 333 334 def _replacement(self, newchild): 335 return self.__class__(newchild, limit=self.limit) 336 337 def _get_spans(self): 338 return [span for span in self.child.spans() 339 if span.end <= self.limit] 340 341 342class SpanNear(SpanQuery): 343 """ 344 Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2 345 takes a list of sub-queries instead of requiring you to create a binary 346 tree of query objects. 347 348 Matches queries that occur near each other. By default, only matches 349 queries that occur right next to each other (slop=1) and in order 350 (ordered=True). 351 352 For example, to find documents where "whoosh" occurs next to "library" 353 in the "text" field:: 354 355 from whoosh import query, spans 356 t1 = query.Term("text", "whoosh") 357 t2 = query.Term("text", "library") 358 q = spans.SpanNear(t1, t2) 359 360 To find documents where "whoosh" occurs at most 5 positions before 361 "library":: 362 363 q = spans.SpanNear(t1, t2, slop=5) 364 365 To find documents where "whoosh" occurs at most 5 positions before or after 366 "library":: 367 368 q = spans.SpanNear(t1, t2, slop=5, ordered=False) 369 370 You can use the ``phrase()`` class method to create a tree of SpanNear 371 queries to match a list of terms:: 372 373 q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"], 374 slop=2) 375 """ 376 377 def __init__(self, a, b, slop=1, ordered=True, mindist=1): 378 """ 379 :param a: the first query to match. 380 :param b: the second query that must occur within "slop" positions of 381 the first query. 382 :param slop: the number of positions within which the queries must 383 occur. Default is 1, meaning the queries must occur right next 384 to each other. 385 :param ordered: whether a must occur before b. Default is True. 386 :pram mindist: the minimum distance allowed between the queries. 387 """ 388 389 self.q = And([a, b]) 390 self.a = a 391 self.b = b 392 self.slop = slop 393 self.ordered = ordered 394 self.mindist = mindist 395 396 def __repr__(self): 397 return ("%s(%r, slop=%d, ordered=%s, mindist=%d)" 398 % (self.__class__.__name__, self.q, self.slop, self.ordered, 399 self.mindist)) 400 401 def __eq__(self, other): 402 return (other and self.__class__ == other.__class__ 403 and self.q == other.q and self.slop == other.slop 404 and self.ordered == other.ordered 405 and self.mindist == other.mindist) 406 407 def __hash__(self): 408 return (hash(self.a) ^ hash(self.b) ^ hash(self.slop) 409 ^ hash(self.ordered) ^ hash(self.mindist)) 410 411 def is_leaf(self): 412 return False 413 414 def apply(self, fn): 415 return self.__class__(fn(self.a), fn(self.b), slop=self.slop, 416 ordered=self.ordered, mindist=self.mindist) 417 418 def matcher(self, searcher, context=None): 419 ma = self.a.matcher(searcher, context) 420 mb = self.b.matcher(searcher, context) 421 return SpanNear.SpanNearMatcher(ma, mb, slop=self.slop, 422 ordered=self.ordered, 423 mindist=self.mindist) 424 425 @classmethod 426 def phrase(cls, fieldname, words, slop=1, ordered=True): 427 """Returns a tree of SpanNear queries to match a list of terms. 428 429 This class method is a convenience for constructing a phrase query 430 using a binary tree of SpanNear queries:: 431 432 SpanNear.phrase("content", ["alfa", "bravo", "charlie", "delta"]) 433 434 :param fieldname: the name of the field to search in. 435 :param words: a sequence of texts to search for. 436 :param slop: the number of positions within which the terms must 437 occur. Default is 1, meaning the terms must occur right next 438 to each other. 439 :param ordered: whether the terms must occur in order. Default is True. 440 """ 441 442 terms = [Term(fieldname, word) for word in words] 443 return make_binary_tree(cls, terms, slop=slop, ordered=ordered) 444 445 class SpanNearMatcher(SpanWrappingMatcher): 446 def __init__(self, a, b, slop=1, ordered=True, mindist=1): 447 self.a = a 448 self.b = b 449 self.slop = slop 450 self.ordered = ordered 451 self.mindist = mindist 452 isect = binary.IntersectionMatcher(a, b) 453 super(SpanNear.SpanNearMatcher, self).__init__(isect) 454 455 def copy(self): 456 return self.__class__(self.a.copy(), self.b.copy(), slop=self.slop, 457 ordered=self.ordered, mindist=self.mindist) 458 459 def replace(self, minquality=0): 460 # TODO: fix this 461 if not self.is_active(): 462 return mcore.NullMatcher() 463 return self 464 465 def _get_spans(self): 466 slop = self.slop 467 mindist = self.mindist 468 ordered = self.ordered 469 spans = set() 470 471 bspans = self.b.spans() 472 for aspan in self.a.spans(): 473 for bspan in bspans: 474 if (bspan.end < aspan.start - slop 475 or (ordered and aspan.start > bspan.start)): 476 # B is too far in front of A, or B is in front of A 477 # *at all* when ordered is True 478 continue 479 if bspan.start > aspan.end + slop: 480 # B is too far from A. Since spans are listed in 481 # start position order, we know that all spans after 482 # this one will also be too far. 483 break 484 485 # Check the distance between the spans 486 dist = aspan.distance_to(bspan) 487 if mindist <= dist <= slop: 488 spans.add(aspan.to(bspan)) 489 490 return sorted(spans) 491 492 493class SpanNear2(SpanQuery): 494 """ 495 Matches queries that occur near each other. By default, only matches 496 queries that occur right next to each other (slop=1) and in order 497 (ordered=True). 498 499 New code should use this query type instead of :class:`SpanNear`. 500 501 (Unlike :class:`SpanNear`, this query takes a list of subqueries instead of 502 requiring you to build a binary tree of query objects. This query should 503 also be slightly faster due to less overhead.) 504 505 For example, to find documents where "whoosh" occurs next to "library" 506 in the "text" field:: 507 508 from whoosh import query, spans 509 t1 = query.Term("text", "whoosh") 510 t2 = query.Term("text", "library") 511 q = spans.SpanNear2([t1, t2]) 512 513 To find documents where "whoosh" occurs at most 5 positions before 514 "library":: 515 516 q = spans.SpanNear2([t1, t2], slop=5) 517 518 To find documents where "whoosh" occurs at most 5 positions before or after 519 "library":: 520 521 q = spans.SpanNear2(t1, t2, slop=5, ordered=False) 522 """ 523 524 def __init__(self, qs, slop=1, ordered=True, mindist=1): 525 """ 526 :param qs: a sequence of sub-queries to match. 527 :param slop: the number of positions within which the queries must 528 occur. Default is 1, meaning the queries must occur right next 529 to each other. 530 :param ordered: whether a must occur before b. Default is True. 531 :pram mindist: the minimum distance allowed between the queries. 532 """ 533 534 self.qs = qs 535 self.slop = slop 536 self.ordered = ordered 537 self.mindist = mindist 538 539 def __repr__(self): 540 return ("%s(%r, slop=%d, ordered=%s, mindist=%d)" 541 % (self.__class__.__name__, self.qs, self.slop, self.ordered, 542 self.mindist)) 543 544 def __eq__(self, other): 545 return (other and self.__class__ == other.__class__ 546 and self.qs == other.qs and self.slop == other.slop 547 and self.ordered == other.ordered 548 and self.mindist == other.mindist) 549 550 def __hash__(self): 551 h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist) 552 for q in self.qs: 553 h ^= hash(q) 554 return h 555 556 def _and_query(self): 557 return q.And(self.qs) 558 559 def estimate_size(self, ixreader): 560 return self._and_query().estimate_size(ixreader) 561 562 def estimate_min_size(self, ixreader): 563 return self._and_query().estimate_min_size(ixreader) 564 565 def is_leaf(self): 566 return False 567 568 def children(self): 569 return self.qs 570 571 def apply(self, fn): 572 return self.__class__([fn(q) for q in self.qs], slop=self.slop, 573 ordered=self.ordered, mindist=self.mindist) 574 575 def matcher(self, searcher, context=None): 576 ms = [q.matcher(searcher, context) for q in self.qs] 577 return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered, 578 mindist=self.mindist) 579 580 class SpanNear2Matcher(SpanWrappingMatcher): 581 def __init__(self, ms, slop=1, ordered=True, mindist=1): 582 self.ms = ms 583 self.slop = slop 584 self.ordered = ordered 585 self.mindist = mindist 586 isect = make_binary_tree(binary.IntersectionMatcher, ms) 587 super(SpanNear2.SpanNear2Matcher, self).__init__(isect) 588 589 def copy(self): 590 return self.__class__([m.copy() for m in self.ms], slop=self.slop, 591 ordered=self.ordered, mindist=self.mindist) 592 593 def replace(self, minquality=0): 594 # TODO: fix this 595 if not self.is_active(): 596 return mcore.NullMatcher() 597 return self 598 599 def _get_spans(self): 600 slop = self.slop 601 mindist = self.mindist 602 ordered = self.ordered 603 ms = self.ms 604 605 aspans = ms[0].spans() 606 i = 1 607 while i < len(ms) and aspans: 608 bspans = ms[i].spans() 609 spans = set() 610 for aspan in aspans: 611 # Use a binary search to find the first position we should 612 # start looking for possible matches 613 if ordered: 614 start = aspan.start 615 else: 616 start = max(0, aspan.start - slop) 617 j = bisect_spans(bspans, start) 618 619 while j < len(bspans): 620 bspan = bspans[j] 621 j += 1 622 623 if (bspan.end < aspan.start - slop 624 or (ordered and aspan.start > bspan.start)): 625 # B is too far in front of A, or B is in front of A 626 # *at all* when ordered is True 627 continue 628 if bspan.start > aspan.end + slop: 629 # B is too far from A. Since spans are listed in 630 # start position order, we know that all spans after 631 # this one will also be too far. 632 break 633 634 # Check the distance between the spans 635 dist = aspan.distance_to(bspan) 636 if mindist <= dist <= slop: 637 spans.add(aspan.to(bspan)) 638 aspans = sorted(spans) 639 i += 1 640 641 if i == len(ms): 642 return aspans 643 else: 644 return [] 645 646 647class SpanOr(SpanQuery): 648 """Matches documents that match any of a list of sub-queries. Unlike 649 query.Or, this class merges together matching spans from the different 650 sub-queries when they overlap. 651 """ 652 653 def __init__(self, subqs): 654 """ 655 :param subqs: a list of queries to match. 656 """ 657 658 self.q = Or(subqs) 659 self.subqs = subqs 660 661 def is_leaf(self): 662 return False 663 664 def apply(self, fn): 665 return self.__class__([fn(sq) for sq in self.subqs]) 666 667 def matcher(self, searcher, context=None): 668 matchers = [q.matcher(searcher, context) for q in self.subqs] 669 return make_binary_tree(SpanOr.SpanOrMatcher, matchers) 670 671 class SpanOrMatcher(SpanBiMatcher): 672 def __init__(self, a, b): 673 self.a = a 674 self.b = b 675 um = binary.UnionMatcher(a, b) 676 super(SpanOr.SpanOrMatcher, self).__init__(um) 677 678 def _get_spans(self): 679 a_active = self.a.is_active() 680 b_active = self.b.is_active() 681 682 if a_active: 683 a_id = self.a.id() 684 if b_active: 685 b_id = self.b.id() 686 if a_id == b_id: 687 spans = sorted(set(self.a.spans()) 688 | set(self.b.spans())) 689 elif a_id < b_id: 690 spans = self.a.spans() 691 else: 692 spans = self.b.spans() 693 else: 694 spans = self.a.spans() 695 else: 696 spans = self.b.spans() 697 698 Span.merge(spans) 699 return spans 700 701 702class SpanBiQuery(SpanQuery): 703 # Intermediate base class for methods common to "a/b" span query types 704 705 def is_leaf(self): 706 return False 707 708 def apply(self, fn): 709 return self.__class__(fn(self.a), fn(self.b)) 710 711 def matcher(self, searcher, context=None): 712 ma = self.a.matcher(searcher, context) 713 mb = self.b.matcher(searcher, context) 714 return self._Matcher(ma, mb) 715 716 717class SpanNot(SpanBiQuery): 718 """Matches spans from the first query only if they don't overlap with 719 spans from the second query. If there are no non-overlapping spans, the 720 document does not match. 721 722 For example, to match documents that contain "bear" at most 2 places after 723 "apple" in the "text" field but don't have "cute" between them:: 724 725 from whoosh import query, spans 726 t1 = query.Term("text", "apple") 727 t2 = query.Term("text", "bear") 728 near = spans.SpanNear(t1, t2, slop=2) 729 q = spans.SpanNot(near, query.Term("text", "cute")) 730 """ 731 732 def __init__(self, a, b): 733 """ 734 :param a: the query to match. 735 :param b: do not match any spans that overlap with spans from this 736 query. 737 """ 738 739 self.q = AndMaybe(a, b) 740 self.a = a 741 self.b = b 742 743 class _Matcher(SpanBiMatcher): 744 def __init__(self, a, b): 745 self.a = a 746 self.b = b 747 amm = binary.AndMaybeMatcher(a, b) 748 super(SpanNot._Matcher, self).__init__(amm) 749 750 def _get_spans(self): 751 if self.a.id() == self.b.id(): 752 spans = [] 753 bspans = self.b.spans() 754 for aspan in self.a.spans(): 755 overlapped = False 756 for bspan in bspans: 757 if aspan.overlaps(bspan): 758 overlapped = True 759 break 760 if not overlapped: 761 spans.append(aspan) 762 return spans 763 else: 764 return self.a.spans() 765 766 767class SpanContains(SpanBiQuery): 768 """Matches documents where the spans of the first query contain any spans 769 of the second query. 770 771 For example, to match documents where "apple" occurs at most 10 places 772 before "bear" in the "text" field and "cute" is between them:: 773 774 from whoosh import query, spans 775 t1 = query.Term("text", "apple") 776 t2 = query.Term("text", "bear") 777 near = spans.SpanNear(t1, t2, slop=10) 778 q = spans.SpanContains(near, query.Term("text", "cute")) 779 """ 780 781 def __init__(self, a, b): 782 """ 783 :param a: the query to match. 784 :param b: the query whose spans must occur within the matching spans 785 of the first query. 786 """ 787 788 self.q = And([a, b]) 789 self.a = a 790 self.b = b 791 792 class _Matcher(SpanBiMatcher): 793 def __init__(self, a, b): 794 self.a = a 795 self.b = b 796 im = binary.IntersectionMatcher(a, b) 797 super(SpanContains._Matcher, self).__init__(im) 798 799 def _get_spans(self): 800 spans = [] 801 bspans = self.b.spans() 802 for aspan in self.a.spans(): 803 for bspan in bspans: 804 if aspan.start > bspan.end: 805 continue 806 if aspan.end < bspan.start: 807 break 808 809 if bspan.is_within(aspan): 810 spans.append(aspan) 811 break 812 return spans 813 814 815class SpanBefore(SpanBiQuery): 816 """Matches documents where the spans of the first query occur before any 817 spans of the second query. 818 819 For example, to match documents where "apple" occurs anywhere before 820 "bear":: 821 822 from whoosh import query, spans 823 t1 = query.Term("text", "apple") 824 t2 = query.Term("text", "bear") 825 q = spans.SpanBefore(t1, t2) 826 """ 827 828 def __init__(self, a, b): 829 """ 830 :param a: the query that must occur before the second. 831 :param b: the query that must occur after the first. 832 """ 833 834 self.a = a 835 self.b = b 836 self.q = And([a, b]) 837 838 class _Matcher(SpanBiMatcher): 839 def __init__(self, a, b): 840 self.a = a 841 self.b = b 842 im = binary.IntersectionMatcher(a, b) 843 super(SpanBefore._Matcher, self).__init__(im) 844 845 def _get_spans(self): 846 bminstart = min(bspan.start for bspan in self.b.spans()) 847 return [aspan for aspan in self.a.spans() if aspan.end < bminstart] 848 849 850class SpanCondition(SpanBiQuery): 851 """Matches documents that satisfy both subqueries, but only uses the spans 852 from the first subquery. 853 854 This is useful when you want to place conditions on matches but not have 855 those conditions affect the spans returned. 856 857 For example, to get spans for the term ``alfa`` in documents that also 858 must contain the term ``bravo``:: 859 860 SpanCondition(Term("text", u"alfa"), Term("text", u"bravo")) 861 862 """ 863 864 def __init__(self, a, b): 865 self.a = a 866 self.b = b 867 self.q = And([a, b]) 868 869 class _Matcher(SpanBiMatcher): 870 def __init__(self, a, b): 871 self.a = a 872 im = binary.IntersectionMatcher(a, b) 873 super(SpanCondition._Matcher, self).__init__(im) 874 875 def _get_spans(self): 876 return self.a.spans() 877 878 879 880 881 882