1# Copyright 2008 Matt Chaput. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are met: 5# 6# 1. Redistributions of source code must retain the above copyright notice, 7# this list of conditions and the following disclaimer. 8# 9# 2. Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23# 24# The views and conclusions contained in the software and documentation are 25# those of the authors and should not be interpreted as representing official 26# policies, either expressed or implied, of Matt Chaput. 27 28"""The highlight module contains classes and functions for displaying short 29excerpts from hit documents in the search results you present to the user, with 30query terms highlighted. 31 32The highlighting system has four main elements. 33 34* **Fragmenters** chop up the original text into __fragments__, based on the 35 locations of matched terms in the text. 36 37* **Scorers** assign a score to each fragment, allowing the system to rank the 38 best fragments by whatever criterion. 39 40* **Order functions** control in what order the top-scoring fragments are 41 presented to the user. For example, you can show the fragments in the order 42 they appear in the document (FIRST) or show higher-scoring fragments first 43 (SCORE) 44 45* **Formatters** turn the fragment objects into human-readable output, such as 46 an HTML string. 47 48See :doc:`/highlight` for more information. 49""" 50 51from __future__ import division 52from collections import deque 53from heapq import nlargest 54from itertools import groupby 55 56from whoosh.compat import htmlescape 57from whoosh.analysis import Token 58 59 60# The default value for the maximum chars to examine when fragmenting 61DEFAULT_CHARLIMIT = 2 ** 15 62 63 64# Fragment object 65 66def mkfrag(text, tokens, startchar=None, endchar=None, 67 charsbefore=0, charsafter=0): 68 """Returns a :class:`Fragment` object based on the :class:`analysis.Token` 69 objects in ``tokens`. 70 """ 71 72 if startchar is None: 73 startchar = tokens[0].startchar if tokens else 0 74 if endchar is None: 75 endchar = tokens[-1].endchar if tokens else len(text) 76 77 startchar = max(0, startchar - charsbefore) 78 endchar = min(len(text), endchar + charsafter) 79 80 return Fragment(text, tokens, startchar, endchar) 81 82 83class Fragment(object): 84 """Represents a fragment (extract) from a hit document. This object is 85 mainly used to keep track of the start and end points of the fragment and 86 the "matched" character ranges inside; it does not contain the text of the 87 fragment or do much else. 88 89 The useful attributes are: 90 91 ``Fragment.text`` 92 The entire original text from which this fragment is taken. 93 94 ``Fragment.matches`` 95 An ordered list of objects representing the matched terms in the 96 fragment. These objects have ``startchar`` and ``endchar`` attributes. 97 98 ``Fragment.startchar`` 99 The index of the first character in the fragment. 100 101 ``Fragment.endchar`` 102 The index of the last character in the fragment. 103 104 ``Fragment.matched_terms`` 105 A ``set`` of the ``text`` of the matched terms in the fragment (if 106 available). 107 """ 108 109 def __init__(self, text, matches, startchar=0, endchar= -1): 110 """ 111 :param text: the source text of the fragment. 112 :param matches: a list of objects which have ``startchar`` and 113 ``endchar`` attributes, and optionally a ``text`` attribute. 114 :param startchar: the index into ``text`` at which the fragment starts. 115 The default is 0. 116 :param endchar: the index into ``text`` at which the fragment ends. 117 The default is -1, which is interpreted as the length of ``text``. 118 """ 119 120 self.text = text 121 self.matches = matches 122 123 if endchar == -1: 124 endchar = len(text) 125 self.startchar = startchar 126 self.endchar = endchar 127 128 self.matched_terms = set() 129 for t in matches: 130 if hasattr(t, "text"): 131 self.matched_terms.add(t.text) 132 133 def __repr__(self): 134 return "<Fragment %d:%d %d>" % (self.startchar, self.endchar, 135 len(self.matches)) 136 137 def __len__(self): 138 return self.endchar - self.startchar 139 140 def overlaps(self, fragment): 141 sc = self.startchar 142 ec = self.endchar 143 fsc = fragment.startchar 144 fec = fragment.endchar 145 return (sc < fsc < ec) or (sc < fec < ec) 146 147 def overlapped_length(self, fragment): 148 sc = self.startchar 149 ec = self.endchar 150 fsc = fragment.startchar 151 fec = fragment.endchar 152 return max(ec, fec) - min(sc, fsc) 153 154 def __lt__(self, other): 155 return id(self) < id(other) 156 157 158# Tokenizing 159 160def set_matched_filter(tokens, termset): 161 for t in tokens: 162 t.matched = t.text in termset 163 yield t 164 165 166# Fragmenters 167 168class Fragmenter(object): 169 def must_retokenize(self): 170 """Returns True if this fragmenter requires retokenized text. 171 172 If this method returns True, the fragmenter's ``fragment_tokens`` 173 method will be called with an iterator of ALL tokens from the text, 174 with the tokens for matched terms having the ``matched`` attribute set 175 to True. 176 177 If this method returns False, the fragmenter's ``fragment_matches`` 178 method will be called with a LIST of matching tokens. 179 """ 180 181 return True 182 183 def fragment_tokens(self, text, all_tokens): 184 """Yields :class:`Fragment` objects based on the tokenized text. 185 186 :param text: the string being highlighted. 187 :param all_tokens: an iterator of :class:`analysis.Token` 188 objects from the string. 189 """ 190 191 raise NotImplementedError 192 193 def fragment_matches(self, text, matched_tokens): 194 """Yields :class:`Fragment` objects based on the text and the matched 195 terms. 196 197 :param text: the string being highlighted. 198 :param matched_tokens: a list of :class:`analysis.Token` objects 199 representing the term matches in the string. 200 """ 201 202 raise NotImplementedError 203 204 205class WholeFragmenter(Fragmenter): 206 """Doesn't fragment the token stream. This object just returns the entire 207 entire stream as one "fragment". This is useful if you want to highlight 208 the entire text. 209 210 Note that even if you use the `WholeFragmenter`, the highlight code will 211 return no fragment if no terms matched in the given field. To return the 212 whole fragment even in that case, call `highlights()` with `minscore=0`:: 213 214 # Query where no terms match in the "text" field 215 q = query.Term("tag", "new") 216 217 r = mysearcher.search(q) 218 r.fragmenter = highlight.WholeFragmenter() 219 r.formatter = highlight.UppercaseFormatter() 220 # Since no terms in the "text" field matched, we get no fragments back 221 assert r[0].highlights("text") == "" 222 223 # If we lower the minimum score to 0, we get a fragment even though it 224 # has no matching terms 225 assert r[0].highlights("text", minscore=0) == "This is the text field." 226 227 """ 228 229 def __init__(self, charlimit=DEFAULT_CHARLIMIT): 230 self.charlimit = charlimit 231 232 def fragment_tokens(self, text, tokens): 233 charlimit = self.charlimit 234 matches = [] 235 for t in tokens: 236 if charlimit and t.endchar > charlimit: 237 break 238 if t.matched: 239 matches.append(t.copy()) 240 return [Fragment(text, matches)] 241 242 243# Backwards compatiblity 244NullFragmeter = WholeFragmenter 245 246 247class SentenceFragmenter(Fragmenter): 248 """Breaks the text up on sentence end punctuation characters 249 (".", "!", or "?"). This object works by looking in the original text for a 250 sentence end as the next character after each token's 'endchar'. 251 252 When highlighting with this fragmenter, you should use an analyzer that 253 does NOT remove stop words, for example:: 254 255 sa = StandardAnalyzer(stoplist=None) 256 """ 257 258 def __init__(self, maxchars=200, sentencechars=".!?", 259 charlimit=DEFAULT_CHARLIMIT): 260 """ 261 :param maxchars: The maximum number of characters allowed in a 262 fragment. 263 """ 264 265 self.maxchars = maxchars 266 self.sentencechars = frozenset(sentencechars) 267 self.charlimit = charlimit 268 269 def fragment_tokens(self, text, tokens): 270 maxchars = self.maxchars 271 sentencechars = self.sentencechars 272 charlimit = self.charlimit 273 274 textlen = len(text) 275 # startchar of first token in the current sentence 276 first = None 277 # Buffer for matched tokens in the current sentence 278 tks = [] 279 endchar = None 280 # Number of chars in the current sentence 281 currentlen = 0 282 283 for t in tokens: 284 startchar = t.startchar 285 endchar = t.endchar 286 if charlimit and endchar > charlimit: 287 break 288 289 if first is None: 290 # Remember the startchar of the first token in a sentence 291 first = startchar 292 currentlen = 0 293 294 tlength = endchar - startchar 295 currentlen += tlength 296 297 if t.matched: 298 tks.append(t.copy()) 299 300 # If the character after the current token is end-of-sentence 301 # punctuation, finish the sentence and reset 302 if endchar < textlen and text[endchar] in sentencechars: 303 # Don't break for two periods in a row (e.g. ignore "...") 304 if endchar + 1 < textlen and text[endchar + 1] in sentencechars: 305 continue 306 307 # If the sentence had matches and it's not too long, yield it 308 # as a token 309 if tks and currentlen <= maxchars: 310 yield mkfrag(text, tks, startchar=first, endchar=endchar) 311 # Reset the counts 312 tks = [] 313 first = None 314 currentlen = 0 315 316 # If we get to the end of the text and there's still a sentence 317 # in the buffer, yield it 318 if tks: 319 yield mkfrag(text, tks, startchar=first, endchar=endchar) 320 321 322class ContextFragmenter(Fragmenter): 323 """Looks for matched terms and aggregates them with their surrounding 324 context. 325 """ 326 327 def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT): 328 """ 329 :param maxchars: The maximum number of characters allowed in a 330 fragment. 331 :param surround: The number of extra characters of context to add both 332 before the first matched term and after the last matched term. 333 """ 334 335 self.maxchars = maxchars 336 self.surround = surround 337 self.charlimit = charlimit 338 339 def fragment_tokens(self, text, tokens): 340 maxchars = self.maxchars 341 surround = self.surround 342 charlimit = self.charlimit 343 344 # startchar of the first token in the fragment 345 first = None 346 # Stack of startchars 347 firsts = deque() 348 # Each time we see a matched token, we reset the countdown to finishing 349 # the fragment. This also indicates whether we're currently inside a 350 # fragment (< 0 not in fragment, >= 0 in fragment) 351 countdown = -1 352 # Tokens in current fragment 353 tks = [] 354 endchar = None 355 # Number of chars in the current fragment 356 currentlen = 0 357 358 for t in tokens: 359 startchar = t.startchar 360 endchar = t.endchar 361 tlength = endchar - startchar 362 if charlimit and endchar > charlimit: 363 break 364 365 if countdown < 0 and not t.matched: 366 # We're not in a fragment currently, so just maintain the 367 # "charsbefore" buffer 368 firsts.append(startchar) 369 while firsts and endchar - firsts[0] > surround: 370 firsts.popleft() 371 elif currentlen + tlength > maxchars: 372 # We're in a fragment, but adding this token would put us past 373 # the maximum size. Zero the countdown so the code below will 374 # cause the fragment to be emitted 375 countdown = 0 376 elif t.matched: 377 # Start/restart the countdown 378 countdown = surround 379 # Remember the first char of this fragment 380 if first is None: 381 if firsts: 382 first = firsts[0] 383 else: 384 first = startchar 385 # Add on unused front context 386 countdown += surround 387 tks.append(t.copy()) 388 389 # If we're in a fragment... 390 if countdown >= 0: 391 # Update the counts 392 currentlen += tlength 393 countdown -= tlength 394 395 # If the countdown is expired 396 if countdown <= 0: 397 # Finish the fragment 398 yield mkfrag(text, tks, startchar=first, endchar=endchar) 399 # Reset the counts 400 tks = [] 401 firsts = deque() 402 first = None 403 currentlen = 0 404 405 # If there's a fragment left over at the end, yield it 406 if tks: 407 yield mkfrag(text, tks, startchar=first, endchar=endchar) 408 409 410class PinpointFragmenter(Fragmenter): 411 """This is a NON-RETOKENIZING fragmenter. It builds fragments from the 412 positions of the matched terms. 413 """ 414 415 def __init__(self, maxchars=200, surround=20, autotrim=False, 416 charlimit=DEFAULT_CHARLIMIT): 417 """ 418 :param maxchars: The maximum number of characters allowed in a 419 fragment. 420 :param surround: The number of extra characters of context to add both 421 before the first matched term and after the last matched term. 422 :param autotrim: automatically trims text before the first space and 423 after the last space in the fragments, to try to avoid truncated 424 words at the start and end. For short fragments or fragments with 425 long runs between spaces this may give strange results. 426 """ 427 428 self.maxchars = maxchars 429 self.surround = surround 430 self.autotrim = autotrim 431 self.charlimit = charlimit 432 433 def must_retokenize(self): 434 return False 435 436 def fragment_tokens(self, text, tokens): 437 matched = [t for t in tokens if t.matched] 438 return self.fragment_matches(text, matched) 439 440 @staticmethod 441 def _autotrim(fragment): 442 text = fragment.text 443 startchar = fragment.startchar 444 endchar = fragment.endchar 445 446 firstspace = text.find(" ", startchar, endchar) 447 if firstspace > 0: 448 startchar = firstspace + 1 449 lastspace = text.rfind(" ", startchar, endchar) 450 if lastspace > 0: 451 endchar = lastspace 452 453 if fragment.matches: 454 startchar = min(startchar, fragment.matches[0].startchar) 455 endchar = max(endchar, fragment.matches[-1].endchar) 456 457 fragment.startchar = startchar 458 fragment.endchar = endchar 459 460 def fragment_matches(self, text, tokens): 461 maxchars = self.maxchars 462 surround = self.surround 463 autotrim = self.autotrim 464 charlimit = self.charlimit 465 466 j = -1 467 468 for i, t in enumerate(tokens): 469 if j >= i: 470 continue 471 j = i 472 left = t.startchar 473 right = t.endchar 474 if charlimit and right > charlimit: 475 break 476 477 currentlen = right - left 478 while j < len(tokens) - 1 and currentlen < maxchars: 479 next = tokens[j + 1] 480 ec = next.endchar 481 if ec - right <= surround and ec - left <= maxchars: 482 j += 1 483 right = ec 484 currentlen += (ec - next.startchar) 485 else: 486 break 487 488 left = max(0, left - surround) 489 right = min(len(text), right + surround) 490 fragment = Fragment(text, tokens[i:j + 1], left, right) 491 if autotrim: 492 self._autotrim(fragment) 493 yield fragment 494 495 496# Fragment scorers 497 498class FragmentScorer(object): 499 pass 500 501 502class BasicFragmentScorer(FragmentScorer): 503 def __call__(self, f): 504 # Add up the boosts for the matched terms in this passage 505 score = sum(t.boost for t in f.matches) 506 507 # Favor diversity: multiply score by the number of separate 508 # terms matched 509 score *= (len(f.matched_terms) * 100) or 1 510 511 return score 512 513 514# Fragment sorters 515 516def SCORE(fragment): 517 "Sorts higher scored passages first." 518 return 1 519 520 521def FIRST(fragment): 522 "Sorts passages from earlier in the document first." 523 return fragment.startchar 524 525 526def LONGER(fragment): 527 "Sorts longer passages first." 528 return 0 - len(fragment) 529 530 531def SHORTER(fragment): 532 "Sort shorter passages first." 533 return len(fragment) 534 535 536# Formatters 537 538def get_text(original, token, replace): 539 """Convenience function for getting the text to use for a match when 540 formatting. 541 542 If ``replace`` is False, returns the part of ``original`` between 543 ``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns 544 ``token.text``. 545 """ 546 547 if replace: 548 return token.text 549 else: 550 return original[token.startchar:token.endchar] 551 552 553class Formatter(object): 554 """Base class for formatters. 555 556 For highlighters that return strings, it is usually only necessary to 557 override :meth:`Formatter.format_token`. 558 559 Use the :func:`get_text` function as a convenience to get the token text:: 560 561 class MyFormatter(Formatter): 562 def format_token(text, token, replace=False): 563 ttext = get_text(text, token, replace) 564 return "[%s]" % ttext 565 """ 566 567 between = "..." 568 569 def _text(self, text): 570 return text 571 572 def format_token(self, text, token, replace=False): 573 """Returns a formatted version of the given "token" object, which 574 should have at least ``startchar`` and ``endchar`` attributes, and 575 a ``text`` attribute if ``replace`` is True. 576 577 :param text: the original fragment text being highlighted. 578 :param token: an object having ``startchar`` and ``endchar`` attributes 579 and optionally a ``text`` attribute (if ``replace`` is True). 580 :param replace: if True, the original text between the token's 581 ``startchar`` and ``endchar`` indices will be replaced with the 582 value of the token's ``text`` attribute. 583 """ 584 585 raise NotImplementedError 586 587 def format_fragment(self, fragment, replace=False): 588 """Returns a formatted version of the given text, using the "token" 589 objects in the given :class:`Fragment`. 590 591 :param fragment: a :class:`Fragment` object representing a list of 592 matches in the text. 593 :param replace: if True, the original text corresponding to each 594 match will be replaced with the value of the token object's 595 ``text`` attribute. 596 """ 597 598 output = [] 599 index = fragment.startchar 600 text = fragment.text 601 602 for t in fragment.matches: 603 if t.startchar is None: 604 continue 605 if t.startchar < index: 606 continue 607 if t.startchar > index: 608 output.append(self._text(text[index:t.startchar])) 609 output.append(self.format_token(text, t, replace)) 610 index = t.endchar 611 output.append(self._text(text[index:fragment.endchar])) 612 613 out_string = "".join(output) 614 return out_string 615 616 def format(self, fragments, replace=False): 617 """Returns a formatted version of the given text, using a list of 618 :class:`Fragment` objects. 619 """ 620 621 formatted = [self.format_fragment(f, replace=replace) 622 for f in fragments] 623 return self.between.join(formatted) 624 625 def __call__(self, text, fragments): 626 # For backwards compatibility 627 return self.format(fragments) 628 629 630class NullFormatter(Formatter): 631 """Formatter that does not modify the string. 632 """ 633 634 def format_token(self, text, token, replace=False): 635 return get_text(text, token, replace) 636 637 638class UppercaseFormatter(Formatter): 639 """Returns a string in which the matched terms are in UPPERCASE. 640 """ 641 642 def __init__(self, between="..."): 643 """ 644 :param between: the text to add between fragments. 645 """ 646 647 self.between = between 648 649 def format_token(self, text, token, replace=False): 650 ttxt = get_text(text, token, replace) 651 return ttxt.upper() 652 653 654class HtmlFormatter(Formatter): 655 """Returns a string containing HTML formatting around the matched terms. 656 657 This formatter wraps matched terms in an HTML element with two class names. 658 The first class name (set with the constructor argument ``classname``) is 659 the same for each match. The second class name (set with the constructor 660 argument ``termclass`` is different depending on which term matched. This 661 allows you to give different formatting (for example, different background 662 colors) to the different terms in the excerpt. 663 664 >>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term") 665 >>> hf(mytext, myfragments) 666 "The <span class="match term0">template</span> <span class="match term1">geometry</span> is..." 667 668 This object maintains a dictionary mapping terms to HTML class names (e.g. 669 ``term0`` and ``term1`` above), so that multiple excerpts will use the same 670 class for the same term. If you want to re-use the same HtmlFormatter 671 object with different searches, you should call HtmlFormatter.clear() 672 between searches to clear the mapping. 673 """ 674 675 template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s</%(tag)s>' 676 677 def __init__(self, tagname="strong", between="...", 678 classname="match", termclass="term", maxclasses=5, 679 attrquote='"'): 680 """ 681 :param tagname: the tag to wrap around matching terms. 682 :param between: the text to add between fragments. 683 :param classname: the class name to add to the elements wrapped around 684 matching terms. 685 :param termclass: the class name prefix for the second class which is 686 different for each matched term. 687 :param maxclasses: the maximum number of term classes to produce. This 688 limits the number of classes you have to define in CSS by recycling 689 term class names. For example, if you set maxclasses to 3 and have 690 5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``, 691 ``term2``, ``term0``, ``term1``. 692 """ 693 694 self.between = between 695 self.tagname = tagname 696 self.classname = classname 697 self.termclass = termclass 698 self.attrquote = attrquote 699 self.maxclasses = maxclasses 700 self.seen = {} 701 self.htmlclass = " ".join((self.classname, self.termclass)) 702 703 def _text(self, text): 704 return htmlescape(text, quote=False) 705 706 def format_token(self, text, token, replace=False): 707 seen = self.seen 708 ttext = self._text(get_text(text, token, replace)) 709 if ttext in seen: 710 termnum = seen[ttext] 711 else: 712 termnum = len(seen) % self.maxclasses 713 seen[ttext] = termnum 714 715 return self.template % {"tag": self.tagname, "q": self.attrquote, 716 "cls": self.htmlclass, "t": ttext, 717 "tn": termnum} 718 719 def clean(self): 720 """Clears the dictionary mapping terms to HTML classnames. 721 """ 722 self.seen = {} 723 724 725class GenshiFormatter(Formatter): 726 """Returns a Genshi event stream containing HTML formatting around the 727 matched terms. 728 """ 729 730 def __init__(self, qname="strong", between="..."): 731 """ 732 :param qname: the QName for the tag to wrap around matched terms. 733 :param between: the text to add between fragments. 734 """ 735 736 self.qname = qname 737 self.between = between 738 739 from genshi.core import START, END, TEXT # @UnresolvedImport 740 from genshi.core import Attrs, Stream # @UnresolvedImport 741 self.START, self.END, self.TEXT = START, END, TEXT 742 self.Attrs, self.Stream = Attrs, Stream 743 744 def _add_text(self, text, output): 745 if output and output[-1][0] == self.TEXT: 746 output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2]) 747 else: 748 output.append((self.TEXT, text, (None, -1, -1))) 749 750 def format_token(self, text, token, replace=False): 751 qn = self.qname 752 txt = get_text(text, token, replace) 753 return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)), 754 (self.TEXT, txt, (None, -1, -1)), 755 (self.END, qn, (None, -1, -1))]) 756 757 def format_fragment(self, fragment, replace=False): 758 output = [] 759 index = fragment.startchar 760 text = fragment.text 761 762 for t in fragment.matches: 763 if t.startchar > index: 764 self._add_text(text[index:t.startchar], output) 765 output.append((text, t, replace)) 766 index = t.endchar 767 if index < len(text): 768 self._add_text(text[index:], output) 769 return self.Stream(output) 770 771 def format(self, fragments, replace=False): 772 output = [] 773 first = True 774 for fragment in fragments: 775 if not first: 776 self._add_text(self.between, output) 777 output += self.format_fragment(fragment, replace=replace) 778 first = False 779 return self.Stream(output) 780 781 782# Highlighting 783 784def top_fragments(fragments, count, scorer, order, minscore=1): 785 scored_fragments = ((scorer(f), f) for f in fragments) 786 scored_fragments = nlargest(count, scored_fragments) 787 best_fragments = [sf for score, sf in scored_fragments if score >= minscore] 788 best_fragments.sort(key=order) 789 return best_fragments 790 791 792def highlight(text, terms, analyzer, fragmenter, formatter, top=3, 793 scorer=None, minscore=1, order=FIRST, mode="query"): 794 795 if scorer is None: 796 scorer = BasicFragmentScorer() 797 798 if type(fragmenter) is type: 799 fragmenter = fragmenter() 800 if type(formatter) is type: 801 formatter = formatter() 802 if type(scorer) is type: 803 scorer = scorer() 804 805 if scorer is None: 806 scorer = BasicFragmentScorer() 807 808 termset = frozenset(terms) 809 tokens = analyzer(text, chars=True, mode=mode, removestops=False) 810 tokens = set_matched_filter(tokens, termset) 811 fragments = fragmenter.fragment_tokens(text, tokens) 812 fragments = top_fragments(fragments, top, scorer, order, minscore) 813 return formatter(text, fragments) 814 815 816class Highlighter(object): 817 def __init__(self, fragmenter=None, scorer=None, formatter=None, 818 always_retokenize=False, order=FIRST): 819 self.fragmenter = fragmenter or ContextFragmenter() 820 self.scorer = scorer or BasicFragmentScorer() 821 self.formatter = formatter or HtmlFormatter(tagname="b") 822 self.order = order 823 self.always_retokenize = always_retokenize 824 825 def can_load_chars(self, results, fieldname): 826 # Is it possible to build a mapping between the matched terms/docs and 827 # their start and end chars for "pinpoint" highlighting (ie not require 828 # re-tokenizing text)? 829 830 if self.always_retokenize: 831 # No, we've been configured to always retokenize some text 832 return False 833 if not results.has_matched_terms(): 834 # No, we don't know what the matched terms are yet 835 return False 836 if self.fragmenter.must_retokenize(): 837 # No, the configured fragmenter doesn't support it 838 return False 839 840 # Maybe, if the field was configured to store characters 841 field = results.searcher.schema[fieldname] 842 return field.supports("characters") 843 844 @staticmethod 845 def _load_chars(results, fieldname, texts, to_bytes): 846 # For each docnum, create a mapping of text -> [(startchar, endchar)] 847 # for the matched terms 848 849 results._char_cache[fieldname] = cache = {} 850 sorted_ids = sorted(docnum for _, docnum in results.top_n) 851 852 for docnum in sorted_ids: 853 cache[docnum] = {} 854 855 for text in texts: 856 btext = to_bytes(text) 857 m = results.searcher.postings(fieldname, btext) 858 docset = set(results.termdocs[(fieldname, btext)]) 859 for docnum in sorted_ids: 860 if docnum in docset: 861 m.skip_to(docnum) 862 assert m.id() == docnum 863 cache[docnum][text] = m.value_as("characters") 864 865 @staticmethod 866 def _merge_matched_tokens(tokens): 867 # Merges consecutive matched tokens together, so they are highlighted 868 # as one 869 870 token = None 871 872 for t in tokens: 873 if not t.matched: 874 if token is not None: 875 yield token 876 token = None 877 yield t 878 continue 879 880 if token is None: 881 token = t.copy() 882 elif t.startchar <= token.endchar: 883 if t.endchar > token.endchar: 884 token.text += t.text[token.endchar-t.endchar:] 885 token.endchar = t.endchar 886 else: 887 yield token 888 token = None 889 # t was not merged, also has to be yielded 890 yield t 891 892 if token is not None: 893 yield token 894 895 def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1): 896 results = hitobj.results 897 schema = results.searcher.schema 898 field = schema[fieldname] 899 to_bytes = field.to_bytes 900 from_bytes = field.from_bytes 901 902 if text is None: 903 if fieldname not in hitobj: 904 raise KeyError("Field %r is not stored." % fieldname) 905 text = hitobj[fieldname] 906 907 # Get the terms searched for/matched in this field 908 if results.has_matched_terms(): 909 bterms = (term for term in results.matched_terms() 910 if term[0] == fieldname) 911 else: 912 bterms = results.query_terms(expand=True, fieldname=fieldname) 913 # Convert bytes to unicode 914 words = frozenset(from_bytes(term[1]) for term in bterms) 915 916 # If we can do "pinpoint" highlighting... 917 if self.can_load_chars(results, fieldname): 918 # Build the docnum->[(startchar, endchar),] map 919 if fieldname not in results._char_cache: 920 self._load_chars(results, fieldname, words, to_bytes) 921 922 hitterms = (from_bytes(term[1]) for term in hitobj.matched_terms() 923 if term[0] == fieldname) 924 925 # Grab the word->[(startchar, endchar)] map for this docnum 926 cmap = results._char_cache[fieldname][hitobj.docnum] 927 # A list of Token objects for matched words 928 tokens = [] 929 charlimit = self.fragmenter.charlimit 930 for word in hitterms: 931 chars = cmap[word] 932 for pos, startchar, endchar in chars: 933 if charlimit and endchar > charlimit: 934 break 935 tokens.append(Token(text=word, pos=pos, 936 startchar=startchar, endchar=endchar)) 937 tokens.sort(key=lambda t: t.startchar) 938 tokens = [max(group, key=lambda t: t.endchar - t.startchar) 939 for key, group in groupby(tokens, lambda t: t.startchar)] 940 fragments = self.fragmenter.fragment_matches(text, tokens) 941 else: 942 # Retokenize the text 943 analyzer = results.searcher.schema[fieldname].analyzer 944 tokens = analyzer(text, positions=True, chars=True, mode="index", 945 removestops=False) 946 # Set Token.matched attribute for tokens that match a query term 947 tokens = set_matched_filter(tokens, words) 948 tokens = self._merge_matched_tokens(tokens) 949 fragments = self.fragmenter.fragment_tokens(text, tokens) 950 951 fragments = top_fragments(fragments, top, self.scorer, self.order, 952 minscore=minscore) 953 output = self.formatter.format(fragments) 954 return output 955