1# Copyright 2008 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6#    1. Redistributions of source code must retain the above copyright notice,
7#       this list of conditions and the following disclaimer.
8#
9#    2. Redistributions in binary form must reproduce the above copyright
10#       notice, this list of conditions and the following disclaimer in the
11#       documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""The highlight module contains classes and functions for displaying short
29excerpts from hit documents in the search results you present to the user, with
30query terms highlighted.
31
32The highlighting system has four main elements.
33
34* **Fragmenters** chop up the original text into __fragments__, based on the
35  locations of matched terms in the text.
36
37* **Scorers** assign a score to each fragment, allowing the system to rank the
38  best fragments by whatever criterion.
39
40* **Order functions** control in what order the top-scoring fragments are
41  presented to the user. For example, you can show the fragments in the order
42  they appear in the document (FIRST) or show higher-scoring fragments first
43  (SCORE)
44
45* **Formatters** turn the fragment objects into human-readable output, such as
46  an HTML string.
47
48See :doc:`/highlight` for more information.
49"""
50
51from __future__ import division
52from collections import deque
53from heapq import nlargest
54from itertools import groupby
55
56from whoosh.compat import htmlescape
57from whoosh.analysis import Token
58
59
60# The default value for the maximum chars to examine when fragmenting
61DEFAULT_CHARLIMIT = 2 ** 15
62
63
64# Fragment object
65
66def mkfrag(text, tokens, startchar=None, endchar=None,
67           charsbefore=0, charsafter=0):
68    """Returns a :class:`Fragment` object based on the :class:`analysis.Token`
69    objects in ``tokens`.
70    """
71
72    if startchar is None:
73        startchar = tokens[0].startchar if tokens else 0
74    if endchar is None:
75        endchar = tokens[-1].endchar if tokens else len(text)
76
77    startchar = max(0, startchar - charsbefore)
78    endchar = min(len(text), endchar + charsafter)
79
80    return Fragment(text, tokens, startchar, endchar)
81
82
83class Fragment(object):
84    """Represents a fragment (extract) from a hit document. This object is
85    mainly used to keep track of the start and end points of the fragment and
86    the "matched" character ranges inside; it does not contain the text of the
87    fragment or do much else.
88
89    The useful attributes are:
90
91    ``Fragment.text``
92        The entire original text from which this fragment is taken.
93
94    ``Fragment.matches``
95        An ordered list of objects representing the matched terms in the
96        fragment. These objects have ``startchar`` and ``endchar`` attributes.
97
98    ``Fragment.startchar``
99        The index of the first character in the fragment.
100
101    ``Fragment.endchar``
102        The index of the last character in the fragment.
103
104    ``Fragment.matched_terms``
105        A ``set`` of the ``text`` of the matched terms in the fragment (if
106        available).
107    """
108
109    def __init__(self, text, matches, startchar=0, endchar= -1):
110        """
111        :param text: the source text of the fragment.
112        :param matches: a list of objects which have ``startchar`` and
113            ``endchar`` attributes, and optionally a ``text`` attribute.
114        :param startchar: the index into ``text`` at which the fragment starts.
115            The default is 0.
116        :param endchar: the index into ``text`` at which the fragment ends.
117            The default is -1, which is interpreted as the length of ``text``.
118        """
119
120        self.text = text
121        self.matches = matches
122
123        if endchar == -1:
124            endchar = len(text)
125        self.startchar = startchar
126        self.endchar = endchar
127
128        self.matched_terms = set()
129        for t in matches:
130            if hasattr(t, "text"):
131                self.matched_terms.add(t.text)
132
133    def __repr__(self):
134        return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
135                                        len(self.matches))
136
137    def __len__(self):
138        return self.endchar - self.startchar
139
140    def overlaps(self, fragment):
141        sc = self.startchar
142        ec = self.endchar
143        fsc = fragment.startchar
144        fec = fragment.endchar
145        return (sc < fsc < ec) or (sc < fec < ec)
146
147    def overlapped_length(self, fragment):
148        sc = self.startchar
149        ec = self.endchar
150        fsc = fragment.startchar
151        fec = fragment.endchar
152        return max(ec, fec) - min(sc, fsc)
153
154    def __lt__(self, other):
155        return id(self) < id(other)
156
157
158# Tokenizing
159
160def set_matched_filter(tokens, termset):
161    for t in tokens:
162        t.matched = t.text in termset
163        yield t
164
165
166# Fragmenters
167
168class Fragmenter(object):
169    def must_retokenize(self):
170        """Returns True if this fragmenter requires retokenized text.
171
172        If this method returns True, the fragmenter's ``fragment_tokens``
173        method  will be called with an iterator of ALL tokens from the text,
174        with the tokens for matched terms having the ``matched`` attribute set
175        to True.
176
177        If this method returns False, the fragmenter's ``fragment_matches``
178        method will be called with a LIST of matching tokens.
179        """
180
181        return True
182
183    def fragment_tokens(self, text, all_tokens):
184        """Yields :class:`Fragment` objects based on the tokenized text.
185
186        :param text: the string being highlighted.
187        :param all_tokens: an iterator of :class:`analysis.Token`
188            objects from the string.
189        """
190
191        raise NotImplementedError
192
193    def fragment_matches(self, text, matched_tokens):
194        """Yields :class:`Fragment` objects based on the text and the matched
195        terms.
196
197        :param text: the string being highlighted.
198        :param matched_tokens: a list of :class:`analysis.Token` objects
199            representing the term matches in the string.
200        """
201
202        raise NotImplementedError
203
204
205class WholeFragmenter(Fragmenter):
206    """Doesn't fragment the token stream. This object just returns the entire
207    entire stream as one "fragment". This is useful if you want to highlight
208    the entire text.
209
210    Note that even if you use the `WholeFragmenter`, the highlight code will
211    return no fragment if no terms matched in the given field. To return the
212    whole fragment even in that case, call `highlights()` with `minscore=0`::
213
214        # Query where no terms match in the "text" field
215        q = query.Term("tag", "new")
216
217        r = mysearcher.search(q)
218        r.fragmenter = highlight.WholeFragmenter()
219        r.formatter = highlight.UppercaseFormatter()
220        # Since no terms in the "text" field matched, we get no fragments back
221        assert r[0].highlights("text") == ""
222
223        # If we lower the minimum score to 0, we get a fragment even though it
224        # has no matching terms
225        assert r[0].highlights("text", minscore=0) == "This is the text field."
226
227    """
228
229    def __init__(self, charlimit=DEFAULT_CHARLIMIT):
230        self.charlimit = charlimit
231
232    def fragment_tokens(self, text, tokens):
233        charlimit = self.charlimit
234        matches = []
235        for t in tokens:
236            if charlimit and t.endchar > charlimit:
237                break
238            if t.matched:
239                matches.append(t.copy())
240        return [Fragment(text, matches)]
241
242
243# Backwards compatiblity
244NullFragmeter = WholeFragmenter
245
246
247class SentenceFragmenter(Fragmenter):
248    """Breaks the text up on sentence end punctuation characters
249    (".", "!", or "?"). This object works by looking in the original text for a
250    sentence end as the next character after each token's 'endchar'.
251
252    When highlighting with this fragmenter, you should use an analyzer that
253    does NOT remove stop words, for example::
254
255        sa = StandardAnalyzer(stoplist=None)
256    """
257
258    def __init__(self, maxchars=200, sentencechars=".!?",
259                 charlimit=DEFAULT_CHARLIMIT):
260        """
261        :param maxchars: The maximum number of characters allowed in a
262            fragment.
263        """
264
265        self.maxchars = maxchars
266        self.sentencechars = frozenset(sentencechars)
267        self.charlimit = charlimit
268
269    def fragment_tokens(self, text, tokens):
270        maxchars = self.maxchars
271        sentencechars = self.sentencechars
272        charlimit = self.charlimit
273
274        textlen = len(text)
275        # startchar of first token in the current sentence
276        first = None
277        # Buffer for matched tokens in the current sentence
278        tks = []
279        endchar = None
280        # Number of chars in the current sentence
281        currentlen = 0
282
283        for t in tokens:
284            startchar = t.startchar
285            endchar = t.endchar
286            if charlimit and endchar > charlimit:
287                break
288
289            if first is None:
290                # Remember the startchar of the first token in a sentence
291                first = startchar
292                currentlen = 0
293
294            tlength = endchar - startchar
295            currentlen += tlength
296
297            if t.matched:
298                tks.append(t.copy())
299
300            # If the character after the current token is end-of-sentence
301            # punctuation, finish the sentence and reset
302            if endchar < textlen and text[endchar] in sentencechars:
303                # Don't break for two periods in a row (e.g. ignore "...")
304                if endchar + 1 < textlen and text[endchar + 1] in sentencechars:
305                    continue
306
307                # If the sentence had matches and it's not too long, yield it
308                # as a token
309                if tks and currentlen <= maxchars:
310                    yield mkfrag(text, tks, startchar=first, endchar=endchar)
311                # Reset the counts
312                tks = []
313                first = None
314                currentlen = 0
315
316        # If we get to the end of the text and there's still a sentence
317        # in the buffer, yield it
318        if tks:
319            yield mkfrag(text, tks, startchar=first, endchar=endchar)
320
321
322class ContextFragmenter(Fragmenter):
323    """Looks for matched terms and aggregates them with their surrounding
324    context.
325    """
326
327    def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT):
328        """
329        :param maxchars: The maximum number of characters allowed in a
330            fragment.
331        :param surround: The number of extra characters of context to add both
332            before the first matched term and after the last matched term.
333        """
334
335        self.maxchars = maxchars
336        self.surround = surround
337        self.charlimit = charlimit
338
339    def fragment_tokens(self, text, tokens):
340        maxchars = self.maxchars
341        surround = self.surround
342        charlimit = self.charlimit
343
344        # startchar of the first token in the fragment
345        first = None
346        # Stack of startchars
347        firsts = deque()
348        # Each time we see a matched token, we reset the countdown to finishing
349        # the fragment. This also indicates whether we're currently inside a
350        # fragment (< 0 not in fragment, >= 0 in fragment)
351        countdown = -1
352        # Tokens in current fragment
353        tks = []
354        endchar = None
355        # Number of chars in the current fragment
356        currentlen = 0
357
358        for t in tokens:
359            startchar = t.startchar
360            endchar = t.endchar
361            tlength = endchar - startchar
362            if charlimit and endchar > charlimit:
363                break
364
365            if countdown < 0 and not t.matched:
366                # We're not in a fragment currently, so just maintain the
367                # "charsbefore" buffer
368                firsts.append(startchar)
369                while firsts and endchar - firsts[0] > surround:
370                    firsts.popleft()
371            elif currentlen + tlength > maxchars:
372                # We're in a fragment, but adding this token would put us past
373                # the maximum size. Zero the countdown so the code below will
374                # cause the fragment to be emitted
375                countdown = 0
376            elif t.matched:
377                # Start/restart the countdown
378                countdown = surround
379                # Remember the first char of this fragment
380                if first is None:
381                    if firsts:
382                        first = firsts[0]
383                    else:
384                        first = startchar
385                        # Add on unused front context
386                        countdown += surround
387                tks.append(t.copy())
388
389            # If we're in a fragment...
390            if countdown >= 0:
391                # Update the counts
392                currentlen += tlength
393                countdown -= tlength
394
395                # If the countdown is expired
396                if countdown <= 0:
397                    # Finish the fragment
398                    yield mkfrag(text, tks, startchar=first, endchar=endchar)
399                    # Reset the counts
400                    tks = []
401                    firsts = deque()
402                    first = None
403                    currentlen = 0
404
405        # If there's a fragment left over at the end, yield it
406        if tks:
407            yield mkfrag(text, tks, startchar=first, endchar=endchar)
408
409
410class PinpointFragmenter(Fragmenter):
411    """This is a NON-RETOKENIZING fragmenter. It builds fragments from the
412    positions of the matched terms.
413    """
414
415    def __init__(self, maxchars=200, surround=20, autotrim=False,
416                 charlimit=DEFAULT_CHARLIMIT):
417        """
418        :param maxchars: The maximum number of characters allowed in a
419            fragment.
420        :param surround: The number of extra characters of context to add both
421            before the first matched term and after the last matched term.
422        :param autotrim: automatically trims text before the first space and
423            after the last space in the fragments, to try to avoid truncated
424            words at the start and end. For short fragments or fragments with
425            long runs between spaces this may give strange results.
426        """
427
428        self.maxchars = maxchars
429        self.surround = surround
430        self.autotrim = autotrim
431        self.charlimit = charlimit
432
433    def must_retokenize(self):
434        return False
435
436    def fragment_tokens(self, text, tokens):
437        matched = [t for t in tokens if t.matched]
438        return self.fragment_matches(text, matched)
439
440    @staticmethod
441    def _autotrim(fragment):
442        text = fragment.text
443        startchar = fragment.startchar
444        endchar = fragment.endchar
445
446        firstspace = text.find(" ", startchar, endchar)
447        if firstspace > 0:
448            startchar = firstspace + 1
449        lastspace = text.rfind(" ", startchar, endchar)
450        if lastspace > 0:
451            endchar = lastspace
452
453        if fragment.matches:
454            startchar = min(startchar, fragment.matches[0].startchar)
455            endchar = max(endchar, fragment.matches[-1].endchar)
456
457        fragment.startchar = startchar
458        fragment.endchar = endchar
459
460    def fragment_matches(self, text, tokens):
461        maxchars = self.maxchars
462        surround = self.surround
463        autotrim = self.autotrim
464        charlimit = self.charlimit
465
466        j = -1
467
468        for i, t in enumerate(tokens):
469            if j >= i:
470                continue
471            j = i
472            left = t.startchar
473            right = t.endchar
474            if charlimit and right > charlimit:
475                break
476
477            currentlen = right - left
478            while j < len(tokens) - 1 and currentlen < maxchars:
479                next = tokens[j + 1]
480                ec = next.endchar
481                if ec - right <= surround and ec - left <= maxchars:
482                    j += 1
483                    right = ec
484                    currentlen += (ec - next.startchar)
485                else:
486                    break
487
488            left = max(0, left - surround)
489            right = min(len(text), right + surround)
490            fragment = Fragment(text, tokens[i:j + 1], left, right)
491            if autotrim:
492                self._autotrim(fragment)
493            yield fragment
494
495
496# Fragment scorers
497
498class FragmentScorer(object):
499    pass
500
501
502class BasicFragmentScorer(FragmentScorer):
503    def __call__(self, f):
504        # Add up the boosts for the matched terms in this passage
505        score = sum(t.boost for t in f.matches)
506
507        # Favor diversity: multiply score by the number of separate
508        # terms matched
509        score *= (len(f.matched_terms) * 100) or 1
510
511        return score
512
513
514# Fragment sorters
515
516def SCORE(fragment):
517    "Sorts higher scored passages first."
518    return 1
519
520
521def FIRST(fragment):
522    "Sorts passages from earlier in the document first."
523    return fragment.startchar
524
525
526def LONGER(fragment):
527    "Sorts longer passages first."
528    return 0 - len(fragment)
529
530
531def SHORTER(fragment):
532    "Sort shorter passages first."
533    return len(fragment)
534
535
536# Formatters
537
538def get_text(original, token, replace):
539    """Convenience function for getting the text to use for a match when
540    formatting.
541
542    If ``replace`` is False, returns the part of ``original`` between
543    ``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns
544    ``token.text``.
545    """
546
547    if replace:
548        return token.text
549    else:
550        return original[token.startchar:token.endchar]
551
552
553class Formatter(object):
554    """Base class for formatters.
555
556    For highlighters that return strings, it is usually only necessary to
557    override :meth:`Formatter.format_token`.
558
559    Use the :func:`get_text` function as a convenience to get the token text::
560
561        class MyFormatter(Formatter):
562            def format_token(text, token, replace=False):
563                ttext = get_text(text, token, replace)
564                return "[%s]" % ttext
565    """
566
567    between = "..."
568
569    def _text(self, text):
570        return text
571
572    def format_token(self, text, token, replace=False):
573        """Returns a formatted version of the given "token" object, which
574        should have at least ``startchar`` and ``endchar`` attributes, and
575        a ``text`` attribute if ``replace`` is True.
576
577        :param text: the original fragment text being highlighted.
578        :param token: an object having ``startchar`` and ``endchar`` attributes
579            and optionally a ``text`` attribute (if ``replace`` is True).
580        :param replace: if True, the original text between the token's
581            ``startchar`` and ``endchar`` indices will be replaced with the
582            value of the token's ``text`` attribute.
583        """
584
585        raise NotImplementedError
586
587    def format_fragment(self, fragment, replace=False):
588        """Returns a formatted version of the given text, using the "token"
589        objects in the given :class:`Fragment`.
590
591        :param fragment: a :class:`Fragment` object representing a list of
592            matches in the text.
593        :param replace: if True, the original text corresponding to each
594            match will be replaced with the value of the token object's
595            ``text`` attribute.
596        """
597
598        output = []
599        index = fragment.startchar
600        text = fragment.text
601
602        for t in fragment.matches:
603            if t.startchar is None:
604                continue
605            if t.startchar < index:
606                continue
607            if t.startchar > index:
608                output.append(self._text(text[index:t.startchar]))
609            output.append(self.format_token(text, t, replace))
610            index = t.endchar
611        output.append(self._text(text[index:fragment.endchar]))
612
613        out_string = "".join(output)
614        return out_string
615
616    def format(self, fragments, replace=False):
617        """Returns a formatted version of the given text, using a list of
618        :class:`Fragment` objects.
619        """
620
621        formatted = [self.format_fragment(f, replace=replace)
622                     for f in fragments]
623        return self.between.join(formatted)
624
625    def __call__(self, text, fragments):
626        # For backwards compatibility
627        return self.format(fragments)
628
629
630class NullFormatter(Formatter):
631    """Formatter that does not modify the string.
632    """
633
634    def format_token(self, text, token, replace=False):
635        return get_text(text, token, replace)
636
637
638class UppercaseFormatter(Formatter):
639    """Returns a string in which the matched terms are in UPPERCASE.
640    """
641
642    def __init__(self, between="..."):
643        """
644        :param between: the text to add between fragments.
645        """
646
647        self.between = between
648
649    def format_token(self, text, token, replace=False):
650        ttxt = get_text(text, token, replace)
651        return ttxt.upper()
652
653
654class HtmlFormatter(Formatter):
655    """Returns a string containing HTML formatting around the matched terms.
656
657    This formatter wraps matched terms in an HTML element with two class names.
658    The first class name (set with the constructor argument ``classname``) is
659    the same for each match. The second class name (set with the constructor
660    argument ``termclass`` is different depending on which term matched. This
661    allows you to give different formatting (for example, different background
662    colors) to the different terms in the excerpt.
663
664    >>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term")
665    >>> hf(mytext, myfragments)
666    "The <span class="match term0">template</span> <span class="match term1">geometry</span> is..."
667
668    This object maintains a dictionary mapping terms to HTML class names (e.g.
669    ``term0`` and ``term1`` above), so that multiple excerpts will use the same
670    class for the same term. If you want to re-use the same HtmlFormatter
671    object with different searches, you should call HtmlFormatter.clear()
672    between searches to clear the mapping.
673    """
674
675    template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s</%(tag)s>'
676
677    def __init__(self, tagname="strong", between="...",
678                 classname="match", termclass="term", maxclasses=5,
679                 attrquote='"'):
680        """
681        :param tagname: the tag to wrap around matching terms.
682        :param between: the text to add between fragments.
683        :param classname: the class name to add to the elements wrapped around
684            matching terms.
685        :param termclass: the class name prefix for the second class which is
686            different for each matched term.
687        :param maxclasses: the maximum number of term classes to produce. This
688            limits the number of classes you have to define in CSS by recycling
689            term class names. For example, if you set maxclasses to 3 and have
690            5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``,
691            ``term2``, ``term0``, ``term1``.
692        """
693
694        self.between = between
695        self.tagname = tagname
696        self.classname = classname
697        self.termclass = termclass
698        self.attrquote = attrquote
699        self.maxclasses = maxclasses
700        self.seen = {}
701        self.htmlclass = " ".join((self.classname, self.termclass))
702
703    def _text(self, text):
704        return htmlescape(text, quote=False)
705
706    def format_token(self, text, token, replace=False):
707        seen = self.seen
708        ttext = self._text(get_text(text, token, replace))
709        if ttext in seen:
710            termnum = seen[ttext]
711        else:
712            termnum = len(seen) % self.maxclasses
713            seen[ttext] = termnum
714
715        return self.template % {"tag": self.tagname, "q": self.attrquote,
716                                "cls": self.htmlclass, "t": ttext,
717                                "tn": termnum}
718
719    def clean(self):
720        """Clears the dictionary mapping terms to HTML classnames.
721        """
722        self.seen = {}
723
724
725class GenshiFormatter(Formatter):
726    """Returns a Genshi event stream containing HTML formatting around the
727    matched terms.
728    """
729
730    def __init__(self, qname="strong", between="..."):
731        """
732        :param qname: the QName for the tag to wrap around matched terms.
733        :param between: the text to add between fragments.
734        """
735
736        self.qname = qname
737        self.between = between
738
739        from genshi.core import START, END, TEXT  # @UnresolvedImport
740        from genshi.core import Attrs, Stream  # @UnresolvedImport
741        self.START, self.END, self.TEXT = START, END, TEXT
742        self.Attrs, self.Stream = Attrs, Stream
743
744    def _add_text(self, text, output):
745        if output and output[-1][0] == self.TEXT:
746            output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])
747        else:
748            output.append((self.TEXT, text, (None, -1, -1)))
749
750    def format_token(self, text, token, replace=False):
751        qn = self.qname
752        txt = get_text(text, token, replace)
753        return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)),
754                            (self.TEXT, txt, (None, -1, -1)),
755                            (self.END, qn, (None, -1, -1))])
756
757    def format_fragment(self, fragment, replace=False):
758        output = []
759        index = fragment.startchar
760        text = fragment.text
761
762        for t in fragment.matches:
763            if t.startchar > index:
764                self._add_text(text[index:t.startchar], output)
765            output.append((text, t, replace))
766            index = t.endchar
767        if index < len(text):
768            self._add_text(text[index:], output)
769        return self.Stream(output)
770
771    def format(self, fragments, replace=False):
772        output = []
773        first = True
774        for fragment in fragments:
775            if not first:
776                self._add_text(self.between, output)
777            output += self.format_fragment(fragment, replace=replace)
778            first = False
779        return self.Stream(output)
780
781
782# Highlighting
783
784def top_fragments(fragments, count, scorer, order, minscore=1):
785    scored_fragments = ((scorer(f), f) for f in fragments)
786    scored_fragments = nlargest(count, scored_fragments)
787    best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
788    best_fragments.sort(key=order)
789    return best_fragments
790
791
792def highlight(text, terms, analyzer, fragmenter, formatter, top=3,
793              scorer=None, minscore=1, order=FIRST, mode="query"):
794
795    if scorer is None:
796        scorer = BasicFragmentScorer()
797
798    if type(fragmenter) is type:
799        fragmenter = fragmenter()
800    if type(formatter) is type:
801        formatter = formatter()
802    if type(scorer) is type:
803        scorer = scorer()
804
805    if scorer is None:
806        scorer = BasicFragmentScorer()
807
808    termset = frozenset(terms)
809    tokens = analyzer(text, chars=True, mode=mode, removestops=False)
810    tokens = set_matched_filter(tokens, termset)
811    fragments = fragmenter.fragment_tokens(text, tokens)
812    fragments = top_fragments(fragments, top, scorer, order, minscore)
813    return formatter(text, fragments)
814
815
816class Highlighter(object):
817    def __init__(self, fragmenter=None, scorer=None, formatter=None,
818                 always_retokenize=False, order=FIRST):
819        self.fragmenter = fragmenter or ContextFragmenter()
820        self.scorer = scorer or BasicFragmentScorer()
821        self.formatter = formatter or HtmlFormatter(tagname="b")
822        self.order = order
823        self.always_retokenize = always_retokenize
824
825    def can_load_chars(self, results, fieldname):
826        # Is it possible to build a mapping between the matched terms/docs and
827        # their start and end chars for "pinpoint" highlighting (ie not require
828        # re-tokenizing text)?
829
830        if self.always_retokenize:
831            # No, we've been configured to always retokenize some text
832            return False
833        if not results.has_matched_terms():
834            # No, we don't know what the matched terms are yet
835            return False
836        if self.fragmenter.must_retokenize():
837            # No, the configured fragmenter doesn't support it
838            return False
839
840        # Maybe, if the field was configured to store characters
841        field = results.searcher.schema[fieldname]
842        return field.supports("characters")
843
844    @staticmethod
845    def _load_chars(results, fieldname, texts, to_bytes):
846        # For each docnum, create a mapping of text -> [(startchar, endchar)]
847        # for the matched terms
848
849        results._char_cache[fieldname] = cache = {}
850        sorted_ids = sorted(docnum for _, docnum in results.top_n)
851
852        for docnum in sorted_ids:
853            cache[docnum] = {}
854
855        for text in texts:
856            btext = to_bytes(text)
857            m = results.searcher.postings(fieldname, btext)
858            docset = set(results.termdocs[(fieldname, btext)])
859            for docnum in sorted_ids:
860                if docnum in docset:
861                    m.skip_to(docnum)
862                    assert m.id() == docnum
863                    cache[docnum][text] = m.value_as("characters")
864
865    @staticmethod
866    def _merge_matched_tokens(tokens):
867        # Merges consecutive matched tokens together, so they are highlighted
868        # as one
869
870        token = None
871
872        for t in tokens:
873            if not t.matched:
874                if token is not None:
875                    yield token
876                    token = None
877                yield t
878                continue
879
880            if token is None:
881                token = t.copy()
882            elif t.startchar <= token.endchar:
883                if t.endchar > token.endchar:
884                    token.text += t.text[token.endchar-t.endchar:]
885                    token.endchar = t.endchar
886            else:
887                yield token
888                token = None
889                # t was not merged, also has to be yielded
890                yield t
891
892        if token is not None:
893            yield token
894
895    def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
896        results = hitobj.results
897        schema = results.searcher.schema
898        field = schema[fieldname]
899        to_bytes = field.to_bytes
900        from_bytes = field.from_bytes
901
902        if text is None:
903            if fieldname not in hitobj:
904                raise KeyError("Field %r is not stored." % fieldname)
905            text = hitobj[fieldname]
906
907        # Get the terms searched for/matched in this field
908        if results.has_matched_terms():
909            bterms = (term for term in results.matched_terms()
910                      if term[0] == fieldname)
911        else:
912            bterms = results.query_terms(expand=True, fieldname=fieldname)
913        # Convert bytes to unicode
914        words = frozenset(from_bytes(term[1]) for term in bterms)
915
916        # If we can do "pinpoint" highlighting...
917        if self.can_load_chars(results, fieldname):
918            # Build the docnum->[(startchar, endchar),] map
919            if fieldname not in results._char_cache:
920                self._load_chars(results, fieldname, words, to_bytes)
921
922            hitterms = (from_bytes(term[1]) for term in hitobj.matched_terms()
923                        if term[0] == fieldname)
924
925            # Grab the word->[(startchar, endchar)] map for this docnum
926            cmap = results._char_cache[fieldname][hitobj.docnum]
927            # A list of Token objects for matched words
928            tokens = []
929            charlimit = self.fragmenter.charlimit
930            for word in hitterms:
931                chars = cmap[word]
932                for pos, startchar, endchar in chars:
933                    if charlimit and endchar > charlimit:
934                        break
935                    tokens.append(Token(text=word, pos=pos,
936                                        startchar=startchar, endchar=endchar))
937            tokens.sort(key=lambda t: t.startchar)
938            tokens = [max(group, key=lambda t: t.endchar - t.startchar)
939                      for key, group in groupby(tokens, lambda t: t.startchar)]
940            fragments = self.fragmenter.fragment_matches(text, tokens)
941        else:
942            # Retokenize the text
943            analyzer = results.searcher.schema[fieldname].analyzer
944            tokens = analyzer(text, positions=True, chars=True, mode="index",
945                              removestops=False)
946            # Set Token.matched attribute for tokens that match a query term
947            tokens = set_matched_filter(tokens, words)
948            tokens = self._merge_matched_tokens(tokens)
949            fragments = self.fragmenter.fragment_tokens(text, tokens)
950
951        fragments = top_fragments(fragments, top, self.scorer, self.order,
952                                  minscore=minscore)
953        output = self.formatter.format(fragments)
954        return output
955