1# Copyright 2007 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6#    1. Redistributions of source code must retain the above copyright notice,
7#       this list of conditions and the following disclaimer.
8#
9#    2. Redistributions in binary form must reproduce the above copyright
10#       notice, this list of conditions and the following disclaimer in the
11#       documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""
29This module contains helper functions for correcting typos in user queries.
30"""
31
32from bisect import bisect_left
33from heapq import heappush, heapreplace
34
35from whoosh import highlight
36from whoosh.compat import iteritems, xrange
37
38
39# Corrector objects
40
41class Corrector(object):
42    """
43    Base class for spelling correction objects. Concrete sub-classes should
44    implement the ``_suggestions`` method.
45    """
46
47    def suggest(self, text, limit=5, maxdist=2, prefix=0):
48        """
49        :param text: the text to check. This word will **not** be added to the
50            suggestions, even if it appears in the word graph.
51        :param limit: only return up to this many suggestions. If there are not
52            enough terms in the field within ``maxdist`` of the given word, the
53            returned list will be shorter than this number.
54        :param maxdist: the largest edit distance from the given word to look
55            at. Values higher than 2 are not very effective or efficient.
56        :param prefix: require suggestions to share a prefix of this length
57            with the given word. This is often justifiable since most
58            misspellings do not involve the first letter of the word. Using a
59            prefix dramatically decreases the time it takes to generate the
60            list of words.
61        """
62
63        _suggestions = self._suggestions
64
65        heap = []
66        for item in _suggestions(text, maxdist, prefix):
67            # Note that the *higher* scores (item[0]) are better!
68            if len(heap) < limit:
69                heappush(heap, item)
70            elif item > heap[0]:
71                heapreplace(heap, item)
72
73        sugs = sorted(heap, key=lambda x: (0 - x[0], x[1]))
74        return [sug for _, sug in sugs]
75
76    def _suggestions(self, text, maxdist, prefix):
77        """
78        Low-level method that yields a series of (score, "suggestion")
79        tuples.
80
81        :param text: the text to check.
82        :param maxdist: the maximum edit distance.
83        :param prefix: require suggestions to share a prefix of this length
84            with the given word.
85        """
86
87        raise NotImplementedError
88
89
90class ReaderCorrector(Corrector):
91    """
92    Suggests corrections based on the content of a field in a reader.
93
94    Ranks suggestions by the edit distance, then by highest to lowest
95    frequency.
96    """
97
98    def __init__(self, reader, fieldname, fieldobj):
99        self.reader = reader
100        self.fieldname = fieldname
101        self.fieldobj = fieldobj
102
103    def _suggestions(self, text, maxdist, prefix):
104        reader = self.reader
105        freq = reader.frequency
106
107        fieldname = self.fieldname
108        fieldobj = reader.schema[fieldname]
109        sugfield = fieldobj.spelling_fieldname(fieldname)
110
111        for sug in reader.terms_within(sugfield, text, maxdist, prefix=prefix):
112            # Higher scores are better, so negate the distance and frequency
113            f = freq(fieldname, sug) or 1
114            score = 0 - (maxdist + (1.0 / f * 0.5))
115            yield (score, sug)
116
117
118class ListCorrector(Corrector):
119    """
120    Suggests corrections based on the content of a sorted list of strings.
121    """
122
123    def __init__(self, wordlist):
124        self.wordlist = wordlist
125
126    def _suggestions(self, text, maxdist, prefix):
127        from whoosh.automata.lev import levenshtein_automaton
128        from whoosh.automata.fsa import find_all_matches
129
130        seen = set()
131        for i in xrange(1, maxdist + 1):
132            dfa = levenshtein_automaton(text, maxdist, prefix).to_dfa()
133            sk = self.Skipper(self.wordlist)
134            for sug in find_all_matches(dfa, sk):
135                if sug not in seen:
136                    seen.add(sug)
137                    yield (0 - maxdist), sug
138
139    class Skipper(object):
140        def __init__(self, data):
141            self.data = data
142            self.i = 0
143
144        def __call__(self, w):
145            if self.data[self.i] == w:
146                return w
147            self.i += 1
148            pos = bisect_left(self.data, w, self.i)
149            if pos < len(self.data):
150                return self.data[pos]
151            else:
152                return None
153
154
155class MultiCorrector(Corrector):
156    """
157    Merges suggestions from a list of sub-correctors.
158    """
159
160    def __init__(self, correctors, op):
161        self.correctors = correctors
162        self.op = op
163
164    def _suggestions(self, text, maxdist, prefix):
165        op = self.op
166        seen = {}
167        for corr in self.correctors:
168            for score, sug in corr._suggestions(text, maxdist, prefix):
169                if sug in seen:
170                    seen[sug] = op(seen[sug], score)
171                else:
172                    seen[sug] = score
173        return iteritems(seen)
174
175
176# Query correction
177
178class Correction(object):
179    """
180    Represents the corrected version of a user query string. Has the
181    following attributes:
182
183    ``query``
184        The corrected :class:`whoosh.query.Query` object.
185    ``string``
186        The corrected user query string.
187    ``original_query``
188        The original :class:`whoosh.query.Query` object that was corrected.
189    ``original_string``
190        The original user query string.
191    ``tokens``
192        A list of token objects representing the corrected words.
193
194    You can also use the :meth:`Correction.format_string` method to reformat the
195    corrected query string using a :class:`whoosh.highlight.Formatter` class.
196    For example, to display the corrected query string as HTML with the
197    changed words emphasized::
198
199        from whoosh import highlight
200
201        correction = mysearcher.correct_query(q, qstring)
202
203        hf = highlight.HtmlFormatter(classname="change")
204        html = correction.format_string(hf)
205    """
206
207    def __init__(self, q, qstring, corr_q, tokens):
208        self.original_query = q
209        self.query = corr_q
210        self.original_string = qstring
211        self.tokens = tokens
212
213        if self.original_string:
214            self.string = self.format_string(highlight.NullFormatter())
215        else:
216            self.string = ''
217
218    def __repr__(self):
219        return "%s(%r, %r)" % (self.__class__.__name__, self.query,
220                               self.string)
221
222    def format_string(self, formatter):
223        """
224        Highlights the corrected words in the original query string using the
225        given :class:`~whoosh.highlight.Formatter`.
226
227        :param formatter: A :class:`whoosh.highlight.Formatter` instance.
228        :return: the output of the formatter (usually a string).
229        """
230
231        if not self.original_string:
232            return ''
233        if isinstance(formatter, type):
234            formatter = formatter()
235
236        fragment = highlight.Fragment(self.original_string, self.tokens)
237        return formatter.format_fragment(fragment, replace=True)
238
239
240# QueryCorrector objects
241
242class QueryCorrector(object):
243    """
244    Base class for objects that correct words in a user query.
245    """
246
247    def __init__(self, fieldname):
248        self.fieldname = fieldname
249
250    def correct_query(self, q, qstring):
251        """
252        Returns a :class:`Correction` object representing the corrected
253        form of the given query.
254
255        :param q: the original :class:`whoosh.query.Query` tree to be
256            corrected.
257        :param qstring: the original user query. This may be None if the
258            original query string is not available, in which case the
259            ``Correction.string`` attribute will also be None.
260        :rtype: :class:`Correction`
261        """
262
263        raise NotImplementedError
264
265    def field(self):
266        return self.fieldname
267
268
269class SimpleQueryCorrector(QueryCorrector):
270    """
271    A simple query corrector based on a mapping of field names to
272    :class:`Corrector` objects, and a list of ``("fieldname", "text")`` tuples
273    to correct. And terms in the query that appear in list of term tuples are
274    corrected using the appropriate corrector.
275    """
276
277    def __init__(self, correctors, terms, aliases=None, prefix=0, maxdist=2):
278        """
279        :param correctors: a dictionary mapping field names to
280            :class:`Corrector` objects.
281        :param terms: a sequence of ``("fieldname", "text")`` tuples
282            representing terms to be corrected.
283        :param aliases: a dictionary mapping field names in the query to
284            field names for spelling suggestions.
285        :param prefix: suggested replacement words must share this number of
286            initial characters with the original word. Increasing this even to
287            just ``1`` can dramatically speed up suggestions, and may be
288            justifiable since spellling mistakes rarely involve the first
289            letter of a word.
290        :param maxdist: the maximum number of "edits" (insertions, deletions,
291            subsitutions, or transpositions of letters) allowed between the
292            original word and any suggestion. Values higher than ``2`` may be
293            slow.
294        """
295
296        self.correctors = correctors
297        self.aliases = aliases or {}
298        self.termset = frozenset(terms)
299        self.prefix = prefix
300        self.maxdist = maxdist
301
302    def correct_query(self, q, qstring):
303        correctors = self.correctors
304        aliases = self.aliases
305        termset = self.termset
306        prefix = self.prefix
307        maxdist = self.maxdist
308
309        # A list of tokens that were changed by a corrector
310        corrected_tokens = []
311
312        # The corrected query tree. We don't need to deepcopy the original
313        # because we use Query.replace() to find-and-replace the corrected
314        # words and it returns a copy of the query tree.
315        corrected_q = q
316
317        # For every word in the original query...
318        # Note we can't put these in a set, because we must preserve WHERE
319        # in the query each token occured so we can format them later
320        for token in q.all_tokens():
321            fname = token.fieldname
322            aname = aliases.get(fname, fname)
323
324            # If this is one of the words we're supposed to correct...
325            if (fname, token.text) in termset:
326                c = correctors[aname]
327                sugs = c.suggest(token.text, prefix=prefix, maxdist=maxdist)
328                if sugs:
329                    # This is a "simple" corrector, so we just pick the first
330                    # suggestion :/
331                    sug = sugs[0]
332
333                    # Return a new copy of the original query with this word
334                    # replaced by the correction
335                    corrected_q = corrected_q.replace(token.fieldname,
336                                                      token.text, sug)
337                    # Add the token to the list of corrected tokens (for the
338                    # formatter to use later)
339                    token.original = token.text
340                    token.text = sug
341                    corrected_tokens.append(token)
342
343        return Correction(q, qstring, corrected_q, corrected_tokens)
344