1# Copyright 2007 Matt Chaput. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are met: 5# 6# 1. Redistributions of source code must retain the above copyright notice, 7# this list of conditions and the following disclaimer. 8# 9# 2. Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23# 24# The views and conclusions contained in the software and documentation are 25# those of the authors and should not be interpreted as representing official 26# policies, either expressed or implied, of Matt Chaput. 27 28""" 29This module contains helper functions for correcting typos in user queries. 30""" 31 32from bisect import bisect_left 33from heapq import heappush, heapreplace 34 35from whoosh import highlight 36from whoosh.compat import iteritems, xrange 37 38 39# Corrector objects 40 41class Corrector(object): 42 """ 43 Base class for spelling correction objects. Concrete sub-classes should 44 implement the ``_suggestions`` method. 45 """ 46 47 def suggest(self, text, limit=5, maxdist=2, prefix=0): 48 """ 49 :param text: the text to check. This word will **not** be added to the 50 suggestions, even if it appears in the word graph. 51 :param limit: only return up to this many suggestions. If there are not 52 enough terms in the field within ``maxdist`` of the given word, the 53 returned list will be shorter than this number. 54 :param maxdist: the largest edit distance from the given word to look 55 at. Values higher than 2 are not very effective or efficient. 56 :param prefix: require suggestions to share a prefix of this length 57 with the given word. This is often justifiable since most 58 misspellings do not involve the first letter of the word. Using a 59 prefix dramatically decreases the time it takes to generate the 60 list of words. 61 """ 62 63 _suggestions = self._suggestions 64 65 heap = [] 66 for item in _suggestions(text, maxdist, prefix): 67 # Note that the *higher* scores (item[0]) are better! 68 if len(heap) < limit: 69 heappush(heap, item) 70 elif item > heap[0]: 71 heapreplace(heap, item) 72 73 sugs = sorted(heap, key=lambda x: (0 - x[0], x[1])) 74 return [sug for _, sug in sugs] 75 76 def _suggestions(self, text, maxdist, prefix): 77 """ 78 Low-level method that yields a series of (score, "suggestion") 79 tuples. 80 81 :param text: the text to check. 82 :param maxdist: the maximum edit distance. 83 :param prefix: require suggestions to share a prefix of this length 84 with the given word. 85 """ 86 87 raise NotImplementedError 88 89 90class ReaderCorrector(Corrector): 91 """ 92 Suggests corrections based on the content of a field in a reader. 93 94 Ranks suggestions by the edit distance, then by highest to lowest 95 frequency. 96 """ 97 98 def __init__(self, reader, fieldname, fieldobj): 99 self.reader = reader 100 self.fieldname = fieldname 101 self.fieldobj = fieldobj 102 103 def _suggestions(self, text, maxdist, prefix): 104 reader = self.reader 105 freq = reader.frequency 106 107 fieldname = self.fieldname 108 fieldobj = reader.schema[fieldname] 109 sugfield = fieldobj.spelling_fieldname(fieldname) 110 111 for sug in reader.terms_within(sugfield, text, maxdist, prefix=prefix): 112 # Higher scores are better, so negate the distance and frequency 113 f = freq(fieldname, sug) or 1 114 score = 0 - (maxdist + (1.0 / f * 0.5)) 115 yield (score, sug) 116 117 118class ListCorrector(Corrector): 119 """ 120 Suggests corrections based on the content of a sorted list of strings. 121 """ 122 123 def __init__(self, wordlist): 124 self.wordlist = wordlist 125 126 def _suggestions(self, text, maxdist, prefix): 127 from whoosh.automata.lev import levenshtein_automaton 128 from whoosh.automata.fsa import find_all_matches 129 130 seen = set() 131 for i in xrange(1, maxdist + 1): 132 dfa = levenshtein_automaton(text, maxdist, prefix).to_dfa() 133 sk = self.Skipper(self.wordlist) 134 for sug in find_all_matches(dfa, sk): 135 if sug not in seen: 136 seen.add(sug) 137 yield (0 - maxdist), sug 138 139 class Skipper(object): 140 def __init__(self, data): 141 self.data = data 142 self.i = 0 143 144 def __call__(self, w): 145 if self.data[self.i] == w: 146 return w 147 self.i += 1 148 pos = bisect_left(self.data, w, self.i) 149 if pos < len(self.data): 150 return self.data[pos] 151 else: 152 return None 153 154 155class MultiCorrector(Corrector): 156 """ 157 Merges suggestions from a list of sub-correctors. 158 """ 159 160 def __init__(self, correctors, op): 161 self.correctors = correctors 162 self.op = op 163 164 def _suggestions(self, text, maxdist, prefix): 165 op = self.op 166 seen = {} 167 for corr in self.correctors: 168 for score, sug in corr._suggestions(text, maxdist, prefix): 169 if sug in seen: 170 seen[sug] = op(seen[sug], score) 171 else: 172 seen[sug] = score 173 return iteritems(seen) 174 175 176# Query correction 177 178class Correction(object): 179 """ 180 Represents the corrected version of a user query string. Has the 181 following attributes: 182 183 ``query`` 184 The corrected :class:`whoosh.query.Query` object. 185 ``string`` 186 The corrected user query string. 187 ``original_query`` 188 The original :class:`whoosh.query.Query` object that was corrected. 189 ``original_string`` 190 The original user query string. 191 ``tokens`` 192 A list of token objects representing the corrected words. 193 194 You can also use the :meth:`Correction.format_string` method to reformat the 195 corrected query string using a :class:`whoosh.highlight.Formatter` class. 196 For example, to display the corrected query string as HTML with the 197 changed words emphasized:: 198 199 from whoosh import highlight 200 201 correction = mysearcher.correct_query(q, qstring) 202 203 hf = highlight.HtmlFormatter(classname="change") 204 html = correction.format_string(hf) 205 """ 206 207 def __init__(self, q, qstring, corr_q, tokens): 208 self.original_query = q 209 self.query = corr_q 210 self.original_string = qstring 211 self.tokens = tokens 212 213 if self.original_string: 214 self.string = self.format_string(highlight.NullFormatter()) 215 else: 216 self.string = '' 217 218 def __repr__(self): 219 return "%s(%r, %r)" % (self.__class__.__name__, self.query, 220 self.string) 221 222 def format_string(self, formatter): 223 """ 224 Highlights the corrected words in the original query string using the 225 given :class:`~whoosh.highlight.Formatter`. 226 227 :param formatter: A :class:`whoosh.highlight.Formatter` instance. 228 :return: the output of the formatter (usually a string). 229 """ 230 231 if not self.original_string: 232 return '' 233 if isinstance(formatter, type): 234 formatter = formatter() 235 236 fragment = highlight.Fragment(self.original_string, self.tokens) 237 return formatter.format_fragment(fragment, replace=True) 238 239 240# QueryCorrector objects 241 242class QueryCorrector(object): 243 """ 244 Base class for objects that correct words in a user query. 245 """ 246 247 def __init__(self, fieldname): 248 self.fieldname = fieldname 249 250 def correct_query(self, q, qstring): 251 """ 252 Returns a :class:`Correction` object representing the corrected 253 form of the given query. 254 255 :param q: the original :class:`whoosh.query.Query` tree to be 256 corrected. 257 :param qstring: the original user query. This may be None if the 258 original query string is not available, in which case the 259 ``Correction.string`` attribute will also be None. 260 :rtype: :class:`Correction` 261 """ 262 263 raise NotImplementedError 264 265 def field(self): 266 return self.fieldname 267 268 269class SimpleQueryCorrector(QueryCorrector): 270 """ 271 A simple query corrector based on a mapping of field names to 272 :class:`Corrector` objects, and a list of ``("fieldname", "text")`` tuples 273 to correct. And terms in the query that appear in list of term tuples are 274 corrected using the appropriate corrector. 275 """ 276 277 def __init__(self, correctors, terms, aliases=None, prefix=0, maxdist=2): 278 """ 279 :param correctors: a dictionary mapping field names to 280 :class:`Corrector` objects. 281 :param terms: a sequence of ``("fieldname", "text")`` tuples 282 representing terms to be corrected. 283 :param aliases: a dictionary mapping field names in the query to 284 field names for spelling suggestions. 285 :param prefix: suggested replacement words must share this number of 286 initial characters with the original word. Increasing this even to 287 just ``1`` can dramatically speed up suggestions, and may be 288 justifiable since spellling mistakes rarely involve the first 289 letter of a word. 290 :param maxdist: the maximum number of "edits" (insertions, deletions, 291 subsitutions, or transpositions of letters) allowed between the 292 original word and any suggestion. Values higher than ``2`` may be 293 slow. 294 """ 295 296 self.correctors = correctors 297 self.aliases = aliases or {} 298 self.termset = frozenset(terms) 299 self.prefix = prefix 300 self.maxdist = maxdist 301 302 def correct_query(self, q, qstring): 303 correctors = self.correctors 304 aliases = self.aliases 305 termset = self.termset 306 prefix = self.prefix 307 maxdist = self.maxdist 308 309 # A list of tokens that were changed by a corrector 310 corrected_tokens = [] 311 312 # The corrected query tree. We don't need to deepcopy the original 313 # because we use Query.replace() to find-and-replace the corrected 314 # words and it returns a copy of the query tree. 315 corrected_q = q 316 317 # For every word in the original query... 318 # Note we can't put these in a set, because we must preserve WHERE 319 # in the query each token occured so we can format them later 320 for token in q.all_tokens(): 321 fname = token.fieldname 322 aname = aliases.get(fname, fname) 323 324 # If this is one of the words we're supposed to correct... 325 if (fname, token.text) in termset: 326 c = correctors[aname] 327 sugs = c.suggest(token.text, prefix=prefix, maxdist=maxdist) 328 if sugs: 329 # This is a "simple" corrector, so we just pick the first 330 # suggestion :/ 331 sug = sugs[0] 332 333 # Return a new copy of the original query with this word 334 # replaced by the correction 335 corrected_q = corrected_q.replace(token.fieldname, 336 token.text, sug) 337 # Add the token to the list of corrected tokens (for the 338 # formatter to use later) 339 token.original = token.text 340 token.text = sug 341 corrected_tokens.append(token) 342 343 return Correction(q, qstring, corrected_q, corrected_tokens) 344