1 /** @file flint_spelling.cc
2 * @brief Spelling correction data for a flint database.
3 */
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <config.h>
22
23 #include <xapian/error.h>
24 #include <xapian/types.h>
25
26 #include "expandweight.h"
27 #include "flint_spelling.h"
28 #include "flint_utils.h"
29 #include "omassert.h"
30 #include "ortermlist.h"
31
32 #include "../prefix_compressed_strings.h"
33
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
40
41 using namespace std;
42
43 void
merge_changes()44 FlintSpellingTable::merge_changes()
45 {
46 map<F_fragment, set<string> >::const_iterator i;
47 for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
48 string key = i->first;
49 const set<string> & changes = i->second;
50
51 set<string>::const_iterator d = changes.begin();
52 if (d == changes.end()) continue;
53
54 string updated;
55 string current;
56 PrefixCompressedStringWriter out(updated);
57 if (get_exact_entry(key, current)) {
58 PrefixCompressedStringItor in(current);
59 updated.reserve(current.size()); // FIXME plus some?
60 while (!in.at_end() && d != changes.end()) {
61 const string & word = *in;
62 Assert(d != changes.end());
63 int cmp = word.compare(*d);
64 if (cmp < 0) {
65 out.append(word);
66 ++in;
67 } else if (cmp > 0) {
68 out.append(*d);
69 ++d;
70 } else {
71 // If an existing entry is in the changes list, that means
72 // we should remove it.
73 ++in;
74 ++d;
75 }
76 }
77 if (!in.at_end()) {
78 // FIXME : easy to optimise this to a fix-up and substring copy.
79 while (!in.at_end()) {
80 out.append(*in++);
81 }
82 }
83 }
84 while (d != changes.end()) {
85 out.append(*d++);
86 }
87 if (!updated.empty()) {
88 add(key, updated);
89 } else {
90 del(key);
91 }
92 }
93 termlist_deltas.clear();
94
95 map<string, Xapian::termcount>::const_iterator j;
96 for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
97 string key = "W" + j->first;
98 if (j->second) {
99 add(key, F_pack_uint_last(j->second));
100 } else {
101 del(key);
102 }
103 }
104 wordfreq_changes.clear();
105 }
106
107 void
toggle_fragment(F_fragment frag,const string & word)108 FlintSpellingTable::toggle_fragment(F_fragment frag, const string & word)
109 {
110 map<F_fragment, set<string> >::iterator i = termlist_deltas.find(frag);
111 if (i == termlist_deltas.end()) {
112 i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
113 }
114 // The commonest case is that we're adding lots of words, so try insert
115 // first and if that reports that the word already exists, remove it.
116 pair<set<string>::iterator, bool> res = i->second.insert(word);
117 if (!res.second) {
118 // word is already in the set, so remove it.
119 i->second.erase(res.first);
120 }
121 }
122
123 void
add_word(const string & word,Xapian::termcount freqinc)124 FlintSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
125 {
126 if (word.size() <= 1) return;
127
128 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
129 if (i != wordfreq_changes.end()) {
130 // Word "word" already exists and has been modified.
131 if (i->second) {
132 i->second += freqinc;
133 return;
134 }
135 // If "word" is currently modified such that it no longer exists, so
136 // we need to execute the code below to re-add trigrams for it.
137 i->second = freqinc;
138 } else {
139 string key = "W" + word;
140 string data;
141 if (get_exact_entry(key, data)) {
142 // Word "word" already exists, so increment its count.
143 Xapian::termcount freq;
144 const char * p = data.data();
145 if (!F_unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
146 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
147 }
148 wordfreq_changes[word] = freq + freqinc;
149 return;
150 }
151 wordfreq_changes[word] = freqinc;
152 }
153
154 // Add trigrams for word.
155 toggle_word(word);
156 }
157
158 void
remove_word(const string & word,Xapian::termcount freqdec)159 FlintSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
160 {
161 if (word.size() <= 1) return;
162
163 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
164 if (i != wordfreq_changes.end()) {
165 if (i->second == 0) {
166 // Word has already been deleted.
167 return;
168 }
169 // Word "word" exists and has been modified.
170 if (freqdec < i->second) {
171 i->second -= freqdec;
172 return;
173 }
174
175 // Mark word as deleted.
176 i->second = 0;
177 } else {
178 string key = "W" + word;
179 string data;
180 if (!get_exact_entry(key, data)) {
181 // This word doesn't exist.
182 return;
183 }
184
185 Xapian::termcount freq;
186 const char *p = data.data();
187 if (!F_unpack_uint_last(&p, p + data.size(), &freq)) {
188 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
189 }
190 if (freqdec < freq) {
191 wordfreq_changes[word] = freq - freqdec;
192 return;
193 }
194 // Mark word as deleted.
195 wordfreq_changes[word] = 0;
196 }
197
198 // Remove trigrams for word.
199 toggle_word(word);
200 }
201
202 void
toggle_word(const string & word)203 FlintSpellingTable::toggle_word(const string & word)
204 {
205 F_fragment buf;
206 // Head:
207 buf[0] = 'H';
208 buf[1] = word[0];
209 buf[2] = word[1];
210 buf[3] = '\0';
211 toggle_fragment(buf, word);
212
213 // Tail:
214 buf[0] = 'T';
215 buf[1] = word[word.size() - 2];
216 buf[2] = word[word.size() - 1];
217 buf[3] = '\0';
218 toggle_fragment(buf, word);
219
220 if (word.size() <= 4) {
221 // We also generate 'bookends' for two, three, and four character
222 // terms so we can handle transposition of the middle two characters
223 // of a four character word, substitution or deletion of the middle
224 // character of a three character word, or insertion in the middle of a
225 // two character word.
226 // 'Bookends':
227 buf[0] = 'B';
228 buf[1] = word[0];
229 buf[3] = '\0';
230 toggle_fragment(buf, word);
231 }
232 if (word.size() > 2) {
233 set<F_fragment> done;
234 // Middles:
235 buf[0] = 'M';
236 for (size_t start = 0; start <= word.size() - 3; ++start) {
237 memcpy(buf.data + 1, word.data() + start, 3);
238 // Don't toggle the same F_fragment twice or it will cancel out.
239 // Bug fixed in 1.2.6.
240 if (done.insert(buf).second)
241 toggle_fragment(buf, word);
242 }
243 }
244 }
245
246 struct TermListGreaterApproxSize {
operator ()TermListGreaterApproxSize247 bool operator()(const TermList *a, const TermList *b) {
248 return a->get_approx_size() > b->get_approx_size();
249 }
250 };
251
252 TermList *
open_termlist(const string & word)253 FlintSpellingTable::open_termlist(const string & word)
254 {
255 // This should have been handled by Database::get_spelling_suggestion().
256 AssertRel(word.size(),>,1);
257
258 // Merge any pending changes to disk, but don't call commit() so they
259 // won't be switched live.
260 if (!wordfreq_changes.empty()) merge_changes();
261
262 // Build a priority queue of TermList objects which returns those of
263 // greatest approximate size first.
264 priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
265 try {
266 string data;
267 F_fragment buf;
268
269 // Head:
270 buf[0] = 'H';
271 buf[1] = word[0];
272 buf[2] = word[1];
273 if (get_exact_entry(string(buf), data))
274 pq.push(new FlintSpellingTermList(data));
275
276 // Tail:
277 buf[0] = 'T';
278 buf[1] = word[word.size() - 2];
279 buf[2] = word[word.size() - 1];
280 if (get_exact_entry(string(buf), data))
281 pq.push(new FlintSpellingTermList(data));
282
283 if (word.size() <= 4) {
284 // We also generate 'bookends' for two, three, and four character
285 // terms so we can handle transposition of the middle two
286 // characters of a four character word, substitution or deletion of
287 // the middle character of a three character word, or insertion in
288 // the middle of a two character word.
289 buf[0] = 'B';
290 buf[1] = word[0];
291 buf[3] = '\0';
292 if (get_exact_entry(string(buf), data))
293 pq.push(new FlintSpellingTermList(data));
294 }
295 if (word.size() > 2) {
296 // Middles:
297 buf[0] = 'M';
298 for (size_t start = 0; start <= word.size() - 3; ++start) {
299 memcpy(buf.data + 1, word.data() + start, 3);
300 if (get_exact_entry(string(buf), data))
301 pq.push(new FlintSpellingTermList(data));
302 }
303
304 if (word.size() == 3) {
305 // For three letter words, we generate the two "single
306 // transposition" forms too, so that we can produce good
307 // spelling suggestions.
308 // ABC -> BAC
309 buf[1] = word[1];
310 buf[2] = word[0];
311 if (get_exact_entry(string(buf), data))
312 pq.push(new FlintSpellingTermList(data));
313 // ABC -> ACB
314 buf[1] = word[0];
315 buf[2] = word[2];
316 buf[3] = word[1];
317 if (get_exact_entry(string(buf), data))
318 pq.push(new FlintSpellingTermList(data));
319 }
320 } else {
321 Assert(word.size() == 2);
322 // For two letter words, we generate H and T terms for the
323 // transposed form so that we can produce good spelling
324 // suggestions.
325 // AB -> BA
326 buf[0] = 'H';
327 buf[1] = word[1];
328 buf[2] = word[0];
329 if (get_exact_entry(string(buf), data))
330 pq.push(new FlintSpellingTermList(data));
331 buf[0] = 'T';
332 if (get_exact_entry(string(buf), data))
333 pq.push(new FlintSpellingTermList(data));
334 }
335
336 if (pq.empty()) return NULL;
337
338 // Build up an OrTermList tree by combine leaves and/or branches in
339 // pairs. The tree is balanced by the approximated sizes of the leaf
340 // FlintSpellingTermList objects - the way the tree is built are very
341 // similar to how an optimal Huffman code is often constructed.
342 //
343 // Balancing the tree like this should tend to minimise the amount of
344 // work done.
345 while (pq.size() > 1) {
346 // Build the tree such that left is always >= right so that
347 // OrTermList can rely on this when trying to minimise work.
348 TermList * termlist = pq.top();
349 pq.pop();
350
351 termlist = new OrTermList(pq.top(), termlist);
352 pq.pop();
353 pq.push(termlist);
354 }
355
356 return pq.top();
357 } catch (...) {
358 // Make sure we delete all the TermList objects to avoid leaking
359 // memory.
360 while (!pq.empty()) {
361 delete pq.top();
362 pq.pop();
363 }
364 throw;
365 }
366 }
367
368 Xapian::doccount
get_word_frequency(const string & word) const369 FlintSpellingTable::get_word_frequency(const string & word) const
370 {
371 map<string, Xapian::termcount>::const_iterator i;
372 i = wordfreq_changes.find(word);
373 if (i != wordfreq_changes.end()) {
374 // Modified frequency for word:
375 return i->second;
376 }
377
378 string key = "W" + word;
379 string data;
380 if (get_exact_entry(key, data)) {
381 // Word "word" already exists.
382 Xapian::termcount freq;
383 const char *p = data.data();
384 if (!F_unpack_uint_last(&p, p + data.size(), &freq)) {
385 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
386 }
387 return freq;
388 }
389
390 return 0;
391 }
392
393 ///////////////////////////////////////////////////////////////////////////
394
395 Xapian::termcount
get_approx_size() const396 FlintSpellingTermList::get_approx_size() const
397 {
398 // This is only used to decide how to build a OR-tree of TermList objects
399 // so we just need to return "sizes" which are ordered roughly correctly.
400 return data.size();
401 }
402
403 std::string
get_termname() const404 FlintSpellingTermList::get_termname() const
405 {
406 return current_term;
407 }
408
409 Xapian::termcount
get_wdf() const410 FlintSpellingTermList::get_wdf() const
411 {
412 return 1;
413 }
414
415 Xapian::doccount
get_termfreq() const416 FlintSpellingTermList::get_termfreq() const
417 {
418 return 1;
419 }
420
421 Xapian::termcount
get_collection_freq() const422 FlintSpellingTermList::get_collection_freq() const
423 {
424 return 1;
425 }
426
427 TermList *
next()428 FlintSpellingTermList::next()
429 {
430 if (p == data.size()) {
431 p = 0;
432 data.resize(0);
433 return NULL;
434 }
435 if (!current_term.empty()) {
436 if (p == data.size())
437 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
438 current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
439 }
440 unsigned add;
441 if (p == data.size() ||
442 (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
443 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
444 current_term.append(data.data() + p + 1, add);
445 p += add + 1;
446 return NULL;
447 }
448
449 TermList *
skip_to(const string & term)450 FlintSpellingTermList::skip_to(const string & term)
451 {
452 while (!data.empty() && current_term < term) {
453 (void)FlintSpellingTermList::next();
454 }
455 return NULL;
456 }
457
458 bool
at_end() const459 FlintSpellingTermList::at_end() const
460 {
461 return data.empty();
462 }
463
464 Xapian::termcount
positionlist_count() const465 FlintSpellingTermList::positionlist_count() const
466 {
467 throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_count() not implemented");
468 }
469
470 Xapian::PositionIterator
positionlist_begin() const471 FlintSpellingTermList::positionlist_begin() const
472 {
473 throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_begin() not implemented");
474 }
475