1 /** @file flint_spelling.cc
2  * @brief Spelling correction data for a flint database.
3  */
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/error.h>
24 #include <xapian/types.h>
25 
26 #include "expandweight.h"
27 #include "flint_spelling.h"
28 #include "flint_utils.h"
29 #include "omassert.h"
30 #include "ortermlist.h"
31 
32 #include "../prefix_compressed_strings.h"
33 
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
40 
41 using namespace std;
42 
43 void
merge_changes()44 FlintSpellingTable::merge_changes()
45 {
46     map<F_fragment, set<string> >::const_iterator i;
47     for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
48 	string key = i->first;
49 	const set<string> & changes = i->second;
50 
51 	set<string>::const_iterator d = changes.begin();
52 	if (d == changes.end()) continue;
53 
54 	string updated;
55 	string current;
56 	PrefixCompressedStringWriter out(updated);
57 	if (get_exact_entry(key, current)) {
58 	    PrefixCompressedStringItor in(current);
59 	    updated.reserve(current.size()); // FIXME plus some?
60 	    while (!in.at_end() && d != changes.end()) {
61 		const string & word = *in;
62 		Assert(d != changes.end());
63 		int cmp = word.compare(*d);
64 		if (cmp < 0) {
65 		    out.append(word);
66 		    ++in;
67 		} else if (cmp > 0) {
68 		    out.append(*d);
69 		    ++d;
70 		} else {
71 		    // If an existing entry is in the changes list, that means
72 		    // we should remove it.
73 		    ++in;
74 		    ++d;
75 		}
76 	    }
77 	    if (!in.at_end()) {
78 		// FIXME : easy to optimise this to a fix-up and substring copy.
79 		while (!in.at_end()) {
80 		    out.append(*in++);
81 		}
82 	    }
83 	}
84 	while (d != changes.end()) {
85 	    out.append(*d++);
86 	}
87 	if (!updated.empty()) {
88 	    add(key, updated);
89 	} else {
90 	    del(key);
91 	}
92     }
93     termlist_deltas.clear();
94 
95     map<string, Xapian::termcount>::const_iterator j;
96     for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
97 	string key = "W" + j->first;
98 	if (j->second) {
99 	    add(key, F_pack_uint_last(j->second));
100 	} else {
101 	    del(key);
102 	}
103     }
104     wordfreq_changes.clear();
105 }
106 
107 void
toggle_fragment(F_fragment frag,const string & word)108 FlintSpellingTable::toggle_fragment(F_fragment frag, const string & word)
109 {
110     map<F_fragment, set<string> >::iterator i = termlist_deltas.find(frag);
111     if (i == termlist_deltas.end()) {
112 	i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
113     }
114     // The commonest case is that we're adding lots of words, so try insert
115     // first and if that reports that the word already exists, remove it.
116     pair<set<string>::iterator, bool> res = i->second.insert(word);
117     if (!res.second) {
118 	// word is already in the set, so remove it.
119 	i->second.erase(res.first);
120     }
121 }
122 
123 void
add_word(const string & word,Xapian::termcount freqinc)124 FlintSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
125 {
126     if (word.size() <= 1) return;
127 
128     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
129     if (i != wordfreq_changes.end()) {
130 	// Word "word" already exists and has been modified.
131 	if (i->second) {
132 	    i->second += freqinc;
133 	    return;
134 	}
135 	// If "word" is currently modified such that it no longer exists, so
136 	// we need to execute the code below to re-add trigrams for it.
137 	i->second = freqinc;
138     } else {
139 	string key = "W" + word;
140 	string data;
141 	if (get_exact_entry(key, data)) {
142 	    // Word "word" already exists, so increment its count.
143 	    Xapian::termcount freq;
144 	    const char * p = data.data();
145 	    if (!F_unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
146 		throw Xapian::DatabaseCorruptError("Bad spelling word freq");
147 	    }
148 	    wordfreq_changes[word] = freq + freqinc;
149 	    return;
150 	}
151 	wordfreq_changes[word] = freqinc;
152     }
153 
154     // Add trigrams for word.
155     toggle_word(word);
156 }
157 
158 void
remove_word(const string & word,Xapian::termcount freqdec)159 FlintSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
160 {
161     if (word.size() <= 1) return;
162 
163     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
164     if (i != wordfreq_changes.end()) {
165 	if (i->second == 0) {
166 	    // Word has already been deleted.
167 	    return;
168 	}
169 	// Word "word" exists and has been modified.
170 	if (freqdec < i->second) {
171 	    i->second -= freqdec;
172 	    return;
173 	}
174 
175 	// Mark word as deleted.
176 	i->second = 0;
177     } else {
178 	string key = "W" + word;
179 	string data;
180 	if (!get_exact_entry(key, data)) {
181 	    // This word doesn't exist.
182 	    return;
183 	}
184 
185 	Xapian::termcount freq;
186 	const char *p = data.data();
187 	if (!F_unpack_uint_last(&p, p + data.size(), &freq)) {
188 	    throw Xapian::DatabaseCorruptError("Bad spelling word freq");
189 	}
190 	if (freqdec < freq) {
191 	    wordfreq_changes[word] = freq - freqdec;
192 	    return;
193 	}
194 	// Mark word as deleted.
195 	wordfreq_changes[word] = 0;
196     }
197 
198     // Remove trigrams for word.
199     toggle_word(word);
200 }
201 
202 void
toggle_word(const string & word)203 FlintSpellingTable::toggle_word(const string & word)
204 {
205     F_fragment buf;
206     // Head:
207     buf[0] = 'H';
208     buf[1] = word[0];
209     buf[2] = word[1];
210     buf[3] = '\0';
211     toggle_fragment(buf, word);
212 
213     // Tail:
214     buf[0] = 'T';
215     buf[1] = word[word.size() - 2];
216     buf[2] = word[word.size() - 1];
217     buf[3] = '\0';
218     toggle_fragment(buf, word);
219 
220     if (word.size() <= 4) {
221 	// We also generate 'bookends' for two, three, and four character
222 	// terms so we can handle transposition of the middle two characters
223 	// of a four character word, substitution or deletion of the middle
224 	// character of a three character word, or insertion in the middle of a
225 	// two character word.
226 	// 'Bookends':
227 	buf[0] = 'B';
228 	buf[1] = word[0];
229 	buf[3] = '\0';
230 	toggle_fragment(buf, word);
231     }
232     if (word.size() > 2) {
233 	set<F_fragment> done;
234 	// Middles:
235 	buf[0] = 'M';
236 	for (size_t start = 0; start <= word.size() - 3; ++start) {
237 	    memcpy(buf.data + 1, word.data() + start, 3);
238 	    // Don't toggle the same F_fragment twice or it will cancel out.
239 	    // Bug fixed in 1.2.6.
240 	    if (done.insert(buf).second)
241 		toggle_fragment(buf, word);
242 	}
243     }
244 }
245 
246 struct TermListGreaterApproxSize {
operator ()TermListGreaterApproxSize247     bool operator()(const TermList *a, const TermList *b) {
248 	return a->get_approx_size() > b->get_approx_size();
249     }
250 };
251 
252 TermList *
open_termlist(const string & word)253 FlintSpellingTable::open_termlist(const string & word)
254 {
255     // This should have been handled by Database::get_spelling_suggestion().
256     AssertRel(word.size(),>,1);
257 
258     // Merge any pending changes to disk, but don't call commit() so they
259     // won't be switched live.
260     if (!wordfreq_changes.empty()) merge_changes();
261 
262     // Build a priority queue of TermList objects which returns those of
263     // greatest approximate size first.
264     priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
265     try {
266 	string data;
267 	F_fragment buf;
268 
269 	// Head:
270 	buf[0] = 'H';
271 	buf[1] = word[0];
272 	buf[2] = word[1];
273 	if (get_exact_entry(string(buf), data))
274 	    pq.push(new FlintSpellingTermList(data));
275 
276 	// Tail:
277 	buf[0] = 'T';
278 	buf[1] = word[word.size() - 2];
279 	buf[2] = word[word.size() - 1];
280 	if (get_exact_entry(string(buf), data))
281 	    pq.push(new FlintSpellingTermList(data));
282 
283 	if (word.size() <= 4) {
284 	    // We also generate 'bookends' for two, three, and four character
285 	    // terms so we can handle transposition of the middle two
286 	    // characters of a four character word, substitution or deletion of
287 	    // the middle character of a three character word, or insertion in
288 	    // the middle of a two character word.
289 	    buf[0] = 'B';
290 	    buf[1] = word[0];
291 	    buf[3] = '\0';
292 	    if (get_exact_entry(string(buf), data))
293 		pq.push(new FlintSpellingTermList(data));
294 	}
295 	if (word.size() > 2) {
296 	    // Middles:
297 	    buf[0] = 'M';
298 	    for (size_t start = 0; start <= word.size() - 3; ++start) {
299 		memcpy(buf.data + 1, word.data() + start, 3);
300 		if (get_exact_entry(string(buf), data))
301 		    pq.push(new FlintSpellingTermList(data));
302 	    }
303 
304 	    if (word.size() == 3) {
305 		// For three letter words, we generate the two "single
306 		// transposition" forms too, so that we can produce good
307 		// spelling suggestions.
308 		// ABC -> BAC
309 		buf[1] = word[1];
310 		buf[2] = word[0];
311 		if (get_exact_entry(string(buf), data))
312 		    pq.push(new FlintSpellingTermList(data));
313 		// ABC -> ACB
314 		buf[1] = word[0];
315 		buf[2] = word[2];
316 		buf[3] = word[1];
317 		if (get_exact_entry(string(buf), data))
318 		    pq.push(new FlintSpellingTermList(data));
319 	    }
320 	} else {
321 	    Assert(word.size() == 2);
322 	    // For two letter words, we generate H and T terms for the
323 	    // transposed form so that we can produce good spelling
324 	    // suggestions.
325 	    // AB -> BA
326 	    buf[0] = 'H';
327 	    buf[1] = word[1];
328 	    buf[2] = word[0];
329 	    if (get_exact_entry(string(buf), data))
330 		pq.push(new FlintSpellingTermList(data));
331 	    buf[0] = 'T';
332 	    if (get_exact_entry(string(buf), data))
333 		pq.push(new FlintSpellingTermList(data));
334 	}
335 
336 	if (pq.empty()) return NULL;
337 
338 	// Build up an OrTermList tree by combine leaves and/or branches in
339 	// pairs.  The tree is balanced by the approximated sizes of the leaf
340 	// FlintSpellingTermList objects - the way the tree is built are very
341 	// similar to how an optimal Huffman code is often constructed.
342 	//
343 	// Balancing the tree like this should tend to minimise the amount of
344 	// work done.
345 	while (pq.size() > 1) {
346 	    // Build the tree such that left is always >= right so that
347 	    // OrTermList can rely on this when trying to minimise work.
348 	    TermList * termlist = pq.top();
349 	    pq.pop();
350 
351 	    termlist = new OrTermList(pq.top(), termlist);
352 	    pq.pop();
353 	    pq.push(termlist);
354 	}
355 
356 	return pq.top();
357     } catch (...) {
358 	// Make sure we delete all the TermList objects to avoid leaking
359 	// memory.
360 	while (!pq.empty()) {
361 	    delete pq.top();
362 	    pq.pop();
363 	}
364 	throw;
365     }
366 }
367 
368 Xapian::doccount
get_word_frequency(const string & word) const369 FlintSpellingTable::get_word_frequency(const string & word) const
370 {
371     map<string, Xapian::termcount>::const_iterator i;
372     i = wordfreq_changes.find(word);
373     if (i != wordfreq_changes.end()) {
374 	// Modified frequency for word:
375 	return i->second;
376     }
377 
378     string key = "W" + word;
379     string data;
380     if (get_exact_entry(key, data)) {
381 	// Word "word" already exists.
382 	Xapian::termcount freq;
383 	const char *p = data.data();
384 	if (!F_unpack_uint_last(&p, p + data.size(), &freq)) {
385 	    throw Xapian::DatabaseCorruptError("Bad spelling word freq");
386 	}
387 	return freq;
388     }
389 
390     return 0;
391 }
392 
393 ///////////////////////////////////////////////////////////////////////////
394 
395 Xapian::termcount
get_approx_size() const396 FlintSpellingTermList::get_approx_size() const
397 {
398     // This is only used to decide how to build a OR-tree of TermList objects
399     // so we just need to return "sizes" which are ordered roughly correctly.
400     return data.size();
401 }
402 
403 std::string
get_termname() const404 FlintSpellingTermList::get_termname() const
405 {
406     return current_term;
407 }
408 
409 Xapian::termcount
get_wdf() const410 FlintSpellingTermList::get_wdf() const
411 {
412     return 1;
413 }
414 
415 Xapian::doccount
get_termfreq() const416 FlintSpellingTermList::get_termfreq() const
417 {
418     return 1;
419 }
420 
421 Xapian::termcount
get_collection_freq() const422 FlintSpellingTermList::get_collection_freq() const
423 {
424     return 1;
425 }
426 
427 TermList *
next()428 FlintSpellingTermList::next()
429 {
430     if (p == data.size()) {
431 	p = 0;
432 	data.resize(0);
433 	return NULL;
434     }
435     if (!current_term.empty()) {
436 	if (p == data.size())
437 	    throw Xapian::DatabaseCorruptError("Bad spelling termlist");
438 	current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
439     }
440     unsigned add;
441     if (p == data.size() ||
442 	(add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
443 	throw Xapian::DatabaseCorruptError("Bad spelling termlist");
444     current_term.append(data.data() + p + 1, add);
445     p += add + 1;
446     return NULL;
447 }
448 
449 TermList *
skip_to(const string & term)450 FlintSpellingTermList::skip_to(const string & term)
451 {
452     while (!data.empty() && current_term < term) {
453 	(void)FlintSpellingTermList::next();
454     }
455     return NULL;
456 }
457 
458 bool
at_end() const459 FlintSpellingTermList::at_end() const
460 {
461     return data.empty();
462 }
463 
464 Xapian::termcount
positionlist_count() const465 FlintSpellingTermList::positionlist_count() const
466 {
467     throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_count() not implemented");
468 }
469 
470 Xapian::PositionIterator
positionlist_begin() const471 FlintSpellingTermList::positionlist_begin() const
472 {
473     throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_begin() not implemented");
474 }
475