1 /* omdatabase.cc: External interface for running queries
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2001,2002 Ananova Ltd
5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2016 Olly Betts
6  * Copyright 2006,2008 Lemur Consulting Ltd
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include "autoptr.h"
27 
28 #include <xapian/constants.h>
29 #include <xapian/error.h>
30 #include <xapian/positioniterator.h>
31 #include <xapian/postingiterator.h>
32 #include <xapian/termiterator.h>
33 #include <xapian/unicode.h>
34 
35 #include "omassert.h"
36 #include "debuglog.h"
37 #include "backends/alltermslist.h"
38 #include "backends/multi/multi_alltermslist.h"
39 #include "backends/multi/multi_postlist.h"
40 #include "backends/multi/multi_termlist.h"
41 #include "backends/multivaluelist.h"
42 #include "backends/database.h"
43 #include "editdistance.h"
44 #include "expand/ortermlist.h"
45 #include "internaltypes.h"
46 #include "noreturn.h"
47 #include "pack.h"
48 
49 #include <algorithm>
50 #include <cstdlib> // For abs().
51 #include <cstring>
52 #include <vector>
53 
54 using namespace std;
55 using Xapian::Internal::intrusive_ptr;
56 
57 XAPIAN_NORETURN(static void docid_zero_invalid());
docid_zero_invalid()58 static void docid_zero_invalid()
59 {
60     throw Xapian::InvalidArgumentError("Document ID 0 is invalid");
61 }
62 
63 XAPIAN_NORETURN(static void no_subdatabases());
no_subdatabases()64 static void no_subdatabases()
65 {
66     throw Xapian::InvalidOperationError("No subdatabases");
67 }
68 
69 XAPIAN_NORETURN(static void empty_metadata_key());
empty_metadata_key()70 static void empty_metadata_key()
71 {
72     throw Xapian::InvalidArgumentError("Empty metadata keys are invalid");
73 }
74 
75 inline size_t
sub_db(Xapian::docid did,size_t n_dbs)76 sub_db(Xapian::docid did, size_t n_dbs)
77 {
78     return (did - 1) % n_dbs;
79 }
80 
81 inline size_t
sub_docid(Xapian::docid did,size_t n_dbs)82 sub_docid(Xapian::docid did, size_t n_dbs)
83 {
84     return (did - 1) / n_dbs + 1;
85 }
86 
87 namespace Xapian {
88 
89 Database::Database(Database&&) = default;
90 
91 Database&
92 Database::operator=(Database&&) = default;
93 
Database()94 Database::Database()
95 {
96     LOGCALL_CTOR(API, "Database", NO_ARGS);
97 }
98 
Database(Database::Internal * internal_)99 Database::Database(Database::Internal *internal_)
100 {
101     LOGCALL_CTOR(API, "Database", internal_);
102     intrusive_ptr<Database::Internal> newi(internal_);
103     internal.push_back(newi);
104 }
105 
Database(const Database & other)106 Database::Database(const Database &other)
107 {
108     LOGCALL_CTOR(API, "Database", other);
109     internal = other.internal;
110 }
111 
112 void
operator =(const Database & other)113 Database::operator=(const Database &other)
114 {
115     LOGCALL_VOID(API, "Database::operator=", other);
116     internal = other.internal;
117 }
118 
~Database()119 Database::~Database()
120 {
121     LOGCALL_DTOR(API, "Database");
122 }
123 
124 bool
reopen()125 Database::reopen()
126 {
127     LOGCALL(API, bool, "Database::reopen", NO_ARGS);
128     bool maybe_changed = false;
129     vector<intrusive_ptr<Database::Internal> >::iterator i;
130     for (i = internal.begin(); i != internal.end(); ++i) {
131 	if ((*i)->reopen())
132 	    maybe_changed = true;
133     }
134     RETURN(maybe_changed);
135 }
136 
137 void
close()138 Database::close()
139 {
140     LOGCALL_VOID(API, "Database::close", NO_ARGS);
141     vector<intrusive_ptr<Database::Internal> >::iterator i;
142     for (i = internal.begin(); i != internal.end(); ++i) {
143 	(*i)->close();
144     }
145 }
146 
147 void
add_database(const Database & database)148 Database::add_database(const Database & database)
149 {
150     LOGCALL_VOID(API, "Database::add_database", database);
151     if (this == &database) {
152 	LOGLINE(API, "Database added to itself");
153 	throw Xapian::InvalidArgumentError("Can't add a Database to itself");
154     }
155     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
156     for (i = database.internal.begin(); i != database.internal.end(); ++i) {
157 	internal.push_back(*i);
158     }
159 }
160 
161 PostingIterator
postlist_begin(const string & tname) const162 Database::postlist_begin(const string &tname) const
163 {
164     LOGCALL(API, PostingIterator, "Database::postlist_begin", tname);
165 
166     // Don't bother checking that the term exists first.  If it does, we
167     // just end up doing more work, and if it doesn't, we save very little
168     // work.
169 
170     // Handle the common case of a single database specially.
171     if (internal.size() == 1)
172 	RETURN(PostingIterator(internal[0]->open_post_list(tname)));
173 
174     if (rare(internal.empty()))
175 	RETURN(PostingIterator());
176 
177     vector<LeafPostList *> pls;
178     try {
179 	vector<intrusive_ptr<Database::Internal> >::const_iterator i;
180 	for (i = internal.begin(); i != internal.end(); ++i) {
181 	    pls.push_back((*i)->open_post_list(tname));
182 	    pls.back()->next();
183 	}
184 	Assert(pls.begin() != pls.end());
185     } catch (...) {
186 	vector<LeafPostList *>::iterator i;
187 	for (i = pls.begin(); i != pls.end(); ++i) {
188 	    delete *i;
189 	    *i = 0;
190 	}
191 	throw;
192     }
193 
194     RETURN(PostingIterator(new MultiPostList(pls, *this)));
195 }
196 
197 TermIterator
termlist_begin(Xapian::docid did) const198 Database::termlist_begin(Xapian::docid did) const
199 {
200     LOGCALL(API, TermIterator, "Database::termlist_begin", did);
201     if (did == 0)
202 	docid_zero_invalid();
203 
204     unsigned int multiplier = internal.size();
205     if (rare(multiplier == 0))
206 	no_subdatabases();
207     TermList *tl;
208     if (multiplier == 1) {
209 	// There's no need for the MultiTermList wrapper in the common case
210 	// where we're only dealing with a single database.
211 	tl = internal[0]->open_term_list(did);
212     } else {
213 	Assert(multiplier != 0);
214 	Xapian::doccount n = (did - 1) % multiplier; // which actual database
215 	Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
216 
217 	tl = new MultiTermList(internal[n]->open_term_list(m), *this, n);
218     }
219     RETURN(TermIterator(tl));
220 }
221 
222 TermIterator
allterms_begin(const std::string & prefix) const223 Database::allterms_begin(const std::string & prefix) const
224 {
225     LOGCALL(API, TermIterator, "Database::allterms_begin", NO_ARGS);
226     TermList * tl;
227     if (rare(internal.size() == 0)) {
228 	tl = NULL;
229     } else if (internal.size() == 1) {
230 	tl = internal[0]->open_allterms(prefix);
231     } else {
232 	tl = new MultiAllTermsList(internal, prefix);
233     }
234     RETURN(TermIterator(tl));
235 }
236 
237 bool
has_positions() const238 Database::has_positions() const
239 {
240     LOGCALL(API, bool, "Database::has_positions", NO_ARGS);
241     // If any sub-database has positions, the combined database does.
242     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
243     for (i = internal.begin(); i != internal.end(); ++i) {
244 	if ((*i)->has_positions()) RETURN(true);
245     }
246     RETURN(false);
247 }
248 
249 PositionIterator
positionlist_begin(Xapian::docid did,const string & tname) const250 Database::positionlist_begin(Xapian::docid did, const string &tname) const
251 {
252     LOGCALL(API, PositionIterator, "Database::positionlist_begin", did | tname);
253     if (tname.empty())
254 	throw InvalidArgumentError("Zero length terms are invalid");
255     if (did == 0)
256 	docid_zero_invalid();
257 
258     unsigned int multiplier = internal.size();
259     if (rare(multiplier == 0))
260 	no_subdatabases();
261     Xapian::doccount n = (did - 1) % multiplier; // which actual database
262     Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
263     RETURN(PositionIterator(internal[n]->open_position_list(m, tname)));
264 }
265 
266 Xapian::doccount
get_doccount() const267 Database::get_doccount() const
268 {
269     LOGCALL(API, Xapian::doccount, "Database::get_doccount", NO_ARGS);
270     Xapian::doccount docs = 0;
271     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
272     for (i = internal.begin(); i != internal.end(); ++i) {
273 	docs += (*i)->get_doccount();
274     }
275     RETURN(docs);
276 }
277 
278 Xapian::docid
get_lastdocid() const279 Database::get_lastdocid() const
280 {
281     LOGCALL(API, Xapian::docid, "Database::get_lastdocid", NO_ARGS);
282     Xapian::docid did = 0;
283 
284     unsigned int multiplier = internal.size();
285     for (Xapian::doccount i = 0; i < multiplier; ++i) {
286 	Xapian::docid did_i = internal[i]->get_lastdocid();
287 	if (did_i) did = std::max(did, (did_i - 1) * multiplier + i + 1);
288     }
289     RETURN(did);
290 }
291 
292 Xapian::doclength
get_avlength() const293 Database::get_avlength() const
294 {
295     LOGCALL(API, Xapian::doclength, "Database::get_avlength", NO_ARGS);
296     Xapian::doccount docs = 0;
297     Xapian::totallength totlen = 0;
298 
299     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
300     for (i = internal.begin(); i != internal.end(); ++i) {
301 	docs += (*i)->get_doccount();
302 	totlen += (*i)->get_total_length();
303     }
304     LOGLINE(UNKNOWN, "get_avlength() = " << totlen << " / " << docs <<
305 	    " (from " << internal.size() << " dbs)");
306 
307     if (docs == 0) RETURN(0.0);
308     RETURN(totlen / double(docs));
309 }
310 
311 Xapian::totallength
get_total_length() const312 Database::get_total_length() const
313 {
314     LOGCALL(API, Xapian::totallength, "Database::get_total_length", NO_ARGS);
315     Xapian::totallength total_length = 0;
316     for (auto&& sub_db : internal) {
317 	total_length += sub_db->get_total_length();
318     }
319     RETURN(total_length);
320 }
321 
322 Xapian::doccount
get_termfreq(const string & tname) const323 Database::get_termfreq(const string & tname) const
324 {
325     LOGCALL(API, Xapian::doccount, "Database::get_termfreq", tname);
326     if (tname.empty()) RETURN(get_doccount());
327 
328     Xapian::doccount tf = 0;
329     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
330     for (i = internal.begin(); i != internal.end(); ++i) {
331 	Xapian::doccount sub_tf;
332 	(*i)->get_freqs(tname, &sub_tf, NULL);
333 	tf += sub_tf;
334     }
335     RETURN(tf);
336 }
337 
338 Xapian::termcount
get_collection_freq(const string & tname) const339 Database::get_collection_freq(const string & tname) const
340 {
341     LOGCALL(API, Xapian::termcount, "Database::get_collection_freq", tname);
342     if (tname.empty()) RETURN(get_doccount());
343 
344     Xapian::termcount cf = 0;
345     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
346     for (i = internal.begin(); i != internal.end(); ++i) {
347 	Xapian::termcount sub_cf;
348 	(*i)->get_freqs(tname, NULL, &sub_cf);
349 	cf += sub_cf;
350     }
351     RETURN(cf);
352 }
353 
354 Xapian::doccount
get_value_freq(Xapian::valueno slot) const355 Database::get_value_freq(Xapian::valueno slot) const
356 {
357     LOGCALL(API, Xapian::doccount, "Database::get_value_freq", slot);
358 
359     Xapian::doccount vf = 0;
360     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
361     for (i = internal.begin(); i != internal.end(); ++i) {
362 	vf += (*i)->get_value_freq(slot);
363     }
364     RETURN(vf);
365 }
366 
367 string
get_value_lower_bound(Xapian::valueno slot) const368 Database::get_value_lower_bound(Xapian::valueno slot) const
369 {
370     LOGCALL(API, string, "Database::get_value_lower_bound", slot);
371 
372     if (rare(internal.empty())) RETURN(string());
373 
374     string full_lb;
375     for (auto&& subdb : internal) {
376 	string lb = subdb->get_value_lower_bound(slot);
377 	if (lb.empty())
378 	    continue;
379 	if (full_lb.empty() || lb < full_lb)
380 	    full_lb = std::move(lb);
381     }
382     RETURN(full_lb);
383 }
384 
385 std::string
get_value_upper_bound(Xapian::valueno slot) const386 Database::get_value_upper_bound(Xapian::valueno slot) const
387 {
388     LOGCALL(API, std::string, "Database::get_value_upper_bound", slot);
389 
390     std::string full_ub;
391     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
392     for (i = internal.begin(); i != internal.end(); ++i) {
393 	std::string ub = (*i)->get_value_upper_bound(slot);
394 	if (ub > full_ub)
395 	    full_ub = ub;
396     }
397     RETURN(full_ub);
398 }
399 
400 Xapian::termcount
get_doclength_lower_bound() const401 Database::get_doclength_lower_bound() const
402 {
403     LOGCALL(API, Xapian::termcount, "Database::get_doclength_lower_bound", NO_ARGS);
404 
405     if (rare(internal.empty())) RETURN(0);
406 
407     Xapian::termcount full_lb = 0;
408     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
409     for (i = internal.begin(); i != internal.end(); ++i) {
410 	// Skip sub-databases which are empty or only contain documents with
411 	// doclen==0.
412 	if ((*i)->get_total_length() != 0) {
413 	    Xapian::termcount lb = (*i)->get_doclength_lower_bound();
414 	    if (full_lb == 0 || lb < full_lb) full_lb = lb;
415 	}
416     }
417     RETURN(full_lb);
418 }
419 
420 Xapian::termcount
get_doclength_upper_bound() const421 Database::get_doclength_upper_bound() const
422 {
423     LOGCALL(API, Xapian::termcount, "Database::get_doclength_upper_bound", NO_ARGS);
424 
425     Xapian::termcount full_ub = 0;
426     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
427     for (i = internal.begin(); i != internal.end(); ++i) {
428 	Xapian::termcount ub = (*i)->get_doclength_upper_bound();
429 	if (ub > full_ub) full_ub = ub;
430     }
431     RETURN(full_ub);
432 }
433 
434 Xapian::termcount
get_wdf_upper_bound(const string & term) const435 Database::get_wdf_upper_bound(const string & term) const
436 {
437     LOGCALL(API, Xapian::termcount, "Database::get_wdf_upper_bound", term);
438     if (term.empty()) RETURN(0);
439 
440     Xapian::termcount full_ub = 0;
441     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
442     for (i = internal.begin(); i != internal.end(); ++i) {
443 	Xapian::termcount ub = (*i)->get_wdf_upper_bound(term);
444 	if (ub > full_ub) full_ub = ub;
445     }
446     RETURN(full_ub);
447 }
448 
449 ValueIterator
valuestream_begin(Xapian::valueno slot) const450 Database::valuestream_begin(Xapian::valueno slot) const
451 {
452     LOGCALL(API, ValueIterator, "Database::valuestream_begin", slot);
453     if (internal.size() == 0)
454 	RETURN(ValueIterator());
455     if (internal.size() != 1)
456 	RETURN(ValueIterator(new MultiValueList(internal, slot)));
457     RETURN(ValueIterator(internal[0]->open_value_list(slot)));
458 }
459 
460 Xapian::termcount
get_doclength(Xapian::docid did) const461 Database::get_doclength(Xapian::docid did) const
462 {
463     LOGCALL(API, Xapian::termcount, "Database::get_doclength", did);
464     if (did == 0)
465 	docid_zero_invalid();
466 
467     unsigned int multiplier = internal.size();
468     if (rare(multiplier == 0))
469 	no_subdatabases();
470     Xapian::doccount n = (did - 1) % multiplier; // which actual database
471     Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
472     RETURN(internal[n]->get_doclength(m));
473 }
474 
475 Xapian::termcount
get_unique_terms(Xapian::docid did) const476 Database::get_unique_terms(Xapian::docid did) const
477 {
478     LOGCALL(API, Xapian::termcount, "Database::get_unique_terms", did);
479     if (did == 0)
480 	docid_zero_invalid();
481     unsigned int multiplier = internal.size();
482     if (rare(multiplier == 0))
483 	no_subdatabases();
484     Xapian::doccount n = (did - 1) % multiplier; // which actual database
485     Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
486     RETURN(internal[n]->get_unique_terms(m));
487 }
488 
489 Document
get_document(Xapian::docid did) const490 Database::get_document(Xapian::docid did) const
491 {
492     LOGCALL(API, Document, "Database::get_document", did);
493     if (did == 0)
494 	docid_zero_invalid();
495 
496     unsigned int multiplier = internal.size();
497     if (rare(multiplier == 0))
498 	no_subdatabases();
499     Xapian::doccount n = (did - 1) % multiplier; // which actual database
500     Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
501 
502     // Open non-lazily so we throw DocNotFoundError if the doc doesn't exist.
503     RETURN(Document(internal[n]->open_document(m, false)));
504 }
505 
506 Document
get_document(Xapian::docid did,unsigned flags) const507 Database::get_document(Xapian::docid did, unsigned flags) const
508 {
509     LOGCALL(API, Document, "Database::get_document", did|flags);
510     if (did == 0)
511 	docid_zero_invalid();
512 
513     unsigned int multiplier = internal.size();
514     if (rare(multiplier == 0))
515 	no_subdatabases();
516     Xapian::doccount n = (did - 1) % multiplier; // which actual database
517     Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
518 
519     bool assume_valid = flags & Xapian::DOC_ASSUME_VALID;
520     RETURN(Document(internal[n]->open_document(m, assume_valid)));
521 }
522 
523 bool
term_exists(const string & tname) const524 Database::term_exists(const string & tname) const
525 {
526     LOGCALL(API, bool, "Database::term_exists", tname);
527     if (tname.empty()) {
528 	RETURN(get_doccount() != 0);
529     }
530     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
531     for (i = internal.begin(); i != internal.end(); ++i) {
532 	if ((*i)->term_exists(tname)) RETURN(true);
533     }
534     RETURN(false);
535 }
536 
537 void
keep_alive()538 Database::keep_alive()
539 {
540     LOGCALL_VOID(API, "Database::keep_alive", NO_ARGS);
541     vector<intrusive_ptr<Database::Internal> >::const_iterator i;
542     for (i = internal.begin(); i != internal.end(); ++i) {
543 	(*i)->keep_alive();
544     }
545 }
546 
547 string
get_description() const548 Database::get_description() const
549 {
550     /// @todo display contents of the database
551     return "Database()";
552 }
553 
554 // We sum the character frequency histogram absolute differences to compute a
555 // lower bound on the edit distance.  Rather than counting each Unicode code
556 // point uniquely, we use an array with VEC_SIZE elements and tally code points
557 // modulo VEC_SIZE which can only reduce the bound we calculate.
558 //
559 // There will be a trade-off between how good the bound is and how large and
560 // array is used (a larger array takes more time to clear and sum over).  The
561 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
562 // but that may not reflect real world performance.  FIXME: profile and tune.
563 
564 #define VEC_SIZE 64
565 
566 static int
freq_edit_lower_bound(const vector<unsigned> & a,const vector<unsigned> & b)567 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)
568 {
569     int vec[VEC_SIZE];
570     memset(vec, 0, sizeof(vec));
571     vector<unsigned>::const_iterator i;
572     for (i = a.begin(); i != a.end(); ++i) {
573 	++vec[(*i) % VEC_SIZE];
574     }
575     for (i = b.begin(); i != b.end(); ++i) {
576 	--vec[(*i) % VEC_SIZE];
577     }
578     unsigned int total = 0;
579     for (size_t j = 0; j < VEC_SIZE; ++j) {
580 	total += abs(vec[j]);
581     }
582     // Each insertion or deletion adds at most 1 to total.  Each transposition
583     // doesn't change it at all.  But each substitution can change it by 2 so
584     // we need to divide it by 2.  Rounding up is OK, since the odd change must
585     // be due to an actual edit.
586     return (total + 1) / 2;
587 }
588 
589 // Word must have a trigram score at least this close to the best score seen
590 // so far.
591 #define TRIGRAM_SCORE_THRESHOLD 2
592 
593 string
get_spelling_suggestion(const string & word,unsigned max_edit_distance) const594 Database::get_spelling_suggestion(const string &word,
595 				  unsigned max_edit_distance) const
596 {
597     LOGCALL(API, string, "Database::get_spelling_suggestion", word | max_edit_distance);
598     if (word.size() <= 1) return string();
599     AutoPtr<TermList> merger;
600     for (size_t i = 0; i < internal.size(); ++i) {
601 	TermList * tl = internal[i]->open_spelling_termlist(word);
602 	LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl);
603 	if (tl) {
604 	    if (merger.get()) {
605 		merger.reset(new OrTermList(merger.release(), tl));
606 	    } else {
607 		merger.reset(tl);
608 	    }
609 	}
610     }
611     if (!merger.get()) RETURN(string());
612 
613     // Convert word to UTF-32.
614     // Extra brackets needed to avoid this being misparsed as a function
615     // prototype.
616     vector<unsigned> utf32_word((Utf8Iterator(word)), Utf8Iterator());
617 
618     vector<unsigned> utf32_term;
619 
620     Xapian::termcount best = 1;
621     string result;
622     int edist_best = max_edit_distance;
623     Xapian::doccount freq_best = 0;
624     Xapian::doccount freq_exact = 0;
625     while (true) {
626 	TermList *ret = merger->next();
627 	if (ret) merger.reset(ret);
628 
629 	if (merger->at_end()) break;
630 
631 	string term = merger->get_termname();
632 	Xapian::termcount score = merger->get_wdf();
633 
634 	LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score);
635 	if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
636 	    if (score > best) best = score;
637 
638 	    // There's no point considering a word where the difference
639 	    // in length is greater than the smallest number of edits we've
640 	    // found so far.
641 
642 	    // First check the length of the encoded UTF-8 version of term.
643 	    // Each UTF-32 character is 1-4 bytes in UTF-8.
644 	    if (abs(long(term.size()) - long(word.size())) > edist_best * 4) {
645 		LOGLINE(SPELLING, "Lengths much too different");
646 		continue;
647 	    }
648 
649 	    // Now convert to UTF-32, and compare the true lengths more
650 	    // strictly.
651 	    utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
652 
653 	    if (abs(long(utf32_term.size()) - long(utf32_word.size()))
654 		    > edist_best) {
655 		LOGLINE(SPELLING, "Lengths too different");
656 		continue;
657 	    }
658 
659 	    if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) {
660 		LOGLINE(SPELLING, "Rejected by character frequency test");
661 		continue;
662 	    }
663 
664 	    int edist = edit_distance_unsigned(&utf32_term[0],
665 					       int(utf32_term.size()),
666 					       &utf32_word[0],
667 					       int(utf32_word.size()),
668 					       edist_best);
669 	    LOGLINE(SPELLING, "Edit distance " << edist);
670 
671 	    if (edist <= edist_best) {
672 		Xapian::doccount freq = 0;
673 		for (size_t j = 0; j < internal.size(); ++j)
674 		    freq += internal[j]->get_spelling_frequency(term);
675 
676 		LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
677 		// Even if we have an exact match, there may be a much more
678 		// frequent potential correction which will still be
679 		// interesting.
680 		if (edist == 0) {
681 		    freq_exact = freq;
682 		    continue;
683 		}
684 
685 		if (edist < edist_best || freq > freq_best) {
686 		    LOGLINE(SPELLING, "Best so far: \"" << term <<
687 				      "\" edist " << edist << " freq " << freq);
688 		    result = term;
689 		    edist_best = edist;
690 		    freq_best = freq;
691 		}
692 	    }
693 	}
694     }
695     if (freq_best < freq_exact)
696 	RETURN(string());
697     RETURN(result);
698 }
699 
700 TermIterator
spellings_begin() const701 Database::spellings_begin() const
702 {
703     LOGCALL(API, TermIterator, "Database::spellings_begin", NO_ARGS);
704     AutoPtr<TermList> merger;
705     for (size_t i = 0; i < internal.size(); ++i) {
706 	TermList * tl = internal[i]->open_spelling_wordlist();
707 	if (tl) {
708 	    if (merger.get()) {
709 		merger.reset(new FreqAdderOrTermList(merger.release(), tl));
710 	    } else {
711 		merger.reset(tl);
712 	    }
713 	}
714     }
715     RETURN(TermIterator(merger.release()));
716 }
717 
718 TermIterator
synonyms_begin(const std::string & term) const719 Database::synonyms_begin(const std::string &term) const
720 {
721     LOGCALL(API, TermIterator, "Database::synonyms_begin", term);
722     AutoPtr<TermList> merger;
723     for (size_t i = 0; i < internal.size(); ++i) {
724 	TermList * tl = internal[i]->open_synonym_termlist(term);
725 	if (tl) {
726 	    if (merger.get()) {
727 		merger.reset(new OrTermList(merger.release(), tl));
728 	    } else {
729 		merger.reset(tl);
730 	    }
731 	}
732     }
733     RETURN(TermIterator(merger.release()));
734 }
735 
736 TermIterator
synonym_keys_begin(const std::string & prefix) const737 Database::synonym_keys_begin(const std::string &prefix) const
738 {
739     LOGCALL(API, TermIterator, "Database::synonym_keys_begin", prefix);
740     AutoPtr<TermList> merger;
741     for (size_t i = 0; i < internal.size(); ++i) {
742 	TermList * tl = internal[i]->open_synonym_keylist(prefix);
743 	if (tl) {
744 	    if (merger.get()) {
745 		merger.reset(new OrTermList(merger.release(), tl));
746 	    } else {
747 		merger.reset(tl);
748 	    }
749 	}
750     }
751     RETURN(TermIterator(merger.release()));
752 }
753 
754 string
get_metadata(const string & key) const755 Database::get_metadata(const string & key) const
756 {
757     LOGCALL(API, string, "Database::get_metadata", key);
758     if (rare(key.empty()))
759 	empty_metadata_key();
760     if (internal.empty()) RETURN(std::string());
761     RETURN(internal[0]->get_metadata(key));
762 }
763 
764 Xapian::TermIterator
metadata_keys_begin(const std::string & prefix) const765 Database::metadata_keys_begin(const std::string &prefix) const
766 {
767     LOGCALL(API, Xapian::TermIterator, "Database::metadata_keys_begin", NO_ARGS);
768     if (internal.empty()) RETURN(TermIterator());
769     RETURN(TermIterator(internal[0]->open_metadata_keylist(prefix)));
770 }
771 
772 std::string
get_uuid() const773 Database::get_uuid() const
774 {
775     LOGCALL(API, std::string, "Database::get_uuid", NO_ARGS);
776     string uuid;
777     for (size_t i = 0; i < internal.size(); ++i) {
778 	string sub_uuid = internal[i]->get_uuid();
779 	// If any of the sub-databases have no uuid, we can't make a uuid for
780 	// the combined database.
781 	if (sub_uuid.empty())
782 	    RETURN(sub_uuid);
783 	if (!uuid.empty()) uuid += ':';
784 	uuid += sub_uuid;
785     }
786     RETURN(uuid);
787 }
788 
789 bool
locked() const790 Database::locked() const
791 {
792     LOGCALL(API, bool, "Database::locked", NO_ARGS);
793     for (const auto & subdb : internal) {
794 	// If any of the sub-databases is locked, return true.
795 	if (subdb->locked())
796 	    RETURN(true);
797     }
798     RETURN(false);
799 }
800 
801 Xapian::rev
get_revision() const802 Database::get_revision() const
803 {
804     LOGCALL(API, Xapian::rev, "Database::get_revision", NO_ARGS);
805     size_t n_dbs = internal.size();
806     if (rare(n_dbs != 1)) {
807 	if (n_dbs == 0)
808 	    return 0;
809 	throw Xapian::InvalidOperationError("Database::get_revision() requires "
810 					    "exactly one subdatabase");
811     }
812     const string& s = internal[0]->get_revision_info();
813     const char* p = s.data();
814     const char* end = p + s.size();
815     Xapian::rev revision;
816     if (!unpack_uint(&p, end, &revision))
817 	throw Xapian::UnimplementedError("Database::get_revision() only "
818 					 "supported for chert and glass");
819     return revision;
820 }
821 
822 ///////////////////////////////////////////////////////////////////////////
823 
WritableDatabase()824 WritableDatabase::WritableDatabase() : Database()
825 {
826     LOGCALL_CTOR(API, "WritableDatabase", NO_ARGS);
827 }
828 
WritableDatabase(Database::Internal * internal_)829 WritableDatabase::WritableDatabase(Database::Internal *internal_)
830 	: Database(internal_)
831 {
832     LOGCALL_CTOR(API, "WritableDatabase", internal_);
833 }
834 
WritableDatabase(const WritableDatabase & other)835 WritableDatabase::WritableDatabase(const WritableDatabase &other)
836 	: Database(other)
837 {
838     LOGCALL_CTOR(API, "WritableDatabase", other);
839 }
840 
841 void
operator =(const WritableDatabase & other)842 WritableDatabase::operator=(const WritableDatabase &other)
843 {
844     LOGCALL_VOID(API, "WritableDatabase::operator=", other);
845     Database::operator=(other);
846 }
847 
~WritableDatabase()848 WritableDatabase::~WritableDatabase()
849 {
850     LOGCALL_DTOR(API, "WritableDatabase");
851 }
852 
853 void
commit()854 WritableDatabase::commit()
855 {
856     LOGCALL_VOID(API, "WritableDatabase::commit", NO_ARGS);
857     size_t n_dbs = internal.size();
858     if (rare(n_dbs == 0))
859 	no_subdatabases();
860     for (size_t i = 0; i != n_dbs; ++i)
861 	internal[i]->commit();
862 }
863 
864 void
begin_transaction(bool flushed)865 WritableDatabase::begin_transaction(bool flushed)
866 {
867     LOGCALL_VOID(API, "WritableDatabase::begin_transaction", flushed);
868     size_t n_dbs = internal.size();
869     if (rare(n_dbs == 0))
870 	no_subdatabases();
871     for (size_t i = 0; i != n_dbs; ++i)
872 	internal[i]->begin_transaction(flushed);
873 }
874 
875 void
commit_transaction()876 WritableDatabase::commit_transaction()
877 {
878     LOGCALL_VOID(API, "WritableDatabase::commit_transaction", NO_ARGS);
879     size_t n_dbs = internal.size();
880     if (rare(n_dbs == 0))
881 	no_subdatabases();
882     for (size_t i = 0; i != n_dbs; ++i)
883 	internal[i]->commit_transaction();
884 }
885 
886 void
cancel_transaction()887 WritableDatabase::cancel_transaction()
888 {
889     LOGCALL_VOID(API, "WritableDatabase::cancel_transaction", NO_ARGS);
890     size_t n_dbs = internal.size();
891     if (rare(n_dbs == 0))
892 	no_subdatabases();
893     for (size_t i = 0; i != n_dbs; ++i)
894 	internal[i]->cancel_transaction();
895 }
896 
897 
898 Xapian::docid
add_document(const Document & document)899 WritableDatabase::add_document(const Document & document)
900 {
901     LOGCALL(API, Xapian::docid, "WritableDatabase::add_document", document);
902     size_t n_dbs = internal.size();
903     if (rare(n_dbs == 0))
904 	no_subdatabases();
905     if (n_dbs == 1)
906 	RETURN(internal[0]->add_document(document));
907 
908     // Which database will the next never used docid be in?
909     Xapian::docid did = get_lastdocid() + 1;
910     if (rare(did == 0)) {
911 	throw Xapian::DatabaseError("Run out of docids - you'll have to use copydatabase to eliminate any gaps before you can add more documents");
912     }
913     // We want exactly did to be used, not a lower docid if that subdb isn't
914     // using the docid before it, so call replace_document() not
915     // add_document().
916     size_t i = sub_db(did, n_dbs);
917     internal[i]->replace_document(sub_docid(did, n_dbs), document);
918     RETURN(did);
919 }
920 
921 void
delete_document(Xapian::docid did)922 WritableDatabase::delete_document(Xapian::docid did)
923 {
924     LOGCALL_VOID(API, "WritableDatabase::delete_document", did);
925     if (rare(did == 0))
926 	docid_zero_invalid();
927 
928     size_t n_dbs = internal.size();
929     if (rare(n_dbs == 0))
930 	no_subdatabases();
931     size_t i = sub_db(did, n_dbs);
932     internal[i]->delete_document(sub_docid(did, n_dbs));
933 }
934 
935 void
delete_document(const std::string & unique_term)936 WritableDatabase::delete_document(const std::string & unique_term)
937 {
938     LOGCALL_VOID(API, "WritableDatabase::delete_document", unique_term);
939     if (unique_term.empty())
940 	throw InvalidArgumentError("Empty termnames are invalid");
941     size_t n_dbs = internal.size();
942     if (rare(n_dbs == 0))
943 	no_subdatabases();
944     for (size_t i = 0; i != n_dbs; ++i)
945 	internal[i]->delete_document(unique_term);
946 }
947 
948 void
replace_document(Xapian::docid did,const Document & document)949 WritableDatabase::replace_document(Xapian::docid did, const Document & document)
950 {
951     LOGCALL_VOID(API, "WritableDatabase::replace_document", did | document);
952     if (did == 0)
953 	docid_zero_invalid();
954     size_t n_dbs = internal.size();
955     if (rare(n_dbs == 0))
956 	no_subdatabases();
957     size_t i = sub_db(did, n_dbs);
958     internal[i]->replace_document(sub_docid(did, n_dbs), document);
959 }
960 
961 Xapian::docid
replace_document(const std::string & unique_term,const Document & document)962 WritableDatabase::replace_document(const std::string & unique_term,
963 				   const Document & document)
964 {
965     LOGCALL(API, Xapian::docid, "WritableDatabase::replace_document", unique_term | document);
966     if (unique_term.empty())
967 	throw InvalidArgumentError("Empty termnames are invalid");
968     size_t n_dbs = internal.size();
969     if (rare(n_dbs == 0))
970 	no_subdatabases();
971     if (n_dbs == 1)
972 	RETURN(internal[0]->replace_document(unique_term, document));
973 
974     Xapian::PostingIterator postit = postlist_begin(unique_term);
975     // If no unique_term in the database, this is just an add_document().
976     if (postit == postlist_end(unique_term)) {
977 	// Which database will the next never used docid be in?
978 	Xapian::docid did = get_lastdocid() + 1;
979 	if (rare(did == 0)) {
980 	    throw Xapian::DatabaseError("Run out of docids - you'll have to use copydatabase to eliminate any gaps before you can add more documents");
981 	}
982 	size_t i = sub_db(did, n_dbs);
983 	RETURN(internal[i]->add_document(document));
984     }
985 
986     Xapian::docid retval = *postit;
987     size_t i = sub_db(retval, n_dbs);
988     internal[i]->replace_document(sub_docid(retval, n_dbs), document);
989 
990     // Delete any other occurrences of unique_term.
991     while (++postit != postlist_end(unique_term)) {
992 	Xapian::docid did = *postit;
993 	i = sub_db(did, n_dbs);
994 	internal[i]->delete_document(sub_docid(did, n_dbs));
995     }
996 
997     return retval;
998 }
999 
1000 void
add_spelling(const std::string & word,Xapian::termcount freqinc) const1001 WritableDatabase::add_spelling(const std::string & word,
1002 			       Xapian::termcount freqinc) const
1003 {
1004     LOGCALL_VOID(API, "WritableDatabase::add_spelling", word | freqinc);
1005     if (rare(internal.empty()))
1006 	no_subdatabases();
1007     // FIXME: Is adding to the first subdatabase sensible?
1008     internal[0]->add_spelling(word, freqinc);
1009 }
1010 
1011 void
remove_spelling(const std::string & word,Xapian::termcount freqdec) const1012 WritableDatabase::remove_spelling(const std::string & word,
1013 				  Xapian::termcount freqdec) const
1014 {
1015     LOGCALL_VOID(API, "WritableDatabase::remove_spelling", word | freqdec);
1016     size_t n_dbs = internal.size();
1017     if (rare(n_dbs == 0))
1018 	no_subdatabases();
1019     for (size_t i = 0; i < n_dbs; ++i) {
1020 	internal[i]->remove_spelling(word, freqdec);
1021     }
1022 }
1023 
1024 void
add_synonym(const std::string & term,const std::string & synonym) const1025 WritableDatabase::add_synonym(const std::string & term,
1026 			      const std::string & synonym) const
1027 {
1028     LOGCALL_VOID(API, "WritableDatabase::add_synonym", term | synonym);
1029     if (rare(internal.empty()))
1030 	no_subdatabases();
1031     // FIXME: Is adding to the first subdatabase sensible?
1032     internal[0]->add_synonym(term, synonym);
1033 }
1034 
1035 void
remove_synonym(const std::string & term,const std::string & synonym) const1036 WritableDatabase::remove_synonym(const std::string & term,
1037 				 const std::string & synonym) const
1038 {
1039     LOGCALL_VOID(API, "WritableDatabase::remove_synonym", term | synonym);
1040     size_t n_dbs = internal.size();
1041     if (rare(n_dbs == 0))
1042 	no_subdatabases();
1043     for (size_t i = 0; i < n_dbs; ++i) {
1044 	internal[i]->remove_synonym(term, synonym);
1045     }
1046 }
1047 
1048 void
clear_synonyms(const std::string & term) const1049 WritableDatabase::clear_synonyms(const std::string & term) const
1050 {
1051     LOGCALL_VOID(API, "WritableDatabase::clear_synonyms", term);
1052     size_t n_dbs = internal.size();
1053     if (rare(n_dbs == 0))
1054 	no_subdatabases();
1055     for (size_t i = 0; i < n_dbs; ++i) {
1056 	internal[i]->clear_synonyms(term);
1057     }
1058 }
1059 
1060 void
set_metadata(const string & key,const string & value)1061 WritableDatabase::set_metadata(const string & key, const string & value)
1062 {
1063     LOGCALL_VOID(API, "WritableDatabase::set_metadata", key | value);
1064     if (rare(key.empty()))
1065 	empty_metadata_key();
1066     if (rare(internal.empty()))
1067 	no_subdatabases();
1068     internal[0]->set_metadata(key, value);
1069 }
1070 
1071 string
get_description() const1072 WritableDatabase::get_description() const
1073 {
1074     /// @todo display contents of the writable database
1075     return "WritableDatabase()";
1076 }
1077 
1078 }
1079