1 /* omdatabase.cc: External interface for running queries
2 *
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2016 Olly Betts
6 * Copyright 2006,2008 Lemur Consulting Ltd
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
22 */
23
24 #include <config.h>
25
26 #include "autoptr.h"
27
28 #include <xapian/constants.h>
29 #include <xapian/error.h>
30 #include <xapian/positioniterator.h>
31 #include <xapian/postingiterator.h>
32 #include <xapian/termiterator.h>
33 #include <xapian/unicode.h>
34
35 #include "omassert.h"
36 #include "debuglog.h"
37 #include "backends/alltermslist.h"
38 #include "backends/multi/multi_alltermslist.h"
39 #include "backends/multi/multi_postlist.h"
40 #include "backends/multi/multi_termlist.h"
41 #include "backends/multivaluelist.h"
42 #include "backends/database.h"
43 #include "editdistance.h"
44 #include "expand/ortermlist.h"
45 #include "internaltypes.h"
46 #include "noreturn.h"
47 #include "pack.h"
48
49 #include <algorithm>
50 #include <cstdlib> // For abs().
51 #include <cstring>
52 #include <vector>
53
54 using namespace std;
55 using Xapian::Internal::intrusive_ptr;
56
57 XAPIAN_NORETURN(static void docid_zero_invalid());
docid_zero_invalid()58 static void docid_zero_invalid()
59 {
60 throw Xapian::InvalidArgumentError("Document ID 0 is invalid");
61 }
62
63 XAPIAN_NORETURN(static void no_subdatabases());
no_subdatabases()64 static void no_subdatabases()
65 {
66 throw Xapian::InvalidOperationError("No subdatabases");
67 }
68
69 XAPIAN_NORETURN(static void empty_metadata_key());
empty_metadata_key()70 static void empty_metadata_key()
71 {
72 throw Xapian::InvalidArgumentError("Empty metadata keys are invalid");
73 }
74
75 inline size_t
sub_db(Xapian::docid did,size_t n_dbs)76 sub_db(Xapian::docid did, size_t n_dbs)
77 {
78 return (did - 1) % n_dbs;
79 }
80
81 inline size_t
sub_docid(Xapian::docid did,size_t n_dbs)82 sub_docid(Xapian::docid did, size_t n_dbs)
83 {
84 return (did - 1) / n_dbs + 1;
85 }
86
87 namespace Xapian {
88
89 Database::Database(Database&&) = default;
90
91 Database&
92 Database::operator=(Database&&) = default;
93
Database()94 Database::Database()
95 {
96 LOGCALL_CTOR(API, "Database", NO_ARGS);
97 }
98
Database(Database::Internal * internal_)99 Database::Database(Database::Internal *internal_)
100 {
101 LOGCALL_CTOR(API, "Database", internal_);
102 intrusive_ptr<Database::Internal> newi(internal_);
103 internal.push_back(newi);
104 }
105
Database(const Database & other)106 Database::Database(const Database &other)
107 {
108 LOGCALL_CTOR(API, "Database", other);
109 internal = other.internal;
110 }
111
112 void
operator =(const Database & other)113 Database::operator=(const Database &other)
114 {
115 LOGCALL_VOID(API, "Database::operator=", other);
116 internal = other.internal;
117 }
118
~Database()119 Database::~Database()
120 {
121 LOGCALL_DTOR(API, "Database");
122 }
123
124 bool
reopen()125 Database::reopen()
126 {
127 LOGCALL(API, bool, "Database::reopen", NO_ARGS);
128 bool maybe_changed = false;
129 vector<intrusive_ptr<Database::Internal> >::iterator i;
130 for (i = internal.begin(); i != internal.end(); ++i) {
131 if ((*i)->reopen())
132 maybe_changed = true;
133 }
134 RETURN(maybe_changed);
135 }
136
137 void
close()138 Database::close()
139 {
140 LOGCALL_VOID(API, "Database::close", NO_ARGS);
141 vector<intrusive_ptr<Database::Internal> >::iterator i;
142 for (i = internal.begin(); i != internal.end(); ++i) {
143 (*i)->close();
144 }
145 }
146
147 void
add_database(const Database & database)148 Database::add_database(const Database & database)
149 {
150 LOGCALL_VOID(API, "Database::add_database", database);
151 if (this == &database) {
152 LOGLINE(API, "Database added to itself");
153 throw Xapian::InvalidArgumentError("Can't add a Database to itself");
154 }
155 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
156 for (i = database.internal.begin(); i != database.internal.end(); ++i) {
157 internal.push_back(*i);
158 }
159 }
160
161 PostingIterator
postlist_begin(const string & tname) const162 Database::postlist_begin(const string &tname) const
163 {
164 LOGCALL(API, PostingIterator, "Database::postlist_begin", tname);
165
166 // Don't bother checking that the term exists first. If it does, we
167 // just end up doing more work, and if it doesn't, we save very little
168 // work.
169
170 // Handle the common case of a single database specially.
171 if (internal.size() == 1)
172 RETURN(PostingIterator(internal[0]->open_post_list(tname)));
173
174 if (rare(internal.empty()))
175 RETURN(PostingIterator());
176
177 vector<LeafPostList *> pls;
178 try {
179 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
180 for (i = internal.begin(); i != internal.end(); ++i) {
181 pls.push_back((*i)->open_post_list(tname));
182 pls.back()->next();
183 }
184 Assert(pls.begin() != pls.end());
185 } catch (...) {
186 vector<LeafPostList *>::iterator i;
187 for (i = pls.begin(); i != pls.end(); ++i) {
188 delete *i;
189 *i = 0;
190 }
191 throw;
192 }
193
194 RETURN(PostingIterator(new MultiPostList(pls, *this)));
195 }
196
197 TermIterator
termlist_begin(Xapian::docid did) const198 Database::termlist_begin(Xapian::docid did) const
199 {
200 LOGCALL(API, TermIterator, "Database::termlist_begin", did);
201 if (did == 0)
202 docid_zero_invalid();
203
204 unsigned int multiplier = internal.size();
205 if (rare(multiplier == 0))
206 no_subdatabases();
207 TermList *tl;
208 if (multiplier == 1) {
209 // There's no need for the MultiTermList wrapper in the common case
210 // where we're only dealing with a single database.
211 tl = internal[0]->open_term_list(did);
212 } else {
213 Assert(multiplier != 0);
214 Xapian::doccount n = (did - 1) % multiplier; // which actual database
215 Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
216
217 tl = new MultiTermList(internal[n]->open_term_list(m), *this, n);
218 }
219 RETURN(TermIterator(tl));
220 }
221
222 TermIterator
allterms_begin(const std::string & prefix) const223 Database::allterms_begin(const std::string & prefix) const
224 {
225 LOGCALL(API, TermIterator, "Database::allterms_begin", NO_ARGS);
226 TermList * tl;
227 if (rare(internal.size() == 0)) {
228 tl = NULL;
229 } else if (internal.size() == 1) {
230 tl = internal[0]->open_allterms(prefix);
231 } else {
232 tl = new MultiAllTermsList(internal, prefix);
233 }
234 RETURN(TermIterator(tl));
235 }
236
237 bool
has_positions() const238 Database::has_positions() const
239 {
240 LOGCALL(API, bool, "Database::has_positions", NO_ARGS);
241 // If any sub-database has positions, the combined database does.
242 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
243 for (i = internal.begin(); i != internal.end(); ++i) {
244 if ((*i)->has_positions()) RETURN(true);
245 }
246 RETURN(false);
247 }
248
249 PositionIterator
positionlist_begin(Xapian::docid did,const string & tname) const250 Database::positionlist_begin(Xapian::docid did, const string &tname) const
251 {
252 LOGCALL(API, PositionIterator, "Database::positionlist_begin", did | tname);
253 if (tname.empty())
254 throw InvalidArgumentError("Zero length terms are invalid");
255 if (did == 0)
256 docid_zero_invalid();
257
258 unsigned int multiplier = internal.size();
259 if (rare(multiplier == 0))
260 no_subdatabases();
261 Xapian::doccount n = (did - 1) % multiplier; // which actual database
262 Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
263 RETURN(PositionIterator(internal[n]->open_position_list(m, tname)));
264 }
265
266 Xapian::doccount
get_doccount() const267 Database::get_doccount() const
268 {
269 LOGCALL(API, Xapian::doccount, "Database::get_doccount", NO_ARGS);
270 Xapian::doccount docs = 0;
271 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
272 for (i = internal.begin(); i != internal.end(); ++i) {
273 docs += (*i)->get_doccount();
274 }
275 RETURN(docs);
276 }
277
278 Xapian::docid
get_lastdocid() const279 Database::get_lastdocid() const
280 {
281 LOGCALL(API, Xapian::docid, "Database::get_lastdocid", NO_ARGS);
282 Xapian::docid did = 0;
283
284 unsigned int multiplier = internal.size();
285 for (Xapian::doccount i = 0; i < multiplier; ++i) {
286 Xapian::docid did_i = internal[i]->get_lastdocid();
287 if (did_i) did = std::max(did, (did_i - 1) * multiplier + i + 1);
288 }
289 RETURN(did);
290 }
291
292 Xapian::doclength
get_avlength() const293 Database::get_avlength() const
294 {
295 LOGCALL(API, Xapian::doclength, "Database::get_avlength", NO_ARGS);
296 Xapian::doccount docs = 0;
297 Xapian::totallength totlen = 0;
298
299 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
300 for (i = internal.begin(); i != internal.end(); ++i) {
301 docs += (*i)->get_doccount();
302 totlen += (*i)->get_total_length();
303 }
304 LOGLINE(UNKNOWN, "get_avlength() = " << totlen << " / " << docs <<
305 " (from " << internal.size() << " dbs)");
306
307 if (docs == 0) RETURN(0.0);
308 RETURN(totlen / double(docs));
309 }
310
311 Xapian::totallength
get_total_length() const312 Database::get_total_length() const
313 {
314 LOGCALL(API, Xapian::totallength, "Database::get_total_length", NO_ARGS);
315 Xapian::totallength total_length = 0;
316 for (auto&& sub_db : internal) {
317 total_length += sub_db->get_total_length();
318 }
319 RETURN(total_length);
320 }
321
322 Xapian::doccount
get_termfreq(const string & tname) const323 Database::get_termfreq(const string & tname) const
324 {
325 LOGCALL(API, Xapian::doccount, "Database::get_termfreq", tname);
326 if (tname.empty()) RETURN(get_doccount());
327
328 Xapian::doccount tf = 0;
329 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
330 for (i = internal.begin(); i != internal.end(); ++i) {
331 Xapian::doccount sub_tf;
332 (*i)->get_freqs(tname, &sub_tf, NULL);
333 tf += sub_tf;
334 }
335 RETURN(tf);
336 }
337
338 Xapian::termcount
get_collection_freq(const string & tname) const339 Database::get_collection_freq(const string & tname) const
340 {
341 LOGCALL(API, Xapian::termcount, "Database::get_collection_freq", tname);
342 if (tname.empty()) RETURN(get_doccount());
343
344 Xapian::termcount cf = 0;
345 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
346 for (i = internal.begin(); i != internal.end(); ++i) {
347 Xapian::termcount sub_cf;
348 (*i)->get_freqs(tname, NULL, &sub_cf);
349 cf += sub_cf;
350 }
351 RETURN(cf);
352 }
353
354 Xapian::doccount
get_value_freq(Xapian::valueno slot) const355 Database::get_value_freq(Xapian::valueno slot) const
356 {
357 LOGCALL(API, Xapian::doccount, "Database::get_value_freq", slot);
358
359 Xapian::doccount vf = 0;
360 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
361 for (i = internal.begin(); i != internal.end(); ++i) {
362 vf += (*i)->get_value_freq(slot);
363 }
364 RETURN(vf);
365 }
366
367 string
get_value_lower_bound(Xapian::valueno slot) const368 Database::get_value_lower_bound(Xapian::valueno slot) const
369 {
370 LOGCALL(API, string, "Database::get_value_lower_bound", slot);
371
372 if (rare(internal.empty())) RETURN(string());
373
374 string full_lb;
375 for (auto&& subdb : internal) {
376 string lb = subdb->get_value_lower_bound(slot);
377 if (lb.empty())
378 continue;
379 if (full_lb.empty() || lb < full_lb)
380 full_lb = std::move(lb);
381 }
382 RETURN(full_lb);
383 }
384
385 std::string
get_value_upper_bound(Xapian::valueno slot) const386 Database::get_value_upper_bound(Xapian::valueno slot) const
387 {
388 LOGCALL(API, std::string, "Database::get_value_upper_bound", slot);
389
390 std::string full_ub;
391 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
392 for (i = internal.begin(); i != internal.end(); ++i) {
393 std::string ub = (*i)->get_value_upper_bound(slot);
394 if (ub > full_ub)
395 full_ub = ub;
396 }
397 RETURN(full_ub);
398 }
399
400 Xapian::termcount
get_doclength_lower_bound() const401 Database::get_doclength_lower_bound() const
402 {
403 LOGCALL(API, Xapian::termcount, "Database::get_doclength_lower_bound", NO_ARGS);
404
405 if (rare(internal.empty())) RETURN(0);
406
407 Xapian::termcount full_lb = 0;
408 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
409 for (i = internal.begin(); i != internal.end(); ++i) {
410 // Skip sub-databases which are empty or only contain documents with
411 // doclen==0.
412 if ((*i)->get_total_length() != 0) {
413 Xapian::termcount lb = (*i)->get_doclength_lower_bound();
414 if (full_lb == 0 || lb < full_lb) full_lb = lb;
415 }
416 }
417 RETURN(full_lb);
418 }
419
420 Xapian::termcount
get_doclength_upper_bound() const421 Database::get_doclength_upper_bound() const
422 {
423 LOGCALL(API, Xapian::termcount, "Database::get_doclength_upper_bound", NO_ARGS);
424
425 Xapian::termcount full_ub = 0;
426 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
427 for (i = internal.begin(); i != internal.end(); ++i) {
428 Xapian::termcount ub = (*i)->get_doclength_upper_bound();
429 if (ub > full_ub) full_ub = ub;
430 }
431 RETURN(full_ub);
432 }
433
434 Xapian::termcount
get_wdf_upper_bound(const string & term) const435 Database::get_wdf_upper_bound(const string & term) const
436 {
437 LOGCALL(API, Xapian::termcount, "Database::get_wdf_upper_bound", term);
438 if (term.empty()) RETURN(0);
439
440 Xapian::termcount full_ub = 0;
441 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
442 for (i = internal.begin(); i != internal.end(); ++i) {
443 Xapian::termcount ub = (*i)->get_wdf_upper_bound(term);
444 if (ub > full_ub) full_ub = ub;
445 }
446 RETURN(full_ub);
447 }
448
449 ValueIterator
valuestream_begin(Xapian::valueno slot) const450 Database::valuestream_begin(Xapian::valueno slot) const
451 {
452 LOGCALL(API, ValueIterator, "Database::valuestream_begin", slot);
453 if (internal.size() == 0)
454 RETURN(ValueIterator());
455 if (internal.size() != 1)
456 RETURN(ValueIterator(new MultiValueList(internal, slot)));
457 RETURN(ValueIterator(internal[0]->open_value_list(slot)));
458 }
459
460 Xapian::termcount
get_doclength(Xapian::docid did) const461 Database::get_doclength(Xapian::docid did) const
462 {
463 LOGCALL(API, Xapian::termcount, "Database::get_doclength", did);
464 if (did == 0)
465 docid_zero_invalid();
466
467 unsigned int multiplier = internal.size();
468 if (rare(multiplier == 0))
469 no_subdatabases();
470 Xapian::doccount n = (did - 1) % multiplier; // which actual database
471 Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
472 RETURN(internal[n]->get_doclength(m));
473 }
474
475 Xapian::termcount
get_unique_terms(Xapian::docid did) const476 Database::get_unique_terms(Xapian::docid did) const
477 {
478 LOGCALL(API, Xapian::termcount, "Database::get_unique_terms", did);
479 if (did == 0)
480 docid_zero_invalid();
481 unsigned int multiplier = internal.size();
482 if (rare(multiplier == 0))
483 no_subdatabases();
484 Xapian::doccount n = (did - 1) % multiplier; // which actual database
485 Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
486 RETURN(internal[n]->get_unique_terms(m));
487 }
488
489 Document
get_document(Xapian::docid did) const490 Database::get_document(Xapian::docid did) const
491 {
492 LOGCALL(API, Document, "Database::get_document", did);
493 if (did == 0)
494 docid_zero_invalid();
495
496 unsigned int multiplier = internal.size();
497 if (rare(multiplier == 0))
498 no_subdatabases();
499 Xapian::doccount n = (did - 1) % multiplier; // which actual database
500 Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
501
502 // Open non-lazily so we throw DocNotFoundError if the doc doesn't exist.
503 RETURN(Document(internal[n]->open_document(m, false)));
504 }
505
506 Document
get_document(Xapian::docid did,unsigned flags) const507 Database::get_document(Xapian::docid did, unsigned flags) const
508 {
509 LOGCALL(API, Document, "Database::get_document", did|flags);
510 if (did == 0)
511 docid_zero_invalid();
512
513 unsigned int multiplier = internal.size();
514 if (rare(multiplier == 0))
515 no_subdatabases();
516 Xapian::doccount n = (did - 1) % multiplier; // which actual database
517 Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
518
519 bool assume_valid = flags & Xapian::DOC_ASSUME_VALID;
520 RETURN(Document(internal[n]->open_document(m, assume_valid)));
521 }
522
523 bool
term_exists(const string & tname) const524 Database::term_exists(const string & tname) const
525 {
526 LOGCALL(API, bool, "Database::term_exists", tname);
527 if (tname.empty()) {
528 RETURN(get_doccount() != 0);
529 }
530 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
531 for (i = internal.begin(); i != internal.end(); ++i) {
532 if ((*i)->term_exists(tname)) RETURN(true);
533 }
534 RETURN(false);
535 }
536
537 void
keep_alive()538 Database::keep_alive()
539 {
540 LOGCALL_VOID(API, "Database::keep_alive", NO_ARGS);
541 vector<intrusive_ptr<Database::Internal> >::const_iterator i;
542 for (i = internal.begin(); i != internal.end(); ++i) {
543 (*i)->keep_alive();
544 }
545 }
546
547 string
get_description() const548 Database::get_description() const
549 {
550 /// @todo display contents of the database
551 return "Database()";
552 }
553
554 // We sum the character frequency histogram absolute differences to compute a
555 // lower bound on the edit distance. Rather than counting each Unicode code
556 // point uniquely, we use an array with VEC_SIZE elements and tally code points
557 // modulo VEC_SIZE which can only reduce the bound we calculate.
558 //
559 // There will be a trade-off between how good the bound is and how large and
560 // array is used (a larger array takes more time to clear and sum over). The
561 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
562 // but that may not reflect real world performance. FIXME: profile and tune.
563
564 #define VEC_SIZE 64
565
566 static int
freq_edit_lower_bound(const vector<unsigned> & a,const vector<unsigned> & b)567 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)
568 {
569 int vec[VEC_SIZE];
570 memset(vec, 0, sizeof(vec));
571 vector<unsigned>::const_iterator i;
572 for (i = a.begin(); i != a.end(); ++i) {
573 ++vec[(*i) % VEC_SIZE];
574 }
575 for (i = b.begin(); i != b.end(); ++i) {
576 --vec[(*i) % VEC_SIZE];
577 }
578 unsigned int total = 0;
579 for (size_t j = 0; j < VEC_SIZE; ++j) {
580 total += abs(vec[j]);
581 }
582 // Each insertion or deletion adds at most 1 to total. Each transposition
583 // doesn't change it at all. But each substitution can change it by 2 so
584 // we need to divide it by 2. Rounding up is OK, since the odd change must
585 // be due to an actual edit.
586 return (total + 1) / 2;
587 }
588
589 // Word must have a trigram score at least this close to the best score seen
590 // so far.
591 #define TRIGRAM_SCORE_THRESHOLD 2
592
593 string
get_spelling_suggestion(const string & word,unsigned max_edit_distance) const594 Database::get_spelling_suggestion(const string &word,
595 unsigned max_edit_distance) const
596 {
597 LOGCALL(API, string, "Database::get_spelling_suggestion", word | max_edit_distance);
598 if (word.size() <= 1) return string();
599 AutoPtr<TermList> merger;
600 for (size_t i = 0; i < internal.size(); ++i) {
601 TermList * tl = internal[i]->open_spelling_termlist(word);
602 LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl);
603 if (tl) {
604 if (merger.get()) {
605 merger.reset(new OrTermList(merger.release(), tl));
606 } else {
607 merger.reset(tl);
608 }
609 }
610 }
611 if (!merger.get()) RETURN(string());
612
613 // Convert word to UTF-32.
614 // Extra brackets needed to avoid this being misparsed as a function
615 // prototype.
616 vector<unsigned> utf32_word((Utf8Iterator(word)), Utf8Iterator());
617
618 vector<unsigned> utf32_term;
619
620 Xapian::termcount best = 1;
621 string result;
622 int edist_best = max_edit_distance;
623 Xapian::doccount freq_best = 0;
624 Xapian::doccount freq_exact = 0;
625 while (true) {
626 TermList *ret = merger->next();
627 if (ret) merger.reset(ret);
628
629 if (merger->at_end()) break;
630
631 string term = merger->get_termname();
632 Xapian::termcount score = merger->get_wdf();
633
634 LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score);
635 if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
636 if (score > best) best = score;
637
638 // There's no point considering a word where the difference
639 // in length is greater than the smallest number of edits we've
640 // found so far.
641
642 // First check the length of the encoded UTF-8 version of term.
643 // Each UTF-32 character is 1-4 bytes in UTF-8.
644 if (abs(long(term.size()) - long(word.size())) > edist_best * 4) {
645 LOGLINE(SPELLING, "Lengths much too different");
646 continue;
647 }
648
649 // Now convert to UTF-32, and compare the true lengths more
650 // strictly.
651 utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
652
653 if (abs(long(utf32_term.size()) - long(utf32_word.size()))
654 > edist_best) {
655 LOGLINE(SPELLING, "Lengths too different");
656 continue;
657 }
658
659 if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) {
660 LOGLINE(SPELLING, "Rejected by character frequency test");
661 continue;
662 }
663
664 int edist = edit_distance_unsigned(&utf32_term[0],
665 int(utf32_term.size()),
666 &utf32_word[0],
667 int(utf32_word.size()),
668 edist_best);
669 LOGLINE(SPELLING, "Edit distance " << edist);
670
671 if (edist <= edist_best) {
672 Xapian::doccount freq = 0;
673 for (size_t j = 0; j < internal.size(); ++j)
674 freq += internal[j]->get_spelling_frequency(term);
675
676 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
677 // Even if we have an exact match, there may be a much more
678 // frequent potential correction which will still be
679 // interesting.
680 if (edist == 0) {
681 freq_exact = freq;
682 continue;
683 }
684
685 if (edist < edist_best || freq > freq_best) {
686 LOGLINE(SPELLING, "Best so far: \"" << term <<
687 "\" edist " << edist << " freq " << freq);
688 result = term;
689 edist_best = edist;
690 freq_best = freq;
691 }
692 }
693 }
694 }
695 if (freq_best < freq_exact)
696 RETURN(string());
697 RETURN(result);
698 }
699
700 TermIterator
spellings_begin() const701 Database::spellings_begin() const
702 {
703 LOGCALL(API, TermIterator, "Database::spellings_begin", NO_ARGS);
704 AutoPtr<TermList> merger;
705 for (size_t i = 0; i < internal.size(); ++i) {
706 TermList * tl = internal[i]->open_spelling_wordlist();
707 if (tl) {
708 if (merger.get()) {
709 merger.reset(new FreqAdderOrTermList(merger.release(), tl));
710 } else {
711 merger.reset(tl);
712 }
713 }
714 }
715 RETURN(TermIterator(merger.release()));
716 }
717
718 TermIterator
synonyms_begin(const std::string & term) const719 Database::synonyms_begin(const std::string &term) const
720 {
721 LOGCALL(API, TermIterator, "Database::synonyms_begin", term);
722 AutoPtr<TermList> merger;
723 for (size_t i = 0; i < internal.size(); ++i) {
724 TermList * tl = internal[i]->open_synonym_termlist(term);
725 if (tl) {
726 if (merger.get()) {
727 merger.reset(new OrTermList(merger.release(), tl));
728 } else {
729 merger.reset(tl);
730 }
731 }
732 }
733 RETURN(TermIterator(merger.release()));
734 }
735
736 TermIterator
synonym_keys_begin(const std::string & prefix) const737 Database::synonym_keys_begin(const std::string &prefix) const
738 {
739 LOGCALL(API, TermIterator, "Database::synonym_keys_begin", prefix);
740 AutoPtr<TermList> merger;
741 for (size_t i = 0; i < internal.size(); ++i) {
742 TermList * tl = internal[i]->open_synonym_keylist(prefix);
743 if (tl) {
744 if (merger.get()) {
745 merger.reset(new OrTermList(merger.release(), tl));
746 } else {
747 merger.reset(tl);
748 }
749 }
750 }
751 RETURN(TermIterator(merger.release()));
752 }
753
754 string
get_metadata(const string & key) const755 Database::get_metadata(const string & key) const
756 {
757 LOGCALL(API, string, "Database::get_metadata", key);
758 if (rare(key.empty()))
759 empty_metadata_key();
760 if (internal.empty()) RETURN(std::string());
761 RETURN(internal[0]->get_metadata(key));
762 }
763
764 Xapian::TermIterator
metadata_keys_begin(const std::string & prefix) const765 Database::metadata_keys_begin(const std::string &prefix) const
766 {
767 LOGCALL(API, Xapian::TermIterator, "Database::metadata_keys_begin", NO_ARGS);
768 if (internal.empty()) RETURN(TermIterator());
769 RETURN(TermIterator(internal[0]->open_metadata_keylist(prefix)));
770 }
771
772 std::string
get_uuid() const773 Database::get_uuid() const
774 {
775 LOGCALL(API, std::string, "Database::get_uuid", NO_ARGS);
776 string uuid;
777 for (size_t i = 0; i < internal.size(); ++i) {
778 string sub_uuid = internal[i]->get_uuid();
779 // If any of the sub-databases have no uuid, we can't make a uuid for
780 // the combined database.
781 if (sub_uuid.empty())
782 RETURN(sub_uuid);
783 if (!uuid.empty()) uuid += ':';
784 uuid += sub_uuid;
785 }
786 RETURN(uuid);
787 }
788
789 bool
locked() const790 Database::locked() const
791 {
792 LOGCALL(API, bool, "Database::locked", NO_ARGS);
793 for (const auto & subdb : internal) {
794 // If any of the sub-databases is locked, return true.
795 if (subdb->locked())
796 RETURN(true);
797 }
798 RETURN(false);
799 }
800
801 Xapian::rev
get_revision() const802 Database::get_revision() const
803 {
804 LOGCALL(API, Xapian::rev, "Database::get_revision", NO_ARGS);
805 size_t n_dbs = internal.size();
806 if (rare(n_dbs != 1)) {
807 if (n_dbs == 0)
808 return 0;
809 throw Xapian::InvalidOperationError("Database::get_revision() requires "
810 "exactly one subdatabase");
811 }
812 const string& s = internal[0]->get_revision_info();
813 const char* p = s.data();
814 const char* end = p + s.size();
815 Xapian::rev revision;
816 if (!unpack_uint(&p, end, &revision))
817 throw Xapian::UnimplementedError("Database::get_revision() only "
818 "supported for chert and glass");
819 return revision;
820 }
821
822 ///////////////////////////////////////////////////////////////////////////
823
WritableDatabase()824 WritableDatabase::WritableDatabase() : Database()
825 {
826 LOGCALL_CTOR(API, "WritableDatabase", NO_ARGS);
827 }
828
WritableDatabase(Database::Internal * internal_)829 WritableDatabase::WritableDatabase(Database::Internal *internal_)
830 : Database(internal_)
831 {
832 LOGCALL_CTOR(API, "WritableDatabase", internal_);
833 }
834
WritableDatabase(const WritableDatabase & other)835 WritableDatabase::WritableDatabase(const WritableDatabase &other)
836 : Database(other)
837 {
838 LOGCALL_CTOR(API, "WritableDatabase", other);
839 }
840
841 void
operator =(const WritableDatabase & other)842 WritableDatabase::operator=(const WritableDatabase &other)
843 {
844 LOGCALL_VOID(API, "WritableDatabase::operator=", other);
845 Database::operator=(other);
846 }
847
~WritableDatabase()848 WritableDatabase::~WritableDatabase()
849 {
850 LOGCALL_DTOR(API, "WritableDatabase");
851 }
852
853 void
commit()854 WritableDatabase::commit()
855 {
856 LOGCALL_VOID(API, "WritableDatabase::commit", NO_ARGS);
857 size_t n_dbs = internal.size();
858 if (rare(n_dbs == 0))
859 no_subdatabases();
860 for (size_t i = 0; i != n_dbs; ++i)
861 internal[i]->commit();
862 }
863
864 void
begin_transaction(bool flushed)865 WritableDatabase::begin_transaction(bool flushed)
866 {
867 LOGCALL_VOID(API, "WritableDatabase::begin_transaction", flushed);
868 size_t n_dbs = internal.size();
869 if (rare(n_dbs == 0))
870 no_subdatabases();
871 for (size_t i = 0; i != n_dbs; ++i)
872 internal[i]->begin_transaction(flushed);
873 }
874
875 void
commit_transaction()876 WritableDatabase::commit_transaction()
877 {
878 LOGCALL_VOID(API, "WritableDatabase::commit_transaction", NO_ARGS);
879 size_t n_dbs = internal.size();
880 if (rare(n_dbs == 0))
881 no_subdatabases();
882 for (size_t i = 0; i != n_dbs; ++i)
883 internal[i]->commit_transaction();
884 }
885
886 void
cancel_transaction()887 WritableDatabase::cancel_transaction()
888 {
889 LOGCALL_VOID(API, "WritableDatabase::cancel_transaction", NO_ARGS);
890 size_t n_dbs = internal.size();
891 if (rare(n_dbs == 0))
892 no_subdatabases();
893 for (size_t i = 0; i != n_dbs; ++i)
894 internal[i]->cancel_transaction();
895 }
896
897
898 Xapian::docid
add_document(const Document & document)899 WritableDatabase::add_document(const Document & document)
900 {
901 LOGCALL(API, Xapian::docid, "WritableDatabase::add_document", document);
902 size_t n_dbs = internal.size();
903 if (rare(n_dbs == 0))
904 no_subdatabases();
905 if (n_dbs == 1)
906 RETURN(internal[0]->add_document(document));
907
908 // Which database will the next never used docid be in?
909 Xapian::docid did = get_lastdocid() + 1;
910 if (rare(did == 0)) {
911 throw Xapian::DatabaseError("Run out of docids - you'll have to use copydatabase to eliminate any gaps before you can add more documents");
912 }
913 // We want exactly did to be used, not a lower docid if that subdb isn't
914 // using the docid before it, so call replace_document() not
915 // add_document().
916 size_t i = sub_db(did, n_dbs);
917 internal[i]->replace_document(sub_docid(did, n_dbs), document);
918 RETURN(did);
919 }
920
921 void
delete_document(Xapian::docid did)922 WritableDatabase::delete_document(Xapian::docid did)
923 {
924 LOGCALL_VOID(API, "WritableDatabase::delete_document", did);
925 if (rare(did == 0))
926 docid_zero_invalid();
927
928 size_t n_dbs = internal.size();
929 if (rare(n_dbs == 0))
930 no_subdatabases();
931 size_t i = sub_db(did, n_dbs);
932 internal[i]->delete_document(sub_docid(did, n_dbs));
933 }
934
935 void
delete_document(const std::string & unique_term)936 WritableDatabase::delete_document(const std::string & unique_term)
937 {
938 LOGCALL_VOID(API, "WritableDatabase::delete_document", unique_term);
939 if (unique_term.empty())
940 throw InvalidArgumentError("Empty termnames are invalid");
941 size_t n_dbs = internal.size();
942 if (rare(n_dbs == 0))
943 no_subdatabases();
944 for (size_t i = 0; i != n_dbs; ++i)
945 internal[i]->delete_document(unique_term);
946 }
947
948 void
replace_document(Xapian::docid did,const Document & document)949 WritableDatabase::replace_document(Xapian::docid did, const Document & document)
950 {
951 LOGCALL_VOID(API, "WritableDatabase::replace_document", did | document);
952 if (did == 0)
953 docid_zero_invalid();
954 size_t n_dbs = internal.size();
955 if (rare(n_dbs == 0))
956 no_subdatabases();
957 size_t i = sub_db(did, n_dbs);
958 internal[i]->replace_document(sub_docid(did, n_dbs), document);
959 }
960
961 Xapian::docid
replace_document(const std::string & unique_term,const Document & document)962 WritableDatabase::replace_document(const std::string & unique_term,
963 const Document & document)
964 {
965 LOGCALL(API, Xapian::docid, "WritableDatabase::replace_document", unique_term | document);
966 if (unique_term.empty())
967 throw InvalidArgumentError("Empty termnames are invalid");
968 size_t n_dbs = internal.size();
969 if (rare(n_dbs == 0))
970 no_subdatabases();
971 if (n_dbs == 1)
972 RETURN(internal[0]->replace_document(unique_term, document));
973
974 Xapian::PostingIterator postit = postlist_begin(unique_term);
975 // If no unique_term in the database, this is just an add_document().
976 if (postit == postlist_end(unique_term)) {
977 // Which database will the next never used docid be in?
978 Xapian::docid did = get_lastdocid() + 1;
979 if (rare(did == 0)) {
980 throw Xapian::DatabaseError("Run out of docids - you'll have to use copydatabase to eliminate any gaps before you can add more documents");
981 }
982 size_t i = sub_db(did, n_dbs);
983 RETURN(internal[i]->add_document(document));
984 }
985
986 Xapian::docid retval = *postit;
987 size_t i = sub_db(retval, n_dbs);
988 internal[i]->replace_document(sub_docid(retval, n_dbs), document);
989
990 // Delete any other occurrences of unique_term.
991 while (++postit != postlist_end(unique_term)) {
992 Xapian::docid did = *postit;
993 i = sub_db(did, n_dbs);
994 internal[i]->delete_document(sub_docid(did, n_dbs));
995 }
996
997 return retval;
998 }
999
1000 void
add_spelling(const std::string & word,Xapian::termcount freqinc) const1001 WritableDatabase::add_spelling(const std::string & word,
1002 Xapian::termcount freqinc) const
1003 {
1004 LOGCALL_VOID(API, "WritableDatabase::add_spelling", word | freqinc);
1005 if (rare(internal.empty()))
1006 no_subdatabases();
1007 // FIXME: Is adding to the first subdatabase sensible?
1008 internal[0]->add_spelling(word, freqinc);
1009 }
1010
1011 void
remove_spelling(const std::string & word,Xapian::termcount freqdec) const1012 WritableDatabase::remove_spelling(const std::string & word,
1013 Xapian::termcount freqdec) const
1014 {
1015 LOGCALL_VOID(API, "WritableDatabase::remove_spelling", word | freqdec);
1016 size_t n_dbs = internal.size();
1017 if (rare(n_dbs == 0))
1018 no_subdatabases();
1019 for (size_t i = 0; i < n_dbs; ++i) {
1020 internal[i]->remove_spelling(word, freqdec);
1021 }
1022 }
1023
1024 void
add_synonym(const std::string & term,const std::string & synonym) const1025 WritableDatabase::add_synonym(const std::string & term,
1026 const std::string & synonym) const
1027 {
1028 LOGCALL_VOID(API, "WritableDatabase::add_synonym", term | synonym);
1029 if (rare(internal.empty()))
1030 no_subdatabases();
1031 // FIXME: Is adding to the first subdatabase sensible?
1032 internal[0]->add_synonym(term, synonym);
1033 }
1034
1035 void
remove_synonym(const std::string & term,const std::string & synonym) const1036 WritableDatabase::remove_synonym(const std::string & term,
1037 const std::string & synonym) const
1038 {
1039 LOGCALL_VOID(API, "WritableDatabase::remove_synonym", term | synonym);
1040 size_t n_dbs = internal.size();
1041 if (rare(n_dbs == 0))
1042 no_subdatabases();
1043 for (size_t i = 0; i < n_dbs; ++i) {
1044 internal[i]->remove_synonym(term, synonym);
1045 }
1046 }
1047
1048 void
clear_synonyms(const std::string & term) const1049 WritableDatabase::clear_synonyms(const std::string & term) const
1050 {
1051 LOGCALL_VOID(API, "WritableDatabase::clear_synonyms", term);
1052 size_t n_dbs = internal.size();
1053 if (rare(n_dbs == 0))
1054 no_subdatabases();
1055 for (size_t i = 0; i < n_dbs; ++i) {
1056 internal[i]->clear_synonyms(term);
1057 }
1058 }
1059
1060 void
set_metadata(const string & key,const string & value)1061 WritableDatabase::set_metadata(const string & key, const string & value)
1062 {
1063 LOGCALL_VOID(API, "WritableDatabase::set_metadata", key | value);
1064 if (rare(key.empty()))
1065 empty_metadata_key();
1066 if (rare(internal.empty()))
1067 no_subdatabases();
1068 internal[0]->set_metadata(key, value);
1069 }
1070
1071 string
get_description() const1072 WritableDatabase::get_description() const
1073 {
1074 /// @todo display contents of the writable database
1075 return "WritableDatabase()";
1076 }
1077
1078 }
1079