1 /* omdocument.cc: class for performing a match
2 *
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002 Ananova Ltd
5 * Copyright 2003,2004,2006,2007,2008,2009,2011,2013,2014,2018 Olly Betts
6 * Copyright 2009 Lemur Consulting Ltd
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
22 */
23
24 #include <config.h>
25
26 #include <xapian/document.h>
27
28 #include "backends/document.h"
29 #include "documentvaluelist.h"
30 #include "maptermlist.h"
31 #include "net/serialise.h"
32 #include "overflow.h"
33 #include "str.h"
34 #include "unicode/description_append.h"
35
36 #include <xapian/error.h>
37 #include <xapian/types.h>
38 #include <xapian/valueiterator.h>
39
40 #include <algorithm>
41 #include <limits>
42 #include <string>
43
44 using namespace std;
45
46 namespace Xapian {
47
48 // implementation of Document
49
Document(Document::Internal * internal_)50 Document::Document(Document::Internal *internal_) : internal(internal_)
51 {
52 }
53
54 Document::Document(Document&&) = default;
55
56 Document&
57 Document::operator=(Document&&) = default;
58
Document()59 Document::Document() : internal(new Xapian::Document::Internal)
60 {
61 }
62
63 string
get_value(Xapian::valueno slot) const64 Document::get_value(Xapian::valueno slot) const
65 {
66 LOGCALL(API, string, "Document::get_value", slot);
67 RETURN(internal->get_value(slot));
68 }
69
70 string
get_data() const71 Document::get_data() const
72 {
73 LOGCALL(API, string, "Document::get_data", NO_ARGS);
74 RETURN(internal->get_data());
75 }
76
77 void
set_data(const string & data)78 Document::set_data(const string &data)
79 {
80 LOGCALL_VOID(API, "Document::set_data", data);
81 internal->set_data(data);
82 }
83
84 void
operator =(const Document & other)85 Document::operator=(const Document &other)
86 {
87 // pointers are reference counted.
88 internal = other.internal;
89 }
90
Document(const Document & other)91 Document::Document(const Document &other)
92 : internal(other.internal)
93 {
94 }
95
~Document()96 Document::~Document()
97 {
98 }
99
100 string
get_description() const101 Document::get_description() const
102 {
103 return internal->get_description();
104 }
105
106 void
add_value(Xapian::valueno slot,const string & value)107 Document::add_value(Xapian::valueno slot, const string &value)
108 {
109 LOGCALL_VOID(API, "Document::add_value", slot | value);
110 internal->add_value(slot, value);
111 }
112
113 void
remove_value(Xapian::valueno slot)114 Document::remove_value(Xapian::valueno slot)
115 {
116 LOGCALL_VOID(API, "Document::remove_value", slot);
117 internal->remove_value(slot);
118 }
119
120 void
clear_values()121 Document::clear_values()
122 {
123 LOGCALL_VOID(API, "Document::clear_values", NO_ARGS);
124 internal->clear_values();
125 }
126
127 void
add_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfinc)128 Document::add_posting(const string & tname,
129 Xapian::termpos tpos,
130 Xapian::termcount wdfinc)
131 {
132 LOGCALL_VOID(API, "Document::add_posting", tname | tpos | wdfinc);
133 if (tname.empty()) {
134 throw InvalidArgumentError("Empty termnames aren't allowed.");
135 }
136 internal->add_posting(tname, tpos, wdfinc);
137 }
138
139 void
add_term(const string & tname,Xapian::termcount wdfinc)140 Document::add_term(const string & tname, Xapian::termcount wdfinc)
141 {
142 LOGCALL_VOID(API, "Document::add_term", tname | wdfinc);
143 if (tname.empty()) {
144 throw InvalidArgumentError("Empty termnames aren't allowed.");
145 }
146 internal->add_term(tname, wdfinc);
147 }
148
149 void
remove_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfdec)150 Document::remove_posting(const string & tname, Xapian::termpos tpos,
151 Xapian::termcount wdfdec)
152 {
153 LOGCALL_VOID(API, "Document::remove_posting", tname | tpos | wdfdec);
154 if (tname.empty()) {
155 throw InvalidArgumentError("Empty termnames aren't allowed.");
156 }
157 internal->remove_posting(tname, tpos, wdfdec);
158 }
159
160 Xapian::termpos
remove_postings(const string & term,Xapian::termpos termpos_first,Xapian::termpos termpos_last,Xapian::termcount wdf_dec)161 Document::remove_postings(const string& term,
162 Xapian::termpos termpos_first,
163 Xapian::termpos termpos_last,
164 Xapian::termcount wdf_dec)
165 {
166 if (term.empty()) {
167 throw InvalidArgumentError("Empty termnames aren't allowed.");
168 }
169 if (rare(termpos_first > termpos_last)) {
170 return 0;
171 }
172 return internal->remove_postings(term, termpos_first, termpos_last,
173 wdf_dec);
174 }
175
176 void
remove_term(const string & tname)177 Document::remove_term(const string & tname)
178 {
179 LOGCALL_VOID(API, "Document::remove_term", tname);
180 internal->remove_term(tname);
181 }
182
183 void
clear_terms()184 Document::clear_terms()
185 {
186 LOGCALL_VOID(API, "Document::clear_terms", NO_ARGS);
187 internal->clear_terms();
188 }
189
190 Xapian::termcount
termlist_count() const191 Document::termlist_count() const {
192 LOGCALL(API, Xapian::termcount, "Document::termlist_count", NO_ARGS);
193 RETURN(internal->termlist_count());
194 }
195
196 TermIterator
termlist_begin() const197 Document::termlist_begin() const
198 {
199 LOGCALL(API, TermIterator, "Document::termlist_begin", NO_ARGS);
200 RETURN(TermIterator(internal->open_term_list()));
201 }
202
203 Xapian::termcount
values_count() const204 Document::values_count() const {
205 LOGCALL(API, Xapian::termcount, "Document::values_count", NO_ARGS);
206 RETURN(internal->values_count());
207 }
208
209 ValueIterator
values_begin() const210 Document::values_begin() const
211 {
212 LOGCALL(API, ValueIterator, "Document::values_begin", NO_ARGS);
213 // Calling values_count() has the side effect of making sure that they have
214 // been read into the std::map "values" member of internal.
215 if (internal->values_count() == 0) RETURN(ValueIterator());
216 RETURN(ValueIterator(new DocumentValueList(internal)));
217 }
218
219 docid
get_docid() const220 Document::get_docid() const
221 {
222 LOGCALL(API, docid, "Document::get_docid", NO_ARGS);
223 RETURN(internal->get_docid());
224 }
225
226 std::string
serialise() const227 Document::serialise() const
228 {
229 LOGCALL(API, std::string, "Document::serialise", NO_ARGS);
230 RETURN(serialise_document(*this));
231 }
232
233 Document
unserialise(const std::string & s)234 Document::unserialise(const std::string &s)
235 {
236 LOGCALL_STATIC(API, Document, "Document::unserialise", s);
237 RETURN(unserialise_document(s));
238 }
239
240 }
241
242 /////////////////////////////////////////////////////////////////////////////
243
244 void
merge() const245 OmDocumentTerm::merge() const
246 {
247 Assert(!is_deleted());
248 inplace_merge(positions.begin(),
249 positions.begin() + split,
250 positions.end());
251 split = 0;
252 }
253
254 bool
add_position(Xapian::termcount wdf_inc,Xapian::termpos tpos)255 OmDocumentTerm::add_position(Xapian::termcount wdf_inc, Xapian::termpos tpos)
256 {
257 LOGCALL(DB, bool, "OmDocumentTerm::add_position", wdf_inc | tpos);
258
259 if (rare(is_deleted())) {
260 wdf = wdf_inc;
261 split = 0;
262 positions.push_back(tpos);
263 return true;
264 }
265
266 wdf += wdf_inc;
267
268 // Optimise the common case of adding positions in ascending order.
269 if (positions.empty()) {
270 positions.push_back(tpos);
271 return false;
272 }
273 if (tpos > positions.back()) {
274 if (split) {
275 // Check for duplicate before split.
276 auto i = lower_bound(positions.cbegin(),
277 positions.cbegin() + split,
278 tpos);
279 if (i != positions.cbegin() + split && *i == tpos)
280 return false;
281 }
282 positions.push_back(tpos);
283 return false;
284 }
285
286 if (tpos == positions.back()) {
287 // Duplicate of last entry.
288 return false;
289 }
290
291 if (split > 0) {
292 // We could merge in the new entry at the same time, but that seems to
293 // make things much more complex for minor gains.
294 merge();
295 }
296
297 // Search for the position the term occurs at. Use binary chop to
298 // search, since this is a sorted list.
299 vector<Xapian::termpos>::iterator i;
300 i = lower_bound(positions.begin(), positions.end(), tpos);
301 if (i == positions.end() || *i != tpos) {
302 auto new_split = positions.size();
303 if (sizeof(split) < sizeof(Xapian::termpos)) {
304 if (rare(new_split > numeric_limits<decltype(split)>::max())) {
305 // The split point would be beyond the size of the type used to
306 // hold it, which is really unlikely if that type is 32-bit.
307 // Just insert the old way in this case.
308 positions.insert(i, tpos);
309 return false;
310 }
311 } else {
312 // This assertion should always be true because we shouldn't have
313 // duplicate entries and the split point can't be after the final
314 // entry.
315 AssertRel(new_split, <=, numeric_limits<decltype(split)>::max());
316 }
317 split = new_split;
318 positions.push_back(tpos);
319 }
320 return false;
321 }
322
323 void
remove_position(Xapian::termpos tpos)324 OmDocumentTerm::remove_position(Xapian::termpos tpos)
325 {
326 LOGCALL_VOID(DB, "OmDocumentTerm::remove_position", tpos);
327
328 Assert(!is_deleted());
329
330 if (rare(positions.empty())) {
331 not_there:
332 throw Xapian::InvalidArgumentError("Position " + str(tpos) +
333 " not in list, can't remove");
334 }
335
336 // Special case removing the final position, which we can handle in O(1).
337 if (positions.back() == tpos) {
338 positions.pop_back();
339 if (split == positions.size()) {
340 split = 0;
341 // We removed the only entry from after the split.
342 }
343 return;
344 }
345
346 if (split > 0) {
347 // We could remove the requested entry at the same time, but that seems
348 // fiddly to do.
349 merge();
350 }
351
352 // We keep positions sorted, so use lower_bound() which can binary chop to
353 // find the entry.
354 auto i = lower_bound(positions.begin(), positions.end(), tpos);
355 if (i == positions.end() || *i != tpos) {
356 goto not_there;
357 }
358 positions.erase(i);
359 }
360
361 Xapian::termpos
remove_positions(Xapian::termpos termpos_first,Xapian::termpos termpos_last)362 OmDocumentTerm::remove_positions(Xapian::termpos termpos_first,
363 Xapian::termpos termpos_last)
364 {
365 LOGCALL(DB, Xapian::termpos, "OmDocumentTerm::remove_position", termpos_first | termpos_last);
366
367 Assert(!is_deleted());
368
369 if (split > 0) {
370 // We could remove the requested entries at the same time, but that
371 // seems fiddly to do.
372 merge();
373 }
374
375 // Find the range [i, j) that the specified termpos range maps to. Use
376 // binary chop to search, since this is a sorted list.
377 auto i = lower_bound(positions.begin(), positions.end(), termpos_first);
378 if (i == positions.end() || *i > termpos_last) {
379 return 0;
380 }
381 auto j = upper_bound(i, positions.end(), termpos_last);
382 size_t size_before = positions.size();
383 positions.erase(i, j);
384 return Xapian::termpos(size_before - positions.size());
385 }
386
387 string
get_description() const388 OmDocumentTerm::get_description() const
389 {
390 string description;
391 description = "OmDocumentTerm(wdf = ";
392 description += str(wdf);
393 description += ", positions[";
394 description += str(positions.size());
395 description += "])";
396 return description;
397 }
398
399 string
get_value(Xapian::valueno slot) const400 Xapian::Document::Internal::get_value(Xapian::valueno slot) const
401 {
402 if (values_here) {
403 map<Xapian::valueno, string>::const_iterator i;
404 i = values.find(slot);
405 if (i == values.end()) return string();
406 return i->second;
407 }
408 if (!database.get()) return string();
409 return do_get_value(slot);
410 }
411
412 string
get_data() const413 Xapian::Document::Internal::get_data() const
414 {
415 LOGCALL(DB, string, "Xapian::Document::Internal::get_data", NO_ARGS);
416 if (data_here) RETURN(data);
417 if (!database.get()) RETURN(string());
418 RETURN(do_get_data());
419 }
420
421 void
set_data(const string & data_)422 Xapian::Document::Internal::set_data(const string &data_)
423 {
424 data = data_;
425 data_here = true;
426 }
427
428 TermList *
open_term_list() const429 Xapian::Document::Internal::open_term_list() const
430 {
431 LOGCALL(DB, TermList *, "Document::Internal::open_term_list", NO_ARGS);
432 if (terms_here) {
433 RETURN(new MapTermList(terms.begin(), terms.end()));
434 }
435 if (!database.get()) RETURN(NULL);
436 RETURN(database->open_term_list(did));
437 }
438
439 void
add_value(Xapian::valueno slot,const string & value)440 Xapian::Document::Internal::add_value(Xapian::valueno slot, const string &value)
441 {
442 need_values();
443 if (!value.empty()) {
444 values[slot] = value;
445 } else {
446 // Empty values aren't stored, but replace any existing value by
447 // removing it.
448 values.erase(slot);
449 }
450 }
451
452 void
remove_value(Xapian::valueno slot)453 Xapian::Document::Internal::remove_value(Xapian::valueno slot)
454 {
455 need_values();
456 map<Xapian::valueno, string>::iterator i = values.find(slot);
457 if (i == values.end()) {
458 throw Xapian::InvalidArgumentError("Value #" + str(slot) +
459 " is not present in document, in "
460 "Xapian::Document::Internal::remove_value()");
461 }
462 values.erase(i);
463 }
464
465 void
clear_values()466 Xapian::Document::Internal::clear_values()
467 {
468 values.clear();
469 values_here = true;
470 }
471
472 void
add_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfinc)473 Xapian::Document::Internal::add_posting(const string & tname, Xapian::termpos tpos,
474 Xapian::termcount wdfinc)
475 {
476 need_terms();
477 positions_modified = true;
478
479 map<string, OmDocumentTerm>::iterator i;
480 i = terms.find(tname);
481 if (i == terms.end()) {
482 ++termlist_size;
483 OmDocumentTerm newterm(wdfinc);
484 newterm.append_position(tpos);
485 terms.insert(make_pair(tname, std::move(newterm)));
486 } else {
487 if (i->second.add_position(wdfinc, tpos))
488 ++termlist_size;
489 }
490 }
491
492 void
add_term(const string & tname,Xapian::termcount wdfinc)493 Xapian::Document::Internal::add_term(const string & tname, Xapian::termcount wdfinc)
494 {
495 need_terms();
496
497 map<string, OmDocumentTerm>::iterator i;
498 i = terms.find(tname);
499 if (i == terms.end()) {
500 ++termlist_size;
501 OmDocumentTerm newterm(wdfinc);
502 terms.insert(make_pair(tname, std::move(newterm)));
503 } else {
504 if (i->second.increase_wdf(wdfinc))
505 ++termlist_size;
506 }
507 }
508
509 void
remove_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfdec)510 Xapian::Document::Internal::remove_posting(const string & tname,
511 Xapian::termpos tpos,
512 Xapian::termcount wdfdec)
513 {
514 need_terms();
515
516 map<string, OmDocumentTerm>::iterator i;
517 i = terms.find(tname);
518 if (i == terms.end() || i->second.is_deleted()) {
519 throw Xapian::InvalidArgumentError("Term '" + tname +
520 "' is not present in document, in "
521 "Xapian::Document::Internal::remove_posting()");
522 }
523 i->second.remove_position(tpos);
524 if (wdfdec) i->second.decrease_wdf(wdfdec);
525 positions_modified = true;
526 }
527
528 Xapian::termpos
remove_postings(const string & term,Xapian::termpos termpos_first,Xapian::termpos termpos_last,Xapian::termcount wdf_dec)529 Xapian::Document::Internal::remove_postings(const string& term,
530 Xapian::termpos termpos_first,
531 Xapian::termpos termpos_last,
532 Xapian::termcount wdf_dec)
533 {
534 need_terms();
535
536 auto i = terms.find(term);
537 if (i == terms.end() || i->second.is_deleted()) {
538 throw Xapian::InvalidArgumentError("Term '" + term +
539 "' is not present in document, in "
540 "Xapian::Document::Internal::remove_postings()");
541 }
542 auto n_removed = i->second.remove_positions(termpos_first, termpos_last);
543 if (n_removed) {
544 positions_modified = true;
545 if (wdf_dec) {
546 Xapian::termcount wdf_delta;
547 if (mul_overflows(n_removed, wdf_dec, wdf_delta)) {
548 // Decreasing by the maximum value will zero the wdf.
549 wdf_delta = numeric_limits<Xapian::termcount>::max();
550 }
551 i->second.decrease_wdf(wdf_delta);
552 }
553 }
554 return n_removed;
555 }
556
557 void
remove_term(const string & tname)558 Xapian::Document::Internal::remove_term(const string & tname)
559 {
560 need_terms();
561 map<string, OmDocumentTerm>::iterator i;
562 i = terms.find(tname);
563 if (i == terms.end() || i->second.is_deleted()) {
564 throw Xapian::InvalidArgumentError("Term '" + tname +
565 "' is not present in document, in "
566 "Xapian::Document::Internal::remove_term()");
567 }
568 --termlist_size;
569 if (!positions_modified) {
570 positions_modified = (i->second.positionlist_count() > 0);
571 }
572 i->second.remove();
573 }
574
575 void
clear_terms()576 Xapian::Document::Internal::clear_terms()
577 {
578 terms.clear();
579 termlist_size = 0;
580 terms_here = true;
581 // Assume there was a term with positions for now.
582 // FIXME: may be worth checking...
583 positions_modified = true;
584 }
585
586 Xapian::termcount
termlist_count() const587 Xapian::Document::Internal::termlist_count() const
588 {
589 if (!terms_here) {
590 // How equivalent is this line to the rest?
591 // return database.get() ? database->open_term_list(did)->get_approx_size() : 0;
592 need_terms();
593 }
594 Assert(terms_here);
595 return termlist_size;
596 }
597
598 void
need_terms() const599 Xapian::Document::Internal::need_terms() const
600 {
601 if (terms_here) return;
602 if (database.get()) {
603 Xapian::TermIterator t(database->open_term_list(did));
604 Xapian::TermIterator tend(NULL);
605 for ( ; t != tend; ++t) {
606 Xapian::PositionIterator p = t.positionlist_begin();
607 OmDocumentTerm term(t.get_wdf());
608 for ( ; p != t.positionlist_end(); ++p) {
609 term.append_position(*p);
610 }
611 terms.insert(terms.end(), make_pair(*t, std::move(term)));
612 }
613 }
614 termlist_size = terms.size();
615 terms_here = true;
616 }
617
618 Xapian::valueno
values_count() const619 Xapian::Document::Internal::values_count() const
620 {
621 LOGCALL(DB, Xapian::valueno, "Document::Internal::values_count", NO_ARGS);
622 need_values();
623 Assert(values_here);
624 RETURN(values.size());
625 }
626
627 string
get_description() const628 Xapian::Document::Internal::get_description() const
629 {
630 string desc = "Document(";
631
632 // description_append ?
633 if (data_here) {
634 desc += "data='";
635 description_append(desc, data);
636 desc += "'";
637 }
638
639 if (values_here) {
640 if (data_here) desc += ", ";
641 desc += "values[";
642 desc += str(values.size());
643 desc += ']';
644 }
645
646 if (terms_here) {
647 if (data_here || values_here) desc += ", ";
648 desc += "terms[";
649 desc += str(termlist_size);
650 desc += ']';
651 }
652
653 if (database.get()) {
654 if (data_here || values_here || terms_here) desc += ", ";
655 // database->get_description() if/when that returns a non-generic
656 // result.
657 desc += "db:yes";
658 }
659
660 desc += ')';
661
662 return desc;
663 }
664
665 void
need_values() const666 Xapian::Document::Internal::need_values() const
667 {
668 if (!values_here) {
669 if (database.get()) {
670 Assert(values.empty());
671 do_get_all_values(values);
672 }
673 values_here = true;
674 }
675 }
676
~Internal()677 Xapian::Document::Internal::~Internal()
678 {
679 if (database.get())
680 database->invalidate_doc_object(this);
681 }
682