1 /* omdocument.cc: class for performing a match
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2002 Ananova Ltd
5  * Copyright 2003,2004,2006,2007,2008,2009,2011,2013,2014,2018 Olly Betts
6  * Copyright 2009 Lemur Consulting Ltd
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include <xapian/document.h>
27 
28 #include "backends/document.h"
29 #include "documentvaluelist.h"
30 #include "maptermlist.h"
31 #include "net/serialise.h"
32 #include "overflow.h"
33 #include "str.h"
34 #include "unicode/description_append.h"
35 
36 #include <xapian/error.h>
37 #include <xapian/types.h>
38 #include <xapian/valueiterator.h>
39 
40 #include <algorithm>
41 #include <limits>
42 #include <string>
43 
44 using namespace std;
45 
46 namespace Xapian {
47 
48 // implementation of Document
49 
Document(Document::Internal * internal_)50 Document::Document(Document::Internal *internal_) : internal(internal_)
51 {
52 }
53 
54 Document::Document(Document&&) = default;
55 
56 Document&
57 Document::operator=(Document&&) = default;
58 
Document()59 Document::Document() : internal(new Xapian::Document::Internal)
60 {
61 }
62 
63 string
get_value(Xapian::valueno slot) const64 Document::get_value(Xapian::valueno slot) const
65 {
66     LOGCALL(API, string, "Document::get_value", slot);
67     RETURN(internal->get_value(slot));
68 }
69 
70 string
get_data() const71 Document::get_data() const
72 {
73     LOGCALL(API, string, "Document::get_data", NO_ARGS);
74     RETURN(internal->get_data());
75 }
76 
77 void
set_data(const string & data)78 Document::set_data(const string &data)
79 {
80     LOGCALL_VOID(API, "Document::set_data", data);
81     internal->set_data(data);
82 }
83 
84 void
operator =(const Document & other)85 Document::operator=(const Document &other)
86 {
87     // pointers are reference counted.
88     internal = other.internal;
89 }
90 
Document(const Document & other)91 Document::Document(const Document &other)
92 	: internal(other.internal)
93 {
94 }
95 
~Document()96 Document::~Document()
97 {
98 }
99 
100 string
get_description() const101 Document::get_description() const
102 {
103     return internal->get_description();
104 }
105 
106 void
add_value(Xapian::valueno slot,const string & value)107 Document::add_value(Xapian::valueno slot, const string &value)
108 {
109     LOGCALL_VOID(API, "Document::add_value", slot | value);
110     internal->add_value(slot, value);
111 }
112 
113 void
remove_value(Xapian::valueno slot)114 Document::remove_value(Xapian::valueno slot)
115 {
116     LOGCALL_VOID(API, "Document::remove_value", slot);
117     internal->remove_value(slot);
118 }
119 
120 void
clear_values()121 Document::clear_values()
122 {
123     LOGCALL_VOID(API, "Document::clear_values", NO_ARGS);
124     internal->clear_values();
125 }
126 
127 void
add_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfinc)128 Document::add_posting(const string & tname,
129 			Xapian::termpos tpos,
130 			Xapian::termcount wdfinc)
131 {
132     LOGCALL_VOID(API, "Document::add_posting", tname | tpos | wdfinc);
133     if (tname.empty()) {
134 	throw InvalidArgumentError("Empty termnames aren't allowed.");
135     }
136     internal->add_posting(tname, tpos, wdfinc);
137 }
138 
139 void
add_term(const string & tname,Xapian::termcount wdfinc)140 Document::add_term(const string & tname, Xapian::termcount wdfinc)
141 {
142     LOGCALL_VOID(API, "Document::add_term", tname | wdfinc);
143     if (tname.empty()) {
144 	throw InvalidArgumentError("Empty termnames aren't allowed.");
145     }
146     internal->add_term(tname, wdfinc);
147 }
148 
149 void
remove_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfdec)150 Document::remove_posting(const string & tname, Xapian::termpos tpos,
151 			 Xapian::termcount wdfdec)
152 {
153     LOGCALL_VOID(API, "Document::remove_posting", tname | tpos | wdfdec);
154     if (tname.empty()) {
155 	throw InvalidArgumentError("Empty termnames aren't allowed.");
156     }
157     internal->remove_posting(tname, tpos, wdfdec);
158 }
159 
160 Xapian::termpos
remove_postings(const string & term,Xapian::termpos termpos_first,Xapian::termpos termpos_last,Xapian::termcount wdf_dec)161 Document::remove_postings(const string& term,
162 			  Xapian::termpos termpos_first,
163 			  Xapian::termpos termpos_last,
164 			  Xapian::termcount wdf_dec)
165 {
166     if (term.empty()) {
167 	throw InvalidArgumentError("Empty termnames aren't allowed.");
168     }
169     if (rare(termpos_first > termpos_last)) {
170 	return 0;
171     }
172     return internal->remove_postings(term, termpos_first, termpos_last,
173 				     wdf_dec);
174 }
175 
176 void
remove_term(const string & tname)177 Document::remove_term(const string & tname)
178 {
179     LOGCALL_VOID(API, "Document::remove_term", tname);
180     internal->remove_term(tname);
181 }
182 
183 void
clear_terms()184 Document::clear_terms()
185 {
186     LOGCALL_VOID(API, "Document::clear_terms", NO_ARGS);
187     internal->clear_terms();
188 }
189 
190 Xapian::termcount
termlist_count() const191 Document::termlist_count() const {
192     LOGCALL(API, Xapian::termcount, "Document::termlist_count", NO_ARGS);
193     RETURN(internal->termlist_count());
194 }
195 
196 TermIterator
termlist_begin() const197 Document::termlist_begin() const
198 {
199     LOGCALL(API, TermIterator, "Document::termlist_begin", NO_ARGS);
200     RETURN(TermIterator(internal->open_term_list()));
201 }
202 
203 Xapian::termcount
values_count() const204 Document::values_count() const {
205     LOGCALL(API, Xapian::termcount, "Document::values_count", NO_ARGS);
206     RETURN(internal->values_count());
207 }
208 
209 ValueIterator
values_begin() const210 Document::values_begin() const
211 {
212     LOGCALL(API, ValueIterator, "Document::values_begin", NO_ARGS);
213     // Calling values_count() has the side effect of making sure that they have
214     // been read into the std::map "values" member of internal.
215     if (internal->values_count() == 0) RETURN(ValueIterator());
216     RETURN(ValueIterator(new DocumentValueList(internal)));
217 }
218 
219 docid
get_docid() const220 Document::get_docid() const
221 {
222     LOGCALL(API, docid, "Document::get_docid", NO_ARGS);
223     RETURN(internal->get_docid());
224 }
225 
226 std::string
serialise() const227 Document::serialise() const
228 {
229     LOGCALL(API, std::string, "Document::serialise", NO_ARGS);
230     RETURN(serialise_document(*this));
231 }
232 
233 Document
unserialise(const std::string & s)234 Document::unserialise(const std::string &s)
235 {
236     LOGCALL_STATIC(API, Document, "Document::unserialise", s);
237     RETURN(unserialise_document(s));
238 }
239 
240 }
241 
242 /////////////////////////////////////////////////////////////////////////////
243 
244 void
merge() const245 OmDocumentTerm::merge() const
246 {
247     Assert(!is_deleted());
248     inplace_merge(positions.begin(),
249 		  positions.begin() + split,
250 		  positions.end());
251     split = 0;
252 }
253 
254 bool
add_position(Xapian::termcount wdf_inc,Xapian::termpos tpos)255 OmDocumentTerm::add_position(Xapian::termcount wdf_inc, Xapian::termpos tpos)
256 {
257     LOGCALL(DB, bool, "OmDocumentTerm::add_position", wdf_inc | tpos);
258 
259     if (rare(is_deleted())) {
260 	wdf = wdf_inc;
261 	split = 0;
262 	positions.push_back(tpos);
263 	return true;
264     }
265 
266     wdf += wdf_inc;
267 
268     // Optimise the common case of adding positions in ascending order.
269     if (positions.empty()) {
270 	positions.push_back(tpos);
271 	return false;
272     }
273     if (tpos > positions.back()) {
274 	if (split) {
275 	    // Check for duplicate before split.
276 	    auto i = lower_bound(positions.cbegin(),
277 				 positions.cbegin() + split,
278 				 tpos);
279 	    if (i != positions.cbegin() + split && *i == tpos)
280 		return false;
281 	}
282 	positions.push_back(tpos);
283 	return false;
284     }
285 
286     if (tpos == positions.back()) {
287 	// Duplicate of last entry.
288 	return false;
289     }
290 
291     if (split > 0) {
292 	// We could merge in the new entry at the same time, but that seems to
293 	// make things much more complex for minor gains.
294 	merge();
295     }
296 
297     // Search for the position the term occurs at.  Use binary chop to
298     // search, since this is a sorted list.
299     vector<Xapian::termpos>::iterator i;
300     i = lower_bound(positions.begin(), positions.end(), tpos);
301     if (i == positions.end() || *i != tpos) {
302 	auto new_split = positions.size();
303 	if (sizeof(split) < sizeof(Xapian::termpos)) {
304 	    if (rare(new_split > numeric_limits<decltype(split)>::max())) {
305 		// The split point would be beyond the size of the type used to
306 		// hold it, which is really unlikely if that type is 32-bit.
307 		// Just insert the old way in this case.
308 		positions.insert(i, tpos);
309 		return false;
310 	    }
311 	} else {
312 	    // This assertion should always be true because we shouldn't have
313 	    // duplicate entries and the split point can't be after the final
314 	    // entry.
315 	    AssertRel(new_split, <=, numeric_limits<decltype(split)>::max());
316 	}
317 	split = new_split;
318 	positions.push_back(tpos);
319     }
320     return false;
321 }
322 
323 void
remove_position(Xapian::termpos tpos)324 OmDocumentTerm::remove_position(Xapian::termpos tpos)
325 {
326     LOGCALL_VOID(DB, "OmDocumentTerm::remove_position", tpos);
327 
328     Assert(!is_deleted());
329 
330     if (rare(positions.empty())) {
331 not_there:
332 	throw Xapian::InvalidArgumentError("Position " + str(tpos) +
333 					   " not in list, can't remove");
334     }
335 
336     // Special case removing the final position, which we can handle in O(1).
337     if (positions.back() == tpos) {
338 	positions.pop_back();
339 	if (split == positions.size()) {
340 	    split = 0;
341 	    // We removed the only entry from after the split.
342 	}
343 	return;
344     }
345 
346     if (split > 0) {
347 	// We could remove the requested entry at the same time, but that seems
348 	// fiddly to do.
349 	merge();
350     }
351 
352     // We keep positions sorted, so use lower_bound() which can binary chop to
353     // find the entry.
354     auto i = lower_bound(positions.begin(), positions.end(), tpos);
355     if (i == positions.end() || *i != tpos) {
356 	goto not_there;
357     }
358     positions.erase(i);
359 }
360 
361 Xapian::termpos
remove_positions(Xapian::termpos termpos_first,Xapian::termpos termpos_last)362 OmDocumentTerm::remove_positions(Xapian::termpos termpos_first,
363 				 Xapian::termpos termpos_last)
364 {
365     LOGCALL(DB, Xapian::termpos, "OmDocumentTerm::remove_position", termpos_first | termpos_last);
366 
367     Assert(!is_deleted());
368 
369     if (split > 0) {
370 	// We could remove the requested entries at the same time, but that
371 	// seems fiddly to do.
372 	merge();
373     }
374 
375     // Find the range [i, j) that the specified termpos range maps to.  Use
376     // binary chop to search, since this is a sorted list.
377     auto i = lower_bound(positions.begin(), positions.end(), termpos_first);
378     if (i == positions.end() || *i > termpos_last) {
379 	return 0;
380     }
381     auto j = upper_bound(i, positions.end(), termpos_last);
382     size_t size_before = positions.size();
383     positions.erase(i, j);
384     return Xapian::termpos(size_before - positions.size());
385 }
386 
387 string
get_description() const388 OmDocumentTerm::get_description() const
389 {
390     string description;
391     description = "OmDocumentTerm(wdf = ";
392     description += str(wdf);
393     description += ", positions[";
394     description += str(positions.size());
395     description += "])";
396     return description;
397 }
398 
399 string
get_value(Xapian::valueno slot) const400 Xapian::Document::Internal::get_value(Xapian::valueno slot) const
401 {
402     if (values_here) {
403 	map<Xapian::valueno, string>::const_iterator i;
404 	i = values.find(slot);
405 	if (i == values.end()) return string();
406 	return i->second;
407     }
408     if (!database.get()) return string();
409     return do_get_value(slot);
410 }
411 
412 string
get_data() const413 Xapian::Document::Internal::get_data() const
414 {
415     LOGCALL(DB, string, "Xapian::Document::Internal::get_data", NO_ARGS);
416     if (data_here) RETURN(data);
417     if (!database.get()) RETURN(string());
418     RETURN(do_get_data());
419 }
420 
421 void
set_data(const string & data_)422 Xapian::Document::Internal::set_data(const string &data_)
423 {
424     data = data_;
425     data_here = true;
426 }
427 
428 TermList *
open_term_list() const429 Xapian::Document::Internal::open_term_list() const
430 {
431     LOGCALL(DB, TermList *, "Document::Internal::open_term_list", NO_ARGS);
432     if (terms_here) {
433 	RETURN(new MapTermList(terms.begin(), terms.end()));
434     }
435     if (!database.get()) RETURN(NULL);
436     RETURN(database->open_term_list(did));
437 }
438 
439 void
add_value(Xapian::valueno slot,const string & value)440 Xapian::Document::Internal::add_value(Xapian::valueno slot, const string &value)
441 {
442     need_values();
443     if (!value.empty()) {
444 	values[slot] = value;
445     } else {
446 	// Empty values aren't stored, but replace any existing value by
447 	// removing it.
448 	values.erase(slot);
449     }
450 }
451 
452 void
remove_value(Xapian::valueno slot)453 Xapian::Document::Internal::remove_value(Xapian::valueno slot)
454 {
455     need_values();
456     map<Xapian::valueno, string>::iterator i = values.find(slot);
457     if (i == values.end()) {
458 	throw Xapian::InvalidArgumentError("Value #" + str(slot) +
459 		" is not present in document, in "
460 		"Xapian::Document::Internal::remove_value()");
461     }
462     values.erase(i);
463 }
464 
465 void
clear_values()466 Xapian::Document::Internal::clear_values()
467 {
468     values.clear();
469     values_here = true;
470 }
471 
472 void
add_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfinc)473 Xapian::Document::Internal::add_posting(const string & tname, Xapian::termpos tpos,
474 			      Xapian::termcount wdfinc)
475 {
476     need_terms();
477     positions_modified = true;
478 
479     map<string, OmDocumentTerm>::iterator i;
480     i = terms.find(tname);
481     if (i == terms.end()) {
482 	++termlist_size;
483 	OmDocumentTerm newterm(wdfinc);
484 	newterm.append_position(tpos);
485 	terms.insert(make_pair(tname, std::move(newterm)));
486     } else {
487 	if (i->second.add_position(wdfinc, tpos))
488 	    ++termlist_size;
489     }
490 }
491 
492 void
add_term(const string & tname,Xapian::termcount wdfinc)493 Xapian::Document::Internal::add_term(const string & tname, Xapian::termcount wdfinc)
494 {
495     need_terms();
496 
497     map<string, OmDocumentTerm>::iterator i;
498     i = terms.find(tname);
499     if (i == terms.end()) {
500 	++termlist_size;
501 	OmDocumentTerm newterm(wdfinc);
502 	terms.insert(make_pair(tname, std::move(newterm)));
503     } else {
504 	if (i->second.increase_wdf(wdfinc))
505 	    ++termlist_size;
506     }
507 }
508 
509 void
remove_posting(const string & tname,Xapian::termpos tpos,Xapian::termcount wdfdec)510 Xapian::Document::Internal::remove_posting(const string & tname,
511 					   Xapian::termpos tpos,
512 					   Xapian::termcount wdfdec)
513 {
514     need_terms();
515 
516     map<string, OmDocumentTerm>::iterator i;
517     i = terms.find(tname);
518     if (i == terms.end() || i->second.is_deleted()) {
519 	throw Xapian::InvalidArgumentError("Term '" + tname +
520 		"' is not present in document, in "
521 		"Xapian::Document::Internal::remove_posting()");
522     }
523     i->second.remove_position(tpos);
524     if (wdfdec) i->second.decrease_wdf(wdfdec);
525     positions_modified = true;
526 }
527 
528 Xapian::termpos
remove_postings(const string & term,Xapian::termpos termpos_first,Xapian::termpos termpos_last,Xapian::termcount wdf_dec)529 Xapian::Document::Internal::remove_postings(const string& term,
530 					    Xapian::termpos termpos_first,
531 					    Xapian::termpos termpos_last,
532 					    Xapian::termcount wdf_dec)
533 {
534     need_terms();
535 
536     auto i = terms.find(term);
537     if (i == terms.end() || i->second.is_deleted()) {
538 	throw Xapian::InvalidArgumentError("Term '" + term +
539 		"' is not present in document, in "
540 		"Xapian::Document::Internal::remove_postings()");
541     }
542     auto n_removed = i->second.remove_positions(termpos_first, termpos_last);
543     if (n_removed) {
544 	positions_modified = true;
545 	if (wdf_dec) {
546 	    Xapian::termcount wdf_delta;
547 	    if (mul_overflows(n_removed, wdf_dec, wdf_delta)) {
548 		// Decreasing by the maximum value will zero the wdf.
549 		wdf_delta = numeric_limits<Xapian::termcount>::max();
550 	    }
551 	    i->second.decrease_wdf(wdf_delta);
552 	}
553     }
554     return n_removed;
555 }
556 
557 void
remove_term(const string & tname)558 Xapian::Document::Internal::remove_term(const string & tname)
559 {
560     need_terms();
561     map<string, OmDocumentTerm>::iterator i;
562     i = terms.find(tname);
563     if (i == terms.end() || i->second.is_deleted()) {
564 	throw Xapian::InvalidArgumentError("Term '" + tname +
565 		"' is not present in document, in "
566 		"Xapian::Document::Internal::remove_term()");
567     }
568     --termlist_size;
569     if (!positions_modified) {
570 	positions_modified = (i->second.positionlist_count() > 0);
571     }
572     i->second.remove();
573 }
574 
575 void
clear_terms()576 Xapian::Document::Internal::clear_terms()
577 {
578     terms.clear();
579     termlist_size = 0;
580     terms_here = true;
581     // Assume there was a term with positions for now.
582     // FIXME: may be worth checking...
583     positions_modified = true;
584 }
585 
586 Xapian::termcount
termlist_count() const587 Xapian::Document::Internal::termlist_count() const
588 {
589     if (!terms_here) {
590 	// How equivalent is this line to the rest?
591 	// return database.get() ? database->open_term_list(did)->get_approx_size() : 0;
592 	need_terms();
593     }
594     Assert(terms_here);
595     return termlist_size;
596 }
597 
598 void
need_terms() const599 Xapian::Document::Internal::need_terms() const
600 {
601     if (terms_here) return;
602     if (database.get()) {
603 	Xapian::TermIterator t(database->open_term_list(did));
604 	Xapian::TermIterator tend(NULL);
605 	for ( ; t != tend; ++t) {
606 	    Xapian::PositionIterator p = t.positionlist_begin();
607 	    OmDocumentTerm term(t.get_wdf());
608 	    for ( ; p != t.positionlist_end(); ++p) {
609 		term.append_position(*p);
610 	    }
611 	    terms.insert(terms.end(), make_pair(*t, std::move(term)));
612 	}
613     }
614     termlist_size = terms.size();
615     terms_here = true;
616 }
617 
618 Xapian::valueno
values_count() const619 Xapian::Document::Internal::values_count() const
620 {
621     LOGCALL(DB, Xapian::valueno, "Document::Internal::values_count", NO_ARGS);
622     need_values();
623     Assert(values_here);
624     RETURN(values.size());
625 }
626 
627 string
get_description() const628 Xapian::Document::Internal::get_description() const
629 {
630     string desc = "Document(";
631 
632     // description_append ?
633     if (data_here) {
634 	desc += "data='";
635 	description_append(desc, data);
636 	desc += "'";
637     }
638 
639     if (values_here) {
640 	if (data_here) desc += ", ";
641 	desc += "values[";
642 	desc += str(values.size());
643 	desc += ']';
644     }
645 
646     if (terms_here) {
647 	if (data_here || values_here) desc += ", ";
648 	desc += "terms[";
649 	desc += str(termlist_size);
650 	desc += ']';
651     }
652 
653     if (database.get()) {
654 	if (data_here || values_here || terms_here) desc += ", ";
655 	// database->get_description() if/when that returns a non-generic
656 	// result.
657 	desc += "db:yes";
658     }
659 
660     desc += ')';
661 
662     return desc;
663 }
664 
665 void
need_values() const666 Xapian::Document::Internal::need_values() const
667 {
668     if (!values_here) {
669 	if (database.get()) {
670 	    Assert(values.empty());
671 	    do_get_all_values(values);
672 	}
673 	values_here = true;
674     }
675 }
676 
~Internal()677 Xapian::Document::Internal::~Internal()
678 {
679     if (database.get())
680 	database->invalidate_doc_object(this);
681 }
682