1 /** @file flint_termlisttable.cc
2  * @brief Subclass of FlintTable which holds termlists.
3  */
4 /* Copyright (C) 2007,2008 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/document.h>
24 #include <xapian/error.h>
25 #include <xapian/termiterator.h>
26 
27 #include "flint_termlisttable.h"
28 #include "flint_utils.h"
29 #include "debuglog.h"
30 #include "omassert.h"
31 #include "str.h"
32 #include "stringutils.h"
33 
34 #include <string>
35 
36 using namespace std;
37 
38 void
set_termlist(Xapian::docid did,const Xapian::Document & doc,flint_doclen_t doclen)39 FlintTermListTable::set_termlist(Xapian::docid did,
40 				 const Xapian::Document & doc,
41 				 flint_doclen_t doclen)
42 {
43     LOGCALL_VOID(DB, "FlintTermListTable::set_termlist", did | doc | doclen);
44 
45     Xapian::doccount termlist_size = doc.termlist_count();
46     if (termlist_size == 0) {
47 	// doclen is sum(wdf) so should be zero if there are no terms.
48 	Assert(doclen == 0);
49 	Assert(doc.termlist_begin() == doc.termlist_end());
50 	add(flint_docid_to_key(did), string());
51 	return;
52     }
53 
54     string tag = F_pack_uint(doclen);
55 
56     Xapian::TermIterator t = doc.termlist_begin();
57     if (t != doc.termlist_end()) {
58 	tag += F_pack_uint(termlist_size);
59 	string prev_term = *t;
60 
61 	// Previous database versions encoded a boolean here, which was
62 	// always false (and F_pack_bool() encodes false as a '0').  We can
63 	// just omit this and successfully read old and new termlists
64 	// except in the case where the next byte is a '0' - in this case
65 	// we need keep the '0' so that the decoder can just skip any '0'
66 	// it sees in this position (this shouldn't be a common case - 48
67 	// character terms aren't very common, and the first term
68 	// alphabetically is likely to be shorter than average).
69 	if (prev_term.size() == '0') tag += '0';
70 
71 	tag += char(prev_term.size());
72 	tag += prev_term;
73 	tag += F_pack_uint(t.get_wdf());
74 	--termlist_size;
75 
76 	while (++t != doc.termlist_end()) {
77 	    const string & term = *t;
78 	    // If there's a shared prefix with the previous term, we don't
79 	    // store it explicitly, but just store the length of the shared
80 	    // prefix.  In general, this is a big win.
81 	    size_t reuse = common_prefix_length(prev_term, term);
82 
83 	    // reuse must be <= prev_term.size(), and we know that value while
84 	    // decoding.  So if the wdf is small enough that we can multiply it
85 	    // by (prev_term.size() + 1), add reuse and fit the result in a
86 	    // byte, then we can pack reuse and the wdf into a single byte and
87 	    // save ourselves a byte.  We actually need to add one to the wdf
88 	    // before multiplying so that a wdf of 0 can be detected by the
89 	    // decoder.
90 	    size_t packed = 0;
91 	    Xapian::termcount wdf = t.get_wdf();
92 	    // If wdf >= 128, then we aren't going to be able to pack it in so
93 	    // don't even try to avoid the calculation overflowing and making
94 	    // us think we can.
95 	    if (wdf < 127)
96 		packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
97 
98 	    if (packed && packed < 256) {
99 		// We can pack the wdf into the same byte.
100 		tag += char(packed);
101 		tag += char(term.size() - reuse);
102 		tag.append(term.data() + reuse, term.size() - reuse);
103 	    } else {
104 		tag += char(reuse);
105 		tag += char(term.size() - reuse);
106 		tag.append(term.data() + reuse, term.size() - reuse);
107 		// FIXME: pack wdf after reuse next time we rejig the format
108 		// incompatibly.
109 		tag += F_pack_uint(wdf);
110 	    }
111 
112 	    prev_term = *t;
113 	    --termlist_size;
114 	}
115     }
116     Assert(termlist_size == 0);
117     add(flint_docid_to_key(did), tag);
118 }
119 
120 flint_doclen_t
get_doclength(Xapian::docid did) const121 FlintTermListTable::get_doclength(Xapian::docid did) const
122 {
123     LOGCALL(DB, flint_doclen_t, "FlintTermListTable::get_doclength", did);
124 
125     string tag;
126     if (!get_exact_entry(flint_docid_to_key(did), tag))
127 	throw Xapian::DocNotFoundError("No termlist found for document " +
128 				       str(did));
129 
130     if (tag.empty()) RETURN(0);
131 
132     const char * pos = tag.data();
133     const char * end = pos + tag.size();
134 
135     flint_doclen_t doclen;
136     if (!F_unpack_uint(&pos, end, &doclen)) {
137 	const char *msg;
138 	if (pos == 0) {
139 	    msg = "Too little data for doclen in termlist";
140 	} else {
141 	    msg = "Overflowed value for doclen in termlist";
142 	}
143 	throw Xapian::DatabaseCorruptError(msg);
144     }
145 
146     RETURN(doclen);
147 }
148