1 /** @file flint_termlisttable.cc
2 * @brief Subclass of FlintTable which holds termlists.
3 */
4 /* Copyright (C) 2007,2008 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <config.h>
22
23 #include <xapian/document.h>
24 #include <xapian/error.h>
25 #include <xapian/termiterator.h>
26
27 #include "flint_termlisttable.h"
28 #include "flint_utils.h"
29 #include "debuglog.h"
30 #include "omassert.h"
31 #include "str.h"
32 #include "stringutils.h"
33
34 #include <string>
35
36 using namespace std;
37
38 void
set_termlist(Xapian::docid did,const Xapian::Document & doc,flint_doclen_t doclen)39 FlintTermListTable::set_termlist(Xapian::docid did,
40 const Xapian::Document & doc,
41 flint_doclen_t doclen)
42 {
43 LOGCALL_VOID(DB, "FlintTermListTable::set_termlist", did | doc | doclen);
44
45 Xapian::doccount termlist_size = doc.termlist_count();
46 if (termlist_size == 0) {
47 // doclen is sum(wdf) so should be zero if there are no terms.
48 Assert(doclen == 0);
49 Assert(doc.termlist_begin() == doc.termlist_end());
50 add(flint_docid_to_key(did), string());
51 return;
52 }
53
54 string tag = F_pack_uint(doclen);
55
56 Xapian::TermIterator t = doc.termlist_begin();
57 if (t != doc.termlist_end()) {
58 tag += F_pack_uint(termlist_size);
59 string prev_term = *t;
60
61 // Previous database versions encoded a boolean here, which was
62 // always false (and F_pack_bool() encodes false as a '0'). We can
63 // just omit this and successfully read old and new termlists
64 // except in the case where the next byte is a '0' - in this case
65 // we need keep the '0' so that the decoder can just skip any '0'
66 // it sees in this position (this shouldn't be a common case - 48
67 // character terms aren't very common, and the first term
68 // alphabetically is likely to be shorter than average).
69 if (prev_term.size() == '0') tag += '0';
70
71 tag += char(prev_term.size());
72 tag += prev_term;
73 tag += F_pack_uint(t.get_wdf());
74 --termlist_size;
75
76 while (++t != doc.termlist_end()) {
77 const string & term = *t;
78 // If there's a shared prefix with the previous term, we don't
79 // store it explicitly, but just store the length of the shared
80 // prefix. In general, this is a big win.
81 size_t reuse = common_prefix_length(prev_term, term);
82
83 // reuse must be <= prev_term.size(), and we know that value while
84 // decoding. So if the wdf is small enough that we can multiply it
85 // by (prev_term.size() + 1), add reuse and fit the result in a
86 // byte, then we can pack reuse and the wdf into a single byte and
87 // save ourselves a byte. We actually need to add one to the wdf
88 // before multiplying so that a wdf of 0 can be detected by the
89 // decoder.
90 size_t packed = 0;
91 Xapian::termcount wdf = t.get_wdf();
92 // If wdf >= 128, then we aren't going to be able to pack it in so
93 // don't even try to avoid the calculation overflowing and making
94 // us think we can.
95 if (wdf < 127)
96 packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
97
98 if (packed && packed < 256) {
99 // We can pack the wdf into the same byte.
100 tag += char(packed);
101 tag += char(term.size() - reuse);
102 tag.append(term.data() + reuse, term.size() - reuse);
103 } else {
104 tag += char(reuse);
105 tag += char(term.size() - reuse);
106 tag.append(term.data() + reuse, term.size() - reuse);
107 // FIXME: pack wdf after reuse next time we rejig the format
108 // incompatibly.
109 tag += F_pack_uint(wdf);
110 }
111
112 prev_term = *t;
113 --termlist_size;
114 }
115 }
116 Assert(termlist_size == 0);
117 add(flint_docid_to_key(did), tag);
118 }
119
120 flint_doclen_t
get_doclength(Xapian::docid did) const121 FlintTermListTable::get_doclength(Xapian::docid did) const
122 {
123 LOGCALL(DB, flint_doclen_t, "FlintTermListTable::get_doclength", did);
124
125 string tag;
126 if (!get_exact_entry(flint_docid_to_key(did), tag))
127 throw Xapian::DocNotFoundError("No termlist found for document " +
128 str(did));
129
130 if (tag.empty()) RETURN(0);
131
132 const char * pos = tag.data();
133 const char * end = pos + tag.size();
134
135 flint_doclen_t doclen;
136 if (!F_unpack_uint(&pos, end, &doclen)) {
137 const char *msg;
138 if (pos == 0) {
139 msg = "Too little data for doclen in termlist";
140 } else {
141 msg = "Overflowed value for doclen in termlist";
142 }
143 throw Xapian::DatabaseCorruptError(msg);
144 }
145
146 RETURN(doclen);
147 }
148