1 /* dbcheck.cc: test database contents and consistency.
2 *
3 * Copyright 2009 Richard Boulton
4 * Copyright 2010 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
20 */
21
22 #include <config.h>
23
24 #include "dbcheck.h"
25
26 #include "str.h"
27 #include "testsuite.h"
28
29 using namespace std;
30
31 string
positions_to_string(Xapian::PositionIterator & it,const Xapian::PositionIterator & end,Xapian::termcount * count)32 positions_to_string(Xapian::PositionIterator & it,
33 const Xapian::PositionIterator & end,
34 Xapian::termcount * count)
35 {
36 string result;
37 bool need_comma = false;
38 Xapian::termcount c = 0;
39 while (it != end) {
40 if (need_comma)
41 result += ", ";
42 result += str(*it);
43 need_comma = true;
44 ++it;
45 ++c;
46 }
47 if (count) {
48 *count = c;
49 }
50 return result;
51 }
52
53 string
postlist_to_string(const Xapian::Database & db,const string & tname)54 postlist_to_string(const Xapian::Database & db, const string & tname)
55 {
56 string result;
57 bool need_comma = false;
58
59 for (Xapian::PostingIterator p = db.postlist_begin(tname);
60 p != db.postlist_end(tname);
61 ++p) {
62 if (need_comma)
63 result += ", ";
64
65 Xapian::PositionIterator it(p.positionlist_begin());
66 string posrepr = positions_to_string(it, p.positionlist_end());
67 if (!posrepr.empty()) {
68 posrepr = ", pos=[" + posrepr + "]";
69 }
70
71 result += "(" + str(*p) +
72 ", doclen=" + str(p.get_doclength()) +
73 ", wdf=" + str(p.get_wdf()) +
74 posrepr + ")";
75 need_comma = true;
76 }
77 return result;
78 }
79
80 string
docterms_to_string(const Xapian::Database & db,Xapian::docid did)81 docterms_to_string(const Xapian::Database & db, Xapian::docid did)
82 {
83 string result;
84 bool need_comma = false;
85
86 for (Xapian::TermIterator t = db.termlist_begin(did);
87 t != db.termlist_end(did);
88 ++t) {
89 Xapian::PositionIterator it(t.positionlist_begin());
90 string posrepr = positions_to_string(it, t.positionlist_end());
91 if (!posrepr.empty()) {
92 posrepr = ", pos=[" + posrepr + "]";
93 }
94 if (need_comma)
95 result += ", ";
96 result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr + ")";
97 need_comma = true;
98 }
99 return result;
100 }
101
102 string
docstats_to_string(const Xapian::Database & db,Xapian::docid did)103 docstats_to_string(const Xapian::Database & db, Xapian::docid did)
104 {
105 string result;
106
107 result += "len=" + str(db.get_doclength(did));
108
109 return result;
110 }
111
112 string
termstats_to_string(const Xapian::Database & db,const string & term)113 termstats_to_string(const Xapian::Database & db, const string & term)
114 {
115 string result;
116
117 result += "tf=" + str(db.get_termfreq(term));
118 result += ",cf=" + str(db.get_collection_freq(term));
119
120 return result;
121 }
122
123 string
dbstats_to_string(const Xapian::Database & db)124 dbstats_to_string(const Xapian::Database & db)
125 {
126 string result;
127
128 result += "dc=" + str(db.get_doccount());
129 result += ",al=" + str(db.get_avlength());
130 result += ",ld=" + str(db.get_lastdocid());
131
132 return result;
133 }
134
135 void
dbcheck(const Xapian::Database & db,Xapian::doccount expected_doccount,Xapian::docid expected_lastdocid)136 dbcheck(const Xapian::Database & db,
137 Xapian::doccount expected_doccount,
138 Xapian::docid expected_lastdocid)
139 {
140 TEST_EQUAL(db.get_doccount(), expected_doccount);
141 TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
142
143 // Note - may not be a very big type, but we're only expecting to use this
144 // for small databases, so should be fine.
145 unsigned long totlen = 0;
146
147 // A map from term to a representation of the posting list for that term.
148 // We build this up from the documents, and then check it against the
149 // equivalent built up from the posting lists.
150 map<string, string> posting_reprs;
151 map<Xapian::valueno, string> value_reprs;
152
153 Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
154 Xapian::termcount doclen_upper_bound = 0;
155
156 for (Xapian::PostingIterator dociter = db.postlist_begin(string());
157 dociter != db.postlist_end(string());
158 ++dociter) {
159 Xapian::docid did = *dociter;
160 TEST_EQUAL(dociter.get_wdf(), 1);
161 Xapian::Document doc(db.get_document(did));
162 Xapian::termcount doclen(db.get_doclength(did));
163 if (doclen < doclen_lower_bound)
164 doclen_lower_bound = doclen;
165 if (doclen > doclen_upper_bound)
166 doclen_upper_bound = doclen;
167 totlen += doclen;
168
169 Xapian::termcount found_termcount = 0;
170 Xapian::termcount wdf_sum = 0;
171 Xapian::TermIterator t, t2;
172 for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
173 t != doc.termlist_end();
174 ++t, ++t2) {
175 TEST(t2 != db.termlist_end(did));
176
177 ++found_termcount;
178 wdf_sum += t.get_wdf();
179
180 TEST_EQUAL(*t, *t2);
181 TEST_EQUAL(t.get_wdf(), t2.get_wdf());
182 TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
183 TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
184
185 // Check the position lists are equal.
186 Xapian::termcount tc1, tc2;
187 Xapian::PositionIterator it1(t.positionlist_begin());
188 string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
189 Xapian::PositionIterator it2(t2.positionlist_begin());
190 string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
191 TEST_EQUAL(posrepr, posrepr2);
192 TEST_EQUAL(tc1, tc2);
193 try {
194 TEST_EQUAL(tc1, t.positionlist_count());
195 } catch (const Xapian::UnimplementedError &) {
196 // positionlist_count() isn't implemented for remote databases.
197 }
198
199 // Make a representation of the posting.
200 if (!posrepr.empty()) {
201 posrepr = ",[" + posrepr + "]";
202 }
203 string posting_repr = "(" + str(did) + "," +
204 str(t.get_wdf()) + "/" + str(doclen) +
205 posrepr + ")";
206
207 // Append the representation to the list for the term.
208 map<string, string>::iterator i = posting_reprs.find(*t);
209 if (i == posting_reprs.end()) {
210 posting_reprs[*t] = posting_repr;
211 } else {
212 i->second += "," + posting_repr;
213 }
214 }
215
216 Xapian::termcount vcount = 0;
217 for (Xapian::ValueIterator v = doc.values_begin();
218 v != doc.values_end();
219 ++v, ++vcount) {
220 TEST((*v).size() != 0);
221 string value_repr = "(" + str(did) + "," + *v + ")";
222
223 // Append the values to the value lists.
224 map<Xapian::valueno, string>::iterator i;
225 i = value_reprs.find(v.get_valueno());
226 if (i == value_reprs.end()) {
227 value_reprs[v.get_valueno()] = value_repr;
228 } else {
229 i->second += "," + value_repr;
230 }
231 }
232 TEST_EQUAL(vcount, doc.values_count());
233 TEST(t2 == db.termlist_end(did));
234 Xapian::termcount expected_termcount = doc.termlist_count();
235 TEST_EQUAL(expected_termcount, found_termcount);
236 TEST_EQUAL(doclen, wdf_sum);
237 }
238
239 TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
240 TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
241
242 Xapian::TermIterator t;
243 map<string, string>::const_iterator i;
244 for (t = db.allterms_begin(), i = posting_reprs.begin();
245 t != db.allterms_end();
246 ++t, ++i) {
247 TEST(db.term_exists(*t));
248 TEST(i != posting_reprs.end());
249 TEST_EQUAL(i->first, *t);
250
251 Xapian::doccount tf_count = 0;
252 Xapian::termcount cf_count = 0;
253 Xapian::termcount wdf_upper_bound = 0;
254 string posting_repr;
255 bool need_comma = false;
256 for (Xapian::PostingIterator p = db.postlist_begin(*t);
257 p != db.postlist_end(*t);
258 ++p) {
259 if (need_comma) {
260 posting_repr += ",";
261 }
262
263 ++tf_count;
264 cf_count += p.get_wdf();
265
266 Xapian::PositionIterator it(p.positionlist_begin());
267 string posrepr = positions_to_string(it, p.positionlist_end());
268 if (!posrepr.empty()) {
269 posrepr = ",[" + posrepr + "]";
270 }
271 posting_repr += "(" + str(*p) + "," +
272 str(p.get_wdf()) + "/" + str(p.get_doclength()) +
273 posrepr + ")";
274 if (wdf_upper_bound < p.get_wdf())
275 wdf_upper_bound = p.get_wdf();
276 need_comma = true;
277 }
278
279 TEST_EQUAL(posting_repr, i->second);
280 TEST_EQUAL(tf_count, t.get_termfreq());
281 TEST_EQUAL(tf_count, db.get_termfreq(*t));
282 TEST_EQUAL(cf_count, db.get_collection_freq(*t));
283 TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
284 }
285 TEST(i == posting_reprs.end());
286
287 map<Xapian::valueno, string>::const_iterator j;
288 for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
289 string value_repr;
290 string value_lower_bound;
291 string value_upper_bound;
292 bool first = true;
293 for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
294 v != db.valuestream_end(j->first); ++v) {
295 if (first) {
296 value_lower_bound = *v;
297 value_upper_bound = *v;
298 first = false;
299 } else {
300 value_repr += ",";
301 if (*v > value_upper_bound) {
302 value_upper_bound = *v;
303 }
304 if (*v < value_lower_bound) {
305 value_lower_bound = *v;
306 }
307 }
308 value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
309 }
310 TEST_EQUAL(value_repr, j->second);
311 try {
312 TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
313 TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
314 } catch (const Xapian::UnimplementedError &) {
315 // Skip the checks if the methods to get the bounds aren't
316 // implemented for this backend.
317 }
318 }
319
320 if (expected_doccount == 0) {
321 TEST_EQUAL(0, db.get_avlength());
322 } else {
323 TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
324 db.get_avlength());
325 }
326 }
327