1 /* dbcheck.cc: test database contents and consistency.
2  *
3  * Copyright 2009 Richard Boulton
4  * Copyright 2010 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include "dbcheck.h"
25 
26 #include "str.h"
27 #include "testsuite.h"
28 
29 using namespace std;
30 
31 string
positions_to_string(Xapian::PositionIterator & it,const Xapian::PositionIterator & end,Xapian::termcount * count)32 positions_to_string(Xapian::PositionIterator & it,
33 		    const Xapian::PositionIterator & end,
34 		    Xapian::termcount * count)
35 {
36     string result;
37     bool need_comma = false;
38     Xapian::termcount c = 0;
39     while (it != end) {
40 	if (need_comma)
41 	    result += ", ";
42 	result += str(*it);
43 	need_comma = true;
44 	++it;
45 	++c;
46     }
47     if (count) {
48 	*count = c;
49     }
50     return result;
51 }
52 
53 string
postlist_to_string(const Xapian::Database & db,const string & tname)54 postlist_to_string(const Xapian::Database & db, const string & tname)
55 {
56     string result;
57     bool need_comma = false;
58 
59     for (Xapian::PostingIterator p = db.postlist_begin(tname);
60 	 p != db.postlist_end(tname);
61 	 ++p) {
62 	if (need_comma)
63 	    result += ", ";
64 
65 	Xapian::PositionIterator it(p.positionlist_begin());
66 	string posrepr = positions_to_string(it, p.positionlist_end());
67 	if (!posrepr.empty()) {
68 	    posrepr = ", pos=[" + posrepr + "]";
69 	}
70 
71 	result += "(" + str(*p) +
72 		", doclen=" + str(p.get_doclength()) +
73 		", wdf=" + str(p.get_wdf()) +
74 		posrepr + ")";
75 	need_comma = true;
76     }
77     return result;
78 }
79 
80 string
docterms_to_string(const Xapian::Database & db,Xapian::docid did)81 docterms_to_string(const Xapian::Database & db, Xapian::docid did)
82 {
83     string result;
84     bool need_comma = false;
85 
86     for (Xapian::TermIterator t = db.termlist_begin(did);
87 	 t != db.termlist_end(did);
88 	 ++t) {
89 	Xapian::PositionIterator it(t.positionlist_begin());
90 	string posrepr = positions_to_string(it, t.positionlist_end());
91 	if (!posrepr.empty()) {
92 	    posrepr = ", pos=[" + posrepr + "]";
93 	}
94 	if (need_comma)
95 	    result += ", ";
96 	result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr + ")";
97 	need_comma = true;
98     }
99     return result;
100 }
101 
102 string
docstats_to_string(const Xapian::Database & db,Xapian::docid did)103 docstats_to_string(const Xapian::Database & db, Xapian::docid did)
104 {
105     string result;
106 
107     result += "len=" + str(db.get_doclength(did));
108 
109     return result;
110 }
111 
112 string
termstats_to_string(const Xapian::Database & db,const string & term)113 termstats_to_string(const Xapian::Database & db, const string & term)
114 {
115     string result;
116 
117     result += "tf=" + str(db.get_termfreq(term));
118     result += ",cf=" + str(db.get_collection_freq(term));
119 
120     return result;
121 }
122 
123 string
dbstats_to_string(const Xapian::Database & db)124 dbstats_to_string(const Xapian::Database & db)
125 {
126     string result;
127 
128     result += "dc=" + str(db.get_doccount());
129     result += ",al=" + str(db.get_avlength());
130     result += ",ld=" + str(db.get_lastdocid());
131 
132     return result;
133 }
134 
135 void
dbcheck(const Xapian::Database & db,Xapian::doccount expected_doccount,Xapian::docid expected_lastdocid)136 dbcheck(const Xapian::Database & db,
137 	Xapian::doccount expected_doccount,
138 	Xapian::docid expected_lastdocid)
139 {
140     TEST_EQUAL(db.get_doccount(), expected_doccount);
141     TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
142 
143     // Note - may not be a very big type, but we're only expecting to use this
144     // for small databases, so should be fine.
145     unsigned long totlen = 0;
146 
147     // A map from term to a representation of the posting list for that term.
148     // We build this up from the documents, and then check it against the
149     // equivalent built up from the posting lists.
150     map<string, string> posting_reprs;
151     map<Xapian::valueno, string> value_reprs;
152 
153     Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
154     Xapian::termcount doclen_upper_bound = 0;
155 
156     for (Xapian::PostingIterator dociter = db.postlist_begin(string());
157 	 dociter != db.postlist_end(string());
158 	 ++dociter) {
159 	Xapian::docid did = *dociter;
160 	TEST_EQUAL(dociter.get_wdf(), 1);
161 	Xapian::Document doc(db.get_document(did));
162 	Xapian::termcount doclen(db.get_doclength(did));
163 	if (doclen < doclen_lower_bound)
164 	    doclen_lower_bound = doclen;
165 	if (doclen > doclen_upper_bound)
166 	    doclen_upper_bound = doclen;
167 	totlen += doclen;
168 
169 	Xapian::termcount found_termcount = 0;
170 	Xapian::termcount wdf_sum = 0;
171 	Xapian::TermIterator t, t2;
172 	for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
173 	     t != doc.termlist_end();
174 	     ++t, ++t2) {
175 	    TEST(t2 != db.termlist_end(did));
176 
177 	    ++found_termcount;
178 	    wdf_sum += t.get_wdf();
179 
180 	    TEST_EQUAL(*t, *t2);
181 	    TEST_EQUAL(t.get_wdf(), t2.get_wdf());
182 	    TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
183 	    TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
184 
185 	    // Check the position lists are equal.
186 	    Xapian::termcount tc1, tc2;
187 	    Xapian::PositionIterator it1(t.positionlist_begin());
188 	    string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
189 	    Xapian::PositionIterator it2(t2.positionlist_begin());
190 	    string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
191 	    TEST_EQUAL(posrepr, posrepr2);
192 	    TEST_EQUAL(tc1, tc2);
193 	    try {
194 	    	TEST_EQUAL(tc1, t.positionlist_count());
195 	    } catch (const Xapian::UnimplementedError &) {
196 		// positionlist_count() isn't implemented for remote databases.
197 	    }
198 
199 	    // Make a representation of the posting.
200 	    if (!posrepr.empty()) {
201 		posrepr = ",[" + posrepr + "]";
202 	    }
203 	    string posting_repr = "(" + str(did) + "," +
204 		    str(t.get_wdf()) + "/" + str(doclen) +
205 		    posrepr + ")";
206 
207 	    // Append the representation to the list for the term.
208 	    map<string, string>::iterator i = posting_reprs.find(*t);
209 	    if (i == posting_reprs.end()) {
210 		posting_reprs[*t] = posting_repr;
211 	    } else {
212 		i->second += "," + posting_repr;
213 	    }
214 	}
215 
216 	Xapian::termcount vcount = 0;
217 	for (Xapian::ValueIterator v = doc.values_begin();
218 	     v != doc.values_end();
219 	     ++v, ++vcount) {
220 	    TEST((*v).size() != 0);
221 	    string value_repr = "(" + str(did) + "," + *v + ")";
222 
223 	    // Append the values to the value lists.
224 	    map<Xapian::valueno, string>::iterator i;
225 	    i = value_reprs.find(v.get_valueno());
226 	    if (i == value_reprs.end()) {
227 		value_reprs[v.get_valueno()] = value_repr;
228 	    } else {
229 		i->second += "," + value_repr;
230 	    }
231 	}
232 	TEST_EQUAL(vcount, doc.values_count());
233 	TEST(t2 == db.termlist_end(did));
234 	Xapian::termcount expected_termcount = doc.termlist_count();
235 	TEST_EQUAL(expected_termcount, found_termcount);
236 	TEST_EQUAL(doclen, wdf_sum);
237     }
238 
239     TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
240     TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
241 
242     Xapian::TermIterator t;
243     map<string, string>::const_iterator i;
244     for (t = db.allterms_begin(), i = posting_reprs.begin();
245 	 t != db.allterms_end();
246 	 ++t, ++i) {
247 	TEST(db.term_exists(*t));
248 	TEST(i != posting_reprs.end());
249 	TEST_EQUAL(i->first, *t);
250 
251 	Xapian::doccount tf_count = 0;
252 	Xapian::termcount cf_count = 0;
253 	Xapian::termcount wdf_upper_bound = 0;
254 	string posting_repr;
255 	bool need_comma = false;
256 	for (Xapian::PostingIterator p = db.postlist_begin(*t);
257 	     p != db.postlist_end(*t);
258 	     ++p) {
259 	    if (need_comma) {
260 		posting_repr += ",";
261 	    }
262 
263 	    ++tf_count;
264 	    cf_count += p.get_wdf();
265 
266 	    Xapian::PositionIterator it(p.positionlist_begin());
267 	    string posrepr = positions_to_string(it, p.positionlist_end());
268 	    if (!posrepr.empty()) {
269 		posrepr = ",[" + posrepr + "]";
270 	    }
271 	    posting_repr += "(" + str(*p) + "," +
272 		    str(p.get_wdf()) + "/" + str(p.get_doclength()) +
273 		    posrepr + ")";
274 	    if (wdf_upper_bound < p.get_wdf())
275 		wdf_upper_bound = p.get_wdf();
276 	    need_comma = true;
277 	}
278 
279 	TEST_EQUAL(posting_repr, i->second);
280 	TEST_EQUAL(tf_count, t.get_termfreq());
281 	TEST_EQUAL(tf_count, db.get_termfreq(*t));
282 	TEST_EQUAL(cf_count, db.get_collection_freq(*t));
283 	TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
284     }
285     TEST(i == posting_reprs.end());
286 
287     map<Xapian::valueno, string>::const_iterator j;
288     for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
289 	string value_repr;
290 	string value_lower_bound;
291 	string value_upper_bound;
292 	bool first = true;
293 	for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
294 	     v != db.valuestream_end(j->first); ++v) {
295 	    if (first) {
296 		value_lower_bound = *v;
297 		value_upper_bound = *v;
298 		first = false;
299 	    } else {
300 		value_repr += ",";
301 		if (*v > value_upper_bound) {
302 		    value_upper_bound = *v;
303 		}
304 		if (*v < value_lower_bound) {
305 		    value_lower_bound = *v;
306 		}
307 	    }
308 	    value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
309 	}
310 	TEST_EQUAL(value_repr, j->second);
311 	try {
312 	    TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
313 	    TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
314 	} catch (const Xapian::UnimplementedError &) {
315 	    // Skip the checks if the methods to get the bounds aren't
316 	    // implemented for this backend.
317 	}
318     }
319 
320     if (expected_doccount == 0) {
321 	TEST_EQUAL(0, db.get_avlength());
322     } else {
323 	TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
324 			  db.get_avlength());
325     }
326 }
327