1 /** @file xapian-check-chert.cc
2  * @brief Check consistency of a chert table.
3  */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2016 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "xapian-check-chert.h"
26 
27 #include "bitstream.h"
28 
29 #include "internaltypes.h"
30 
31 #include "chert_check.h"
32 #include "chert_cursor.h"
33 #include "chert_table.h"
34 #include "chert_types.h"
35 #include "pack.h"
36 #include "valuestats.h"
37 
38 #include <xapian.h>
39 
40 #include "autoptr.h"
41 #include <iostream>
42 
43 using namespace std;
44 
45 static inline bool
is_user_metadata_key(const string & key)46 is_user_metadata_key(const string & key)
47 {
48     return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
49 }
50 
51 struct VStats : public ValueStats {
52     Xapian::doccount freq_real;
53 
VStatsVStats54     VStats() : ValueStats(), freq_real(0) {}
55 };
56 
57 size_t
check_chert_table(const char * tablename,string filename,chert_revision_number_t * rev_ptr,int opts,vector<Xapian::termcount> & doclens,Xapian::doccount doccount,Xapian::docid db_last_docid)58 check_chert_table(const char * tablename, string filename,
59 		  chert_revision_number_t * rev_ptr, int opts,
60 		  vector<Xapian::termcount> & doclens,
61 		  Xapian::doccount doccount, Xapian::docid db_last_docid)
62 {
63     filename += '.';
64 
65     // Check the btree structure.
66     ChertTableCheck::check(tablename, filename, rev_ptr, opts);
67 
68     // Now check the chert structures inside the btree.
69     ChertTable table(tablename, filename, true);
70     if (rev_ptr) {
71 	table.open(*rev_ptr);
72     } else {
73 	table.open();
74     }
75     AutoPtr<ChertCursor> cursor(table.cursor_get());
76 
77     size_t errors = 0;
78 
79     cursor->find_entry(string());
80     cursor->next(); // Skip the empty entry.
81 
82     if (strcmp(tablename, "postlist") == 0) {
83 	// Now check the structure of each postlist in the table.
84 	map<Xapian::valueno, VStats> valuestats;
85 	string current_term;
86 	Xapian::docid lastdid = 0;
87 	Xapian::termcount termfreq = 0, collfreq = 0;
88 	Xapian::termcount tf = 0, cf = 0;
89 	Xapian::doccount num_doclens = 0;
90 	bool have_metainfo_key = false;
91 
92 	// The first key/tag pair should be the METAINFO - though this may be
93 	// missing if the table only contains user-metadata.
94 	if (!cursor->after_end()) {
95 	    if (cursor->current_key == string("", 1)) {
96 		have_metainfo_key = true;
97 		cursor->read_tag();
98 		// Check format of the METAINFO key.
99 		totlen_t total_doclen;
100 		Xapian::docid last_docid;
101 		Xapian::termcount doclen_lbound;
102 		Xapian::termcount doclen_ubound;
103 		Xapian::termcount wdf_ubound;
104 
105 		const char * data = cursor->current_tag.data();
106 		const char * end = data + cursor->current_tag.size();
107 		if (!unpack_uint(&data, end, &last_docid)) {
108 		    cout << "Tag containing meta information is corrupt (couldn't read last_docid)." << endl;
109 		    ++errors;
110 		} else if (!unpack_uint(&data, end, &doclen_lbound)) {
111 		    cout << "Tag containing meta information is corrupt (couldn't read doclen_lbound)." << endl;
112 		    ++errors;
113 		} else if (!unpack_uint(&data, end, &wdf_ubound)) {
114 		    cout << "Tag containing meta information is corrupt (couldn't read wdf_ubound)." << endl;
115 		    ++errors;
116 		} else if (!unpack_uint(&data, end, &doclen_ubound)) {
117 		    cout << "Tag containing meta information is corrupt (couldn't read doclen_ubound)." << endl;
118 		    ++errors;
119 		} else if (!unpack_uint_last(&data, end, &total_doclen)) {
120 		    cout << "Tag containing meta information is corrupt (couldn't read total_doclen)." << endl;
121 		    ++errors;
122 		} else if (data != end) {
123 		    cout << "Tag containing meta information is corrupt (junk at end)." << endl;
124 		    ++errors;
125 		}
126 		cursor->next();
127 	    }
128 	}
129 
130 	bool seen_doclen_initial_chunk = false;
131 	for ( ; !cursor->after_end(); cursor->next()) {
132 	    string & key = cursor->current_key;
133 
134 	    if (is_user_metadata_key(key)) {
135 		// User metadata can be anything, so we can't do any particular
136 		// checks on it other than to check that the tag isn't empty.
137 		cursor->read_tag();
138 		if (cursor->current_tag.empty()) {
139 		    cout << "User metadata item is empty" << endl;
140 		    ++errors;
141 		}
142 		continue;
143 	    }
144 
145 	    if (!have_metainfo_key) {
146 		have_metainfo_key = true;
147 		cout << "METAINFO key missing from postlist table" << endl;
148 		++errors;
149 	    }
150 
151 	    if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
152 		// doclen chunk
153 		const char * pos, * end;
154 		Xapian::docid did = 1;
155 		if (key.size() > 2) {
156 		    // Non-initial chunk.
157 		    if (!seen_doclen_initial_chunk) {
158 			cout << "Doclen initial chunk missing" << endl;
159 			++errors;
160 		    }
161 		    pos = key.data();
162 		    end = pos + key.size();
163 		    pos += 2;
164 		    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
165 			cout << "Error unpacking docid from doclen key" << endl;
166 			++errors;
167 			continue;
168 		    }
169 		}
170 		seen_doclen_initial_chunk = true;
171 
172 		cursor->read_tag();
173 		pos = cursor->current_tag.data();
174 		end = pos + cursor->current_tag.size();
175 		if (key.size() == 2) {
176 		    // Initial chunk.
177 		    if (end - pos < 2 || pos[0] || pos[1]) {
178 			cout << "Initial doclen chunk has nonzero dummy fields" << endl;
179 			++errors;
180 			continue;
181 		    }
182 		    pos += 2;
183 		    if (!unpack_uint(&pos, end, &did)) {
184 			cout << "Failed to unpack firstdid for doclen" << endl;
185 			++errors;
186 			continue;
187 		    }
188 		    ++did;
189 		    if (did <= lastdid) {
190 			cout << "First did in this chunk is <= last in "
191 			    "prev chunk" << endl;
192 			++errors;
193 		    }
194 		}
195 
196 		bool is_last_chunk;
197 		if (!unpack_bool(&pos, end, &is_last_chunk)) {
198 		    cout << "Failed to unpack last chunk flag for doclen" << endl;
199 		    ++errors;
200 		    continue;
201 		}
202 		// Read what the final document ID in this chunk is.
203 		if (!unpack_uint(&pos, end, &lastdid)) {
204 		    cout << "Failed to unpack increase to last" << endl;
205 		    ++errors;
206 		    continue;
207 		}
208 		lastdid += did;
209 		bool bad = false;
210 		while (true) {
211 		    Xapian::termcount doclen;
212 		    if (!unpack_uint(&pos, end, &doclen)) {
213 			cout << "Failed to unpack doclen" << endl;
214 			++errors;
215 			bad = true;
216 			break;
217 		    }
218 
219 		    ++num_doclens;
220 
221 		    if (did > db_last_docid) {
222 			cout << "document id " << did << " in doclen stream "
223 			     << "is larger than get_last_docid() "
224 			     << db_last_docid << endl;
225 			++errors;
226 		    }
227 
228 		    if (!doclens.empty()) {
229 			// In chert, a document without terms doesn't get a
230 			// termlist entry.
231 			Xapian::termcount termlist_doclen = 0;
232 			if (did < doclens.size())
233 			    termlist_doclen = doclens[did];
234 
235 			if (doclen != termlist_doclen) {
236 			    cout << "document id " << did << ": length "
237 				 << doclen << " doesn't match "
238 				 << termlist_doclen << " in the termlist table"
239 				 << endl;
240 			    ++errors;
241 			}
242 		    }
243 
244 		    if (pos == end) break;
245 
246 		    Xapian::docid inc;
247 		    if (!unpack_uint(&pos, end, &inc)) {
248 			cout << "Failed to unpack docid increase" << endl;
249 			++errors;
250 			bad = true;
251 			break;
252 		    }
253 		    ++inc;
254 		    did += inc;
255 		    if (did > lastdid) {
256 			cout << "docid " << did << " > last docid " << lastdid
257 			     << endl;
258 			++errors;
259 		    }
260 		}
261 		if (bad) {
262 		    continue;
263 		}
264 		if (is_last_chunk) {
265 		    if (did != lastdid) {
266 			cout << "lastdid " << lastdid << " != last did " << did
267 			     << endl;
268 			++errors;
269 		    }
270 		}
271 
272 		continue;
273 	    }
274 
275 	    if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
276 		// Value stats.
277 		const char * p = key.data();
278 		const char * end = p + key.length();
279 		p += 2;
280 		Xapian::valueno slot;
281 		if (!unpack_uint_last(&p, end, &slot)) {
282 		    cout << "Bad valuestats key (no slot)" << endl;
283 		    ++errors;
284 		    continue;
285 		}
286 
287 		cursor->read_tag();
288 		p = cursor->current_tag.data();
289 		end = p + cursor->current_tag.size();
290 
291 		VStats & v = valuestats[slot];
292 		if (!unpack_uint(&p, end, &v.freq)) {
293 		    if (*p == 0) {
294 			cout << "Incomplete stats item in value table" << endl;
295 		    } else {
296 			cout << "Frequency statistic in value table is too large" << endl;
297 		    }
298 		    ++errors;
299 		    continue;
300 		}
301 		if (!unpack_string(&p, end, v.lower_bound)) {
302 		    if (*p == 0) {
303 			cout << "Incomplete stats item in value table" << endl;
304 		    } else {
305 			cout << "Lower bound statistic in value table is too large" << endl;
306 		    }
307 		    ++errors;
308 		    continue;
309 		}
310 		size_t len = end - p;
311 		if (len == 0) {
312 		    v.upper_bound = v.lower_bound;
313 		} else {
314 		    v.upper_bound.assign(p, len);
315 		}
316 
317 		continue;
318 	    }
319 
320 	    if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
321 		// Value stream chunk.
322 		const char * p = key.data();
323 		const char * end = p + key.length();
324 		p += 2;
325 		Xapian::valueno slot;
326 		if (!unpack_uint(&p, end, &slot)) {
327 		    cout << "Bad value chunk key (no slot)" << endl;
328 		    ++errors;
329 		    continue;
330 		}
331 		Xapian::docid did;
332 		if (!unpack_uint_preserving_sort(&p, end, &did)) {
333 		    cout << "Bad value chunk key (no docid)" << endl;
334 		    ++errors;
335 		    continue;
336 		}
337 		if (p != end) {
338 		    cout << "Bad value chunk key (trailing junk)" << endl;
339 		    ++errors;
340 		    continue;
341 		}
342 
343 		VStats & v = valuestats[slot];
344 
345 		cursor->read_tag();
346 		p = cursor->current_tag.data();
347 		end = p + cursor->current_tag.size();
348 
349 		while (true) {
350 		    string value;
351 		    if (!unpack_string(&p, end, value)) {
352 			cout << "Failed to unpack value from chunk" << endl;
353 			++errors;
354 			break;
355 		    }
356 
357 		    ++v.freq_real;
358 
359 		    // FIXME: Cross-check that docid did has value slot (and
360 		    // vice versa - that there's a value here if the slot entry
361 		    // says so).
362 
363 		    // FIXME: Check if the bounds are tight?  Or is that better
364 		    // as a separate tool which can also update the bounds?
365 		    if (value < v.lower_bound) {
366 			cout << "Value slot " << slot << " has value below "
367 				"lower bound: '" << value << "' < '"
368 			     << v.lower_bound << "'" << endl;
369 			++errors;
370 		    } else if (value > v.upper_bound) {
371 			cout << "Value slot " << slot << " has value above "
372 				"upper bound: '" << value << "' > '"
373 			     << v.upper_bound << "'" << endl;
374 			++errors;
375 		    }
376 
377 		    if (p == end) break;
378 		    Xapian::docid delta;
379 		    if (!unpack_uint(&p, end, &delta)) {
380 			cout << "Failed to unpack docid delta from chunk" << endl;
381 			++errors;
382 			break;
383 		    }
384 		    Xapian::docid new_did = did + delta + 1;
385 		    if (new_did <= did) {
386 			cout << "docid overflowed in value chunk" << endl;
387 			++errors;
388 			break;
389 		    }
390 		    did = new_did;
391 
392 		    if (did > db_last_docid) {
393 			cout << "document id " << did << " in value chunk "
394 			     << "is larger than get_last_docid() "
395 			     << db_last_docid << endl;
396 			++errors;
397 		    }
398 		}
399 		continue;
400 	    }
401 
402 	    const char * pos, * end;
403 
404 	    // Get term from key.
405 	    pos = key.data();
406 	    end = pos + key.size();
407 
408 	    string term;
409 	    Xapian::docid did;
410 	    if (!unpack_string_preserving_sort(&pos, end, term)) {
411 		cout << "Error unpacking termname from key" << endl;
412 		++errors;
413 		continue;
414 	    }
415 	    if (!current_term.empty() && term != current_term) {
416 		// The term changed unexpectedly.
417 		if (pos == end) {
418 		    cout << "No last chunk for term `" << current_term
419 			 << "'" << endl;
420 		    current_term.resize(0);
421 		} else {
422 		    cout << "Mismatch in follow-on chunk in posting "
423 			"list for term `" << current_term << "' (got `"
424 			<< term << "')" << endl;
425 		    current_term = term;
426 		    tf = cf = 0;
427 		    lastdid = 0;
428 		}
429 		++errors;
430 	    }
431 	    if (pos == end) {
432 		// First chunk.
433 		if (term == current_term) {
434 		    // This probably isn't possible.
435 		    cout << "First posting list chunk for term `"
436 			 << term << "' follows previous chunk for the same "
437 			 "term" << endl;
438 		    ++errors;
439 		}
440 		current_term = term;
441 		tf = cf = 0;
442 
443 		// Unpack extra header from first chunk.
444 		cursor->read_tag();
445 		pos = cursor->current_tag.data();
446 		end = pos + cursor->current_tag.size();
447 		if (!unpack_uint(&pos, end, &termfreq)) {
448 		    cout << "Failed to unpack termfreq for term `" << term
449 			 << "'" << endl;
450 		    ++errors;
451 		    continue;
452 		}
453 		if (!unpack_uint(&pos, end, &collfreq)) {
454 		    cout << "Failed to unpack collfreq for term `" << term
455 			 << "'" << endl;
456 		    ++errors;
457 		    continue;
458 		}
459 		if (!unpack_uint(&pos, end, &did)) {
460 		    cout << "Failed to unpack firstdid for term `" << term
461 			 << "'" << endl;
462 		    ++errors;
463 		    continue;
464 		}
465 		++did;
466 	    } else {
467 		// Continuation chunk.
468 		if (current_term.empty()) {
469 		    cout << "First chunk for term `" << current_term << "' "
470 			 "is a continuation chunk" << endl;
471 		    ++errors;
472 		    current_term = term;
473 		}
474 		AssertEq(current_term, term);
475 		if (!unpack_uint_preserving_sort(&pos, end, &did)) {
476 		    cout << "Failed to unpack did from key" << endl;
477 		    ++errors;
478 		    continue;
479 		}
480 		if (did <= lastdid) {
481 		    cout << "First did in this chunk is <= last in "
482 			"prev chunk" << endl;
483 		    ++errors;
484 		}
485 		cursor->read_tag();
486 		pos = cursor->current_tag.data();
487 		end = pos + cursor->current_tag.size();
488 	    }
489 
490 	    bool is_last_chunk;
491 	    if (!unpack_bool(&pos, end, &is_last_chunk)) {
492 		cout << "Failed to unpack last chunk flag" << endl;
493 		++errors;
494 		continue;
495 	    }
496 	    // Read what the final document ID in this chunk is.
497 	    if (!unpack_uint(&pos, end, &lastdid)) {
498 		cout << "Failed to unpack increase to last" << endl;
499 		++errors;
500 		continue;
501 	    }
502 	    lastdid += did;
503 	    bool bad = false;
504 	    while (true) {
505 		Xapian::termcount wdf;
506 		if (!unpack_uint(&pos, end, &wdf)) {
507 		    cout << "Failed to unpack wdf" << endl;
508 		    ++errors;
509 		    bad = true;
510 		    break;
511 		}
512 		++tf;
513 		cf += wdf;
514 
515 		if (pos == end) break;
516 
517 		Xapian::docid inc;
518 		if (!unpack_uint(&pos, end, &inc)) {
519 		    cout << "Failed to unpack docid increase" << endl;
520 		    ++errors;
521 		    bad = true;
522 		    break;
523 		}
524 		++inc;
525 		did += inc;
526 		if (did > lastdid) {
527 		    cout << "docid " << did << " > last docid " << lastdid
528 			 << endl;
529 		    ++errors;
530 		}
531 	    }
532 	    if (bad) {
533 		continue;
534 	    }
535 	    if (is_last_chunk) {
536 		if (tf != termfreq) {
537 		    cout << "termfreq " << termfreq << " != # of entries "
538 			 << tf << endl;
539 		    ++errors;
540 		}
541 		if (cf != collfreq) {
542 		    cout << "collfreq " << collfreq << " != sum wdf " << cf
543 			 << endl;
544 		    ++errors;
545 		}
546 		if (did != lastdid) {
547 		    cout << "lastdid " << lastdid << " != last did " << did
548 			 << endl;
549 		    ++errors;
550 		}
551 		current_term.resize(0);
552 	    }
553 	}
554 	if (!current_term.empty()) {
555 	    cout << "Last term `" << current_term << "' has no last chunk"
556 		 << endl;
557 	    ++errors;
558 	}
559 
560 	if (num_doclens != doccount && doccount != Xapian::doccount(-1)) {
561 	    cout << "Document length list has " << num_doclens
562 		 << " entries, should be " << doccount << endl;
563 	    ++errors;
564 	}
565 
566 	map<Xapian::valueno, VStats>::const_iterator i;
567 	for (i = valuestats.begin(); i != valuestats.end(); ++i) {
568 	    if (i->second.freq != i->second.freq_real) {
569 		cout << "Value stats frequency for slot " << i->first << " is "
570 		     << i->second.freq << " but recounting gives "
571 		     << i->second.freq_real << endl;
572 		++errors;
573 	    }
574 	}
575     } else if (strcmp(tablename, "record") == 0) {
576 	if (table.get_entry_count() != doccount &&
577 	    doccount != Xapian::doccount(-1)) {
578 	    cout << "Document data entry count (" << table.get_entry_count()
579 		 << ") != get_doccount() (" << doccount << ")" << endl;
580 	    ++errors;
581 	}
582 
583 	// Now check the contents of the record table.  Any data is valid as
584 	// the tag so we don't check the tags.
585 	for ( ; !cursor->after_end(); cursor->next()) {
586 	    string & key = cursor->current_key;
587 
588 	    // Get docid from key.
589 	    const char * pos = key.data();
590 	    const char * end = pos + key.size();
591 
592 	    Xapian::docid did;
593 	    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
594 		cout << "Error unpacking docid from key" << endl;
595 		++errors;
596 	    } else if (pos != end) {
597 		cout << "Extra junk in key" << endl;
598 		++errors;
599 	    } else {
600 		if (did > db_last_docid) {
601 		    cout << "document id " << did << " in docdata table "
602 			    "is larger than get_last_docid() "
603 			 << db_last_docid << endl;
604 		    ++errors;
605 		}
606 	    }
607 	}
608     } else if (strcmp(tablename, "termlist") == 0) {
609 	// Now check the contents of the termlist table.
610 	Xapian::doccount num_termlists = 0;
611 	Xapian::doccount num_slotsused_entries = 0;
612 	for ( ; !cursor->after_end(); cursor->next()) {
613 	    string & key = cursor->current_key;
614 
615 	    // Get docid from key.
616 	    const char * pos = key.data();
617 	    const char * end = pos + key.size();
618 
619 	    Xapian::docid did;
620 	    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
621 		cout << "Error unpacking docid from key" << endl;
622 		++errors;
623 		continue;
624 	    }
625 
626 	    if (did > db_last_docid) {
627 		cout << "document id " << did << " in termlist table "
628 			"is larger than get_last_docid() "
629 		     << db_last_docid << endl;
630 		++errors;
631 	    }
632 
633 	    if (end - pos == 1 && *pos == '\0') {
634 		// Value slots used entry.
635 		++num_slotsused_entries;
636 		cursor->read_tag();
637 
638 		pos = cursor->current_tag.data();
639 		end = pos + cursor->current_tag.size();
640 
641 		if (pos == end) {
642 		    cout << "Empty value slots used tag" << endl;
643 		    ++errors;
644 		    continue;
645 		}
646 
647 		Xapian::valueno prev_slot;
648 		if (!unpack_uint(&pos, end, &prev_slot)) {
649 		    cout << "Value slot encoding corrupt" << endl;
650 		    ++errors;
651 		    continue;
652 		}
653 
654 		while (pos != end) {
655 		    Xapian::valueno slot;
656 		    if (!unpack_uint(&pos, end, &slot)) {
657 			cout << "Value slot encoding corrupt" << endl;
658 			++errors;
659 			break;
660 		    }
661 		    slot += prev_slot + 1;
662 		    if (slot <= prev_slot) {
663 			cout << "Value slot number overflowed (" << prev_slot << " -> " << slot << ")" << endl;
664 			++errors;
665 		    }
666 		    prev_slot = slot;
667 		}
668 		continue;
669 	    }
670 
671 	    if (pos != end) {
672 		cout << "Extra junk in key" << endl;
673 		++errors;
674 		continue;
675 	    }
676 
677 	    ++num_termlists;
678 	    cursor->read_tag();
679 
680 	    pos = cursor->current_tag.data();
681 	    end = pos + cursor->current_tag.size();
682 
683 	    if (pos == end) {
684 		// Empty termlist.
685 		continue;
686 	    }
687 
688 	    Xapian::termcount doclen, termlist_size;
689 
690 	    // Read doclen
691 	    if (!unpack_uint(&pos, end, &doclen)) {
692 		if (pos != 0) {
693 		    cout << "doclen out of range" << endl;
694 		} else {
695 		    cout << "Unexpected end of data when reading doclen" << endl;
696 		}
697 		++errors;
698 		continue;
699 	    }
700 
701 	    // Read termlist_size
702 	    if (!unpack_uint(&pos, end, &termlist_size)) {
703 		if (pos != 0) {
704 		    cout << "termlist_size out of range" << endl;
705 		} else {
706 		    cout << "Unexpected end of data when reading termlist_size" << endl;
707 		}
708 		++errors;
709 		continue;
710 	    }
711 
712 	    Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
713 	    string current_tname;
714 
715 	    bool bad = false;
716 	    while (pos != end) {
717 		Xapian::doccount current_wdf = 0;
718 		bool got_wdf = false;
719 		// If there was a previous term, how much to reuse.
720 		if (!current_tname.empty()) {
721 		    string::size_type len = static_cast<unsigned char>(*pos++);
722 		    if (len > current_tname.length()) {
723 			// The wdf was squeezed into the same byte.
724 			current_wdf = len / (current_tname.length() + 1) - 1;
725 			len %= (current_tname.length() + 1);
726 			got_wdf = true;
727 		    }
728 		    current_tname.resize(len);
729 		}
730 		// What to append (note len must be positive, since just truncating
731 		// always takes us backwards in the sort order)
732 		string::size_type len = static_cast<unsigned char>(*pos++);
733 		current_tname.append(pos, len);
734 		pos += len;
735 
736 		if (!got_wdf) {
737 		    // Read wdf
738 		    if (!unpack_uint(&pos, end, &current_wdf)) {
739 			if (pos == 0) {
740 			    cout << "Unexpected end of data when reading termlist current_wdf" << endl;
741 			} else {
742 			    cout << "Size of wdf out of range, in termlist" << endl;
743 			}
744 			++errors;
745 			bad = true;
746 			break;
747 		    }
748 		}
749 
750 		++actual_termlist_size;
751 		actual_doclen += current_wdf;
752 	    }
753 	    if (bad) {
754 		continue;
755 	    }
756 
757 	    if (termlist_size != actual_termlist_size) {
758 		cout << "termlist_size != # of entries in termlist" << endl;
759 		++errors;
760 	    }
761 	    if (doclen != actual_doclen) {
762 		cout << "doclen != sum(wdf)" << endl;
763 		++errors;
764 	    }
765 
766 	    // + 1 so that did is a valid subscript.
767 	    if (doclens.size() <= did) doclens.resize(did + 1);
768 	    doclens[did] = actual_doclen;
769 	}
770 
771 	if (num_termlists != doccount && doccount != Xapian::doccount(-1)) {
772 	    cout << "Number of termlists (" << num_termlists
773 		 << ") != get_doccount() (" << doccount << ")" << endl;
774 	    ++errors;
775 	}
776 
777 	// chert doesn't store a valueslots used entry if there are no terms,
778 	// so we can only check there aren't more such entries than documents.
779 	if (num_slotsused_entries > doccount &&
780 	    doccount != Xapian::doccount(-1)) {
781 	    cout << "More slots-used entries (" << num_slotsused_entries
782 		 << ") then documents (" << doccount << ")" << endl;
783 	    ++errors;
784 	}
785     } else if (strcmp(tablename, "position") == 0) {
786 	// Now check the contents of the position table.
787 	for ( ; !cursor->after_end(); cursor->next()) {
788 	    string & key = cursor->current_key;
789 
790 	    // Get docid from key.
791 	    const char * pos = key.data();
792 	    const char * end = pos + key.size();
793 
794 	    Xapian::docid did;
795 	    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
796 		cout << "Error unpacking docid from key" << endl;
797 		++errors;
798 		continue;
799 	    }
800 
801 	    if (did > db_last_docid) {
802 		cout << "document id " << did << " in position table "
803 			"is larger than get_last_docid() "
804 		     << db_last_docid << endl;
805 		++errors;
806 	    } else if (!doclens.empty()) {
807 		// In chert, a document without terms doesn't get a
808 		// termlist entry, so we can't tell the difference
809 		// easily.
810 		if (did >= doclens.size() || doclens[did] == 0) {
811 		    cout << "Position list entry for document " << did
812 			 << " which doesn't exist or has no terms" << endl;
813 		    ++errors;
814 		}
815 	    }
816 
817 	    if (pos == end) {
818 		cout << "No termname in key" << endl;
819 		++errors;
820 		continue;
821 	    }
822 
823 	    cursor->read_tag();
824 
825 	    const string & data = cursor->current_tag;
826 	    pos = data.data();
827 	    end = pos + data.size();
828 
829 	    Xapian::termpos pos_last;
830 	    if (!unpack_uint(&pos, end, &pos_last)) {
831 		cout << tablename << " table: Position list data corrupt" << endl;
832 		++errors;
833 		continue;
834 	    }
835 	    if (pos == end) {
836 		// Special case for single entry position list.
837 	    } else {
838 		// Skip the header we just read.
839 		BitReader rd(data, pos - data.data());
840 		Xapian::termpos pos_first = rd.decode(pos_last);
841 		Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
842 		vector<Xapian::termpos> positions;
843 		positions.resize(pos_size);
844 		positions[0] = pos_first;
845 		positions.back() = pos_last;
846 		rd.decode_interpolative(positions, 0, pos_size - 1);
847 		vector<Xapian::termpos>::const_iterator current_pos = positions.begin();
848 		Xapian::termpos lastpos = *current_pos++;
849 		while (current_pos != positions.end()) {
850 		    Xapian::termpos termpos = *current_pos++;
851 		    if (termpos <= lastpos) {
852 			cout << tablename << " table: Positions not strictly monotonically increasing" << endl;
853 			++errors;
854 			break;
855 		    }
856 		    lastpos = termpos;
857 		}
858 	    }
859 	}
860     } else {
861 	cout << tablename << " table: Don't know how to check structure\n" << endl;
862 	return errors;
863     }
864 
865     if (!errors)
866 	cout << tablename << " table structure checked OK\n" << endl;
867     else
868 	cout << tablename << " table errors found: " << errors << "\n" << endl;
869 
870     return errors;
871 }
872