1 /** @file
2  * @brief Check consistency of a glass table.
3  */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2018 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "glass_dbcheck.h"
26 
27 #include "bitstream.h"
28 
29 #include "internaltypes.h"
30 
31 #include "glass_check.h"
32 #include "glass_cursor.h"
33 #include "glass_defs.h"
34 #include "glass_table.h"
35 #include "glass_version.h"
36 #include "pack.h"
37 #include "backends/valuestats.h"
38 
39 #include <xapian.h>
40 
41 #include "filetests.h"
42 #include "autoptr.h"
43 #include <ostream>
44 #include <vector>
45 
46 using namespace std;
47 
48 static inline bool
is_user_metadata_key(const string & key)49 is_user_metadata_key(const string & key)
50 {
51     return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
52 }
53 
54 struct VStats : public ValueStats {
55     Xapian::doccount freq_real;
56 
VStatsVStats57     VStats() : ValueStats(), freq_real(0) {}
58 };
59 
60 size_t
check_glass_table(const char * tablename,const string & db_dir,int fd,off_t offset_,const GlassVersion & version_file,int opts,vector<Xapian::termcount> & doclens,ostream * out)61 check_glass_table(const char * tablename, const string &db_dir, int fd,
62 		  off_t offset_,
63 		  const GlassVersion & version_file, int opts,
64 		  vector<Xapian::termcount> & doclens, ostream * out)
65 {
66     Xapian::docid db_last_docid = version_file.get_last_docid();
67     if (out)
68 	*out << tablename << ":\n";
69     if (fd < 0) {
70 	if (strcmp(tablename, "postlist") != 0) {
71 	    // Other filenames are created lazily, so may not exist.
72 	    string filename(db_dir);
73 	    filename += '/';
74 	    filename += tablename;
75 	    filename += "." GLASS_TABLE_EXTENSION;
76 	    if (!file_exists(filename)) {
77 		if (out) {
78 		    if (strcmp(tablename, "termlist") == 0) {
79 			*out << "Not present.\n";
80 		    } else {
81 			*out << "Lazily created, and not yet used.\n";
82 		    }
83 		    *out << endl;
84 		}
85 		return 0;
86 	    }
87 	}
88     }
89 
90     // Check the btree structure.
91     AutoPtr<GlassTable> table(
92 	    GlassTableCheck::check(tablename, db_dir, fd, offset_,
93 				   version_file, opts, out));
94 
95     // Now check the glass structures inside the btree.
96     AutoPtr<GlassCursor> cursor(table->cursor_get());
97 
98     size_t errors = 0;
99 
100     cursor->find_entry(string());
101     cursor->next(); // Skip the empty entry.
102 
103     if (strcmp(tablename, "postlist") == 0) {
104 	// Now check the structure of each postlist in the table.
105 	map<Xapian::valueno, VStats> valuestats;
106 	string current_term;
107 	Xapian::docid lastdid = 0;
108 	Xapian::termcount termfreq = 0, collfreq = 0;
109 	Xapian::termcount tf = 0, cf = 0;
110 	Xapian::doccount num_doclens = 0;
111 
112 	for ( ; !cursor->after_end(); cursor->next()) {
113 	    string & key = cursor->current_key;
114 
115 	    if (is_user_metadata_key(key)) {
116 		// User metadata can be anything, so we can't do any particular
117 		// checks on it other than to check that the tag isn't empty.
118 		cursor->read_tag();
119 		if (cursor->current_tag.empty()) {
120 		    if (out)
121 			*out << "User metadata item is empty" << endl;
122 		    ++errors;
123 		}
124 		continue;
125 	    }
126 
127 	    if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
128 		// doclen chunk
129 		const char * pos, * end;
130 		Xapian::docid did = 1;
131 		if (key.size() > 2) {
132 		    // Non-initial chunk.
133 		    pos = key.data();
134 		    end = pos + key.size();
135 		    pos += 2;
136 		    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
137 			if (out)
138 			    *out << "Error unpacking docid from doclen key" << endl;
139 			++errors;
140 			continue;
141 		    }
142 		    if (did <= lastdid) {
143 			if (out)
144 			    *out << "First did in this doclen chunk is <= last in "
145 				    "prev chunk" << endl;
146 			++errors;
147 		    }
148 		}
149 
150 		cursor->read_tag();
151 		pos = cursor->current_tag.data();
152 		end = pos + cursor->current_tag.size();
153 		if (key.size() == 2) {
154 		    // Initial chunk.
155 		    if (end - pos < 2 || pos[0] || pos[1]) {
156 			if (out)
157 			    *out << "Initial doclen chunk has nonzero dummy fields" << endl;
158 			++errors;
159 			continue;
160 		    }
161 		    pos += 2;
162 		    if (!unpack_uint(&pos, end, &did)) {
163 			if (out)
164 			    *out << "Failed to unpack firstdid for doclen" << endl;
165 			++errors;
166 			continue;
167 		    }
168 		    ++did;
169 		}
170 
171 		bool is_last_chunk;
172 		if (!unpack_bool(&pos, end, &is_last_chunk)) {
173 		    if (out)
174 			*out << "Failed to unpack last chunk flag for doclen" << endl;
175 		    ++errors;
176 		    continue;
177 		}
178 		// Read what the final document ID in this chunk is.
179 		if (!unpack_uint(&pos, end, &lastdid)) {
180 		    if (out)
181 			*out << "Failed to unpack increase to last" << endl;
182 		    ++errors;
183 		    continue;
184 		}
185 		lastdid += did;
186 		bool bad = false;
187 		while (true) {
188 		    Xapian::termcount doclen;
189 		    if (!unpack_uint(&pos, end, &doclen)) {
190 			if (out)
191 			    *out << "Failed to unpack doclen" << endl;
192 			++errors;
193 			bad = true;
194 			break;
195 		    }
196 
197 		    ++num_doclens;
198 
199 		    if (did > db_last_docid) {
200 			if (out)
201 			    *out << "document id " << did << " in doclen "
202 				    "stream is larger than get_last_docid() "
203 				 << db_last_docid << endl;
204 			++errors;
205 		    }
206 
207 		    if (!doclens.empty()) {
208 			// In glass, a document without terms doesn't get a
209 			// termlist entry.
210 			Xapian::termcount termlist_doclen = 0;
211 			if (did < doclens.size())
212 			    termlist_doclen = doclens[did];
213 
214 			if (doclen != termlist_doclen) {
215 			    if (out)
216 				*out << "document id " << did << ": length "
217 				     << doclen << " doesn't match "
218 				     << termlist_doclen << " in the termlist "
219 					"table" << endl;
220 			    ++errors;
221 			}
222 		    }
223 
224 		    if (pos == end) break;
225 
226 		    Xapian::docid inc;
227 		    if (!unpack_uint(&pos, end, &inc)) {
228 			if (out)
229 			    *out << "Failed to unpack docid increase" << endl;
230 			++errors;
231 			bad = true;
232 			break;
233 		    }
234 		    ++inc;
235 		    did += inc;
236 		    if (did > lastdid) {
237 			if (out)
238 			    *out << "docid " << did << " > last docid "
239 				 << lastdid << endl;
240 			++errors;
241 		    }
242 		}
243 		if (bad) {
244 		    continue;
245 		}
246 		if (is_last_chunk) {
247 		    if (did != lastdid) {
248 			if (out)
249 			    *out << "lastdid " << lastdid << " != last did "
250 				 << did << endl;
251 			++errors;
252 		    }
253 		}
254 
255 		continue;
256 	    }
257 
258 	    if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
259 		// Value stats.
260 		const char * p = key.data();
261 		const char * end = p + key.length();
262 		p += 2;
263 		Xapian::valueno slot;
264 		if (!unpack_uint_last(&p, end, &slot)) {
265 		    if (out)
266 			*out << "Bad valuestats key (no slot)" << endl;
267 		    ++errors;
268 		    continue;
269 		}
270 
271 		cursor->read_tag();
272 		p = cursor->current_tag.data();
273 		end = p + cursor->current_tag.size();
274 
275 		VStats & v = valuestats[slot];
276 		if (!unpack_uint(&p, end, &v.freq)) {
277 		    if (out) {
278 			if (*p == 0) {
279 			    *out << "Incomplete stats item in value table";
280 			} else {
281 			    *out << "Frequency statistic in value table is too large";
282 			}
283 			*out << endl;
284 		    }
285 		    ++errors;
286 		    continue;
287 		}
288 		if (!unpack_string(&p, end, v.lower_bound)) {
289 		    if (out) {
290 			if (*p == 0) {
291 			    *out << "Incomplete stats item in value table";
292 			} else {
293 			    *out << "Lower bound statistic in value table is too large";
294 			}
295 			*out << endl;
296 		    }
297 		    ++errors;
298 		    continue;
299 		}
300 		size_t len = end - p;
301 		if (len == 0) {
302 		    v.upper_bound = v.lower_bound;
303 		} else {
304 		    v.upper_bound.assign(p, len);
305 		}
306 
307 		continue;
308 	    }
309 
310 	    if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
311 		// Value stream chunk.
312 		const char * p = key.data();
313 		const char * end = p + key.length();
314 		p += 2;
315 		Xapian::valueno slot;
316 		if (!unpack_uint(&p, end, &slot)) {
317 		    if (out)
318 			*out << "Bad value chunk key (no slot)" << endl;
319 		    ++errors;
320 		    continue;
321 		}
322 		Xapian::docid did;
323 		if (!unpack_uint_preserving_sort(&p, end, &did)) {
324 		    if (out)
325 			*out << "Bad value chunk key (no docid)" << endl;
326 		    ++errors;
327 		    continue;
328 		}
329 		if (p != end) {
330 		    if (out)
331 			*out << "Bad value chunk key (trailing junk)" << endl;
332 		    ++errors;
333 		    continue;
334 		}
335 
336 		VStats & v = valuestats[slot];
337 
338 		cursor->read_tag();
339 		p = cursor->current_tag.data();
340 		end = p + cursor->current_tag.size();
341 
342 		while (true) {
343 		    string value;
344 		    if (!unpack_string(&p, end, value)) {
345 			if (out)
346 			    *out << "Failed to unpack value from chunk" << endl;
347 			++errors;
348 			break;
349 		    }
350 
351 		    ++v.freq_real;
352 
353 		    // FIXME: Cross-check that docid did has value slot (and
354 		    // vice versa - that there's a value here if the slot entry
355 		    // says so).
356 
357 		    // FIXME: Check if the bounds are tight?  Or is that better
358 		    // as a separate tool which can also update the bounds?
359 		    if (value < v.lower_bound) {
360 			if (out)
361 			    *out << "Value slot " << slot << " has value "
362 				    "below lower bound: '" << value << "' < '"
363 				 << v.lower_bound << "'" << endl;
364 			++errors;
365 		    } else if (value > v.upper_bound) {
366 			if (out)
367 			    *out << "Value slot " << slot << " has value "
368 				    "above upper bound: '" << value << "' > '"
369 				 << v.upper_bound << "'" << endl;
370 			++errors;
371 		    }
372 
373 		    if (p == end) break;
374 		    Xapian::docid delta;
375 		    if (!unpack_uint(&p, end, &delta)) {
376 			if (out)
377 			    *out << "Failed to unpack docid delta from chunk"
378 				 << endl;
379 			++errors;
380 			break;
381 		    }
382 		    Xapian::docid new_did = did + delta + 1;
383 		    if (new_did <= did) {
384 			if (out)
385 			    *out << "docid overflowed in value chunk" << endl;
386 			++errors;
387 			break;
388 		    }
389 		    did = new_did;
390 
391 		    if (did > db_last_docid) {
392 			if (out)
393 			    *out << "document id " << did << " in value chunk "
394 				    "is larger than get_last_docid() "
395 				 << db_last_docid << endl;
396 			++errors;
397 		    }
398 		}
399 		continue;
400 	    }
401 
402 	    const char * pos, * end;
403 
404 	    // Get term from key.
405 	    pos = key.data();
406 	    end = pos + key.size();
407 
408 	    string term;
409 	    Xapian::docid did;
410 	    if (!unpack_string_preserving_sort(&pos, end, term)) {
411 		if (out)
412 		    *out << "Error unpacking termname from key" << endl;
413 		++errors;
414 		continue;
415 	    }
416 	    if (!current_term.empty() && term != current_term) {
417 		// The term changed unexpectedly.
418 		if (pos == end) {
419 		    if (out)
420 			*out << "No last chunk for term '" << current_term
421 			     << "'" << endl;
422 		    current_term.resize(0);
423 		} else {
424 		    if (out)
425 			*out << "Mismatch in follow-on chunk in posting list "
426 				"for term '" << current_term << "' (got '"
427 			     << term << "')" << endl;
428 		    current_term = term;
429 		    tf = cf = 0;
430 		    lastdid = 0;
431 		}
432 		++errors;
433 	    }
434 	    if (pos == end) {
435 		// First chunk.
436 		if (term == current_term) {
437 		    // This probably isn't possible.
438 		    if (out)
439 			*out << "First posting list chunk for term '" << term
440 			     << "' follows previous chunk for the same term"
441 			     << endl;
442 		    ++errors;
443 		}
444 		current_term = term;
445 		tf = cf = 0;
446 
447 		// Unpack extra header from first chunk.
448 		cursor->read_tag();
449 		pos = cursor->current_tag.data();
450 		end = pos + cursor->current_tag.size();
451 		if (!unpack_uint(&pos, end, &termfreq)) {
452 		    if (out)
453 			*out << "Failed to unpack termfreq for term '" << term
454 			     << "'" << endl;
455 		    ++errors;
456 		    continue;
457 		}
458 		if (!unpack_uint(&pos, end, &collfreq)) {
459 		    if (out)
460 			*out << "Failed to unpack collfreq for term '" << term
461 			     << "'" << endl;
462 		    ++errors;
463 		    continue;
464 		}
465 		if (!unpack_uint(&pos, end, &did)) {
466 		    if (out)
467 			*out << "Failed to unpack firstdid for term '" << term
468 			     << "'" << endl;
469 		    ++errors;
470 		    continue;
471 		}
472 		++did;
473 	    } else {
474 		// Continuation chunk.
475 		if (current_term.empty()) {
476 		    if (out)
477 			*out << "First chunk for term '" << current_term
478 			     << "' is a continuation chunk" << endl;
479 		    ++errors;
480 		    current_term = term;
481 		}
482 		AssertEq(current_term, term);
483 		if (!unpack_uint_preserving_sort(&pos, end, &did)) {
484 		    if (out)
485 			*out << "Failed to unpack did from key" << endl;
486 		    ++errors;
487 		    continue;
488 		}
489 		if (did <= lastdid) {
490 		    if (out)
491 			*out << "First did in this chunk is <= last in "
492 				"prev chunk" << endl;
493 		    ++errors;
494 		}
495 		cursor->read_tag();
496 		pos = cursor->current_tag.data();
497 		end = pos + cursor->current_tag.size();
498 	    }
499 
500 	    bool is_last_chunk;
501 	    if (!unpack_bool(&pos, end, &is_last_chunk)) {
502 		if (out)
503 		    *out << "Failed to unpack last chunk flag" << endl;
504 		++errors;
505 		continue;
506 	    }
507 	    // Read what the final document ID in this chunk is.
508 	    if (!unpack_uint(&pos, end, &lastdid)) {
509 		if (out)
510 		    *out << "Failed to unpack increase to last" << endl;
511 		++errors;
512 		continue;
513 	    }
514 	    lastdid += did;
515 	    bool bad = false;
516 	    while (true) {
517 		Xapian::termcount wdf;
518 		if (!unpack_uint(&pos, end, &wdf)) {
519 		    if (out)
520 			*out << "Failed to unpack wdf" << endl;
521 		    ++errors;
522 		    bad = true;
523 		    break;
524 		}
525 		++tf;
526 		cf += wdf;
527 
528 		if (pos == end) break;
529 
530 		Xapian::docid inc;
531 		if (!unpack_uint(&pos, end, &inc)) {
532 		    if (out)
533 			*out << "Failed to unpack docid increase" << endl;
534 		    ++errors;
535 		    bad = true;
536 		    break;
537 		}
538 		++inc;
539 		did += inc;
540 		if (did > lastdid) {
541 		    if (out)
542 			*out << "docid " << did << " > last docid " << lastdid
543 			     << endl;
544 		    ++errors;
545 		}
546 	    }
547 	    if (bad) {
548 		continue;
549 	    }
550 	    if (is_last_chunk) {
551 		if (tf != termfreq) {
552 		    if (out)
553 			*out << "termfreq " << termfreq << " != # of entries "
554 			     << tf << endl;
555 		    ++errors;
556 		}
557 		if (cf != collfreq) {
558 		    if (out)
559 			*out << "collfreq " << collfreq << " != sum wdf " << cf
560 			     << endl;
561 		    ++errors;
562 		}
563 		if (did != lastdid) {
564 		    if (out)
565 			*out << "lastdid " << lastdid << " != last did " << did
566 			     << endl;
567 		    ++errors;
568 		}
569 		current_term.resize(0);
570 	    }
571 	}
572 	if (!current_term.empty()) {
573 	    if (out)
574 		*out << "Last term '" << current_term << "' has no last chunk"
575 		     << endl;
576 	    ++errors;
577 	}
578 
579 	Xapian::doccount doccount = version_file.get_doccount();
580 	if (num_doclens != doccount) {
581 	    if (out)
582 		*out << "Document length list has " << num_doclens
583 		     << " entries, should be " << doccount << endl;
584 	    ++errors;
585 	}
586 
587 	map<Xapian::valueno, VStats>::const_iterator i;
588 	for (i = valuestats.begin(); i != valuestats.end(); ++i) {
589 	    if (i->second.freq != i->second.freq_real) {
590 		if (out)
591 		    *out << "Value stats frequency for slot " << i->first
592 			 << " is " << i->second.freq << " but recounting "
593 			    "gives " << i->second.freq_real << endl;
594 		++errors;
595 	    }
596 	}
597     } else if (strcmp(tablename, "docdata") == 0) {
598 	// glass doesn't store a docdata entry if the document data is empty,
599 	// so we can only check there aren't more docdata entries than
600 	// documents.
601 	Xapian::doccount doccount = version_file.get_doccount();
602 	if (table->get_entry_count() > doccount) {
603 	    if (out)
604 		*out << "More document data (" << table->get_entry_count()
605 		     << ") then documents (" << doccount << ")" << endl;
606 	    ++errors;
607 	}
608 
609 	// Now check the contents of the docdata table.
610 	for ( ; !cursor->after_end(); cursor->next()) {
611 	    string & key = cursor->current_key;
612 
613 	    // Get docid from key.
614 	    const char * pos = key.data();
615 	    const char * end = pos + key.size();
616 
617 	    Xapian::docid did;
618 	    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
619 		if (out)
620 		    *out << "Error unpacking docid from key" << endl;
621 		++errors;
622 	    } else if (pos != end) {
623 		if (out)
624 		    *out << "Extra junk in key" << endl;
625 		++errors;
626 	    } else {
627 		if (did > db_last_docid) {
628 		    if (out)
629 			*out << "document id " << did << " in docdata table "
630 				"is larger than get_last_docid() "
631 			     << db_last_docid << endl;
632 		    ++errors;
633 		}
634 	    }
635 
636 	    // Fetch and decompress the document data to catch problems with
637 	    // the splitting into multiple items, corruption of the compressed
638 	    // data, etc.
639 	    cursor->read_tag();
640 	    if (cursor->current_tag.empty()) {
641 		// We shouldn't store empty document data.
642 		if (out)
643 		    *out << "Empty document data explicitly stored for "
644 			    "document id " << did << endl;
645 		++errors;
646 	    }
647 	}
648     } else if (strcmp(tablename, "termlist") == 0) {
649 	// Now check the contents of the termlist table.
650 	Xapian::doccount num_termlists = 0;
651 	Xapian::doccount num_slotsused_entries = 0;
652 	for ( ; !cursor->after_end(); cursor->next()) {
653 	    string & key = cursor->current_key;
654 
655 	    // Get docid from key.
656 	    const char * pos = key.data();
657 	    const char * end = pos + key.size();
658 
659 	    Xapian::docid did;
660 	    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
661 		if (out)
662 		    *out << "Error unpacking docid from key" << endl;
663 		++errors;
664 		continue;
665 	    }
666 
667 	    if (did > db_last_docid) {
668 		if (out)
669 		    *out << "document id " << did << " in termlist table "
670 			    "is larger than get_last_docid() "
671 			 << db_last_docid << endl;
672 		++errors;
673 	    }
674 
675 	    if (end - pos == 1 && *pos == '\0') {
676 		// Value slots used entry.
677 		++num_slotsused_entries;
678 		cursor->read_tag();
679 
680 		pos = cursor->current_tag.data();
681 		end = pos + cursor->current_tag.size();
682 
683 		if (pos == end) {
684 		    if (out)
685 			*out << "Empty value slots used tag" << endl;
686 		    ++errors;
687 		    continue;
688 		}
689 
690 		Xapian::valueno prev_slot;
691 		if (!unpack_uint(&pos, end, &prev_slot)) {
692 		    if (out)
693 			*out << "Value slot encoding corrupt" << endl;
694 		    ++errors;
695 		    continue;
696 		}
697 
698 		while (pos != end) {
699 		    Xapian::valueno slot;
700 		    if (!unpack_uint(&pos, end, &slot)) {
701 			if (out)
702 			    *out << "Value slot encoding corrupt" << endl;
703 			++errors;
704 			break;
705 		    }
706 		    slot += prev_slot + 1;
707 		    if (slot <= prev_slot) {
708 			if (out)
709 			    *out << "Value slot number overflowed ("
710 				 << prev_slot << " -> " << slot << ")" << endl;
711 			++errors;
712 		    }
713 		    prev_slot = slot;
714 		}
715 		continue;
716 	    }
717 
718 	    if (pos != end) {
719 		if (out)
720 		    *out << "Extra junk in key" << endl;
721 		++errors;
722 		continue;
723 	    }
724 
725 	    ++num_termlists;
726 	    cursor->read_tag();
727 
728 	    pos = cursor->current_tag.data();
729 	    end = pos + cursor->current_tag.size();
730 
731 	    if (pos == end) {
732 		// Empty termlist.
733 		continue;
734 	    }
735 
736 	    Xapian::termcount doclen, termlist_size;
737 
738 	    // Read doclen
739 	    if (!unpack_uint(&pos, end, &doclen)) {
740 		if (out) {
741 		    if (pos != 0) {
742 			*out << "doclen out of range";
743 		    } else {
744 			*out << "Unexpected end of data when reading doclen";
745 		    }
746 		    *out << endl;
747 		}
748 		++errors;
749 		continue;
750 	    }
751 
752 	    // Check doclen with doclen lower and upper bounds
753 	    if (doclen > version_file.get_doclength_upper_bound()) {
754 		if (out)
755 		    *out << "doclen " << doclen << " > upper bound "
756 			 << version_file.get_doclength_upper_bound() << endl;
757 		++errors;
758 	    } else if (doclen < version_file.get_doclength_lower_bound() &&
759 		       doclen != 0) {
760 		if (out)
761 		    *out << "doclen " << doclen << " < lower bound "
762 			 << version_file.get_doclength_lower_bound() << endl;
763 		++errors;
764 	    }
765 
766 	    // Read termlist_size
767 	    if (!unpack_uint(&pos, end, &termlist_size)) {
768 		if (out) {
769 		    if (pos != 0) {
770 			*out << "termlist_size out of range";
771 		    } else {
772 			*out << "Unexpected end of data when reading "
773 				"termlist_size";
774 		    }
775 		    *out << endl;
776 		}
777 		++errors;
778 		continue;
779 	    }
780 
781 	    Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
782 	    string current_tname;
783 
784 	    bool bad = false;
785 	    while (pos != end) {
786 		Xapian::doccount current_wdf = 0;
787 		bool got_wdf = false;
788 		// If there was a previous term, how much to reuse.
789 		if (!current_tname.empty()) {
790 		    string::size_type len = static_cast<unsigned char>(*pos++);
791 		    if (len > current_tname.length()) {
792 			// The wdf was squeezed into the same byte.
793 			current_wdf = len / (current_tname.length() + 1) - 1;
794 			len %= (current_tname.length() + 1);
795 			got_wdf = true;
796 		    }
797 		    current_tname.resize(len);
798 		}
799 		// What to append (note len must be positive, since just truncating
800 		// always takes us backwards in the sort order)
801 		string::size_type len = static_cast<unsigned char>(*pos++);
802 		current_tname.append(pos, len);
803 		pos += len;
804 
805 		if (!got_wdf) {
806 		    // Read wdf
807 		    if (!unpack_uint(&pos, end, &current_wdf)) {
808 			if (out) {
809 			    if (pos == 0) {
810 				*out << "Unexpected end of data when reading "
811 					"termlist current_wdf";
812 			    } else {
813 				*out << "Size of wdf out of range in termlist";
814 			    }
815 			    *out << endl;
816 			}
817 			++errors;
818 			bad = true;
819 			break;
820 		    }
821 		}
822 
823 		++actual_termlist_size;
824 		actual_doclen += current_wdf;
825 	    }
826 	    if (bad) {
827 		continue;
828 	    }
829 
830 	    if (termlist_size != actual_termlist_size) {
831 		if (out)
832 		    *out << "termlist_size != # of entries in termlist" << endl;
833 		++errors;
834 	    }
835 	    if (doclen != actual_doclen) {
836 		if (out)
837 		    *out << "doclen != sum(wdf)" << endl;
838 		++errors;
839 	    }
840 
841 	    // + 1 so that did is a valid subscript.
842 	    if (doclens.size() <= did) doclens.resize(did + 1);
843 	    doclens[did] = actual_doclen;
844 	}
845 
846 	Xapian::doccount doccount = version_file.get_doccount();
847 
848 	// glass doesn't store a termlist entry if there are no terms, so we
849 	// can only check there aren't more termlists than documents.
850 	if (num_termlists > doccount) {
851 	    if (out)
852 		*out << "More termlists (" << num_termlists
853 		     << ") then documents (" << doccount << ")" << endl;
854 	    ++errors;
855 	}
856 
857 	// glass doesn't store a valueslots used entry if there are no terms,
858 	// so we can only check there aren't more such entries than documents.
859 	if (num_slotsused_entries > doccount) {
860 	    if (out)
861 		*out << "More slots-used entries (" << num_slotsused_entries
862 		     << ") then documents (" << doccount << ")" << endl;
863 	    ++errors;
864 	}
865     } else if (strcmp(tablename, "position") == 0) {
866 	// Now check the contents of the position table.
867 	for ( ; !cursor->after_end(); cursor->next()) {
868 	    string & key = cursor->current_key;
869 
870 	    // Get docid from key.
871 	    const char * pos = key.data();
872 	    const char * end = pos + key.size();
873 
874 	    string term;
875 	    if (!unpack_string_preserving_sort(&pos, end, term)) {
876 		if (out)
877 		    *out << "Error unpacking term from key" << endl;
878 		++errors;
879 		continue;
880 	    }
881 
882 	    Xapian::docid did;
883 	    if (!unpack_uint_preserving_sort(&pos, end, &did)) {
884 		if (out)
885 		    *out << "Error unpacking docid from key" << endl;
886 		++errors;
887 		continue;
888 	    }
889 
890 	    if (pos != end) {
891 		if (out)
892 		    *out << "Extra junk in key with docid " << did << endl;
893 		++errors;
894 		continue;
895 	    }
896 
897 	    if (did > db_last_docid) {
898 		if (out)
899 		    *out << "document id " << did << " in position table "
900 			    "is larger than get_last_docid() "
901 			 << db_last_docid << endl;
902 		++errors;
903 	    } else if (!doclens.empty()) {
904 		// In glass, a document without terms doesn't get a
905 		// termlist entry, so we can't tell the difference
906 		// easily.
907 		if (did >= doclens.size() || doclens[did] == 0) {
908 		    if (out)
909 			*out << "Position list entry for document " << did
910 			     << " which doesn't exist or has no terms" << endl;
911 		    ++errors;
912 		}
913 	    }
914 
915 	    cursor->read_tag();
916 
917 	    const string & data = cursor->current_tag;
918 	    pos = data.data();
919 	    end = pos + data.size();
920 
921 	    Xapian::termpos pos_last;
922 	    if (!unpack_uint(&pos, end, &pos_last)) {
923 		if (out)
924 		    *out << tablename << " table: Position list data corrupt"
925 			 << endl;
926 		++errors;
927 		continue;
928 	    }
929 	    if (pos == end) {
930 		// Special case for single entry position list.
931 	    } else {
932 		// Skip the header we just read.
933 		BitReader rd(data, pos - data.data());
934 		Xapian::termpos pos_first = rd.decode(pos_last);
935 		Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
936 		rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
937 		Xapian::termpos p = rd.decode_interpolative_next();
938 		bool ok = true;
939 		while (p != pos_last) {
940 		    Xapian::termpos pos_prev = p;
941 		    p = rd.decode_interpolative_next();
942 		    if (p <= pos_prev) {
943 			if (out)
944 			    *out << tablename << " table: Positions not "
945 				    "strictly monotonically increasing" << endl;
946 			++errors;
947 			ok = false;
948 			break;
949 		    }
950 		}
951 		if (ok && !rd.check_all_gone()) {
952 		    if (out)
953 			*out << tablename << " table: Junk after position data"
954 			     << endl;
955 		    ++errors;
956 		}
957 	    }
958 	}
959     } else {
960 	if (out)
961 	    *out << tablename << " table: Don't know how to check structure\n"
962 		 << endl;
963 	return errors;
964     }
965 
966     if (out) {
967 	if (!errors)
968 	    *out << tablename << " table structure checked OK\n";
969 	else
970 	    *out << tablename << " table errors found: " << errors << "\n";
971 	*out << endl;
972     }
973 
974     return errors;
975 }
976