1 /** @file xapian-check-chert.cc
2 * @brief Check consistency of a chert table.
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2016 Olly Betts
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
21 */
22
23 #include <config.h>
24
25 #include "xapian-check-chert.h"
26
27 #include "bitstream.h"
28
29 #include "internaltypes.h"
30
31 #include "chert_check.h"
32 #include "chert_cursor.h"
33 #include "chert_table.h"
34 #include "chert_types.h"
35 #include "pack.h"
36 #include "valuestats.h"
37
38 #include <xapian.h>
39
40 #include "autoptr.h"
41 #include <iostream>
42
43 using namespace std;
44
45 static inline bool
is_user_metadata_key(const string & key)46 is_user_metadata_key(const string & key)
47 {
48 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
49 }
50
51 struct VStats : public ValueStats {
52 Xapian::doccount freq_real;
53
VStatsVStats54 VStats() : ValueStats(), freq_real(0) {}
55 };
56
57 size_t
check_chert_table(const char * tablename,string filename,chert_revision_number_t * rev_ptr,int opts,vector<Xapian::termcount> & doclens,Xapian::doccount doccount,Xapian::docid db_last_docid)58 check_chert_table(const char * tablename, string filename,
59 chert_revision_number_t * rev_ptr, int opts,
60 vector<Xapian::termcount> & doclens,
61 Xapian::doccount doccount, Xapian::docid db_last_docid)
62 {
63 filename += '.';
64
65 // Check the btree structure.
66 ChertTableCheck::check(tablename, filename, rev_ptr, opts);
67
68 // Now check the chert structures inside the btree.
69 ChertTable table(tablename, filename, true);
70 if (rev_ptr) {
71 table.open(*rev_ptr);
72 } else {
73 table.open();
74 }
75 AutoPtr<ChertCursor> cursor(table.cursor_get());
76
77 size_t errors = 0;
78
79 cursor->find_entry(string());
80 cursor->next(); // Skip the empty entry.
81
82 if (strcmp(tablename, "postlist") == 0) {
83 // Now check the structure of each postlist in the table.
84 map<Xapian::valueno, VStats> valuestats;
85 string current_term;
86 Xapian::docid lastdid = 0;
87 Xapian::termcount termfreq = 0, collfreq = 0;
88 Xapian::termcount tf = 0, cf = 0;
89 Xapian::doccount num_doclens = 0;
90 bool have_metainfo_key = false;
91
92 // The first key/tag pair should be the METAINFO - though this may be
93 // missing if the table only contains user-metadata.
94 if (!cursor->after_end()) {
95 if (cursor->current_key == string("", 1)) {
96 have_metainfo_key = true;
97 cursor->read_tag();
98 // Check format of the METAINFO key.
99 totlen_t total_doclen;
100 Xapian::docid last_docid;
101 Xapian::termcount doclen_lbound;
102 Xapian::termcount doclen_ubound;
103 Xapian::termcount wdf_ubound;
104
105 const char * data = cursor->current_tag.data();
106 const char * end = data + cursor->current_tag.size();
107 if (!unpack_uint(&data, end, &last_docid)) {
108 cout << "Tag containing meta information is corrupt (couldn't read last_docid)." << endl;
109 ++errors;
110 } else if (!unpack_uint(&data, end, &doclen_lbound)) {
111 cout << "Tag containing meta information is corrupt (couldn't read doclen_lbound)." << endl;
112 ++errors;
113 } else if (!unpack_uint(&data, end, &wdf_ubound)) {
114 cout << "Tag containing meta information is corrupt (couldn't read wdf_ubound)." << endl;
115 ++errors;
116 } else if (!unpack_uint(&data, end, &doclen_ubound)) {
117 cout << "Tag containing meta information is corrupt (couldn't read doclen_ubound)." << endl;
118 ++errors;
119 } else if (!unpack_uint_last(&data, end, &total_doclen)) {
120 cout << "Tag containing meta information is corrupt (couldn't read total_doclen)." << endl;
121 ++errors;
122 } else if (data != end) {
123 cout << "Tag containing meta information is corrupt (junk at end)." << endl;
124 ++errors;
125 }
126 cursor->next();
127 }
128 }
129
130 bool seen_doclen_initial_chunk = false;
131 for ( ; !cursor->after_end(); cursor->next()) {
132 string & key = cursor->current_key;
133
134 if (is_user_metadata_key(key)) {
135 // User metadata can be anything, so we can't do any particular
136 // checks on it other than to check that the tag isn't empty.
137 cursor->read_tag();
138 if (cursor->current_tag.empty()) {
139 cout << "User metadata item is empty" << endl;
140 ++errors;
141 }
142 continue;
143 }
144
145 if (!have_metainfo_key) {
146 have_metainfo_key = true;
147 cout << "METAINFO key missing from postlist table" << endl;
148 ++errors;
149 }
150
151 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
152 // doclen chunk
153 const char * pos, * end;
154 Xapian::docid did = 1;
155 if (key.size() > 2) {
156 // Non-initial chunk.
157 if (!seen_doclen_initial_chunk) {
158 cout << "Doclen initial chunk missing" << endl;
159 ++errors;
160 }
161 pos = key.data();
162 end = pos + key.size();
163 pos += 2;
164 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
165 cout << "Error unpacking docid from doclen key" << endl;
166 ++errors;
167 continue;
168 }
169 }
170 seen_doclen_initial_chunk = true;
171
172 cursor->read_tag();
173 pos = cursor->current_tag.data();
174 end = pos + cursor->current_tag.size();
175 if (key.size() == 2) {
176 // Initial chunk.
177 if (end - pos < 2 || pos[0] || pos[1]) {
178 cout << "Initial doclen chunk has nonzero dummy fields" << endl;
179 ++errors;
180 continue;
181 }
182 pos += 2;
183 if (!unpack_uint(&pos, end, &did)) {
184 cout << "Failed to unpack firstdid for doclen" << endl;
185 ++errors;
186 continue;
187 }
188 ++did;
189 if (did <= lastdid) {
190 cout << "First did in this chunk is <= last in "
191 "prev chunk" << endl;
192 ++errors;
193 }
194 }
195
196 bool is_last_chunk;
197 if (!unpack_bool(&pos, end, &is_last_chunk)) {
198 cout << "Failed to unpack last chunk flag for doclen" << endl;
199 ++errors;
200 continue;
201 }
202 // Read what the final document ID in this chunk is.
203 if (!unpack_uint(&pos, end, &lastdid)) {
204 cout << "Failed to unpack increase to last" << endl;
205 ++errors;
206 continue;
207 }
208 lastdid += did;
209 bool bad = false;
210 while (true) {
211 Xapian::termcount doclen;
212 if (!unpack_uint(&pos, end, &doclen)) {
213 cout << "Failed to unpack doclen" << endl;
214 ++errors;
215 bad = true;
216 break;
217 }
218
219 ++num_doclens;
220
221 if (did > db_last_docid) {
222 cout << "document id " << did << " in doclen stream "
223 << "is larger than get_last_docid() "
224 << db_last_docid << endl;
225 ++errors;
226 }
227
228 if (!doclens.empty()) {
229 // In chert, a document without terms doesn't get a
230 // termlist entry.
231 Xapian::termcount termlist_doclen = 0;
232 if (did < doclens.size())
233 termlist_doclen = doclens[did];
234
235 if (doclen != termlist_doclen) {
236 cout << "document id " << did << ": length "
237 << doclen << " doesn't match "
238 << termlist_doclen << " in the termlist table"
239 << endl;
240 ++errors;
241 }
242 }
243
244 if (pos == end) break;
245
246 Xapian::docid inc;
247 if (!unpack_uint(&pos, end, &inc)) {
248 cout << "Failed to unpack docid increase" << endl;
249 ++errors;
250 bad = true;
251 break;
252 }
253 ++inc;
254 did += inc;
255 if (did > lastdid) {
256 cout << "docid " << did << " > last docid " << lastdid
257 << endl;
258 ++errors;
259 }
260 }
261 if (bad) {
262 continue;
263 }
264 if (is_last_chunk) {
265 if (did != lastdid) {
266 cout << "lastdid " << lastdid << " != last did " << did
267 << endl;
268 ++errors;
269 }
270 }
271
272 continue;
273 }
274
275 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
276 // Value stats.
277 const char * p = key.data();
278 const char * end = p + key.length();
279 p += 2;
280 Xapian::valueno slot;
281 if (!unpack_uint_last(&p, end, &slot)) {
282 cout << "Bad valuestats key (no slot)" << endl;
283 ++errors;
284 continue;
285 }
286
287 cursor->read_tag();
288 p = cursor->current_tag.data();
289 end = p + cursor->current_tag.size();
290
291 VStats & v = valuestats[slot];
292 if (!unpack_uint(&p, end, &v.freq)) {
293 if (*p == 0) {
294 cout << "Incomplete stats item in value table" << endl;
295 } else {
296 cout << "Frequency statistic in value table is too large" << endl;
297 }
298 ++errors;
299 continue;
300 }
301 if (!unpack_string(&p, end, v.lower_bound)) {
302 if (*p == 0) {
303 cout << "Incomplete stats item in value table" << endl;
304 } else {
305 cout << "Lower bound statistic in value table is too large" << endl;
306 }
307 ++errors;
308 continue;
309 }
310 size_t len = end - p;
311 if (len == 0) {
312 v.upper_bound = v.lower_bound;
313 } else {
314 v.upper_bound.assign(p, len);
315 }
316
317 continue;
318 }
319
320 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
321 // Value stream chunk.
322 const char * p = key.data();
323 const char * end = p + key.length();
324 p += 2;
325 Xapian::valueno slot;
326 if (!unpack_uint(&p, end, &slot)) {
327 cout << "Bad value chunk key (no slot)" << endl;
328 ++errors;
329 continue;
330 }
331 Xapian::docid did;
332 if (!unpack_uint_preserving_sort(&p, end, &did)) {
333 cout << "Bad value chunk key (no docid)" << endl;
334 ++errors;
335 continue;
336 }
337 if (p != end) {
338 cout << "Bad value chunk key (trailing junk)" << endl;
339 ++errors;
340 continue;
341 }
342
343 VStats & v = valuestats[slot];
344
345 cursor->read_tag();
346 p = cursor->current_tag.data();
347 end = p + cursor->current_tag.size();
348
349 while (true) {
350 string value;
351 if (!unpack_string(&p, end, value)) {
352 cout << "Failed to unpack value from chunk" << endl;
353 ++errors;
354 break;
355 }
356
357 ++v.freq_real;
358
359 // FIXME: Cross-check that docid did has value slot (and
360 // vice versa - that there's a value here if the slot entry
361 // says so).
362
363 // FIXME: Check if the bounds are tight? Or is that better
364 // as a separate tool which can also update the bounds?
365 if (value < v.lower_bound) {
366 cout << "Value slot " << slot << " has value below "
367 "lower bound: '" << value << "' < '"
368 << v.lower_bound << "'" << endl;
369 ++errors;
370 } else if (value > v.upper_bound) {
371 cout << "Value slot " << slot << " has value above "
372 "upper bound: '" << value << "' > '"
373 << v.upper_bound << "'" << endl;
374 ++errors;
375 }
376
377 if (p == end) break;
378 Xapian::docid delta;
379 if (!unpack_uint(&p, end, &delta)) {
380 cout << "Failed to unpack docid delta from chunk" << endl;
381 ++errors;
382 break;
383 }
384 Xapian::docid new_did = did + delta + 1;
385 if (new_did <= did) {
386 cout << "docid overflowed in value chunk" << endl;
387 ++errors;
388 break;
389 }
390 did = new_did;
391
392 if (did > db_last_docid) {
393 cout << "document id " << did << " in value chunk "
394 << "is larger than get_last_docid() "
395 << db_last_docid << endl;
396 ++errors;
397 }
398 }
399 continue;
400 }
401
402 const char * pos, * end;
403
404 // Get term from key.
405 pos = key.data();
406 end = pos + key.size();
407
408 string term;
409 Xapian::docid did;
410 if (!unpack_string_preserving_sort(&pos, end, term)) {
411 cout << "Error unpacking termname from key" << endl;
412 ++errors;
413 continue;
414 }
415 if (!current_term.empty() && term != current_term) {
416 // The term changed unexpectedly.
417 if (pos == end) {
418 cout << "No last chunk for term `" << current_term
419 << "'" << endl;
420 current_term.resize(0);
421 } else {
422 cout << "Mismatch in follow-on chunk in posting "
423 "list for term `" << current_term << "' (got `"
424 << term << "')" << endl;
425 current_term = term;
426 tf = cf = 0;
427 lastdid = 0;
428 }
429 ++errors;
430 }
431 if (pos == end) {
432 // First chunk.
433 if (term == current_term) {
434 // This probably isn't possible.
435 cout << "First posting list chunk for term `"
436 << term << "' follows previous chunk for the same "
437 "term" << endl;
438 ++errors;
439 }
440 current_term = term;
441 tf = cf = 0;
442
443 // Unpack extra header from first chunk.
444 cursor->read_tag();
445 pos = cursor->current_tag.data();
446 end = pos + cursor->current_tag.size();
447 if (!unpack_uint(&pos, end, &termfreq)) {
448 cout << "Failed to unpack termfreq for term `" << term
449 << "'" << endl;
450 ++errors;
451 continue;
452 }
453 if (!unpack_uint(&pos, end, &collfreq)) {
454 cout << "Failed to unpack collfreq for term `" << term
455 << "'" << endl;
456 ++errors;
457 continue;
458 }
459 if (!unpack_uint(&pos, end, &did)) {
460 cout << "Failed to unpack firstdid for term `" << term
461 << "'" << endl;
462 ++errors;
463 continue;
464 }
465 ++did;
466 } else {
467 // Continuation chunk.
468 if (current_term.empty()) {
469 cout << "First chunk for term `" << current_term << "' "
470 "is a continuation chunk" << endl;
471 ++errors;
472 current_term = term;
473 }
474 AssertEq(current_term, term);
475 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
476 cout << "Failed to unpack did from key" << endl;
477 ++errors;
478 continue;
479 }
480 if (did <= lastdid) {
481 cout << "First did in this chunk is <= last in "
482 "prev chunk" << endl;
483 ++errors;
484 }
485 cursor->read_tag();
486 pos = cursor->current_tag.data();
487 end = pos + cursor->current_tag.size();
488 }
489
490 bool is_last_chunk;
491 if (!unpack_bool(&pos, end, &is_last_chunk)) {
492 cout << "Failed to unpack last chunk flag" << endl;
493 ++errors;
494 continue;
495 }
496 // Read what the final document ID in this chunk is.
497 if (!unpack_uint(&pos, end, &lastdid)) {
498 cout << "Failed to unpack increase to last" << endl;
499 ++errors;
500 continue;
501 }
502 lastdid += did;
503 bool bad = false;
504 while (true) {
505 Xapian::termcount wdf;
506 if (!unpack_uint(&pos, end, &wdf)) {
507 cout << "Failed to unpack wdf" << endl;
508 ++errors;
509 bad = true;
510 break;
511 }
512 ++tf;
513 cf += wdf;
514
515 if (pos == end) break;
516
517 Xapian::docid inc;
518 if (!unpack_uint(&pos, end, &inc)) {
519 cout << "Failed to unpack docid increase" << endl;
520 ++errors;
521 bad = true;
522 break;
523 }
524 ++inc;
525 did += inc;
526 if (did > lastdid) {
527 cout << "docid " << did << " > last docid " << lastdid
528 << endl;
529 ++errors;
530 }
531 }
532 if (bad) {
533 continue;
534 }
535 if (is_last_chunk) {
536 if (tf != termfreq) {
537 cout << "termfreq " << termfreq << " != # of entries "
538 << tf << endl;
539 ++errors;
540 }
541 if (cf != collfreq) {
542 cout << "collfreq " << collfreq << " != sum wdf " << cf
543 << endl;
544 ++errors;
545 }
546 if (did != lastdid) {
547 cout << "lastdid " << lastdid << " != last did " << did
548 << endl;
549 ++errors;
550 }
551 current_term.resize(0);
552 }
553 }
554 if (!current_term.empty()) {
555 cout << "Last term `" << current_term << "' has no last chunk"
556 << endl;
557 ++errors;
558 }
559
560 if (num_doclens != doccount && doccount != Xapian::doccount(-1)) {
561 cout << "Document length list has " << num_doclens
562 << " entries, should be " << doccount << endl;
563 ++errors;
564 }
565
566 map<Xapian::valueno, VStats>::const_iterator i;
567 for (i = valuestats.begin(); i != valuestats.end(); ++i) {
568 if (i->second.freq != i->second.freq_real) {
569 cout << "Value stats frequency for slot " << i->first << " is "
570 << i->second.freq << " but recounting gives "
571 << i->second.freq_real << endl;
572 ++errors;
573 }
574 }
575 } else if (strcmp(tablename, "record") == 0) {
576 if (table.get_entry_count() != doccount &&
577 doccount != Xapian::doccount(-1)) {
578 cout << "Document data entry count (" << table.get_entry_count()
579 << ") != get_doccount() (" << doccount << ")" << endl;
580 ++errors;
581 }
582
583 // Now check the contents of the record table. Any data is valid as
584 // the tag so we don't check the tags.
585 for ( ; !cursor->after_end(); cursor->next()) {
586 string & key = cursor->current_key;
587
588 // Get docid from key.
589 const char * pos = key.data();
590 const char * end = pos + key.size();
591
592 Xapian::docid did;
593 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
594 cout << "Error unpacking docid from key" << endl;
595 ++errors;
596 } else if (pos != end) {
597 cout << "Extra junk in key" << endl;
598 ++errors;
599 } else {
600 if (did > db_last_docid) {
601 cout << "document id " << did << " in docdata table "
602 "is larger than get_last_docid() "
603 << db_last_docid << endl;
604 ++errors;
605 }
606 }
607 }
608 } else if (strcmp(tablename, "termlist") == 0) {
609 // Now check the contents of the termlist table.
610 Xapian::doccount num_termlists = 0;
611 Xapian::doccount num_slotsused_entries = 0;
612 for ( ; !cursor->after_end(); cursor->next()) {
613 string & key = cursor->current_key;
614
615 // Get docid from key.
616 const char * pos = key.data();
617 const char * end = pos + key.size();
618
619 Xapian::docid did;
620 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
621 cout << "Error unpacking docid from key" << endl;
622 ++errors;
623 continue;
624 }
625
626 if (did > db_last_docid) {
627 cout << "document id " << did << " in termlist table "
628 "is larger than get_last_docid() "
629 << db_last_docid << endl;
630 ++errors;
631 }
632
633 if (end - pos == 1 && *pos == '\0') {
634 // Value slots used entry.
635 ++num_slotsused_entries;
636 cursor->read_tag();
637
638 pos = cursor->current_tag.data();
639 end = pos + cursor->current_tag.size();
640
641 if (pos == end) {
642 cout << "Empty value slots used tag" << endl;
643 ++errors;
644 continue;
645 }
646
647 Xapian::valueno prev_slot;
648 if (!unpack_uint(&pos, end, &prev_slot)) {
649 cout << "Value slot encoding corrupt" << endl;
650 ++errors;
651 continue;
652 }
653
654 while (pos != end) {
655 Xapian::valueno slot;
656 if (!unpack_uint(&pos, end, &slot)) {
657 cout << "Value slot encoding corrupt" << endl;
658 ++errors;
659 break;
660 }
661 slot += prev_slot + 1;
662 if (slot <= prev_slot) {
663 cout << "Value slot number overflowed (" << prev_slot << " -> " << slot << ")" << endl;
664 ++errors;
665 }
666 prev_slot = slot;
667 }
668 continue;
669 }
670
671 if (pos != end) {
672 cout << "Extra junk in key" << endl;
673 ++errors;
674 continue;
675 }
676
677 ++num_termlists;
678 cursor->read_tag();
679
680 pos = cursor->current_tag.data();
681 end = pos + cursor->current_tag.size();
682
683 if (pos == end) {
684 // Empty termlist.
685 continue;
686 }
687
688 Xapian::termcount doclen, termlist_size;
689
690 // Read doclen
691 if (!unpack_uint(&pos, end, &doclen)) {
692 if (pos != 0) {
693 cout << "doclen out of range" << endl;
694 } else {
695 cout << "Unexpected end of data when reading doclen" << endl;
696 }
697 ++errors;
698 continue;
699 }
700
701 // Read termlist_size
702 if (!unpack_uint(&pos, end, &termlist_size)) {
703 if (pos != 0) {
704 cout << "termlist_size out of range" << endl;
705 } else {
706 cout << "Unexpected end of data when reading termlist_size" << endl;
707 }
708 ++errors;
709 continue;
710 }
711
712 Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
713 string current_tname;
714
715 bool bad = false;
716 while (pos != end) {
717 Xapian::doccount current_wdf = 0;
718 bool got_wdf = false;
719 // If there was a previous term, how much to reuse.
720 if (!current_tname.empty()) {
721 string::size_type len = static_cast<unsigned char>(*pos++);
722 if (len > current_tname.length()) {
723 // The wdf was squeezed into the same byte.
724 current_wdf = len / (current_tname.length() + 1) - 1;
725 len %= (current_tname.length() + 1);
726 got_wdf = true;
727 }
728 current_tname.resize(len);
729 }
730 // What to append (note len must be positive, since just truncating
731 // always takes us backwards in the sort order)
732 string::size_type len = static_cast<unsigned char>(*pos++);
733 current_tname.append(pos, len);
734 pos += len;
735
736 if (!got_wdf) {
737 // Read wdf
738 if (!unpack_uint(&pos, end, ¤t_wdf)) {
739 if (pos == 0) {
740 cout << "Unexpected end of data when reading termlist current_wdf" << endl;
741 } else {
742 cout << "Size of wdf out of range, in termlist" << endl;
743 }
744 ++errors;
745 bad = true;
746 break;
747 }
748 }
749
750 ++actual_termlist_size;
751 actual_doclen += current_wdf;
752 }
753 if (bad) {
754 continue;
755 }
756
757 if (termlist_size != actual_termlist_size) {
758 cout << "termlist_size != # of entries in termlist" << endl;
759 ++errors;
760 }
761 if (doclen != actual_doclen) {
762 cout << "doclen != sum(wdf)" << endl;
763 ++errors;
764 }
765
766 // + 1 so that did is a valid subscript.
767 if (doclens.size() <= did) doclens.resize(did + 1);
768 doclens[did] = actual_doclen;
769 }
770
771 if (num_termlists != doccount && doccount != Xapian::doccount(-1)) {
772 cout << "Number of termlists (" << num_termlists
773 << ") != get_doccount() (" << doccount << ")" << endl;
774 ++errors;
775 }
776
777 // chert doesn't store a valueslots used entry if there are no terms,
778 // so we can only check there aren't more such entries than documents.
779 if (num_slotsused_entries > doccount &&
780 doccount != Xapian::doccount(-1)) {
781 cout << "More slots-used entries (" << num_slotsused_entries
782 << ") then documents (" << doccount << ")" << endl;
783 ++errors;
784 }
785 } else if (strcmp(tablename, "position") == 0) {
786 // Now check the contents of the position table.
787 for ( ; !cursor->after_end(); cursor->next()) {
788 string & key = cursor->current_key;
789
790 // Get docid from key.
791 const char * pos = key.data();
792 const char * end = pos + key.size();
793
794 Xapian::docid did;
795 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
796 cout << "Error unpacking docid from key" << endl;
797 ++errors;
798 continue;
799 }
800
801 if (did > db_last_docid) {
802 cout << "document id " << did << " in position table "
803 "is larger than get_last_docid() "
804 << db_last_docid << endl;
805 ++errors;
806 } else if (!doclens.empty()) {
807 // In chert, a document without terms doesn't get a
808 // termlist entry, so we can't tell the difference
809 // easily.
810 if (did >= doclens.size() || doclens[did] == 0) {
811 cout << "Position list entry for document " << did
812 << " which doesn't exist or has no terms" << endl;
813 ++errors;
814 }
815 }
816
817 if (pos == end) {
818 cout << "No termname in key" << endl;
819 ++errors;
820 continue;
821 }
822
823 cursor->read_tag();
824
825 const string & data = cursor->current_tag;
826 pos = data.data();
827 end = pos + data.size();
828
829 Xapian::termpos pos_last;
830 if (!unpack_uint(&pos, end, &pos_last)) {
831 cout << tablename << " table: Position list data corrupt" << endl;
832 ++errors;
833 continue;
834 }
835 if (pos == end) {
836 // Special case for single entry position list.
837 } else {
838 // Skip the header we just read.
839 BitReader rd(data, pos - data.data());
840 Xapian::termpos pos_first = rd.decode(pos_last);
841 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
842 vector<Xapian::termpos> positions;
843 positions.resize(pos_size);
844 positions[0] = pos_first;
845 positions.back() = pos_last;
846 rd.decode_interpolative(positions, 0, pos_size - 1);
847 vector<Xapian::termpos>::const_iterator current_pos = positions.begin();
848 Xapian::termpos lastpos = *current_pos++;
849 while (current_pos != positions.end()) {
850 Xapian::termpos termpos = *current_pos++;
851 if (termpos <= lastpos) {
852 cout << tablename << " table: Positions not strictly monotonically increasing" << endl;
853 ++errors;
854 break;
855 }
856 lastpos = termpos;
857 }
858 }
859 }
860 } else {
861 cout << tablename << " table: Don't know how to check structure\n" << endl;
862 return errors;
863 }
864
865 if (!errors)
866 cout << tablename << " table structure checked OK\n" << endl;
867 else
868 cout << tablename << " table errors found: " << errors << "\n" << endl;
869
870 return errors;
871 }
872