1 /** @file
2 * @brief Check consistency of a glass table.
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2018 Olly Betts
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
21 */
22
23 #include <config.h>
24
25 #include "glass_dbcheck.h"
26
27 #include "bitstream.h"
28
29 #include "internaltypes.h"
30
31 #include "glass_check.h"
32 #include "glass_cursor.h"
33 #include "glass_defs.h"
34 #include "glass_table.h"
35 #include "glass_version.h"
36 #include "pack.h"
37 #include "backends/valuestats.h"
38
39 #include <xapian.h>
40
41 #include "filetests.h"
42 #include "autoptr.h"
43 #include <ostream>
44 #include <vector>
45
46 using namespace std;
47
48 static inline bool
is_user_metadata_key(const string & key)49 is_user_metadata_key(const string & key)
50 {
51 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
52 }
53
54 struct VStats : public ValueStats {
55 Xapian::doccount freq_real;
56
VStatsVStats57 VStats() : ValueStats(), freq_real(0) {}
58 };
59
60 size_t
check_glass_table(const char * tablename,const string & db_dir,int fd,off_t offset_,const GlassVersion & version_file,int opts,vector<Xapian::termcount> & doclens,ostream * out)61 check_glass_table(const char * tablename, const string &db_dir, int fd,
62 off_t offset_,
63 const GlassVersion & version_file, int opts,
64 vector<Xapian::termcount> & doclens, ostream * out)
65 {
66 Xapian::docid db_last_docid = version_file.get_last_docid();
67 if (out)
68 *out << tablename << ":\n";
69 if (fd < 0) {
70 if (strcmp(tablename, "postlist") != 0) {
71 // Other filenames are created lazily, so may not exist.
72 string filename(db_dir);
73 filename += '/';
74 filename += tablename;
75 filename += "." GLASS_TABLE_EXTENSION;
76 if (!file_exists(filename)) {
77 if (out) {
78 if (strcmp(tablename, "termlist") == 0) {
79 *out << "Not present.\n";
80 } else {
81 *out << "Lazily created, and not yet used.\n";
82 }
83 *out << endl;
84 }
85 return 0;
86 }
87 }
88 }
89
90 // Check the btree structure.
91 AutoPtr<GlassTable> table(
92 GlassTableCheck::check(tablename, db_dir, fd, offset_,
93 version_file, opts, out));
94
95 // Now check the glass structures inside the btree.
96 AutoPtr<GlassCursor> cursor(table->cursor_get());
97
98 size_t errors = 0;
99
100 cursor->find_entry(string());
101 cursor->next(); // Skip the empty entry.
102
103 if (strcmp(tablename, "postlist") == 0) {
104 // Now check the structure of each postlist in the table.
105 map<Xapian::valueno, VStats> valuestats;
106 string current_term;
107 Xapian::docid lastdid = 0;
108 Xapian::termcount termfreq = 0, collfreq = 0;
109 Xapian::termcount tf = 0, cf = 0;
110 Xapian::doccount num_doclens = 0;
111
112 for ( ; !cursor->after_end(); cursor->next()) {
113 string & key = cursor->current_key;
114
115 if (is_user_metadata_key(key)) {
116 // User metadata can be anything, so we can't do any particular
117 // checks on it other than to check that the tag isn't empty.
118 cursor->read_tag();
119 if (cursor->current_tag.empty()) {
120 if (out)
121 *out << "User metadata item is empty" << endl;
122 ++errors;
123 }
124 continue;
125 }
126
127 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
128 // doclen chunk
129 const char * pos, * end;
130 Xapian::docid did = 1;
131 if (key.size() > 2) {
132 // Non-initial chunk.
133 pos = key.data();
134 end = pos + key.size();
135 pos += 2;
136 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
137 if (out)
138 *out << "Error unpacking docid from doclen key" << endl;
139 ++errors;
140 continue;
141 }
142 if (did <= lastdid) {
143 if (out)
144 *out << "First did in this doclen chunk is <= last in "
145 "prev chunk" << endl;
146 ++errors;
147 }
148 }
149
150 cursor->read_tag();
151 pos = cursor->current_tag.data();
152 end = pos + cursor->current_tag.size();
153 if (key.size() == 2) {
154 // Initial chunk.
155 if (end - pos < 2 || pos[0] || pos[1]) {
156 if (out)
157 *out << "Initial doclen chunk has nonzero dummy fields" << endl;
158 ++errors;
159 continue;
160 }
161 pos += 2;
162 if (!unpack_uint(&pos, end, &did)) {
163 if (out)
164 *out << "Failed to unpack firstdid for doclen" << endl;
165 ++errors;
166 continue;
167 }
168 ++did;
169 }
170
171 bool is_last_chunk;
172 if (!unpack_bool(&pos, end, &is_last_chunk)) {
173 if (out)
174 *out << "Failed to unpack last chunk flag for doclen" << endl;
175 ++errors;
176 continue;
177 }
178 // Read what the final document ID in this chunk is.
179 if (!unpack_uint(&pos, end, &lastdid)) {
180 if (out)
181 *out << "Failed to unpack increase to last" << endl;
182 ++errors;
183 continue;
184 }
185 lastdid += did;
186 bool bad = false;
187 while (true) {
188 Xapian::termcount doclen;
189 if (!unpack_uint(&pos, end, &doclen)) {
190 if (out)
191 *out << "Failed to unpack doclen" << endl;
192 ++errors;
193 bad = true;
194 break;
195 }
196
197 ++num_doclens;
198
199 if (did > db_last_docid) {
200 if (out)
201 *out << "document id " << did << " in doclen "
202 "stream is larger than get_last_docid() "
203 << db_last_docid << endl;
204 ++errors;
205 }
206
207 if (!doclens.empty()) {
208 // In glass, a document without terms doesn't get a
209 // termlist entry.
210 Xapian::termcount termlist_doclen = 0;
211 if (did < doclens.size())
212 termlist_doclen = doclens[did];
213
214 if (doclen != termlist_doclen) {
215 if (out)
216 *out << "document id " << did << ": length "
217 << doclen << " doesn't match "
218 << termlist_doclen << " in the termlist "
219 "table" << endl;
220 ++errors;
221 }
222 }
223
224 if (pos == end) break;
225
226 Xapian::docid inc;
227 if (!unpack_uint(&pos, end, &inc)) {
228 if (out)
229 *out << "Failed to unpack docid increase" << endl;
230 ++errors;
231 bad = true;
232 break;
233 }
234 ++inc;
235 did += inc;
236 if (did > lastdid) {
237 if (out)
238 *out << "docid " << did << " > last docid "
239 << lastdid << endl;
240 ++errors;
241 }
242 }
243 if (bad) {
244 continue;
245 }
246 if (is_last_chunk) {
247 if (did != lastdid) {
248 if (out)
249 *out << "lastdid " << lastdid << " != last did "
250 << did << endl;
251 ++errors;
252 }
253 }
254
255 continue;
256 }
257
258 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
259 // Value stats.
260 const char * p = key.data();
261 const char * end = p + key.length();
262 p += 2;
263 Xapian::valueno slot;
264 if (!unpack_uint_last(&p, end, &slot)) {
265 if (out)
266 *out << "Bad valuestats key (no slot)" << endl;
267 ++errors;
268 continue;
269 }
270
271 cursor->read_tag();
272 p = cursor->current_tag.data();
273 end = p + cursor->current_tag.size();
274
275 VStats & v = valuestats[slot];
276 if (!unpack_uint(&p, end, &v.freq)) {
277 if (out) {
278 if (*p == 0) {
279 *out << "Incomplete stats item in value table";
280 } else {
281 *out << "Frequency statistic in value table is too large";
282 }
283 *out << endl;
284 }
285 ++errors;
286 continue;
287 }
288 if (!unpack_string(&p, end, v.lower_bound)) {
289 if (out) {
290 if (*p == 0) {
291 *out << "Incomplete stats item in value table";
292 } else {
293 *out << "Lower bound statistic in value table is too large";
294 }
295 *out << endl;
296 }
297 ++errors;
298 continue;
299 }
300 size_t len = end - p;
301 if (len == 0) {
302 v.upper_bound = v.lower_bound;
303 } else {
304 v.upper_bound.assign(p, len);
305 }
306
307 continue;
308 }
309
310 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
311 // Value stream chunk.
312 const char * p = key.data();
313 const char * end = p + key.length();
314 p += 2;
315 Xapian::valueno slot;
316 if (!unpack_uint(&p, end, &slot)) {
317 if (out)
318 *out << "Bad value chunk key (no slot)" << endl;
319 ++errors;
320 continue;
321 }
322 Xapian::docid did;
323 if (!unpack_uint_preserving_sort(&p, end, &did)) {
324 if (out)
325 *out << "Bad value chunk key (no docid)" << endl;
326 ++errors;
327 continue;
328 }
329 if (p != end) {
330 if (out)
331 *out << "Bad value chunk key (trailing junk)" << endl;
332 ++errors;
333 continue;
334 }
335
336 VStats & v = valuestats[slot];
337
338 cursor->read_tag();
339 p = cursor->current_tag.data();
340 end = p + cursor->current_tag.size();
341
342 while (true) {
343 string value;
344 if (!unpack_string(&p, end, value)) {
345 if (out)
346 *out << "Failed to unpack value from chunk" << endl;
347 ++errors;
348 break;
349 }
350
351 ++v.freq_real;
352
353 // FIXME: Cross-check that docid did has value slot (and
354 // vice versa - that there's a value here if the slot entry
355 // says so).
356
357 // FIXME: Check if the bounds are tight? Or is that better
358 // as a separate tool which can also update the bounds?
359 if (value < v.lower_bound) {
360 if (out)
361 *out << "Value slot " << slot << " has value "
362 "below lower bound: '" << value << "' < '"
363 << v.lower_bound << "'" << endl;
364 ++errors;
365 } else if (value > v.upper_bound) {
366 if (out)
367 *out << "Value slot " << slot << " has value "
368 "above upper bound: '" << value << "' > '"
369 << v.upper_bound << "'" << endl;
370 ++errors;
371 }
372
373 if (p == end) break;
374 Xapian::docid delta;
375 if (!unpack_uint(&p, end, &delta)) {
376 if (out)
377 *out << "Failed to unpack docid delta from chunk"
378 << endl;
379 ++errors;
380 break;
381 }
382 Xapian::docid new_did = did + delta + 1;
383 if (new_did <= did) {
384 if (out)
385 *out << "docid overflowed in value chunk" << endl;
386 ++errors;
387 break;
388 }
389 did = new_did;
390
391 if (did > db_last_docid) {
392 if (out)
393 *out << "document id " << did << " in value chunk "
394 "is larger than get_last_docid() "
395 << db_last_docid << endl;
396 ++errors;
397 }
398 }
399 continue;
400 }
401
402 const char * pos, * end;
403
404 // Get term from key.
405 pos = key.data();
406 end = pos + key.size();
407
408 string term;
409 Xapian::docid did;
410 if (!unpack_string_preserving_sort(&pos, end, term)) {
411 if (out)
412 *out << "Error unpacking termname from key" << endl;
413 ++errors;
414 continue;
415 }
416 if (!current_term.empty() && term != current_term) {
417 // The term changed unexpectedly.
418 if (pos == end) {
419 if (out)
420 *out << "No last chunk for term '" << current_term
421 << "'" << endl;
422 current_term.resize(0);
423 } else {
424 if (out)
425 *out << "Mismatch in follow-on chunk in posting list "
426 "for term '" << current_term << "' (got '"
427 << term << "')" << endl;
428 current_term = term;
429 tf = cf = 0;
430 lastdid = 0;
431 }
432 ++errors;
433 }
434 if (pos == end) {
435 // First chunk.
436 if (term == current_term) {
437 // This probably isn't possible.
438 if (out)
439 *out << "First posting list chunk for term '" << term
440 << "' follows previous chunk for the same term"
441 << endl;
442 ++errors;
443 }
444 current_term = term;
445 tf = cf = 0;
446
447 // Unpack extra header from first chunk.
448 cursor->read_tag();
449 pos = cursor->current_tag.data();
450 end = pos + cursor->current_tag.size();
451 if (!unpack_uint(&pos, end, &termfreq)) {
452 if (out)
453 *out << "Failed to unpack termfreq for term '" << term
454 << "'" << endl;
455 ++errors;
456 continue;
457 }
458 if (!unpack_uint(&pos, end, &collfreq)) {
459 if (out)
460 *out << "Failed to unpack collfreq for term '" << term
461 << "'" << endl;
462 ++errors;
463 continue;
464 }
465 if (!unpack_uint(&pos, end, &did)) {
466 if (out)
467 *out << "Failed to unpack firstdid for term '" << term
468 << "'" << endl;
469 ++errors;
470 continue;
471 }
472 ++did;
473 } else {
474 // Continuation chunk.
475 if (current_term.empty()) {
476 if (out)
477 *out << "First chunk for term '" << current_term
478 << "' is a continuation chunk" << endl;
479 ++errors;
480 current_term = term;
481 }
482 AssertEq(current_term, term);
483 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
484 if (out)
485 *out << "Failed to unpack did from key" << endl;
486 ++errors;
487 continue;
488 }
489 if (did <= lastdid) {
490 if (out)
491 *out << "First did in this chunk is <= last in "
492 "prev chunk" << endl;
493 ++errors;
494 }
495 cursor->read_tag();
496 pos = cursor->current_tag.data();
497 end = pos + cursor->current_tag.size();
498 }
499
500 bool is_last_chunk;
501 if (!unpack_bool(&pos, end, &is_last_chunk)) {
502 if (out)
503 *out << "Failed to unpack last chunk flag" << endl;
504 ++errors;
505 continue;
506 }
507 // Read what the final document ID in this chunk is.
508 if (!unpack_uint(&pos, end, &lastdid)) {
509 if (out)
510 *out << "Failed to unpack increase to last" << endl;
511 ++errors;
512 continue;
513 }
514 lastdid += did;
515 bool bad = false;
516 while (true) {
517 Xapian::termcount wdf;
518 if (!unpack_uint(&pos, end, &wdf)) {
519 if (out)
520 *out << "Failed to unpack wdf" << endl;
521 ++errors;
522 bad = true;
523 break;
524 }
525 ++tf;
526 cf += wdf;
527
528 if (pos == end) break;
529
530 Xapian::docid inc;
531 if (!unpack_uint(&pos, end, &inc)) {
532 if (out)
533 *out << "Failed to unpack docid increase" << endl;
534 ++errors;
535 bad = true;
536 break;
537 }
538 ++inc;
539 did += inc;
540 if (did > lastdid) {
541 if (out)
542 *out << "docid " << did << " > last docid " << lastdid
543 << endl;
544 ++errors;
545 }
546 }
547 if (bad) {
548 continue;
549 }
550 if (is_last_chunk) {
551 if (tf != termfreq) {
552 if (out)
553 *out << "termfreq " << termfreq << " != # of entries "
554 << tf << endl;
555 ++errors;
556 }
557 if (cf != collfreq) {
558 if (out)
559 *out << "collfreq " << collfreq << " != sum wdf " << cf
560 << endl;
561 ++errors;
562 }
563 if (did != lastdid) {
564 if (out)
565 *out << "lastdid " << lastdid << " != last did " << did
566 << endl;
567 ++errors;
568 }
569 current_term.resize(0);
570 }
571 }
572 if (!current_term.empty()) {
573 if (out)
574 *out << "Last term '" << current_term << "' has no last chunk"
575 << endl;
576 ++errors;
577 }
578
579 Xapian::doccount doccount = version_file.get_doccount();
580 if (num_doclens != doccount) {
581 if (out)
582 *out << "Document length list has " << num_doclens
583 << " entries, should be " << doccount << endl;
584 ++errors;
585 }
586
587 map<Xapian::valueno, VStats>::const_iterator i;
588 for (i = valuestats.begin(); i != valuestats.end(); ++i) {
589 if (i->second.freq != i->second.freq_real) {
590 if (out)
591 *out << "Value stats frequency for slot " << i->first
592 << " is " << i->second.freq << " but recounting "
593 "gives " << i->second.freq_real << endl;
594 ++errors;
595 }
596 }
597 } else if (strcmp(tablename, "docdata") == 0) {
598 // glass doesn't store a docdata entry if the document data is empty,
599 // so we can only check there aren't more docdata entries than
600 // documents.
601 Xapian::doccount doccount = version_file.get_doccount();
602 if (table->get_entry_count() > doccount) {
603 if (out)
604 *out << "More document data (" << table->get_entry_count()
605 << ") then documents (" << doccount << ")" << endl;
606 ++errors;
607 }
608
609 // Now check the contents of the docdata table.
610 for ( ; !cursor->after_end(); cursor->next()) {
611 string & key = cursor->current_key;
612
613 // Get docid from key.
614 const char * pos = key.data();
615 const char * end = pos + key.size();
616
617 Xapian::docid did;
618 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
619 if (out)
620 *out << "Error unpacking docid from key" << endl;
621 ++errors;
622 } else if (pos != end) {
623 if (out)
624 *out << "Extra junk in key" << endl;
625 ++errors;
626 } else {
627 if (did > db_last_docid) {
628 if (out)
629 *out << "document id " << did << " in docdata table "
630 "is larger than get_last_docid() "
631 << db_last_docid << endl;
632 ++errors;
633 }
634 }
635
636 // Fetch and decompress the document data to catch problems with
637 // the splitting into multiple items, corruption of the compressed
638 // data, etc.
639 cursor->read_tag();
640 if (cursor->current_tag.empty()) {
641 // We shouldn't store empty document data.
642 if (out)
643 *out << "Empty document data explicitly stored for "
644 "document id " << did << endl;
645 ++errors;
646 }
647 }
648 } else if (strcmp(tablename, "termlist") == 0) {
649 // Now check the contents of the termlist table.
650 Xapian::doccount num_termlists = 0;
651 Xapian::doccount num_slotsused_entries = 0;
652 for ( ; !cursor->after_end(); cursor->next()) {
653 string & key = cursor->current_key;
654
655 // Get docid from key.
656 const char * pos = key.data();
657 const char * end = pos + key.size();
658
659 Xapian::docid did;
660 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
661 if (out)
662 *out << "Error unpacking docid from key" << endl;
663 ++errors;
664 continue;
665 }
666
667 if (did > db_last_docid) {
668 if (out)
669 *out << "document id " << did << " in termlist table "
670 "is larger than get_last_docid() "
671 << db_last_docid << endl;
672 ++errors;
673 }
674
675 if (end - pos == 1 && *pos == '\0') {
676 // Value slots used entry.
677 ++num_slotsused_entries;
678 cursor->read_tag();
679
680 pos = cursor->current_tag.data();
681 end = pos + cursor->current_tag.size();
682
683 if (pos == end) {
684 if (out)
685 *out << "Empty value slots used tag" << endl;
686 ++errors;
687 continue;
688 }
689
690 Xapian::valueno prev_slot;
691 if (!unpack_uint(&pos, end, &prev_slot)) {
692 if (out)
693 *out << "Value slot encoding corrupt" << endl;
694 ++errors;
695 continue;
696 }
697
698 while (pos != end) {
699 Xapian::valueno slot;
700 if (!unpack_uint(&pos, end, &slot)) {
701 if (out)
702 *out << "Value slot encoding corrupt" << endl;
703 ++errors;
704 break;
705 }
706 slot += prev_slot + 1;
707 if (slot <= prev_slot) {
708 if (out)
709 *out << "Value slot number overflowed ("
710 << prev_slot << " -> " << slot << ")" << endl;
711 ++errors;
712 }
713 prev_slot = slot;
714 }
715 continue;
716 }
717
718 if (pos != end) {
719 if (out)
720 *out << "Extra junk in key" << endl;
721 ++errors;
722 continue;
723 }
724
725 ++num_termlists;
726 cursor->read_tag();
727
728 pos = cursor->current_tag.data();
729 end = pos + cursor->current_tag.size();
730
731 if (pos == end) {
732 // Empty termlist.
733 continue;
734 }
735
736 Xapian::termcount doclen, termlist_size;
737
738 // Read doclen
739 if (!unpack_uint(&pos, end, &doclen)) {
740 if (out) {
741 if (pos != 0) {
742 *out << "doclen out of range";
743 } else {
744 *out << "Unexpected end of data when reading doclen";
745 }
746 *out << endl;
747 }
748 ++errors;
749 continue;
750 }
751
752 // Check doclen with doclen lower and upper bounds
753 if (doclen > version_file.get_doclength_upper_bound()) {
754 if (out)
755 *out << "doclen " << doclen << " > upper bound "
756 << version_file.get_doclength_upper_bound() << endl;
757 ++errors;
758 } else if (doclen < version_file.get_doclength_lower_bound() &&
759 doclen != 0) {
760 if (out)
761 *out << "doclen " << doclen << " < lower bound "
762 << version_file.get_doclength_lower_bound() << endl;
763 ++errors;
764 }
765
766 // Read termlist_size
767 if (!unpack_uint(&pos, end, &termlist_size)) {
768 if (out) {
769 if (pos != 0) {
770 *out << "termlist_size out of range";
771 } else {
772 *out << "Unexpected end of data when reading "
773 "termlist_size";
774 }
775 *out << endl;
776 }
777 ++errors;
778 continue;
779 }
780
781 Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
782 string current_tname;
783
784 bool bad = false;
785 while (pos != end) {
786 Xapian::doccount current_wdf = 0;
787 bool got_wdf = false;
788 // If there was a previous term, how much to reuse.
789 if (!current_tname.empty()) {
790 string::size_type len = static_cast<unsigned char>(*pos++);
791 if (len > current_tname.length()) {
792 // The wdf was squeezed into the same byte.
793 current_wdf = len / (current_tname.length() + 1) - 1;
794 len %= (current_tname.length() + 1);
795 got_wdf = true;
796 }
797 current_tname.resize(len);
798 }
799 // What to append (note len must be positive, since just truncating
800 // always takes us backwards in the sort order)
801 string::size_type len = static_cast<unsigned char>(*pos++);
802 current_tname.append(pos, len);
803 pos += len;
804
805 if (!got_wdf) {
806 // Read wdf
807 if (!unpack_uint(&pos, end, ¤t_wdf)) {
808 if (out) {
809 if (pos == 0) {
810 *out << "Unexpected end of data when reading "
811 "termlist current_wdf";
812 } else {
813 *out << "Size of wdf out of range in termlist";
814 }
815 *out << endl;
816 }
817 ++errors;
818 bad = true;
819 break;
820 }
821 }
822
823 ++actual_termlist_size;
824 actual_doclen += current_wdf;
825 }
826 if (bad) {
827 continue;
828 }
829
830 if (termlist_size != actual_termlist_size) {
831 if (out)
832 *out << "termlist_size != # of entries in termlist" << endl;
833 ++errors;
834 }
835 if (doclen != actual_doclen) {
836 if (out)
837 *out << "doclen != sum(wdf)" << endl;
838 ++errors;
839 }
840
841 // + 1 so that did is a valid subscript.
842 if (doclens.size() <= did) doclens.resize(did + 1);
843 doclens[did] = actual_doclen;
844 }
845
846 Xapian::doccount doccount = version_file.get_doccount();
847
848 // glass doesn't store a termlist entry if there are no terms, so we
849 // can only check there aren't more termlists than documents.
850 if (num_termlists > doccount) {
851 if (out)
852 *out << "More termlists (" << num_termlists
853 << ") then documents (" << doccount << ")" << endl;
854 ++errors;
855 }
856
857 // glass doesn't store a valueslots used entry if there are no terms,
858 // so we can only check there aren't more such entries than documents.
859 if (num_slotsused_entries > doccount) {
860 if (out)
861 *out << "More slots-used entries (" << num_slotsused_entries
862 << ") then documents (" << doccount << ")" << endl;
863 ++errors;
864 }
865 } else if (strcmp(tablename, "position") == 0) {
866 // Now check the contents of the position table.
867 for ( ; !cursor->after_end(); cursor->next()) {
868 string & key = cursor->current_key;
869
870 // Get docid from key.
871 const char * pos = key.data();
872 const char * end = pos + key.size();
873
874 string term;
875 if (!unpack_string_preserving_sort(&pos, end, term)) {
876 if (out)
877 *out << "Error unpacking term from key" << endl;
878 ++errors;
879 continue;
880 }
881
882 Xapian::docid did;
883 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
884 if (out)
885 *out << "Error unpacking docid from key" << endl;
886 ++errors;
887 continue;
888 }
889
890 if (pos != end) {
891 if (out)
892 *out << "Extra junk in key with docid " << did << endl;
893 ++errors;
894 continue;
895 }
896
897 if (did > db_last_docid) {
898 if (out)
899 *out << "document id " << did << " in position table "
900 "is larger than get_last_docid() "
901 << db_last_docid << endl;
902 ++errors;
903 } else if (!doclens.empty()) {
904 // In glass, a document without terms doesn't get a
905 // termlist entry, so we can't tell the difference
906 // easily.
907 if (did >= doclens.size() || doclens[did] == 0) {
908 if (out)
909 *out << "Position list entry for document " << did
910 << " which doesn't exist or has no terms" << endl;
911 ++errors;
912 }
913 }
914
915 cursor->read_tag();
916
917 const string & data = cursor->current_tag;
918 pos = data.data();
919 end = pos + data.size();
920
921 Xapian::termpos pos_last;
922 if (!unpack_uint(&pos, end, &pos_last)) {
923 if (out)
924 *out << tablename << " table: Position list data corrupt"
925 << endl;
926 ++errors;
927 continue;
928 }
929 if (pos == end) {
930 // Special case for single entry position list.
931 } else {
932 // Skip the header we just read.
933 BitReader rd(data, pos - data.data());
934 Xapian::termpos pos_first = rd.decode(pos_last);
935 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
936 rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
937 Xapian::termpos p = rd.decode_interpolative_next();
938 bool ok = true;
939 while (p != pos_last) {
940 Xapian::termpos pos_prev = p;
941 p = rd.decode_interpolative_next();
942 if (p <= pos_prev) {
943 if (out)
944 *out << tablename << " table: Positions not "
945 "strictly monotonically increasing" << endl;
946 ++errors;
947 ok = false;
948 break;
949 }
950 }
951 if (ok && !rd.check_all_gone()) {
952 if (out)
953 *out << tablename << " table: Junk after position data"
954 << endl;
955 ++errors;
956 }
957 }
958 }
959 } else {
960 if (out)
961 *out << tablename << " table: Don't know how to check structure\n"
962 << endl;
963 return errors;
964 }
965
966 if (out) {
967 if (!errors)
968 *out << tablename << " table structure checked OK\n";
969 else
970 *out << tablename << " table errors found: " << errors << "\n";
971 *out << endl;
972 }
973
974 return errors;
975 }
976