1 /* chert_postlist.cc: Postlists in a chert database
2 *
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002,2003,2004,2005,2007,2008,2009,2011,2014,2015 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
21 */
22
23 #include <config.h>
24
25 #include "chert_postlist.h"
26
27 #include "chert_cursor.h"
28 #include "chert_database.h"
29 #include "debuglog.h"
30 #include "noreturn.h"
31 #include "pack.h"
32 #include "str.h"
33
34 using Xapian::Internal::intrusive_ptr;
35
36 void
get_freqs(const string & term,Xapian::doccount * termfreq_ptr,Xapian::termcount * collfreq_ptr) const37 ChertPostListTable::get_freqs(const string & term,
38 Xapian::doccount * termfreq_ptr,
39 Xapian::termcount * collfreq_ptr) const
40 {
41 string key = make_key(term);
42 string tag;
43 if (!get_exact_entry(key, tag)) {
44 if (termfreq_ptr)
45 *termfreq_ptr = 0;
46 if (collfreq_ptr)
47 *collfreq_ptr = 0;
48 } else {
49 const char * p = tag.data();
50 ChertPostList::read_number_of_entries(&p, p + tag.size(),
51 termfreq_ptr, collfreq_ptr);
52 }
53 }
54
55 Xapian::termcount
get_doclength(Xapian::docid did,intrusive_ptr<const ChertDatabase> db) const56 ChertPostListTable::get_doclength(Xapian::docid did,
57 intrusive_ptr<const ChertDatabase> db) const {
58 if (!doclen_pl.get()) {
59 // Don't keep a reference back to the database, since this
60 // would make a reference loop.
61 doclen_pl.reset(new ChertPostList(db, string(), false));
62 }
63 if (!doclen_pl->jump_to(did))
64 throw Xapian::DocNotFoundError("Document " + str(did) + " not found");
65 return doclen_pl->get_wdf();
66 }
67
68 bool
document_exists(Xapian::docid did,intrusive_ptr<const ChertDatabase> db) const69 ChertPostListTable::document_exists(Xapian::docid did,
70 intrusive_ptr<const ChertDatabase> db) const
71 {
72 if (!doclen_pl.get()) {
73 // Don't keep a reference back to the database, since this
74 // would make a reference loop.
75 doclen_pl.reset(new ChertPostList(db, string(), false));
76 }
77 return (doclen_pl->jump_to(did));
78 }
79
80 // How big should chunks in the posting list be? (They
81 // will grow slightly bigger than this, but not more than a
82 // few bytes extra) - FIXME: tune this value to try to
83 // maximise how well blocks are used. Or performance.
84 // Or indexing speed. Or something...
85 const unsigned int CHUNKSIZE = 2000;
86
87 /** PostlistChunkWriter is a wrapper which acts roughly as an
88 * output iterator on a postlist chunk, taking care of the
89 * messy details. It's intended to be used with deletion and
90 * replacing of entries, not for adding to the end, when it's
91 * not really needed.
92 */
93 class Chert::PostlistChunkWriter {
94 public:
95 PostlistChunkWriter(const string &orig_key_,
96 bool is_first_chunk_,
97 const string &tname_,
98 bool is_last_chunk_);
99
100 /// Append an entry to this chunk.
101 void append(ChertTable * table, Xapian::docid did,
102 Xapian::termcount wdf);
103
104 /// Append a block of raw entries to this chunk.
raw_append(Xapian::docid first_did_,Xapian::docid current_did_,const string & s)105 void raw_append(Xapian::docid first_did_, Xapian::docid current_did_,
106 const string & s) {
107 Assert(!started);
108 first_did = first_did_;
109 current_did = current_did_;
110 if (!s.empty()) {
111 chunk.append(s);
112 started = true;
113 }
114 }
115
116 /** Flush the chunk to the buffered table. Note: this may write it
117 * with a different key to the original one, if for example the first
118 * entry has changed.
119 */
120 void flush(ChertTable *table);
121
122 private:
123 string orig_key;
124 string tname;
125 bool is_first_chunk;
126 bool is_last_chunk;
127 bool started;
128
129 Xapian::docid first_did;
130 Xapian::docid current_did;
131
132 string chunk;
133 };
134
135 using Chert::PostlistChunkWriter;
136
137 // Static functions
138
139 /// Report an error when reading the posting list.
140 XAPIAN_NORETURN(static void report_read_error(const char * position));
report_read_error(const char * position)141 static void report_read_error(const char * position)
142 {
143 if (position == 0) {
144 // data ran out
145 LOGLINE(DB, "ChertPostList data ran out");
146 throw Xapian::DatabaseCorruptError("Data ran out unexpectedly when reading posting list.");
147 }
148 // overflow
149 LOGLINE(DB, "ChertPostList value too large");
150 throw Xapian::RangeError("Value in posting list too large.");
151 }
152
get_tname_from_key(const char ** src,const char * end,string & tname)153 static inline bool get_tname_from_key(const char **src, const char *end,
154 string &tname)
155 {
156 return unpack_string_preserving_sort(src, end, tname);
157 }
158
159 static inline bool
check_tname_in_key_lite(const char ** keypos,const char * keyend,const string & tname)160 check_tname_in_key_lite(const char **keypos, const char *keyend, const string &tname)
161 {
162 string tname_in_key;
163
164 if (keyend - *keypos >= 2 && (*keypos)[0] == '\0' && (*keypos)[1] == '\xe0') {
165 *keypos += 2;
166 } else {
167 // Read the termname.
168 if (!get_tname_from_key(keypos, keyend, tname_in_key))
169 report_read_error(*keypos);
170 }
171
172 // This should only fail if the postlist doesn't exist at all.
173 return tname_in_key == tname;
174 }
175
176 static inline bool
check_tname_in_key(const char ** keypos,const char * keyend,const string & tname)177 check_tname_in_key(const char **keypos, const char *keyend, const string &tname)
178 {
179 if (*keypos == keyend) return false;
180
181 return check_tname_in_key_lite(keypos, keyend, tname);
182 }
183
184 /// Read the start of the first chunk in the posting list.
185 static Xapian::docid
read_start_of_first_chunk(const char ** posptr,const char * end,Xapian::doccount * number_of_entries_ptr,Xapian::termcount * collection_freq_ptr)186 read_start_of_first_chunk(const char ** posptr,
187 const char * end,
188 Xapian::doccount * number_of_entries_ptr,
189 Xapian::termcount * collection_freq_ptr)
190 {
191 LOGCALL_STATIC(DB, Xapian::docid, "read_start_of_first_chunk", (const void *)posptr | (const void *)end | (void *)number_of_entries_ptr | (void *)collection_freq_ptr);
192
193 ChertPostList::read_number_of_entries(posptr, end,
194 number_of_entries_ptr, collection_freq_ptr);
195 if (number_of_entries_ptr)
196 LOGVALUE(DB, *number_of_entries_ptr);
197 if (collection_freq_ptr)
198 LOGVALUE(DB, *collection_freq_ptr);
199
200 Xapian::docid did;
201 // Read the docid of the first entry in the posting list.
202 if (!unpack_uint(posptr, end, &did))
203 report_read_error(*posptr);
204 ++did;
205 LOGVALUE(DB, did);
206 RETURN(did);
207 }
208
209 static inline void
read_did_increase(const char ** posptr,const char * end,Xapian::docid * did_ptr)210 read_did_increase(const char ** posptr, const char * end,
211 Xapian::docid * did_ptr)
212 {
213 Xapian::docid did_increase;
214 if (!unpack_uint(posptr, end, &did_increase)) report_read_error(*posptr);
215 *did_ptr += did_increase + 1;
216 }
217
218 /// Read the wdf for an entry.
219 static inline void
read_wdf(const char ** posptr,const char * end,Xapian::termcount * wdf_ptr)220 read_wdf(const char ** posptr, const char * end, Xapian::termcount * wdf_ptr)
221 {
222 if (!unpack_uint(posptr, end, wdf_ptr)) report_read_error(*posptr);
223 }
224
225 /// Read the start of a chunk.
226 static Xapian::docid
read_start_of_chunk(const char ** posptr,const char * end,Xapian::docid first_did_in_chunk,bool * is_last_chunk_ptr)227 read_start_of_chunk(const char ** posptr,
228 const char * end,
229 Xapian::docid first_did_in_chunk,
230 bool * is_last_chunk_ptr)
231 {
232 LOGCALL_STATIC(DB, Xapian::docid, "read_start_of_chunk", reinterpret_cast<const void*>(posptr) | reinterpret_cast<const void*>(end) | first_did_in_chunk | reinterpret_cast<const void*>(is_last_chunk_ptr));
233 Assert(is_last_chunk_ptr);
234
235 // Read whether this is the last chunk
236 if (!unpack_bool(posptr, end, is_last_chunk_ptr))
237 report_read_error(*posptr);
238 LOGVALUE(DB, *is_last_chunk_ptr);
239
240 // Read what the final document ID in this chunk is.
241 Xapian::docid increase_to_last;
242 if (!unpack_uint(posptr, end, &increase_to_last))
243 report_read_error(*posptr);
244 Xapian::docid last_did_in_chunk = first_did_in_chunk + increase_to_last;
245 LOGVALUE(DB, last_did_in_chunk);
246 RETURN(last_did_in_chunk);
247 }
248
249 /** PostlistChunkReader is essentially an iterator wrapper
250 * around a postlist chunk. It simply iterates through the
251 * entries in a postlist.
252 */
253 class Chert::PostlistChunkReader {
254 string data;
255
256 const char *pos;
257 const char *end;
258
259 bool at_end;
260
261 Xapian::docid did;
262 Xapian::termcount wdf;
263
264 public:
265 /** Initialise the postlist chunk reader.
266 *
267 * @param first_did First document id in this chunk.
268 * @param data The tag string with the header removed.
269 */
PostlistChunkReader(Xapian::docid first_did,const string & data_)270 PostlistChunkReader(Xapian::docid first_did, const string & data_)
271 : data(data_), pos(data.data()), end(pos + data.length()), at_end(data.empty()), did(first_did)
272 {
273 if (!at_end) read_wdf(&pos, end, &wdf);
274 }
275
get_docid() const276 Xapian::docid get_docid() const {
277 return did;
278 }
get_wdf() const279 Xapian::termcount get_wdf() const {
280 return wdf;
281 }
282
is_at_end() const283 bool is_at_end() const {
284 return at_end;
285 }
286
287 /** Advance to the next entry. Set at_end if we run off the end.
288 */
289 void next();
290 };
291
292 using Chert::PostlistChunkReader;
293
294 void
next()295 PostlistChunkReader::next()
296 {
297 if (pos == end) {
298 at_end = true;
299 } else {
300 read_did_increase(&pos, end, &did);
301 read_wdf(&pos, end, &wdf);
302 }
303 }
304
PostlistChunkWriter(const string & orig_key_,bool is_first_chunk_,const string & tname_,bool is_last_chunk_)305 PostlistChunkWriter::PostlistChunkWriter(const string &orig_key_,
306 bool is_first_chunk_,
307 const string &tname_,
308 bool is_last_chunk_)
309 : orig_key(orig_key_),
310 tname(tname_), is_first_chunk(is_first_chunk_),
311 is_last_chunk(is_last_chunk_),
312 started(false)
313 {
314 LOGCALL_CTOR(DB, "PostlistChunkWriter", orig_key_ | is_first_chunk_ | tname_ | is_last_chunk_);
315 }
316
317 void
append(ChertTable * table,Xapian::docid did,Xapian::termcount wdf)318 PostlistChunkWriter::append(ChertTable * table, Xapian::docid did,
319 Xapian::termcount wdf)
320 {
321 if (!started) {
322 started = true;
323 first_did = did;
324 } else {
325 Assert(did > current_did);
326 // Start a new chunk if this one has grown to the threshold.
327 if (chunk.size() >= CHUNKSIZE) {
328 bool save_is_last_chunk = is_last_chunk;
329 is_last_chunk = false;
330 flush(table);
331 is_last_chunk = save_is_last_chunk;
332 is_first_chunk = false;
333 first_did = did;
334 chunk.resize(0);
335 orig_key = ChertPostListTable::make_key(tname, first_did);
336 } else {
337 pack_uint(chunk, did - current_did - 1);
338 }
339 }
340 current_did = did;
341 pack_uint(chunk, wdf);
342 }
343
344 /** Make the data to go at the start of the very first chunk.
345 */
346 static inline string
make_start_of_first_chunk(Xapian::doccount entries,Xapian::termcount collectionfreq,Xapian::docid new_did)347 make_start_of_first_chunk(Xapian::doccount entries,
348 Xapian::termcount collectionfreq,
349 Xapian::docid new_did)
350 {
351 string chunk;
352 pack_uint(chunk, entries);
353 pack_uint(chunk, collectionfreq);
354 pack_uint(chunk, new_did - 1);
355 return chunk;
356 }
357
358 /** Make the data to go at the start of a standard chunk.
359 */
360 static inline string
make_start_of_chunk(bool new_is_last_chunk,Xapian::docid new_first_did,Xapian::docid new_final_did)361 make_start_of_chunk(bool new_is_last_chunk,
362 Xapian::docid new_first_did,
363 Xapian::docid new_final_did)
364 {
365 Assert(new_final_did >= new_first_did);
366 string chunk;
367 pack_bool(chunk, new_is_last_chunk);
368 pack_uint(chunk, new_final_did - new_first_did);
369 return chunk;
370 }
371
372 static void
write_start_of_chunk(string & chunk,unsigned int start_of_chunk_header,unsigned int end_of_chunk_header,bool is_last_chunk,Xapian::docid first_did_in_chunk,Xapian::docid last_did_in_chunk)373 write_start_of_chunk(string & chunk,
374 unsigned int start_of_chunk_header,
375 unsigned int end_of_chunk_header,
376 bool is_last_chunk,
377 Xapian::docid first_did_in_chunk,
378 Xapian::docid last_did_in_chunk)
379 {
380 Assert((size_t)(end_of_chunk_header - start_of_chunk_header) <= chunk.size());
381
382 chunk.replace(start_of_chunk_header,
383 end_of_chunk_header - start_of_chunk_header,
384 make_start_of_chunk(is_last_chunk, first_did_in_chunk,
385 last_did_in_chunk));
386 }
387
388 void
flush(ChertTable * table)389 PostlistChunkWriter::flush(ChertTable *table)
390 {
391 LOGCALL_VOID(DB, "PostlistChunkWriter::flush", table);
392
393 /* This is one of the more messy parts involved with updating posting
394 * list chunks.
395 *
396 * Depending on circumstances, we may have to delete an entire chunk
397 * or file it under a different key, as well as possibly modifying both
398 * the previous and next chunk of the postlist.
399 */
400
401 if (!started) {
402 /* This chunk is now empty so disappears entirely.
403 *
404 * If this was the last chunk, then the previous chunk
405 * must have its "is_last_chunk" flag updated.
406 *
407 * If this was the first chunk, then the next chunk must
408 * be transformed into the first chunk. Messy!
409 */
410 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting chunk");
411 Assert(!orig_key.empty());
412 if (is_first_chunk) {
413 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting first chunk");
414 if (is_last_chunk) {
415 /* This is the first and the last chunk, ie the only
416 * chunk, so just delete the tag.
417 */
418 table->del(orig_key);
419 return;
420 }
421
422 /* This is the messiest case. The first chunk is to
423 * be removed, and there is at least one chunk after
424 * it. Need to rewrite the next chunk as the first
425 * chunk.
426 */
427 AutoPtr<ChertCursor> cursor(table->cursor_get());
428
429 if (!cursor->find_entry(orig_key)) {
430 throw Xapian::DatabaseCorruptError("The key we're working on has disappeared");
431 }
432
433 // FIXME: Currently the doclen list has a special first chunk too,
434 // which reduces special casing here. The downside is a slightly
435 // larger than necessary first chunk and needless fiddling if the
436 // first chunk is deleted. But really we should look at
437 // redesigning the whole postlist format with an eye to making it
438 // easier to update!
439
440 // Extract existing counts from the first chunk so we can reinsert
441 // them into the block we're renaming.
442 Xapian::doccount num_ent;
443 Xapian::termcount coll_freq;
444 {
445 cursor->read_tag();
446 const char *tagpos = cursor->current_tag.data();
447 const char *tagend = tagpos + cursor->current_tag.size();
448
449 (void)read_start_of_first_chunk(&tagpos, tagend,
450 &num_ent, &coll_freq);
451 }
452
453 // Seek to the next chunk.
454 cursor->next();
455 if (cursor->after_end()) {
456 throw Xapian::DatabaseCorruptError("Expected another key but found none");
457 }
458 const char *kpos = cursor->current_key.data();
459 const char *kend = kpos + cursor->current_key.size();
460 if (!check_tname_in_key(&kpos, kend, tname)) {
461 throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
462 }
463
464 // Read the new first docid
465 Xapian::docid new_first_did;
466 if (!C_unpack_uint_preserving_sort(&kpos, kend, &new_first_did)) {
467 report_read_error(kpos);
468 }
469
470 cursor->read_tag();
471 const char *tagpos = cursor->current_tag.data();
472 const char *tagend = tagpos + cursor->current_tag.size();
473
474 // Read the chunk header
475 bool new_is_last_chunk;
476 Xapian::docid new_last_did_in_chunk =
477 read_start_of_chunk(&tagpos, tagend, new_first_did,
478 &new_is_last_chunk);
479
480 string chunk_data(tagpos, tagend);
481
482 // First remove the renamed tag
483 table->del(cursor->current_key);
484
485 // And now write it as the first chunk
486 string tag;
487 tag = make_start_of_first_chunk(num_ent, coll_freq, new_first_did);
488 tag += make_start_of_chunk(new_is_last_chunk,
489 new_first_did,
490 new_last_did_in_chunk);
491 tag += chunk_data;
492 table->add(orig_key, tag);
493 return;
494 }
495
496 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary chunk");
497 /* This isn't the first chunk. Check whether we're the last chunk. */
498
499 // Delete this chunk
500 table->del(orig_key);
501
502 if (is_last_chunk) {
503 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary last chunk");
504 // Update the previous chunk's is_last_chunk flag.
505 AutoPtr<ChertCursor> cursor(table->cursor_get());
506
507 /* Should not find the key we just deleted, but should
508 * find the previous chunk. */
509 if (cursor->find_entry(orig_key)) {
510 throw Xapian::DatabaseCorruptError("Chert key not deleted as we expected");
511 }
512 // Make sure this is a chunk with the right term attached.
513 const char * keypos = cursor->current_key.data();
514 const char * keyend = keypos + cursor->current_key.size();
515 if (!check_tname_in_key(&keypos, keyend, tname)) {
516 throw Xapian::DatabaseCorruptError("Couldn't find chunk before delete chunk");
517 }
518
519 bool is_prev_first_chunk = (keypos == keyend);
520
521 // Now update the last_chunk
522 cursor->read_tag();
523 string tag = cursor->current_tag;
524
525 const char *tagpos = tag.data();
526 const char *tagend = tagpos + tag.size();
527
528 // Skip first chunk header
529 Xapian::docid first_did_in_chunk;
530 if (is_prev_first_chunk) {
531 first_did_in_chunk = read_start_of_first_chunk(&tagpos, tagend,
532 0, 0);
533 } else {
534 if (!C_unpack_uint_preserving_sort(&keypos, keyend, &first_did_in_chunk))
535 report_read_error(keypos);
536 }
537 bool wrong_is_last_chunk;
538 string::size_type start_of_chunk_header = tagpos - tag.data();
539 Xapian::docid last_did_in_chunk =
540 read_start_of_chunk(&tagpos, tagend, first_did_in_chunk,
541 &wrong_is_last_chunk);
542 string::size_type end_of_chunk_header = tagpos - tag.data();
543
544 // write new is_last flag
545 write_start_of_chunk(tag,
546 start_of_chunk_header,
547 end_of_chunk_header,
548 true, // is_last_chunk
549 first_did_in_chunk,
550 last_did_in_chunk);
551 table->add(cursor->current_key, tag);
552 }
553 } else {
554 LOGLINE(DB, "PostlistChunkWriter::flush(): updating chunk which still has items in it");
555 /* The chunk still has some items in it. Two major subcases:
556 * a) This is the first chunk.
557 * b) This isn't the first chunk.
558 *
559 * The subcases just affect the chunk header.
560 */
561 string tag;
562
563 /* First write the header, which depends on whether this is the
564 * first chunk.
565 */
566 if (is_first_chunk) {
567 /* The first chunk. This is the relatively easy case,
568 * and we just have to write this one back to disk.
569 */
570 LOGLINE(DB, "PostlistChunkWriter::flush(): rewriting the first chunk, which still has items in it");
571 string key = ChertPostListTable::make_key(tname);
572 bool ok = table->get_exact_entry(key, tag);
573 (void)ok;
574 Assert(ok);
575 Assert(!tag.empty());
576
577 Xapian::doccount num_ent;
578 Xapian::termcount coll_freq;
579 {
580 const char * tagpos = tag.data();
581 const char * tagend = tagpos + tag.size();
582 (void)read_start_of_first_chunk(&tagpos, tagend,
583 &num_ent, &coll_freq);
584 }
585
586 tag = make_start_of_first_chunk(num_ent, coll_freq, first_did);
587
588 tag += make_start_of_chunk(is_last_chunk, first_did, current_did);
589 tag += chunk;
590 table->add(key, tag);
591 return;
592 }
593
594 LOGLINE(DB, "PostlistChunkWriter::flush(): updating secondary chunk which still has items in it");
595 /* Not the first chunk.
596 *
597 * This has the easy sub-sub-case:
598 * The first entry in the chunk hasn't changed
599 * ...and the hard sub-sub-case:
600 * The first entry in the chunk has changed. This is
601 * harder because the key for the chunk changes, so
602 * we've got to do a switch.
603 */
604
605 // First find out the initial docid
606 const char *keypos = orig_key.data();
607 const char *keyend = keypos + orig_key.size();
608 if (!check_tname_in_key(&keypos, keyend, tname)) {
609 throw Xapian::DatabaseCorruptError("Have invalid key writing to postlist");
610 }
611 Xapian::docid initial_did;
612 if (!C_unpack_uint_preserving_sort(&keypos, keyend, &initial_did)) {
613 report_read_error(keypos);
614 }
615 string new_key;
616 if (initial_did != first_did) {
617 /* The fiddlier case:
618 * Create a new tag with the correct key, and replace
619 * the old one.
620 */
621 new_key = ChertPostListTable::make_key(tname, first_did);
622 table->del(orig_key);
623 } else {
624 new_key = orig_key;
625 }
626
627 // ...and write the start of this chunk.
628 tag = make_start_of_chunk(is_last_chunk, first_did, current_did);
629
630 tag += chunk;
631 table->add(new_key, tag);
632 }
633 }
634
635 /** Read the number of entries in the posting list.
636 * This must only be called when *posptr is pointing to the start of
637 * the first chunk of the posting list.
638 */
read_number_of_entries(const char ** posptr,const char * end,Xapian::doccount * number_of_entries_ptr,Xapian::termcount * collection_freq_ptr)639 void ChertPostList::read_number_of_entries(const char ** posptr,
640 const char * end,
641 Xapian::doccount * number_of_entries_ptr,
642 Xapian::termcount * collection_freq_ptr)
643 {
644 if (!unpack_uint(posptr, end, number_of_entries_ptr))
645 report_read_error(*posptr);
646 if (!unpack_uint(posptr, end, collection_freq_ptr))
647 report_read_error(*posptr);
648 }
649
650 /** The format of a postlist is:
651 *
652 * Split into chunks. Key for first chunk is the termname (encoded as
653 * length : name). Key for subsequent chunks is the same, followed by the
654 * document ID of the first document in the chunk (encoded as length of
655 * representation in first byte, and then docid).
656 *
657 * A chunk (except for the first chunk) contains:
658 *
659 * 1) bool - true if this is the last chunk.
660 * 2) difference between final docid in chunk and first docid.
661 * 3) wdf for the first item.
662 * 4) increment in docid to next item, followed by wdf for the item.
663 * 5) (4) repeatedly.
664 *
665 * The first chunk begins with the number of entries, the collection
666 * frequency, then the docid of the first document, then has the header of a
667 * standard chunk.
668 */
ChertPostList(intrusive_ptr<const ChertDatabase> this_db_,const string & term_,bool keep_reference)669 ChertPostList::ChertPostList(intrusive_ptr<const ChertDatabase> this_db_,
670 const string & term_,
671 bool keep_reference)
672 : LeafPostList(term_),
673 this_db(keep_reference ? this_db_ : NULL),
674 have_started(false),
675 is_at_end(false),
676 cursor(this_db_->postlist_table.cursor_get())
677 {
678 LOGCALL_CTOR(DB, "ChertPostList", this_db_.get() | term_ | keep_reference);
679 string key = ChertPostListTable::make_key(term);
680 int found = cursor->find_entry(key);
681 if (!found) {
682 LOGLINE(DB, "postlist for term not found");
683 number_of_entries = 0;
684 is_at_end = true;
685 pos = 0;
686 end = 0;
687 first_did_in_chunk = 0;
688 last_did_in_chunk = 0;
689 return;
690 }
691 cursor->read_tag();
692 pos = cursor->current_tag.data();
693 end = pos + cursor->current_tag.size();
694
695 did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
696 first_did_in_chunk = did;
697 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
698 &is_last_chunk);
699 read_wdf(&pos, end, &wdf);
700 LOGLINE(DB, "Initial docid " << did);
701 }
702
~ChertPostList()703 ChertPostList::~ChertPostList()
704 {
705 LOGCALL_DTOR(DB, "ChertPostList");
706 }
707
708 Xapian::termcount
get_doclength() const709 ChertPostList::get_doclength() const
710 {
711 LOGCALL(DB, Xapian::termcount, "ChertPostList::get_doclength", NO_ARGS);
712 Assert(have_started);
713 Assert(!is_at_end);
714 Assert(this_db.get());
715 RETURN(this_db->get_doclength(did));
716 }
717
718 Xapian::termcount
get_unique_terms() const719 ChertPostList::get_unique_terms() const
720 {
721 LOGCALL(DB, Xapian::termcount, "ChertPostList::get_unique_terms", NO_ARGS);
722 Assert(have_started);
723 Assert(!is_at_end);
724 Assert(this_db.get());
725 RETURN(this_db->get_unique_terms(did));
726 }
727
728 bool
next_in_chunk()729 ChertPostList::next_in_chunk()
730 {
731 LOGCALL(DB, bool, "ChertPostList::next_in_chunk", NO_ARGS);
732 if (pos == end) RETURN(false);
733
734 read_did_increase(&pos, end, &did);
735 read_wdf(&pos, end, &wdf);
736
737 // Either not at last doc in chunk, or pos == end, but not both.
738 Assert(did <= last_did_in_chunk);
739 Assert(did < last_did_in_chunk || pos == end);
740 Assert(pos != end || did == last_did_in_chunk);
741
742 RETURN(true);
743 }
744
745 void
next_chunk()746 ChertPostList::next_chunk()
747 {
748 LOGCALL_VOID(DB, "ChertPostList::next_chunk", NO_ARGS);
749 if (is_last_chunk) {
750 is_at_end = true;
751 return;
752 }
753
754 cursor->next();
755 if (cursor->after_end()) {
756 is_at_end = true;
757 throw Xapian::DatabaseCorruptError("Unexpected end of posting list for '" +
758 term + "'");
759 }
760 const char * keypos = cursor->current_key.data();
761 const char * keyend = keypos + cursor->current_key.size();
762 // Check we're still in same postlist
763 if (!check_tname_in_key_lite(&keypos, keyend, term)) {
764 is_at_end = true;
765 throw Xapian::DatabaseCorruptError("Unexpected end of posting list for '" +
766 term + "'");
767 }
768
769 Xapian::docid newdid;
770 if (!C_unpack_uint_preserving_sort(&keypos, keyend, &newdid)) {
771 report_read_error(keypos);
772 }
773 if (newdid <= did) {
774 throw Xapian::DatabaseCorruptError("Document ID in new chunk of postlist (" +
775 str(newdid) +
776 ") is not greater than final document ID in previous chunk (" +
777 str(did) + ")");
778 }
779 did = newdid;
780
781 cursor->read_tag();
782 pos = cursor->current_tag.data();
783 end = pos + cursor->current_tag.size();
784
785 first_did_in_chunk = did;
786 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
787 &is_last_chunk);
788 read_wdf(&pos, end, &wdf);
789 }
790
791 PositionList *
read_position_list()792 ChertPostList::read_position_list()
793 {
794 LOGCALL(DB, PositionList *, "ChertPostList::read_position_list", NO_ARGS);
795 Assert(this_db.get());
796 positionlist.read_data(&this_db->position_table, did, term);
797 RETURN(&positionlist);
798 }
799
800 PositionList *
open_position_list() const801 ChertPostList::open_position_list() const
802 {
803 LOGCALL(DB, PositionList *, "ChertPostList::open_position_list", NO_ARGS);
804 Assert(this_db.get());
805 RETURN(new ChertPositionList(&this_db->position_table, did, term));
806 }
807
808 PostList *
next(double w_min)809 ChertPostList::next(double w_min)
810 {
811 LOGCALL(DB, PostList *, "ChertPostList::next", w_min);
812 (void)w_min; // no warning
813
814 if (!have_started) {
815 have_started = true;
816 } else {
817 if (!next_in_chunk()) next_chunk();
818 }
819
820 if (is_at_end) {
821 LOGLINE(DB, "Moved to end");
822 } else {
823 LOGLINE(DB, "Moved to docid " << did << ", wdf = " << wdf);
824 }
825
826 RETURN(NULL);
827 }
828
829 bool
current_chunk_contains(Xapian::docid desired_did)830 ChertPostList::current_chunk_contains(Xapian::docid desired_did)
831 {
832 LOGCALL(DB, bool, "ChertPostList::current_chunk_contains", desired_did);
833 if (desired_did >= first_did_in_chunk &&
834 desired_did <= last_did_in_chunk) {
835 RETURN(true);
836 }
837 RETURN(false);
838 }
839
840 void
move_to_chunk_containing(Xapian::docid desired_did)841 ChertPostList::move_to_chunk_containing(Xapian::docid desired_did)
842 {
843 LOGCALL_VOID(DB, "ChertPostList::move_to_chunk_containing", desired_did);
844 (void)cursor->find_entry(ChertPostListTable::make_key(term, desired_did));
845 Assert(!cursor->after_end());
846
847 const char * keypos = cursor->current_key.data();
848 const char * keyend = keypos + cursor->current_key.size();
849 // Check we're still in same postlist
850 if (!check_tname_in_key_lite(&keypos, keyend, term)) {
851 // This should only happen if the postlist doesn't exist at all.
852 is_at_end = true;
853 is_last_chunk = true;
854 return;
855 }
856 is_at_end = false;
857
858 cursor->read_tag();
859 pos = cursor->current_tag.data();
860 end = pos + cursor->current_tag.size();
861
862 if (keypos == keyend) {
863 // In first chunk
864 #ifdef XAPIAN_ASSERTIONS
865 Xapian::doccount old_number_of_entries = number_of_entries;
866 did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
867 Assert(old_number_of_entries == number_of_entries);
868 #else
869 did = read_start_of_first_chunk(&pos, end, NULL, NULL);
870 #endif
871 } else {
872 // In normal chunk
873 if (!C_unpack_uint_preserving_sort(&keypos, keyend, &did)) {
874 report_read_error(keypos);
875 }
876 }
877
878 first_did_in_chunk = did;
879 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
880 &is_last_chunk);
881 read_wdf(&pos, end, &wdf);
882
883 // Possible, since desired_did might be after end of this chunk and before
884 // the next.
885 if (desired_did > last_did_in_chunk) next_chunk();
886 }
887
888 bool
move_forward_in_chunk_to_at_least(Xapian::docid desired_did)889 ChertPostList::move_forward_in_chunk_to_at_least(Xapian::docid desired_did)
890 {
891 LOGCALL(DB, bool, "ChertPostList::move_forward_in_chunk_to_at_least", desired_did);
892 if (did >= desired_did)
893 RETURN(true);
894
895 if (desired_did <= last_did_in_chunk) {
896 while (pos != end) {
897 read_did_increase(&pos, end, &did);
898 if (did >= desired_did) {
899 read_wdf(&pos, end, &wdf);
900 RETURN(true);
901 }
902 // It's faster to just skip over the wdf than to decode it.
903 read_wdf(&pos, end, NULL);
904 }
905
906 // If we hit the end of the chunk then last_did_in_chunk must be wrong.
907 Assert(false);
908 }
909
910 pos = end;
911 RETURN(false);
912 }
913
914 PostList *
skip_to(Xapian::docid desired_did,double w_min)915 ChertPostList::skip_to(Xapian::docid desired_did, double w_min)
916 {
917 LOGCALL(DB, PostList *, "ChertPostList::skip_to", desired_did | w_min);
918 (void)w_min; // no warning
919 // We've started now - if we hadn't already, we're already positioned
920 // at start so there's no need to actually do anything.
921 have_started = true;
922
923 // Don't skip back, and don't need to do anything if already there.
924 if (is_at_end || desired_did <= did) RETURN(NULL);
925
926 // Move to correct chunk
927 if (!current_chunk_contains(desired_did)) {
928 move_to_chunk_containing(desired_did);
929 // Might be at_end now, so we need to check before trying to move
930 // forward in chunk.
931 if (is_at_end) RETURN(NULL);
932 }
933
934 // Move to correct position in chunk
935 bool have_document = move_forward_in_chunk_to_at_least(desired_did);
936 (void)have_document;
937 Assert(have_document);
938
939 if (is_at_end) {
940 LOGLINE(DB, "Skipped to end");
941 } else {
942 LOGLINE(DB, "Skipped to docid " << did << ", wdf = " << wdf);
943 }
944
945 RETURN(NULL);
946 }
947
948 // Used for doclens.
949 bool
jump_to(Xapian::docid desired_did)950 ChertPostList::jump_to(Xapian::docid desired_did)
951 {
952 LOGCALL(DB, bool, "ChertPostList::jump_to", desired_did);
953 // We've started now - if we hadn't already, we're already positioned
954 // at start so there's no need to actually do anything.
955 have_started = true;
956
957 // If the list is empty, give up right away.
958 if (pos == 0) RETURN(false);
959
960 // Move to correct chunk, or reload the current chunk to go backwards in it
961 // (FIXME: perhaps handle the latter case more elegantly, though it won't
962 // happen during sequential access which is most common).
963 if (is_at_end || !current_chunk_contains(desired_did) || desired_did < did) {
964 // Clear is_at_end flag since we can rewind.
965 is_at_end = false;
966
967 move_to_chunk_containing(desired_did);
968 // Might be at_end now, so we need to check before trying to move
969 // forward in chunk.
970 if (is_at_end) RETURN(false);
971 }
972
973 // Move to correct position in chunk.
974 if (!move_forward_in_chunk_to_at_least(desired_did)) RETURN(false);
975 RETURN(desired_did == did);
976 }
977
978 string
get_description() const979 ChertPostList::get_description() const
980 {
981 return term + ":" + str(number_of_entries);
982 }
983
984 // Returns the last did to allow in this chunk.
985 Xapian::docid
get_chunk(const string & tname,Xapian::docid did,bool adding,PostlistChunkReader ** from,PostlistChunkWriter ** to)986 ChertPostListTable::get_chunk(const string &tname,
987 Xapian::docid did, bool adding,
988 PostlistChunkReader ** from, PostlistChunkWriter **to)
989 {
990 LOGCALL(DB, Xapian::docid, "ChertPostListTable::get_chunk", tname | did | adding | from | to);
991 // Get chunk containing entry
992 string key = make_key(tname, did);
993
994 // Find the right chunk
995 AutoPtr<ChertCursor> cursor(cursor_get());
996
997 (void)cursor->find_entry(key);
998 Assert(!cursor->after_end());
999
1000 const char * keypos = cursor->current_key.data();
1001 const char * keyend = keypos + cursor->current_key.size();
1002
1003 if (!check_tname_in_key(&keypos, keyend, tname)) {
1004 // Postlist for this termname doesn't exist.
1005 if (!adding)
1006 throw Xapian::DatabaseCorruptError("Attempted to delete or modify an entry in a non-existent posting list for " + tname);
1007
1008 *from = NULL;
1009 *to = new PostlistChunkWriter(string(), true, tname, true);
1010 RETURN(Xapian::docid(-1));
1011 }
1012
1013 // See if we're appending - if so we can shortcut by just copying
1014 // the data part of the chunk wholesale.
1015 bool is_first_chunk = (keypos == keyend);
1016 LOGVALUE(DB, is_first_chunk);
1017
1018 cursor->read_tag();
1019 const char * pos = cursor->current_tag.data();
1020 const char * end = pos + cursor->current_tag.size();
1021 Xapian::docid first_did_in_chunk;
1022 if (is_first_chunk) {
1023 first_did_in_chunk = read_start_of_first_chunk(&pos, end, NULL, NULL);
1024 } else {
1025 if (!C_unpack_uint_preserving_sort(&keypos, keyend, &first_did_in_chunk)) {
1026 report_read_error(keypos);
1027 }
1028 }
1029
1030 bool is_last_chunk;
1031 Xapian::docid last_did_in_chunk;
1032 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, &is_last_chunk);
1033 *to = new PostlistChunkWriter(cursor->current_key, is_first_chunk, tname,
1034 is_last_chunk);
1035 if (did > last_did_in_chunk) {
1036 // This is the shortcut. Not very pretty, but I'll leave refactoring
1037 // until I've a clearer picture of everything which needs to be done.
1038 // (FIXME)
1039 *from = NULL;
1040 (*to)->raw_append(first_did_in_chunk, last_did_in_chunk,
1041 string(pos, end));
1042 } else {
1043 *from = new PostlistChunkReader(first_did_in_chunk, string(pos, end));
1044 }
1045 if (is_last_chunk) RETURN(Xapian::docid(-1));
1046
1047 // Find first did of next tag.
1048 cursor->next();
1049 if (cursor->after_end()) {
1050 throw Xapian::DatabaseCorruptError("Expected another key but found none");
1051 }
1052 const char *kpos = cursor->current_key.data();
1053 const char *kend = kpos + cursor->current_key.size();
1054 if (!check_tname_in_key(&kpos, kend, tname)) {
1055 throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
1056 }
1057
1058 // Read the new first docid
1059 Xapian::docid first_did_of_next_chunk;
1060 if (!C_unpack_uint_preserving_sort(&kpos, kend, &first_did_of_next_chunk)) {
1061 report_read_error(kpos);
1062 }
1063 RETURN(first_did_of_next_chunk - 1);
1064 }
1065
1066 void
merge_changes(const map<string,map<Xapian::docid,pair<char,Xapian::termcount>>> & mod_plists,const map<Xapian::docid,Xapian::termcount> & doclens,const map<string,pair<Xapian::termcount_diff,Xapian::termcount_diff>> & freq_deltas)1067 ChertPostListTable::merge_changes(
1068 const map<string, map<Xapian::docid, pair<char, Xapian::termcount> > > & mod_plists,
1069 const map<Xapian::docid, Xapian::termcount> & doclens,
1070 const map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> > & freq_deltas)
1071 {
1072 LOGCALL_VOID(DB, "ChertPostListTable::merge_changes", mod_plists | doclens | freq_deltas);
1073
1074 // The cursor in the doclen_pl will no longer be valid, so reset it.
1075 doclen_pl.reset(0);
1076
1077 LOGVALUE(DB, doclens.size());
1078 if (!doclens.empty()) {
1079 // Ensure there's a first chunk.
1080 string current_key = make_key(string());
1081 if (!key_exists(current_key)) {
1082 LOGLINE(DB, "Adding dummy first chunk");
1083 string newtag = make_start_of_first_chunk(0, 0, 0);
1084 newtag += make_start_of_chunk(true, 0, 0);
1085 add(current_key, newtag);
1086 }
1087
1088 map<Xapian::docid, Xapian::termcount>::const_iterator j;
1089 j = doclens.begin();
1090 Assert(j != doclens.end()); // This case is caught above.
1091
1092 Xapian::docid max_did;
1093 PostlistChunkReader *from;
1094 PostlistChunkWriter *to;
1095 max_did = get_chunk(string(), j->first, true, &from, &to);
1096 LOGVALUE(DB, max_did);
1097 for ( ; j != doclens.end(); ++j) {
1098 Xapian::docid did = j->first;
1099
1100 next_doclen_chunk:
1101 LOGLINE(DB, "Updating doclens, did=" << did);
1102 if (from) while (!from->is_at_end()) {
1103 Xapian::docid copy_did = from->get_docid();
1104 if (copy_did >= did) {
1105 if (copy_did == did) from->next();
1106 break;
1107 }
1108 to->append(this, copy_did, from->get_wdf());
1109 from->next();
1110 }
1111 if ((!from || from->is_at_end()) && did > max_did) {
1112 delete from;
1113 to->flush(this);
1114 delete to;
1115 max_did = get_chunk(string(), did, false, &from, &to);
1116 goto next_doclen_chunk;
1117 }
1118
1119 Xapian::termcount new_doclen = j->second;
1120 if (new_doclen != static_cast<Xapian::termcount>(-1)) {
1121 to->append(this, did, new_doclen);
1122 }
1123 }
1124
1125 if (from) {
1126 while (!from->is_at_end()) {
1127 to->append(this, from->get_docid(), from->get_wdf());
1128 from->next();
1129 }
1130 delete from;
1131 }
1132 to->flush(this);
1133 delete to;
1134 }
1135
1136 map<string, map<Xapian::docid, pair<char, Xapian::termcount> > >::const_iterator i;
1137 for (i = mod_plists.begin(); i != mod_plists.end(); ++i) {
1138 if (i->second.empty()) continue;
1139 string tname = i->first;
1140 {
1141 // Rewrite the first chunk of this posting list with the updated
1142 // termfreq and collfreq.
1143 map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> >::const_iterator deltas = freq_deltas.find(tname);
1144 Assert(deltas != freq_deltas.end());
1145
1146 string current_key = make_key(tname);
1147 string tag;
1148 (void)get_exact_entry(current_key, tag);
1149
1150 // Read start of first chunk to get termfreq and collfreq.
1151 const char *pos = tag.data();
1152 const char *end = pos + tag.size();
1153 Xapian::doccount termfreq;
1154 Xapian::termcount collfreq;
1155 Xapian::docid firstdid, lastdid;
1156 bool islast;
1157 if (pos == end) {
1158 termfreq = 0;
1159 collfreq = 0;
1160 firstdid = 0;
1161 lastdid = 0;
1162 islast = true;
1163 } else {
1164 firstdid = read_start_of_first_chunk(&pos, end,
1165 &termfreq, &collfreq);
1166 // Handle the generic start of chunk header.
1167 lastdid = read_start_of_chunk(&pos, end, firstdid, &islast);
1168 }
1169
1170 termfreq += deltas->second.first;
1171 if (termfreq == 0) {
1172 // All postings deleted! So we can shortcut by zapping the
1173 // posting list.
1174 if (islast) {
1175 // Only one entry for this posting list.
1176 del(current_key);
1177 continue;
1178 }
1179 MutableChertCursor cursor(this);
1180 bool found = cursor.find_entry(current_key);
1181 Assert(found);
1182 if (!found) continue; // Reduce damage!
1183 while (cursor.del()) {
1184 const char *kpos = cursor.current_key.data();
1185 const char *kend = kpos + cursor.current_key.size();
1186 if (!check_tname_in_key_lite(&kpos, kend, tname)) break;
1187 }
1188 continue;
1189 }
1190 collfreq += deltas->second.second;
1191
1192 // Rewrite start of first chunk to update termfreq and collfreq.
1193 string newhdr = make_start_of_first_chunk(termfreq, collfreq, firstdid);
1194 newhdr += make_start_of_chunk(islast, firstdid, lastdid);
1195 if (pos == end) {
1196 add(current_key, newhdr);
1197 } else {
1198 Assert((size_t)(pos - tag.data()) <= tag.size());
1199 tag.replace(0, pos - tag.data(), newhdr);
1200 add(current_key, tag);
1201 }
1202 }
1203 map<Xapian::docid, pair<char, Xapian::termcount> >::const_iterator j;
1204 j = i->second.begin();
1205 Assert(j != i->second.end()); // This case is caught above.
1206
1207 Xapian::docid max_did;
1208 PostlistChunkReader *from;
1209 PostlistChunkWriter *to;
1210 max_did = get_chunk(tname, j->first, j->second.first == 'A',
1211 &from, &to);
1212 for ( ; j != i->second.end(); ++j) {
1213 Xapian::docid did = j->first;
1214
1215 next_chunk:
1216 LOGLINE(DB, "Updating tname=" << tname << ", did=" << did);
1217 if (from) while (!from->is_at_end()) {
1218 Xapian::docid copy_did = from->get_docid();
1219 if (copy_did >= did) {
1220 if (copy_did == did) {
1221 Assert(j->second.first != 'A');
1222 from->next();
1223 }
1224 break;
1225 }
1226 to->append(this, copy_did, from->get_wdf());
1227 from->next();
1228 }
1229 if ((!from || from->is_at_end()) && did > max_did) {
1230 delete from;
1231 to->flush(this);
1232 delete to;
1233 max_did = get_chunk(tname, did, false, &from, &to);
1234 goto next_chunk;
1235 }
1236
1237 if (j->second.first != 'D') {
1238 Xapian::termcount new_wdf = j->second.second;
1239 to->append(this, did, new_wdf);
1240 }
1241 }
1242
1243 if (from) {
1244 while (!from->is_at_end()) {
1245 to->append(this, from->get_docid(), from->get_wdf());
1246 from->next();
1247 }
1248 delete from;
1249 }
1250 to->flush(this);
1251 delete to;
1252 }
1253 }
1254
1255 void
get_used_docid_range(Xapian::docid & first,Xapian::docid & last) const1256 ChertPostListTable::get_used_docid_range(Xapian::docid & first,
1257 Xapian::docid & last) const
1258 {
1259 LOGCALL(DB, Xapian::docid, "ChertPostList::get_used_docid_range", "&first, &used");
1260 AutoPtr<ChertCursor> cur(cursor_get());
1261 if (!cur->find_entry(pack_chert_postlist_key(string()))) {
1262 // Empty database.
1263 first = last = 0;
1264 return;
1265 }
1266
1267 cur->read_tag();
1268 const char * p = cur->current_tag.data();
1269 const char * e = p + cur->current_tag.size();
1270
1271 first = read_start_of_first_chunk(&p, e, NULL, NULL);
1272
1273 (void)cur->find_entry(pack_glass_postlist_key(string(), CHERT_MAX_DOCID));
1274 Assert(!cur->after_end());
1275
1276 const char * keypos = cur->current_key.data();
1277 const char * keyend = keypos + cur->current_key.size();
1278 // Check we're still in same postlist
1279 if (!check_tname_in_key_lite(&keypos, keyend, string())) {
1280 // Shouldn't happen - we already handled the empty database case above.
1281 Assert(false);
1282 first = last = 0;
1283 return;
1284 }
1285
1286 cur->read_tag();
1287 p = cur->current_tag.data();
1288 e = p + cur->current_tag.size();
1289
1290 Xapian::docid start_of_last_chunk;
1291 if (keypos == keyend) {
1292 start_of_last_chunk = first;
1293 first = read_start_of_first_chunk(&p, e, NULL, NULL);
1294 } else {
1295 // In normal chunk
1296 if (!C_unpack_uint_preserving_sort(&keypos, keyend,
1297 &start_of_last_chunk)) {
1298 report_read_error(keypos);
1299 }
1300 }
1301
1302 bool dummy;
1303 last = read_start_of_chunk(&p, e, start_of_last_chunk, &dummy);
1304 }
1305