1 /* brass_postlist.cc: Postlists in a brass database
2 *
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002,2003,2004,2005,2007,2008,2009,2011 Olly Betts
5 * Copyright 2007,2008,2009 Lemur Consulting Ltd
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
21 */
22
23 #include <config.h>
24
25 #include "brass_postlist.h"
26
27 #include "brass_cursor.h"
28 #include "brass_database.h"
29 #include "debuglog.h"
30 #include "noreturn.h"
31 #include "pack.h"
32 #include "str.h"
33
34 Xapian::doccount
get_termfreq(const string & term) const35 BrassPostListTable::get_termfreq(const string & term) const
36 {
37 string key = make_key(term);
38 string tag;
39 if (!get_exact_entry(key, tag)) return 0;
40
41 Xapian::doccount termfreq;
42 const char * p = tag.data();
43 BrassPostList::read_number_of_entries(&p, p + tag.size(), &termfreq, NULL);
44 return termfreq;
45 }
46
47 Xapian::termcount
get_collection_freq(const string & term) const48 BrassPostListTable::get_collection_freq(const string & term) const
49 {
50 string key = make_key(term);
51 string tag;
52 if (!get_exact_entry(key, tag)) return 0;
53
54 Xapian::termcount collfreq;
55 const char * p = tag.data();
56 BrassPostList::read_number_of_entries(&p, p + tag.size(), NULL, &collfreq);
57 return collfreq;
58 }
59
60 Xapian::termcount
get_doclength(Xapian::docid did,Xapian::Internal::RefCntPtr<const BrassDatabase> db) const61 BrassPostListTable::get_doclength(Xapian::docid did,
62 Xapian::Internal::RefCntPtr<const BrassDatabase> db) const {
63 if (!doclen_pl.get()) {
64 // Don't keep a reference back to the database, since this
65 // would make a reference loop.
66 doclen_pl.reset(new BrassPostList(db, string(), false));
67 }
68 if (!doclen_pl->jump_to(did))
69 throw Xapian::DocNotFoundError("Document " + str(did) + " not found");
70 return doclen_pl->get_wdf();
71 }
72
73 bool
document_exists(Xapian::docid did,Xapian::Internal::RefCntPtr<const BrassDatabase> db) const74 BrassPostListTable::document_exists(Xapian::docid did,
75 Xapian::Internal::RefCntPtr<const BrassDatabase> db) const
76 {
77 if (!doclen_pl.get()) {
78 // Don't keep a reference back to the database, since this
79 // would make a reference loop.
80 doclen_pl.reset(new BrassPostList(db, string(), false));
81 }
82 return (doclen_pl->jump_to(did));
83 }
84
85 // How big should chunks in the posting list be? (They
86 // will grow slightly bigger than this, but not more than a
87 // few bytes extra) - FIXME: tune this value to try to
88 // maximise how well blocks are used. Or performance.
89 // Or indexing speed. Or something...
90 const unsigned int CHUNKSIZE = 2000;
91
92 /** PostlistChunkWriter is a wrapper which acts roughly as an
93 * output iterator on a postlist chunk, taking care of the
94 * messy details. It's intended to be used with deletion and
95 * replacing of entries, not for adding to the end, when it's
96 * not really needed.
97 */
98 class Brass::PostlistChunkWriter {
99 public:
100 PostlistChunkWriter(const string &orig_key_,
101 bool is_first_chunk_,
102 const string &tname_,
103 bool is_last_chunk_);
104
105 /// Append an entry to this chunk.
106 void append(BrassTable * table, Xapian::docid did,
107 Xapian::termcount wdf);
108
109 /// Append a block of raw entries to this chunk.
raw_append(Xapian::docid first_did_,Xapian::docid current_did_,const string & s)110 void raw_append(Xapian::docid first_did_, Xapian::docid current_did_,
111 const string & s) {
112 Assert(!started);
113 first_did = first_did_;
114 current_did = current_did_;
115 if (!s.empty()) {
116 chunk.append(s);
117 started = true;
118 }
119 }
120
121 /** Flush the chunk to the buffered table. Note: this may write it
122 * with a different key to the original one, if for example the first
123 * entry has changed.
124 */
125 void flush(BrassTable *table);
126
127 private:
128 string orig_key;
129 string tname;
130 bool is_first_chunk;
131 bool is_last_chunk;
132 bool started;
133
134 Xapian::docid first_did;
135 Xapian::docid current_did;
136
137 string chunk;
138 };
139
140 using Brass::PostlistChunkWriter;
141
142 // Static functions
143
144 /// Report an error when reading the posting list.
145 XAPIAN_NORETURN(static void report_read_error(const char * position));
report_read_error(const char * position)146 static void report_read_error(const char * position)
147 {
148 if (position == 0) {
149 // data ran out
150 LOGLINE(DB, "BrassPostList data ran out");
151 throw Xapian::DatabaseCorruptError("Data ran out unexpectedly when reading posting list.");
152 }
153 // overflow
154 LOGLINE(DB, "BrassPostList value too large");
155 throw Xapian::RangeError("Value in posting list too large.");
156 }
157
get_tname_from_key(const char ** src,const char * end,string & tname)158 static inline bool get_tname_from_key(const char **src, const char *end,
159 string &tname)
160 {
161 return unpack_string_preserving_sort(src, end, tname);
162 }
163
164 static inline bool
check_tname_in_key_lite(const char ** keypos,const char * keyend,const string & tname)165 check_tname_in_key_lite(const char **keypos, const char *keyend, const string &tname)
166 {
167 string tname_in_key;
168
169 if (keyend - *keypos >= 2 && (*keypos)[0] == '\0' && (*keypos)[1] == '\xe0') {
170 *keypos += 2;
171 } else {
172 // Read the termname.
173 if (!get_tname_from_key(keypos, keyend, tname_in_key))
174 report_read_error(*keypos);
175 }
176
177 // This should only fail if the postlist doesn't exist at all.
178 return tname_in_key == tname;
179 }
180
181 static inline bool
check_tname_in_key(const char ** keypos,const char * keyend,const string & tname)182 check_tname_in_key(const char **keypos, const char *keyend, const string &tname)
183 {
184 if (*keypos == keyend) return false;
185
186 return check_tname_in_key_lite(keypos, keyend, tname);
187 }
188
189 /// Read the start of the first chunk in the posting list.
190 static Xapian::docid
read_start_of_first_chunk(const char ** posptr,const char * end,Xapian::doccount * number_of_entries_ptr,Xapian::termcount * collection_freq_ptr)191 read_start_of_first_chunk(const char ** posptr,
192 const char * end,
193 Xapian::doccount * number_of_entries_ptr,
194 Xapian::termcount * collection_freq_ptr)
195 {
196 LOGCALL_STATIC(DB, Xapian::docid, "read_start_of_first_chunk", (const void *)posptr | (const void *)end | (void *)number_of_entries_ptr | (void *)collection_freq_ptr);
197
198 BrassPostList::read_number_of_entries(posptr, end,
199 number_of_entries_ptr, collection_freq_ptr);
200 if (number_of_entries_ptr)
201 LOGVALUE(DB, *number_of_entries_ptr);
202 if (collection_freq_ptr)
203 LOGVALUE(DB, *collection_freq_ptr);
204
205 Xapian::docid did;
206 // Read the docid of the first entry in the posting list.
207 if (!unpack_uint(posptr, end, &did))
208 report_read_error(*posptr);
209 ++did;
210 LOGVALUE(DB, did);
211 RETURN(did);
212 }
213
214 static inline void
read_did_increase(const char ** posptr,const char * end,Xapian::docid * did_ptr)215 read_did_increase(const char ** posptr, const char * end,
216 Xapian::docid * did_ptr)
217 {
218 Xapian::docid did_increase;
219 if (!unpack_uint(posptr, end, &did_increase)) report_read_error(*posptr);
220 *did_ptr += did_increase + 1;
221 }
222
223 /// Read the wdf for an entry.
224 static inline void
read_wdf(const char ** posptr,const char * end,Xapian::termcount * wdf_ptr)225 read_wdf(const char ** posptr, const char * end, Xapian::termcount * wdf_ptr)
226 {
227 if (!unpack_uint(posptr, end, wdf_ptr)) report_read_error(*posptr);
228 }
229
230 /// Read the start of a chunk.
231 static Xapian::docid
read_start_of_chunk(const char ** posptr,const char * end,Xapian::docid first_did_in_chunk,bool * is_last_chunk_ptr)232 read_start_of_chunk(const char ** posptr,
233 const char * end,
234 Xapian::docid first_did_in_chunk,
235 bool * is_last_chunk_ptr)
236 {
237 LOGCALL_STATIC(DB, Xapian::docid, "read_start_of_chunk", reinterpret_cast<const void*>(posptr) | reinterpret_cast<const void*>(end) | first_did_in_chunk | reinterpret_cast<const void*>(is_last_chunk_ptr));
238 Assert(is_last_chunk_ptr);
239
240 // Read whether this is the last chunk
241 if (!unpack_bool(posptr, end, is_last_chunk_ptr))
242 report_read_error(*posptr);
243 LOGVALUE(DB, *is_last_chunk_ptr);
244
245 // Read what the final document ID in this chunk is.
246 Xapian::docid increase_to_last;
247 if (!unpack_uint(posptr, end, &increase_to_last))
248 report_read_error(*posptr);
249 Xapian::docid last_did_in_chunk = first_did_in_chunk + increase_to_last;
250 LOGVALUE(DB, last_did_in_chunk);
251 RETURN(last_did_in_chunk);
252 }
253
254 /** PostlistChunkReader is essentially an iterator wrapper
255 * around a postlist chunk. It simply iterates through the
256 * entries in a postlist.
257 */
258 class Brass::PostlistChunkReader {
259 string data;
260
261 const char *pos;
262 const char *end;
263
264 bool at_end;
265
266 Xapian::docid did;
267 Xapian::termcount wdf;
268
269 public:
270 /** Initialise the postlist chunk reader.
271 *
272 * @param first_did First document id in this chunk.
273 * @param data The tag string with the header removed.
274 */
PostlistChunkReader(Xapian::docid first_did,const string & data_)275 PostlistChunkReader(Xapian::docid first_did, const string & data_)
276 : data(data_), pos(data.data()), end(pos + data.length()), at_end(data.empty()), did(first_did)
277 {
278 if (!at_end) read_wdf(&pos, end, &wdf);
279 }
280
get_docid() const281 Xapian::docid get_docid() const {
282 return did;
283 }
get_wdf() const284 Xapian::termcount get_wdf() const {
285 return wdf;
286 }
287
is_at_end() const288 bool is_at_end() const {
289 return at_end;
290 }
291
292 /** Advance to the next entry. Set at_end if we run off the end.
293 */
294 void next();
295 };
296
297 using Brass::PostlistChunkReader;
298
299 void
next()300 PostlistChunkReader::next()
301 {
302 if (pos == end) {
303 at_end = true;
304 } else {
305 read_did_increase(&pos, end, &did);
306 read_wdf(&pos, end, &wdf);
307 }
308 }
309
PostlistChunkWriter(const string & orig_key_,bool is_first_chunk_,const string & tname_,bool is_last_chunk_)310 PostlistChunkWriter::PostlistChunkWriter(const string &orig_key_,
311 bool is_first_chunk_,
312 const string &tname_,
313 bool is_last_chunk_)
314 : orig_key(orig_key_),
315 tname(tname_), is_first_chunk(is_first_chunk_),
316 is_last_chunk(is_last_chunk_),
317 started(false)
318 {
319 LOGCALL_VOID(DB, "PostlistChunkWriter::PostlistChunkWriter", orig_key_ | is_first_chunk_ | tname_ | is_last_chunk_);
320 }
321
322 void
append(BrassTable * table,Xapian::docid did,Xapian::termcount wdf)323 PostlistChunkWriter::append(BrassTable * table, Xapian::docid did,
324 Xapian::termcount wdf)
325 {
326 if (!started) {
327 started = true;
328 first_did = did;
329 } else {
330 Assert(did > current_did);
331 // Start a new chunk if this one has grown to the threshold.
332 if (chunk.size() >= CHUNKSIZE) {
333 bool save_is_last_chunk = is_last_chunk;
334 is_last_chunk = false;
335 flush(table);
336 is_last_chunk = save_is_last_chunk;
337 is_first_chunk = false;
338 first_did = did;
339 chunk.resize(0);
340 orig_key = BrassPostListTable::make_key(tname, first_did);
341 } else {
342 pack_uint(chunk, did - current_did - 1);
343 }
344 }
345 current_did = did;
346 pack_uint(chunk, wdf);
347 }
348
349 /** Make the data to go at the start of the very first chunk.
350 */
351 static inline string
make_start_of_first_chunk(Xapian::doccount entries,Xapian::termcount collectionfreq,Xapian::docid new_did)352 make_start_of_first_chunk(Xapian::doccount entries,
353 Xapian::termcount collectionfreq,
354 Xapian::docid new_did)
355 {
356 string chunk;
357 pack_uint(chunk, entries);
358 pack_uint(chunk, collectionfreq);
359 pack_uint(chunk, new_did - 1);
360 return chunk;
361 }
362
363 /** Make the data to go at the start of a standard chunk.
364 */
365 static inline string
make_start_of_chunk(bool new_is_last_chunk,Xapian::docid new_first_did,Xapian::docid new_final_did)366 make_start_of_chunk(bool new_is_last_chunk,
367 Xapian::docid new_first_did,
368 Xapian::docid new_final_did)
369 {
370 Assert(new_final_did >= new_first_did);
371 string chunk;
372 pack_bool(chunk, new_is_last_chunk);
373 pack_uint(chunk, new_final_did - new_first_did);
374 return chunk;
375 }
376
377 static void
write_start_of_chunk(string & chunk,unsigned int start_of_chunk_header,unsigned int end_of_chunk_header,bool is_last_chunk,Xapian::docid first_did_in_chunk,Xapian::docid last_did_in_chunk)378 write_start_of_chunk(string & chunk,
379 unsigned int start_of_chunk_header,
380 unsigned int end_of_chunk_header,
381 bool is_last_chunk,
382 Xapian::docid first_did_in_chunk,
383 Xapian::docid last_did_in_chunk)
384 {
385 Assert((size_t)(end_of_chunk_header - start_of_chunk_header) <= chunk.size());
386
387 chunk.replace(start_of_chunk_header,
388 end_of_chunk_header - start_of_chunk_header,
389 make_start_of_chunk(is_last_chunk, first_did_in_chunk,
390 last_did_in_chunk));
391 }
392
393 void
flush(BrassTable * table)394 PostlistChunkWriter::flush(BrassTable *table)
395 {
396 LOGCALL_VOID(DB, "PostlistChunkWriter::flush", table);
397
398 /* This is one of the more messy parts involved with updating posting
399 * list chunks.
400 *
401 * Depending on circumstances, we may have to delete an entire chunk
402 * or file it under a different key, as well as possibly modifying both
403 * the previous and next chunk of the postlist.
404 */
405
406 if (!started) {
407 /* This chunk is now empty so disappears entirely.
408 *
409 * If this was the last chunk, then the previous chunk
410 * must have its "is_last_chunk" flag updated.
411 *
412 * If this was the first chunk, then the next chunk must
413 * be transformed into the first chunk. Messy!
414 */
415 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting chunk");
416 Assert(!orig_key.empty());
417 if (is_first_chunk) {
418 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting first chunk");
419 if (is_last_chunk) {
420 /* This is the first and the last chunk, ie the only
421 * chunk, so just delete the tag.
422 */
423 table->del(orig_key);
424 return;
425 }
426
427 /* This is the messiest case. The first chunk is to
428 * be removed, and there is at least one chunk after
429 * it. Need to rewrite the next chunk as the first
430 * chunk.
431 */
432 AutoPtr<BrassCursor> cursor(table->cursor_get());
433
434 if (!cursor->find_entry(orig_key)) {
435 throw Xapian::DatabaseCorruptError("The key we're working on has disappeared");
436 }
437
438 // FIXME: Currently the doclen list has a special first chunk too,
439 // which reduces special casing here. The downside is a slightly
440 // larger than necessary first chunk and needless fiddling if the
441 // first chunk is deleted. But really we should look at
442 // redesigning the whole postlist format with an eye to making it
443 // easier to update!
444
445 // Extract existing counts from the first chunk so we can reinsert
446 // them into the block we're renaming.
447 Xapian::doccount num_ent;
448 Xapian::termcount coll_freq;
449 {
450 cursor->read_tag();
451 const char *tagpos = cursor->current_tag.data();
452 const char *tagend = tagpos + cursor->current_tag.size();
453
454 (void)read_start_of_first_chunk(&tagpos, tagend,
455 &num_ent, &coll_freq);
456 }
457
458 // Seek to the next chunk.
459 cursor->next();
460 if (cursor->after_end()) {
461 throw Xapian::DatabaseCorruptError("Expected another key but found none");
462 }
463 const char *kpos = cursor->current_key.data();
464 const char *kend = kpos + cursor->current_key.size();
465 if (!check_tname_in_key(&kpos, kend, tname)) {
466 throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
467 }
468
469 // Read the new first docid
470 Xapian::docid new_first_did;
471 if (!unpack_uint_preserving_sort(&kpos, kend, &new_first_did)) {
472 report_read_error(kpos);
473 }
474
475 cursor->read_tag();
476 const char *tagpos = cursor->current_tag.data();
477 const char *tagend = tagpos + cursor->current_tag.size();
478
479 // Read the chunk header
480 bool new_is_last_chunk;
481 Xapian::docid new_last_did_in_chunk =
482 read_start_of_chunk(&tagpos, tagend, new_first_did,
483 &new_is_last_chunk);
484
485 string chunk_data(tagpos, tagend);
486
487 // First remove the renamed tag
488 table->del(cursor->current_key);
489
490 // And now write it as the first chunk
491 string tag;
492 tag = make_start_of_first_chunk(num_ent, coll_freq, new_first_did);
493 tag += make_start_of_chunk(new_is_last_chunk,
494 new_first_did,
495 new_last_did_in_chunk);
496 tag += chunk_data;
497 table->add(orig_key, tag);
498 return;
499 }
500
501 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary chunk");
502 /* This isn't the first chunk. Check whether we're the last chunk. */
503
504 // Delete this chunk
505 table->del(orig_key);
506
507 if (is_last_chunk) {
508 LOGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary last chunk");
509 // Update the previous chunk's is_last_chunk flag.
510 AutoPtr<BrassCursor> cursor(table->cursor_get());
511
512 /* Should not find the key we just deleted, but should
513 * find the previous chunk. */
514 if (cursor->find_entry(orig_key)) {
515 throw Xapian::DatabaseCorruptError("Brass key not deleted as we expected");
516 }
517 // Make sure this is a chunk with the right term attached.
518 const char * keypos = cursor->current_key.data();
519 const char * keyend = keypos + cursor->current_key.size();
520 if (!check_tname_in_key(&keypos, keyend, tname)) {
521 throw Xapian::DatabaseCorruptError("Couldn't find chunk before delete chunk");
522 }
523
524 bool is_prev_first_chunk = (keypos == keyend);
525
526 // Now update the last_chunk
527 cursor->read_tag();
528 string tag = cursor->current_tag;
529
530 const char *tagpos = tag.data();
531 const char *tagend = tagpos + tag.size();
532
533 // Skip first chunk header
534 Xapian::docid first_did_in_chunk;
535 if (is_prev_first_chunk) {
536 first_did_in_chunk = read_start_of_first_chunk(&tagpos, tagend,
537 0, 0);
538 } else {
539 if (!unpack_uint_preserving_sort(&keypos, keyend, &first_did_in_chunk))
540 report_read_error(keypos);
541 }
542 bool wrong_is_last_chunk;
543 string::size_type start_of_chunk_header = tagpos - tag.data();
544 Xapian::docid last_did_in_chunk =
545 read_start_of_chunk(&tagpos, tagend, first_did_in_chunk,
546 &wrong_is_last_chunk);
547 string::size_type end_of_chunk_header = tagpos - tag.data();
548
549 // write new is_last flag
550 write_start_of_chunk(tag,
551 start_of_chunk_header,
552 end_of_chunk_header,
553 true, // is_last_chunk
554 first_did_in_chunk,
555 last_did_in_chunk);
556 table->add(cursor->current_key, tag);
557 }
558 } else {
559 LOGLINE(DB, "PostlistChunkWriter::flush(): updating chunk which still has items in it");
560 /* The chunk still has some items in it. Two major subcases:
561 * a) This is the first chunk.
562 * b) This isn't the first chunk.
563 *
564 * The subcases just affect the chunk header.
565 */
566 string tag;
567
568 /* First write the header, which depends on whether this is the
569 * first chunk.
570 */
571 if (is_first_chunk) {
572 /* The first chunk. This is the relatively easy case,
573 * and we just have to write this one back to disk.
574 */
575 LOGLINE(DB, "PostlistChunkWriter::flush(): rewriting the first chunk, which still has items in it");
576 string key = BrassPostListTable::make_key(tname);
577 bool ok = table->get_exact_entry(key, tag);
578 (void)ok;
579 Assert(ok);
580 Assert(!tag.empty());
581
582 Xapian::doccount num_ent;
583 Xapian::termcount coll_freq;
584 {
585 const char * tagpos = tag.data();
586 const char * tagend = tagpos + tag.size();
587 (void)read_start_of_first_chunk(&tagpos, tagend,
588 &num_ent, &coll_freq);
589 }
590
591 tag = make_start_of_first_chunk(num_ent, coll_freq, first_did);
592
593 tag += make_start_of_chunk(is_last_chunk, first_did, current_did);
594 tag += chunk;
595 table->add(key, tag);
596 return;
597 }
598
599 LOGLINE(DB, "PostlistChunkWriter::flush(): updating secondary chunk which still has items in it");
600 /* Not the first chunk.
601 *
602 * This has the easy sub-sub-case:
603 * The first entry in the chunk hasn't changed
604 * ...and the hard sub-sub-case:
605 * The first entry in the chunk has changed. This is
606 * harder because the key for the chunk changes, so
607 * we've got to do a switch.
608 */
609
610 // First find out the initial docid
611 const char *keypos = orig_key.data();
612 const char *keyend = keypos + orig_key.size();
613 if (!check_tname_in_key(&keypos, keyend, tname)) {
614 throw Xapian::DatabaseCorruptError("Have invalid key writing to postlist");
615 }
616 Xapian::docid initial_did;
617 if (!unpack_uint_preserving_sort(&keypos, keyend, &initial_did)) {
618 report_read_error(keypos);
619 }
620 string new_key;
621 if (initial_did != first_did) {
622 /* The fiddlier case:
623 * Create a new tag with the correct key, and replace
624 * the old one.
625 */
626 new_key = BrassPostListTable::make_key(tname, first_did);
627 table->del(orig_key);
628 } else {
629 new_key = orig_key;
630 }
631
632 // ...and write the start of this chunk.
633 tag = make_start_of_chunk(is_last_chunk, first_did, current_did);
634
635 tag += chunk;
636 table->add(new_key, tag);
637 }
638 }
639
640 /** Read the number of entries in the posting list.
641 * This must only be called when *posptr is pointing to the start of
642 * the first chunk of the posting list.
643 */
read_number_of_entries(const char ** posptr,const char * end,Xapian::doccount * number_of_entries_ptr,Xapian::termcount * collection_freq_ptr)644 void BrassPostList::read_number_of_entries(const char ** posptr,
645 const char * end,
646 Xapian::doccount * number_of_entries_ptr,
647 Xapian::termcount * collection_freq_ptr)
648 {
649 if (!unpack_uint(posptr, end, number_of_entries_ptr))
650 report_read_error(*posptr);
651 if (!unpack_uint(posptr, end, collection_freq_ptr))
652 report_read_error(*posptr);
653 }
654
655 /** The format of a postlist is:
656 *
657 * Split into chunks. Key for first chunk is the termname (encoded as
658 * length : name). Key for subsequent chunks is the same, followed by the
659 * document ID of the first document in the chunk (encoded as length of
660 * representation in first byte, and then docid).
661 *
662 * A chunk (except for the first chunk) contains:
663 *
664 * 1) bool - true if this is the last chunk.
665 * 2) difference between final docid in chunk and first docid.
666 * 3) wdf for the first item.
667 * 4) increment in docid to next item, followed by wdf for the item.
668 * 5) (4) repeatedly.
669 *
670 * The first chunk begins with the number of entries, the collection
671 * frequency, then the docid of the first document, then has the header of a
672 * standard chunk.
673 */
BrassPostList(Xapian::Internal::RefCntPtr<const BrassDatabase> this_db_,const string & term_,bool keep_reference)674 BrassPostList::BrassPostList(Xapian::Internal::RefCntPtr<const BrassDatabase> this_db_,
675 const string & term_,
676 bool keep_reference)
677 : LeafPostList(term_),
678 this_db(keep_reference ? this_db_ : NULL),
679 have_started(false),
680 is_at_end(false),
681 cursor(this_db_->postlist_table.cursor_get())
682 {
683 LOGCALL_VOID(DB, "BrassPostList::BrassPostList", this_db_.get() | term_ | keep_reference);
684 string key = BrassPostListTable::make_key(term);
685 int found = cursor->find_entry(key);
686 if (!found) {
687 LOGLINE(DB, "postlist for term not found");
688 number_of_entries = 0;
689 is_at_end = true;
690 pos = 0;
691 end = 0;
692 first_did_in_chunk = 0;
693 last_did_in_chunk = 0;
694 return;
695 }
696 cursor->read_tag();
697 pos = cursor->current_tag.data();
698 end = pos + cursor->current_tag.size();
699
700 did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
701 first_did_in_chunk = did;
702 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
703 &is_last_chunk);
704 read_wdf(&pos, end, &wdf);
705 LOGLINE(DB, "Initial docid " << did);
706 }
707
~BrassPostList()708 BrassPostList::~BrassPostList()
709 {
710 LOGCALL_DTOR(DB, "BrassPostList");
711 }
712
713 Xapian::termcount
get_doclength() const714 BrassPostList::get_doclength() const
715 {
716 LOGCALL(DB, Xapian::termcount, "BrassPostList::get_doclength", NO_ARGS);
717 Assert(have_started);
718 Assert(this_db.get());
719 RETURN(this_db->get_doclength(did));
720 }
721
722 bool
next_in_chunk()723 BrassPostList::next_in_chunk()
724 {
725 LOGCALL(DB, bool, "BrassPostList::next_in_chunk", NO_ARGS);
726 if (pos == end) RETURN(false);
727
728 read_did_increase(&pos, end, &did);
729 read_wdf(&pos, end, &wdf);
730
731 // Either not at last doc in chunk, or pos == end, but not both.
732 Assert(did <= last_did_in_chunk);
733 Assert(did < last_did_in_chunk || pos == end);
734 Assert(pos != end || did == last_did_in_chunk);
735
736 RETURN(true);
737 }
738
739 void
next_chunk()740 BrassPostList::next_chunk()
741 {
742 LOGCALL_VOID(DB, "BrassPostList::next_chunk", NO_ARGS);
743 if (is_last_chunk) {
744 is_at_end = true;
745 return;
746 }
747
748 cursor->next();
749 if (cursor->after_end()) {
750 is_at_end = true;
751 throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
752 term + "'");
753 }
754 const char * keypos = cursor->current_key.data();
755 const char * keyend = keypos + cursor->current_key.size();
756 // Check we're still in same postlist
757 if (!check_tname_in_key_lite(&keypos, keyend, term)) {
758 is_at_end = true;
759 throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
760 term + "'");
761 }
762
763 Xapian::docid newdid;
764 if (!unpack_uint_preserving_sort(&keypos, keyend, &newdid)) {
765 report_read_error(keypos);
766 }
767 if (newdid <= did) {
768 throw Xapian::DatabaseCorruptError("Document ID in new chunk of postlist (" +
769 str(newdid) +
770 ") is not greater than final document ID in previous chunk (" +
771 str(did) + ")");
772 }
773 did = newdid;
774
775 cursor->read_tag();
776 pos = cursor->current_tag.data();
777 end = pos + cursor->current_tag.size();
778
779 first_did_in_chunk = did;
780 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
781 &is_last_chunk);
782 read_wdf(&pos, end, &wdf);
783 }
784
785 PositionList *
read_position_list()786 BrassPostList::read_position_list()
787 {
788 LOGCALL(DB, PositionList *, "BrassPostList::read_position_list", NO_ARGS);
789 Assert(this_db.get());
790 positionlist.read_data(&this_db->position_table, did, term);
791 RETURN(&positionlist);
792 }
793
794 PositionList *
open_position_list() const795 BrassPostList::open_position_list() const
796 {
797 LOGCALL(DB, PositionList *, "BrassPostList::open_position_list", NO_ARGS);
798 Assert(this_db.get());
799 RETURN(new BrassPositionList(&this_db->position_table, did, term));
800 }
801
802 PostList *
next(Xapian::weight w_min)803 BrassPostList::next(Xapian::weight w_min)
804 {
805 LOGCALL(DB, PostList *, "BrassPostList::next", w_min);
806 (void)w_min; // no warning
807
808 if (!have_started) {
809 have_started = true;
810 } else {
811 if (!next_in_chunk()) next_chunk();
812 }
813
814 if (is_at_end) {
815 LOGLINE(DB, "Moved to end");
816 } else {
817 LOGLINE(DB, "Moved to docid " << did << ", wdf = " << wdf);
818 }
819
820 RETURN(NULL);
821 }
822
823 bool
current_chunk_contains(Xapian::docid desired_did)824 BrassPostList::current_chunk_contains(Xapian::docid desired_did)
825 {
826 LOGCALL(DB, bool, "BrassPostList::current_chunk_contains", desired_did);
827 if (desired_did >= first_did_in_chunk &&
828 desired_did <= last_did_in_chunk) {
829 RETURN(true);
830 }
831 RETURN(false);
832 }
833
834 void
move_to_chunk_containing(Xapian::docid desired_did)835 BrassPostList::move_to_chunk_containing(Xapian::docid desired_did)
836 {
837 LOGCALL_VOID(DB, "BrassPostList::move_to_chunk_containing", desired_did);
838 (void)cursor->find_entry(BrassPostListTable::make_key(term, desired_did));
839 Assert(!cursor->after_end());
840
841 const char * keypos = cursor->current_key.data();
842 const char * keyend = keypos + cursor->current_key.size();
843 // Check we're still in same postlist
844 if (!check_tname_in_key_lite(&keypos, keyend, term)) {
845 // This should only happen if the postlist doesn't exist at all.
846 is_at_end = true;
847 is_last_chunk = true;
848 return;
849 }
850 is_at_end = false;
851
852 cursor->read_tag();
853 pos = cursor->current_tag.data();
854 end = pos + cursor->current_tag.size();
855
856 if (keypos == keyend) {
857 // In first chunk
858 #ifdef XAPIAN_ASSERTIONS
859 Xapian::doccount old_number_of_entries = number_of_entries;
860 did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
861 Assert(old_number_of_entries == number_of_entries);
862 #else
863 did = read_start_of_first_chunk(&pos, end, NULL, NULL);
864 #endif
865 } else {
866 // In normal chunk
867 if (!unpack_uint_preserving_sort(&keypos, keyend, &did)) {
868 report_read_error(keypos);
869 }
870 }
871
872 first_did_in_chunk = did;
873 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
874 &is_last_chunk);
875 read_wdf(&pos, end, &wdf);
876
877 // Possible, since desired_did might be after end of this chunk and before
878 // the next.
879 if (desired_did > last_did_in_chunk) next_chunk();
880 }
881
882 bool
move_forward_in_chunk_to_at_least(Xapian::docid desired_did)883 BrassPostList::move_forward_in_chunk_to_at_least(Xapian::docid desired_did)
884 {
885 LOGCALL(DB, bool, "BrassPostList::move_forward_in_chunk_to_at_least", desired_did);
886 if (did >= desired_did)
887 RETURN(true);
888
889 if (desired_did <= last_did_in_chunk) {
890 while (pos != end) {
891 read_did_increase(&pos, end, &did);
892 if (did >= desired_did) {
893 read_wdf(&pos, end, &wdf);
894 RETURN(true);
895 }
896 // It's faster to just skip over the wdf than to decode it.
897 read_wdf(&pos, end, NULL);
898 }
899
900 // If we hit the end of the chunk then last_did_in_chunk must be wrong.
901 Assert(false);
902 }
903
904 pos = end;
905 RETURN(false);
906 }
907
908 PostList *
skip_to(Xapian::docid desired_did,Xapian::weight w_min)909 BrassPostList::skip_to(Xapian::docid desired_did, Xapian::weight w_min)
910 {
911 LOGCALL(DB, PostList *, "BrassPostList::skip_to", desired_did | w_min);
912 (void)w_min; // no warning
913 // We've started now - if we hadn't already, we're already positioned
914 // at start so there's no need to actually do anything.
915 have_started = true;
916
917 // Don't skip back, and don't need to do anything if already there.
918 if (is_at_end || desired_did <= did) RETURN(NULL);
919
920 // Move to correct chunk
921 if (!current_chunk_contains(desired_did)) {
922 move_to_chunk_containing(desired_did);
923 // Might be at_end now, so we need to check before trying to move
924 // forward in chunk.
925 if (is_at_end) RETURN(NULL);
926 }
927
928 // Move to correct position in chunk
929 bool have_document = move_forward_in_chunk_to_at_least(desired_did);
930 (void)have_document;
931 Assert(have_document);
932
933 if (is_at_end) {
934 LOGLINE(DB, "Skipped to end");
935 } else {
936 LOGLINE(DB, "Skipped to docid " << did << ", wdf = " << wdf);
937 }
938
939 RETURN(NULL);
940 }
941
942 // Used for doclens.
943 bool
jump_to(Xapian::docid desired_did)944 BrassPostList::jump_to(Xapian::docid desired_did)
945 {
946 LOGCALL(DB, bool, "BrassPostList::jump_to", desired_did);
947 // We've started now - if we hadn't already, we're already positioned
948 // at start so there's no need to actually do anything.
949 have_started = true;
950
951 // If the list is empty, give up right away.
952 if (pos == 0) RETURN(false);
953
954 // Move to correct chunk, or reload the current chunk to go backwards in it
955 // (FIXME: perhaps handle the latter case more elegantly, though it won't
956 // happen during sequential access which is most common).
957 if (is_at_end || !current_chunk_contains(desired_did) || desired_did < did) {
958 // Clear is_at_end flag since we can rewind.
959 is_at_end = false;
960
961 move_to_chunk_containing(desired_did);
962 // Might be at_end now, so we need to check before trying to move
963 // forward in chunk.
964 if (is_at_end) RETURN(false);
965 }
966
967 // Move to correct position in chunk.
968 if (!move_forward_in_chunk_to_at_least(desired_did)) RETURN(false);
969 RETURN(desired_did == did);
970 }
971
972 string
get_description() const973 BrassPostList::get_description() const
974 {
975 return term + ":" + str(number_of_entries);
976 }
977
978 // Returns the last did to allow in this chunk.
979 Xapian::docid
get_chunk(const string & tname,Xapian::docid did,bool adding,PostlistChunkReader ** from,PostlistChunkWriter ** to)980 BrassPostListTable::get_chunk(const string &tname,
981 Xapian::docid did, bool adding,
982 PostlistChunkReader ** from, PostlistChunkWriter **to)
983 {
984 LOGCALL(DB, Xapian::docid, "BrassPostListTable::get_chunk", tname | did | adding | from | to);
985 // Get chunk containing entry
986 string key = make_key(tname, did);
987
988 // Find the right chunk
989 AutoPtr<BrassCursor> cursor(cursor_get());
990
991 (void)cursor->find_entry(key);
992 Assert(!cursor->after_end());
993
994 const char * keypos = cursor->current_key.data();
995 const char * keyend = keypos + cursor->current_key.size();
996
997 if (!check_tname_in_key(&keypos, keyend, tname)) {
998 // Postlist for this termname doesn't exist.
999 //
1000 // NB "adding" will only be true if we are adding, but it may sometimes
1001 // be false in some cases where we are actually adding.
1002 if (!adding)
1003 throw Xapian::DatabaseCorruptError("Attempted to delete or modify an entry in a non-existent posting list for " + tname);
1004
1005 *from = NULL;
1006 *to = new PostlistChunkWriter(string(), true, tname, true);
1007 RETURN(Xapian::docid(-1));
1008 }
1009
1010 // See if we're appending - if so we can shortcut by just copying
1011 // the data part of the chunk wholesale.
1012 bool is_first_chunk = (keypos == keyend);
1013 LOGVALUE(DB, is_first_chunk);
1014
1015 cursor->read_tag();
1016 const char * pos = cursor->current_tag.data();
1017 const char * end = pos + cursor->current_tag.size();
1018 Xapian::docid first_did_in_chunk;
1019 if (is_first_chunk) {
1020 first_did_in_chunk = read_start_of_first_chunk(&pos, end, NULL, NULL);
1021 } else {
1022 if (!unpack_uint_preserving_sort(&keypos, keyend, &first_did_in_chunk)) {
1023 report_read_error(keypos);
1024 }
1025 }
1026
1027 bool is_last_chunk;
1028 Xapian::docid last_did_in_chunk;
1029 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, &is_last_chunk);
1030 *to = new PostlistChunkWriter(cursor->current_key, is_first_chunk, tname,
1031 is_last_chunk);
1032 if (did > last_did_in_chunk) {
1033 // This is the shortcut. Not very pretty, but I'll leave refactoring
1034 // until I've a clearer picture of everything which needs to be done.
1035 // (FIXME)
1036 *from = NULL;
1037 (*to)->raw_append(first_did_in_chunk, last_did_in_chunk,
1038 string(pos, end));
1039 } else {
1040 *from = new PostlistChunkReader(first_did_in_chunk, string(pos, end));
1041 }
1042 if (is_last_chunk) RETURN(Xapian::docid(-1));
1043
1044 // Find first did of next tag.
1045 cursor->next();
1046 if (cursor->after_end()) {
1047 throw Xapian::DatabaseCorruptError("Expected another key but found none");
1048 }
1049 const char *kpos = cursor->current_key.data();
1050 const char *kend = kpos + cursor->current_key.size();
1051 if (!check_tname_in_key(&kpos, kend, tname)) {
1052 throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
1053 }
1054
1055 // Read the new first docid
1056 Xapian::docid first_did_of_next_chunk;
1057 if (!unpack_uint_preserving_sort(&kpos, kend, &first_did_of_next_chunk)) {
1058 report_read_error(kpos);
1059 }
1060 RETURN(first_did_of_next_chunk - 1);
1061 }
1062
1063 void
merge_doclen_changes(const map<Xapian::docid,Xapian::termcount> & doclens)1064 BrassPostListTable::merge_doclen_changes(const map<Xapian::docid, Xapian::termcount> & doclens)
1065 {
1066 LOGCALL_VOID(DB, "BrassPostListTable::merge_doclen_changes", doclens);
1067
1068 // The cursor in the doclen_pl will no longer be valid, so reset it.
1069 doclen_pl.reset(0);
1070
1071 LOGVALUE(DB, doclens.size());
1072 if (doclens.empty()) return;
1073
1074 // Ensure there's a first chunk.
1075 string current_key = make_key(string());
1076 if (!key_exists(current_key)) {
1077 LOGLINE(DB, "Adding dummy first chunk");
1078 string newtag = make_start_of_first_chunk(0, 0, 0);
1079 newtag += make_start_of_chunk(true, 0, 0);
1080 add(current_key, newtag);
1081 }
1082
1083 map<Xapian::docid, Xapian::termcount>::const_iterator j;
1084 j = doclens.begin();
1085 Assert(j != doclens.end()); // This case is caught above.
1086
1087 Xapian::docid max_did;
1088 PostlistChunkReader *from;
1089 PostlistChunkWriter *to;
1090 max_did = get_chunk(string(), j->first, true, &from, &to);
1091 LOGVALUE(DB, max_did);
1092 for ( ; j != doclens.end(); ++j) {
1093 Xapian::docid did = j->first;
1094
1095 next_doclen_chunk:
1096 LOGLINE(DB, "Updating doclens, did=" << did);
1097 if (from) while (!from->is_at_end()) {
1098 Xapian::docid copy_did = from->get_docid();
1099 if (copy_did >= did) {
1100 if (copy_did == did) from->next();
1101 break;
1102 }
1103 to->append(this, copy_did, from->get_wdf());
1104 from->next();
1105 }
1106 if ((!from || from->is_at_end()) && did > max_did) {
1107 delete from;
1108 to->flush(this);
1109 delete to;
1110 max_did = get_chunk(string(), did, false, &from, &to);
1111 goto next_doclen_chunk;
1112 }
1113
1114 Xapian::termcount new_doclen = j->second;
1115 if (new_doclen != static_cast<Xapian::termcount>(-1)) {
1116 to->append(this, did, new_doclen);
1117 }
1118 }
1119
1120 if (from) {
1121 while (!from->is_at_end()) {
1122 to->append(this, from->get_docid(), from->get_wdf());
1123 from->next();
1124 }
1125 delete from;
1126 }
1127 to->flush(this);
1128 delete to;
1129 }
1130
1131 void
merge_changes(const string & term,const Inverter::PostingChanges & changes)1132 BrassPostListTable::merge_changes(const string &term,
1133 const Inverter::PostingChanges & changes)
1134 {
1135 {
1136 // Rewrite the first chunk of this posting list with the updated
1137 // termfreq and collfreq.
1138 string current_key = make_key(term);
1139 string tag;
1140 (void)get_exact_entry(current_key, tag);
1141
1142 // Read start of first chunk to get termfreq and collfreq.
1143 const char *pos = tag.data();
1144 const char *end = pos + tag.size();
1145 Xapian::doccount termfreq;
1146 Xapian::termcount collfreq;
1147 Xapian::docid firstdid, lastdid;
1148 bool islast;
1149 if (pos == end) {
1150 termfreq = 0;
1151 collfreq = 0;
1152 firstdid = 0;
1153 lastdid = 0;
1154 islast = true;
1155 } else {
1156 firstdid = read_start_of_first_chunk(&pos, end,
1157 &termfreq, &collfreq);
1158 // Handle the generic start of chunk header.
1159 lastdid = read_start_of_chunk(&pos, end, firstdid, &islast);
1160 }
1161
1162 termfreq += changes.get_tfdelta();
1163 if (termfreq == 0) {
1164 // All postings deleted! So we can shortcut by zapping the
1165 // posting list.
1166 if (islast) {
1167 // Only one entry for this posting list.
1168 del(current_key);
1169 return;
1170 }
1171 MutableBrassCursor cursor(this);
1172 bool found = cursor.find_entry(current_key);
1173 Assert(found);
1174 if (!found) return; // Reduce damage!
1175 while (cursor.del()) {
1176 const char *kpos = cursor.current_key.data();
1177 const char *kend = kpos + cursor.current_key.size();
1178 if (!check_tname_in_key_lite(&kpos, kend, term)) break;
1179 }
1180 return;
1181 }
1182 collfreq += changes.get_cfdelta();
1183
1184 // Rewrite start of first chunk to update termfreq and collfreq.
1185 string newhdr = make_start_of_first_chunk(termfreq, collfreq, firstdid);
1186 newhdr += make_start_of_chunk(islast, firstdid, lastdid);
1187 if (pos == end) {
1188 add(current_key, newhdr);
1189 } else {
1190 Assert((size_t)(pos - tag.data()) <= tag.size());
1191 tag.replace(0, pos - tag.data(), newhdr);
1192 add(current_key, tag);
1193 }
1194 }
1195 map<Xapian::docid, Xapian::termcount>::const_iterator j;
1196 j = changes.pl_changes.begin();
1197 Assert(j != changes.pl_changes.end()); // This case is caught above.
1198
1199 Xapian::docid max_did;
1200 PostlistChunkReader *from;
1201 PostlistChunkWriter *to;
1202 max_did = get_chunk(term, j->first, false, &from, &to);
1203 for ( ; j != changes.pl_changes.end(); ++j) {
1204 Xapian::docid did = j->first;
1205
1206 next_chunk:
1207 LOGLINE(DB, "Updating term=" << term << ", did=" << did);
1208 if (from) while (!from->is_at_end()) {
1209 Xapian::docid copy_did = from->get_docid();
1210 if (copy_did >= did) {
1211 if (copy_did == did) {
1212 from->next();
1213 }
1214 break;
1215 }
1216 to->append(this, copy_did, from->get_wdf());
1217 from->next();
1218 }
1219 if ((!from || from->is_at_end()) && did > max_did) {
1220 delete from;
1221 to->flush(this);
1222 delete to;
1223 max_did = get_chunk(term, did, false, &from, &to);
1224 goto next_chunk;
1225 }
1226
1227 Xapian::termcount new_wdf = j->second;
1228 if (new_wdf != Xapian::termcount(-1)) {
1229 to->append(this, did, new_wdf);
1230 }
1231 }
1232
1233 if (from) {
1234 while (!from->is_at_end()) {
1235 to->append(this, from->get_docid(), from->get_wdf());
1236 from->next();
1237 }
1238 delete from;
1239 }
1240 to->flush(this);
1241 delete to;
1242 }
1243