1 /*
2 * Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * 3. The name "Carnegie Mellon University" must not be used to
17 * endorse or promote products derived from this software without
18 * prior written permission. For permission or any legal
19 * details, please contact
20 * Carnegie Mellon University
21 * Center for Technology Transfer and Enterprise Creation
22 * 4615 Forbes Avenue
23 * Suite 302
24 * Pittsburgh, PA 15213
25 * (412) 268-7393, fax: (412) 268-7395
26 * innovation@andrew.cmu.edu
27 *
28 * 4. Redistributions of any form whatsoever must retain the following
29 * acknowledgment:
30 * "This product includes software developed by Computing Services
31 * at Carnegie Mellon University (http://www.cmu.edu/computing/)."
32 *
33 * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
34 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
35 * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
36 * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
37 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
38 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
39 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
40 */
41
42 /*
43 SQUAT code for building indexes.
44 Robert O'Callahan
45
46 IMPLEMENTATION NOTES:
47
48 The basic strategy here is pretty simple. During the index build
49 process we keep 256 temporary files. Each time we read a source
50 document, we add all its words that start with byte i, along with
51 the document ID, to file #i. Once we've seen all the source
52 documents we proceed through each temporary file #i, one by one,
53 constructing a trie of all the words starting with byte i, and which
54 stores the IDs of the documents that contain each word. When we get
55 to the end of each temporary file, we can write out the trie to the
56 index file and start all over again on the next temporary file.
57
58 This is marvellously scalable! During the document reading phase,
59 we're just dumping data out into temporary files, and the amount of
60 data we dump out is proportional to the total size of the source
61 documents. (In the worst case, with large input files of random
62 data, we write out 3 bytes per input byte into temporary files.)
63 During the trie-building phase, we reread the temporary files and
64 output the final index. In this phase we consume a fair bit of
65 memory, but in the worst case only 8 bytes per document ID per word
66 which starts with the right byte. Even in the very worst case, if
67 there were gigabytes of random data, there are only 2^24 possible
68 such words, and in practice of course there are far fewer.
69
70 In practice performance is dominated by sequential I/O. On my email,
71 I can index half a megabyte of source text per second on a
72 single-disk desktop PC.
73
74 The same trie data structures are used to build tries to record the
75 words used in a particular document (while the source document is
76 being fed in) and to build tries to record the words used in all
77 documents that start with a given byte (while we process each
78 temporary file).
79
80 Each "per document" trie stores all words occurring in the
81 document. We make it a depth 3 trie, and at the leaves we store a
82 bit vector recording which words are present in the document, with a
83 bit set to 1 if a word occurs with its 4th character set to the
84 corresponding byte.
85
86 Each "all document" trie assumes a fixed first word byte, and
87 therefore is only of depth 3. The leaves store the list of document
88 IDs containing the word.
89 */
90
91 #include <config.h>
92
93 #include <stdlib.h>
94 #include <string.h>
95 #include <unistd.h>
96 #include <sys/mman.h>
97
98 #include "squat_internal.h"
99 #include "mailbox.h"
100 #include "message.h"
101
102 #include "assert.h"
103 #include "util.h"
104 #include "index.h"
105 #include "xmalloc.h"
106
107 /* A simple write-buffering module which avoids copying of the output data. */
108
109 typedef struct {
110 struct buf buf; /* The extending malloc'ed buffer */
111 int fd; /* The fd to write to. */
112 int total_output_bytes; /* How much data have we written out
113 through this buffer in total? */
114 } SquatWriteBuffer;
115
init_write_buffer(SquatWriteBuffer * b,int buf_size,int fd)116 static int init_write_buffer(SquatWriteBuffer* b, int buf_size, int fd)
117 {
118 buf_ensure(&b->buf, buf_size);
119 b->fd = fd;
120 b->total_output_bytes = 0;
121
122 return SQUAT_OK;
123 }
124
125 /* Make sure that there is enough space in the buffer to write 'len' bytes.
126 Return a pointer to where the written data should be placed. */
prepare_buffered_write(SquatWriteBuffer * b,int len)127 static char *prepare_buffered_write(SquatWriteBuffer *b, int len)
128 {
129 if (b->buf.len + len >= b->buf.alloc) {
130 if (write(b->fd, b->buf.s, b->buf.len) != (long)b->buf.len) {
131 squat_set_last_error(SQUAT_ERR_SYSERR);
132 return NULL;
133 }
134 buf_reset(&b->buf);
135 buf_ensure(&b->buf, len);
136 }
137
138 return b->buf.s + b->buf.len;
139 }
140
141 /* Signal that data has been written up to the mark 'ptr'.
142 Call this after prepare_buffered_write. */
complete_buffered_write(SquatWriteBuffer * b,char * ptr)143 static void complete_buffered_write(SquatWriteBuffer *b, char *ptr)
144 {
145 int oldbytes = b->buf.len;
146 int newbytes = ptr - b->buf.s;
147 buf_truncate(&b->buf, newbytes);
148 b->total_output_bytes += newbytes - oldbytes;
149 }
150
151 /* Flush the output buffer to the file. Reset the file pointer to the start
152 of the file. */
flush_and_reset_buffered_writes(SquatWriteBuffer * b)153 static int flush_and_reset_buffered_writes(SquatWriteBuffer *b)
154 {
155 if (b->buf.len) {
156 if (write(b->fd, b->buf.s, b->buf.len) != (long)b->buf.len) {
157 squat_set_last_error(SQUAT_ERR_SYSERR);
158 return SQUAT_ERR;
159 }
160 buf_reset(&b->buf);
161 }
162
163 if (lseek(b->fd, 0, SEEK_SET) != 0) {
164 squat_set_last_error(SQUAT_ERR_SYSERR);
165 return SQUAT_ERR;
166 }
167
168 return SQUAT_OK;
169 }
170
171 /* A circular linked list of document IDs, stored in increasing order
172 of document ID. */
173 typedef struct _WordDocEntry {
174 struct _WordDocEntry *next;
175 int doc_ID;
176 } WordDocEntry;
177
178 /* These form the leaves of the "all documents" tries. For each of the
179 256 words with trailing byte 'i', docs[i] is NULL if the word does
180 not occur in any document, otherwise it is the head of a linked
181 list of document IDs for the documents which contain the word. */
182 typedef struct {
183 short first_valid_entry; /* We record the first and last valid
184 entries in the array below. These could
185 be computed by just scanning the array,
186 but it turns out that in practice such
187 array scanning dominates the CPU
188 consumption of the indexer. We get
189 major speedup by maintaining these
190 entries on the fly. */
191 short last_valid_entry;
192 WordDocEntry* docs[256]; /* Pointers to the document ID lists for
193 each of the 256 words rooted at this
194 part of the trie. Each non-NULL pointer
195 points to the LAST element of the
196 linked list (i.e. the entry with the
197 highest document ID). This means we can
198 efficiently add to the end of the
199 linked list, and also efficiently get
200 to the start of the linked list (the
201 element with lowest document ID)
202 (because it's circular). */
203 } SquatWordTableLeafDocs;
204
205 /* These form the leaves of the "per document" tries. For each of the
206 256 words with trailing byte 'i', presence[i >> 3] & (1 << (i & 7))
207 is 1 if the word occurs in the document, otherwise 0. */
208 typedef struct {
209 short first_valid_entry; /* We record the first and last valid
210 entries in the bit vector below. These
211 could be computed by just scanning the
212 array, but we get significant speedup
213 by maintaining them here. */
214 short last_valid_entry;
215 char presence[32];
216 } SquatWordTableLeafPresence;
217
218 /* This is an entry in a trie. */
219 typedef union _SquatWordTableEntry {
220 struct _SquatWordTable* table; /* This is a branch node */
221
222 /* These variants are used for leaves of "per document" tries.
223 They are distinguished by the value of the low bit. */
224 SquatWordTableLeafPresence* leaf_presence; /* low bit is 0 */
225 int leaf_presence_singleton; /* low bit is 1 */
226
227 /* This variant is used for leaves of "all document" tries. */
228 SquatWordTableLeafDocs* leaf_docs;
229 } SquatWordTableEntry;
230
231 /* This is a trie branch node. */
232 typedef struct _SquatWordTable {
233 short first_valid_entry; /* We record the first and last valid
234 entries in the array below, as in the
235 above data structures. */
236 short last_valid_entry;
237 SquatWordTableEntry entries[256];
238 } SquatWordTable;
239
240 /* Map docIDs in existing index to docIDs in the new index */
241 struct doc_ID_map {
242 int *map;
243 int alloc;
244 int max;
245 int new;
246 };
247
248 struct _SquatIndex {
249 char* tmp_path; /* Saved tmp_path option, with
250 the temporary filename
251 pattern appended */
252 SquatWriteBuffer out; /* The buffer for the index file itself */
253 char* doc_ID_list; /* A buffer where we hold the
254 encoded array that maps from
255 a document ID to the offset
256 of the document record within
257 the index file. */
258 int doc_ID_list_size; /* The allocated size of the
259 above buffer, measured in
260 multiples of
261 sizeof(SquatInt32) (i.e., 4) */
262 int current_doc_ID; /* The current document
263 ID. Document IDs are numbered
264 starting at zero and
265 incremented by 1 every time
266 we finish processing a source
267 document. */
268 int current_doc_len; /* The total number of bytes
269 processed in the current
270 source document. */
271 SquatWordTable *doc_word_table; /* The root of the trie being
272 built for the current
273 document or for the current
274 initial byte. */
275 char runover_buf[SQUAT_WORD_SIZE]; /* holds the last runover_len
276 bytes of the current source
277 document */
278 int runover_len;
279 WordDocEntry* word_doc_allocator; /* A preallocated buffer of
280 WordDocEntries; this pointer
281 is bumped up one every
282 allocation */
283 unsigned char valid_char_bits[32]; /* Saved valid_char_bits option */
284 SquatStatsCallback stats_callback; /* Saved stats_callback option */
285 void* stats_callback_closure;
286
287 SquatSearchIndex* old_index; /* Link to old index in incremental */
288 struct doc_ID_map doc_ID_map; /* Map doc_IDs in old index to new */
289 SquatDocChooserCallback select_doc; /* Decide whether we want doc in new */
290 void *select_doc_closure; /* Data for handler */
291
292 /* put the big structures at the end */
293
294 SquatWriteBuffer index_buffers[256]; /* Buffers for the temporary
295 files, one for each first
296 byte of words occurring in
297 the source documents */
298 int total_num_words[256]; /* total number of words starting with
299 given char */
300 int doc_words[256]; /* number of words in current document
301 starting with given char */
302 };
303
304 /* ====================================================================== */
305
306 /* Collection of utility routines to maintain mapping between doc_ID in
307 * the old and new squat files. Not a one to one map as old documents
308 * (old messages in Cyrus) may have been deleted.
309 */
310
311 /* Copy existing document details verbatim from old to new index */
squat_index_copy_document(SquatIndex * index,char const * name,SquatInt64 size)312 static int squat_index_copy_document(SquatIndex *index, char const *name,
313 SquatInt64 size)
314 {
315 char *buf;
316 int r = squat_index_open_document(index, name);
317
318 if (r != SQUAT_OK)
319 return (r);
320
321 squat_set_last_error(SQUAT_ERR_OK);
322 if ((buf = prepare_buffered_write(&index->out, 10)) == NULL) {
323 return SQUAT_ERR;
324 }
325 buf = squat_encode_I(buf, size);
326 complete_buffered_write(&index->out, buf);
327
328 index->current_doc_len = -1;
329 index->current_doc_ID++;
330
331 return SQUAT_OK;
332 }
333
doc_ID_map_init(struct doc_ID_map * doc_ID_map)334 static void doc_ID_map_init(struct doc_ID_map *doc_ID_map)
335 {
336 doc_ID_map->alloc = 50;
337 doc_ID_map->map = xmalloc(doc_ID_map->alloc * sizeof(int));
338 doc_ID_map->max = 0;
339 doc_ID_map->new = 0;
340 }
341
doc_ID_map_free(struct doc_ID_map * doc_ID_map)342 static void doc_ID_map_free(struct doc_ID_map *doc_ID_map)
343 {
344 if (doc_ID_map->map)
345 free(doc_ID_map->map);
346
347 memset(doc_ID_map, 0, sizeof(struct doc_ID_map));
348 }
349
doc_ID_map_add(struct doc_ID_map * doc_ID_map,int exists)350 static void doc_ID_map_add(struct doc_ID_map *doc_ID_map, int exists)
351 {
352 if (doc_ID_map->max == doc_ID_map->alloc) {
353 doc_ID_map->alloc *= 2;
354 doc_ID_map->map =
355 xrealloc(doc_ID_map->map, doc_ID_map->alloc * sizeof(int));
356 }
357 if (exists) {
358 doc_ID_map->map[doc_ID_map->max++] = doc_ID_map->new++;
359 } else {
360 doc_ID_map->map[doc_ID_map->max++] = 0; /* Does not exist in new index */
361 }
362 }
363
doc_ID_map_lookup(struct doc_ID_map * doc_ID_map,int docID)364 static int doc_ID_map_lookup(struct doc_ID_map *doc_ID_map, int docID)
365 {
366 if ((docID < 1) || (docID > doc_ID_map->max))
367 return (0);
368
369 return (doc_ID_map->map[docID]);
370 }
371
copy_docIDs(void * closure,SquatListDoc const * doc)372 static int copy_docIDs(void *closure, SquatListDoc const *doc)
373 {
374 SquatIndex *index = (SquatIndex *) closure;
375 struct doc_ID_map *doc_ID_map = &index->doc_ID_map;
376 int choice = (index->select_doc) (index->select_doc_closure, doc);
377
378 if (choice > 0) {
379 doc_ID_map_add(doc_ID_map, 1);
380 return (squat_index_copy_document
381 (index, doc->doc_name, doc->size));
382 }
383
384 /* This docID no longer exists */
385 doc_ID_map_add(doc_ID_map, 0);
386 return SQUAT_CALLBACK_CONTINUE;
387 }
388
389 /* Comes later */
390 static int add_word_to_trie(SquatIndex* index, char const* word_ptr,
391 int doc_ID);
392
add_word_callback(void * closure,char * name,int doc_ID)393 static int add_word_callback(void *closure, char *name, int doc_ID)
394 {
395 SquatIndex *index = (SquatIndex *) closure;
396 struct doc_ID_map *doc_ID_map = &index->doc_ID_map;
397
398 /* Find doc_ID in the new index which corresponds to this old doc_ID */
399 if ((doc_ID = doc_ID_map_lookup(doc_ID_map, doc_ID)) == 0)
400 return SQUAT_ERR;
401
402 add_word_to_trie(index, name + 1, doc_ID);
403
404 return SQUAT_CALLBACK_CONTINUE;
405 }
406
squat_index_add_existing(SquatIndex * index,SquatSearchIndex * old_index,SquatDocChooserCallback select_doc,void * select_doc_closure)407 int squat_index_add_existing(SquatIndex *index,
408 SquatSearchIndex *old_index,
409 SquatDocChooserCallback select_doc,
410 void *select_doc_closure)
411 {
412 index->old_index = old_index;
413 index->select_doc = select_doc;
414 index->select_doc_closure = select_doc_closure;
415
416 return (squat_search_list_docs(old_index, copy_docIDs, index));
417 }
418
419 /* ====================================================================== */
420
421 /* Initially, before we see a document, there are no words for the document. */
word_table_new(void)422 static SquatWordTable *word_table_new(void)
423 {
424 SquatWordTable *ret =
425 (SquatWordTable *) xzmalloc(sizeof(SquatWordTable));
426
427 /* Initially there are no valid entries. Set things up so that
428 the obvious tests will set first_valid_entry and
429 last_valid_entry correctly. */
430 ret->first_valid_entry = 256;
431 ret->last_valid_entry = 0;
432 return ret;
433 }
434
squat_index_init(int fd,const SquatOptions * options)435 SquatIndex *squat_index_init(int fd, const SquatOptions *options)
436 {
437 SquatIndex *index;
438 unsigned i;
439 char *buf;
440 char const *tmp_path;
441
442 squat_set_last_error(SQUAT_ERR_OK);
443
444 index = (SquatIndex *) xzmalloc(sizeof(SquatIndex));
445
446 /* Copy processed options into the SquatIndex */
447 if (options != NULL
448 && (options->option_mask & SQUAT_OPTION_TMP_PATH) != 0) {
449 tmp_path = options->tmp_path;
450 } else {
451 tmp_path = "/tmp";
452 }
453 index->tmp_path = strconcat(tmp_path, "/squatXXXXXX", (char *)NULL);
454
455 if (options != NULL &&
456 (options->option_mask & SQUAT_OPTION_VALID_CHARS) != 0) {
457 int i;
458
459 memset(index->valid_char_bits, 0, sizeof(index->valid_char_bits));
460 for (i = 0; options->valid_chars[i] != 0; i++) {
461 int ch = (unsigned char)options->valid_chars[i];
462
463 index->valid_char_bits[ch >> 3] |= 1 << (ch & 7);
464 }
465 } else {
466 memset(index->valid_char_bits, 255,
467 sizeof(index->valid_char_bits));
468 }
469
470 if (options != NULL &&
471 (options->option_mask & SQUAT_OPTION_STATISTICS) != 0) {
472 index->stats_callback = options->stats_callback;
473 index->stats_callback_closure = options->stats_callback_closure;
474 } else {
475 index->stats_callback = NULL;
476 }
477
478 /* Finish initializing the SquatIndex */
479 for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
480 index->index_buffers[i].fd = -1;
481 }
482
483 index->doc_ID_list_size = 1000;
484 index->doc_ID_list =
485 (char *)xmalloc(index->doc_ID_list_size * sizeof(SquatInt32));
486
487 /* Use a 128K write buffer for the main index file */
488 if (init_write_buffer(&index->out, 128 * 1024, fd) != SQUAT_OK) {
489 goto cleanup_doc_ID_list;
490 }
491
492 /* Write out a dummy header. This will be replaced by the real header at the
493 end of the process. */
494 buf = prepare_buffered_write(&index->out, sizeof(SquatDiskHeader));
495 if (buf == NULL) {
496 goto cleanup_out_buffer;
497 }
498 memset(buf, 0, sizeof(SquatDiskHeader));
499 complete_buffered_write(&index->out, buf + sizeof(SquatDiskHeader));
500
501 index->current_doc_ID = 0;
502 index->doc_word_table = word_table_new();
503
504 memset(index->total_num_words, 0, sizeof(index->total_num_words));
505
506 index->old_index = NULL; /* Until we are given one */
507 doc_ID_map_init(&index->doc_ID_map);
508
509 return index;
510
511 cleanup_out_buffer:
512 buf_free(&index->out.buf);
513
514 cleanup_doc_ID_list:
515 free(index->doc_ID_list);
516
517 /*cleanup_tmp_path:*/
518 free(index->tmp_path);
519
520 /*cleanup_index:*/
521 free(index);
522 return NULL;
523 }
524
525 /* Initialize a write buffer for a temporary file. We generate the
526 temporary file name here. The file is unlinked right away so if we
527 crash, the temporary file doesn't need to be cleaned up. */
init_write_buffer_to_temp(SquatIndex * index,SquatWriteBuffer * b)528 static int init_write_buffer_to_temp(SquatIndex *index,
529 SquatWriteBuffer *b)
530 {
531 char *tmp_path = xstrdup(index->tmp_path);
532 int fd = mkstemp(tmp_path);
533
534 if (fd < 0) {
535 free(tmp_path);
536 squat_set_last_error(SQUAT_ERR_SYSERR);
537 return SQUAT_ERR;
538 }
539
540 if (unlink(tmp_path) < 0) {
541 squat_set_last_error(SQUAT_ERR_SYSERR);
542 goto cleanup_fd;
543 }
544
545 if (init_write_buffer(b, 64 * 1024, fd) != SQUAT_OK) {
546 goto cleanup_fd;
547 }
548
549 free(tmp_path);
550 return SQUAT_OK;
551
552 cleanup_fd:
553 close(fd);
554 free(tmp_path);
555 return SQUAT_ERR;
556 }
557
squat_index_open_document(SquatIndex * index,char const * name)558 int squat_index_open_document(SquatIndex *index, char const *name)
559 {
560 int name_len;
561 char *buf;
562
563 squat_set_last_error(SQUAT_ERR_OK);
564
565 /* Grow the document ID array as necessary */
566 if (index->current_doc_ID >= index->doc_ID_list_size) {
567 index->doc_ID_list_size *= 2;
568 index->doc_ID_list =
569 (char *)xrealloc(index->doc_ID_list,
570 index->doc_ID_list_size * sizeof(SquatInt32));
571 }
572
573 /* Store the offset of the new document record into the array */
574 squat_encode_32(index->doc_ID_list + index->current_doc_ID * 4,
575 index->out.total_output_bytes -
576 sizeof(SquatDiskHeader));
577
578 /* Now write the new document name out to the file. Later we will
579 write the document length right after this. Nobody writes to the
580 file in the interim. */
581 name_len = strlen(name) + 1;
582 if ((buf = prepare_buffered_write(&index->out, name_len)) == NULL) {
583 return SQUAT_ERR;
584 }
585 strcpy(buf, name);
586 complete_buffered_write(&index->out, buf + name_len);
587
588 index->current_doc_len = 0;
589 index->runover_len = 0;
590 memset(index->doc_words, 0, sizeof(index->doc_words));
591
592 return SQUAT_OK;
593 }
594
595 /* Destroy the SquatWordTable. The leaf data and the internal nodes are free'd. */
word_table_delete(SquatWordTable * t,int depth)596 static void word_table_delete(SquatWordTable *t, int depth)
597 {
598 if (depth > 2) {
599 unsigned i;
600
601 depth--;
602 for (i = 0; i < VECTOR_SIZE(t->entries); i++) {
603 SquatWordTableEntry *e = &(t->entries[i]);
604
605 if (e->table != NULL) {
606 word_table_delete(e->table, depth);
607 }
608 }
609 } else {
610 unsigned i;
611
612 /* this happens to work whether the leaf entries are leaf_presence
613 or leaf_docs. This is ugly but acceptable :-) */
614 for (i = 0; i < VECTOR_SIZE(t->entries); i++) {
615 SquatWordTableEntry *e = &(t->entries[i]);
616
617 if (e->leaf_presence != NULL
618 && ((unsigned long)e->leaf_presence & 1) == 0) {
619 free(e->leaf_presence);
620 }
621 }
622 }
623 free(t);
624 }
625
626 #define SQUAT_ADD_NEW_WORD (SQUAT_LAST_BUILTIN + 1)
627
628 /* Add an entry to the compressed presence set. We maintain
629 first_valid_entry and last_valid_entry.
630 This is faster than scanning to compute them later.
631 We return SQUAT_ADD_NEW_WORD if the bit wasn't already set. */
set_presence_bit(SquatWordTableLeafPresence * p,int ch)632 static int set_presence_bit(SquatWordTableLeafPresence *p, int ch)
633 {
634 int mask = 1 << (ch & 7);
635 char *ptr = p->presence + (ch >> 3);
636
637 if (ch < p->first_valid_entry) {
638 p->first_valid_entry = ch;
639 }
640 if (ch > p->last_valid_entry) {
641 p->last_valid_entry = ch;
642 }
643
644 if ((*ptr & mask) == 0) {
645 *ptr |= mask;
646 return SQUAT_ADD_NEW_WORD;
647 } else {
648 return SQUAT_OK;
649 }
650 }
651
652 /* Add a word to the SquatWordTable trie.
653 If word_entry is NULL then we are in "per document" mode and just record
654 the presence or absence of a word, not the actual document.
655 We return SQUAT_ADD_NEW_WORD if this is the first occurrence of the
656 word in the trie. */
add_to_table(SquatIndex * index,char const * data,int data_len,WordDocEntry * word_entry)657 static int add_to_table(SquatIndex *index, char const *data, int data_len,
658 WordDocEntry *word_entry)
659 {
660 SquatWordTable *t = index->doc_word_table;
661 int ch;
662 SquatWordTableEntry *e;
663
664 while (data_len > 2) {
665 /* Follow the branch node down to the next level of the trie. */
666 ch = (unsigned char)data[0];
667 /* Maintain the valid_entry variables so that we don't have to
668 perform expensive scans of the 256-element arrays
669 later. Surprisingly, this optimization really matters! */
670 if (ch < t->first_valid_entry) {
671 t->first_valid_entry = ch;
672 }
673 if (ch > t->last_valid_entry) {
674 t->last_valid_entry = ch;
675 }
676
677 e = t->entries + ch;
678 t = e->table;
679 /* Allocate the next branch node if it doesn't already exist. */
680 if (t == NULL)
681 e->table = t = word_table_new();
682
683 data++;
684 data_len--;
685 }
686
687 /* Follow the branch node down to the leaf level */
688 ch = (unsigned char)data[0];
689 if (ch < t->first_valid_entry) {
690 t->first_valid_entry = ch;
691 }
692 if (ch > t->last_valid_entry) {
693 t->last_valid_entry = ch;
694 }
695 e = t->entries + ch;
696
697 ch = (unsigned char)data[1];
698
699 if (word_entry == NULL) {
700 /* We are in "per document" mode. */
701 if (((unsigned long)e->leaf_presence & 1) != 0) {
702 /* We currently have a singleton here. */
703 int oldch = e->leaf_presence_singleton >> 1;
704
705 /* If the singleton indicates the same word as the current word,
706 then we don't have to do anything. */
707 if (oldch != ch) {
708 /* Otherwise we have to add the new word. This means we have
709 to convert the singleton to a bit vector. */
710 SquatWordTableLeafPresence *p;
711
712 /* Make an empty bit vector. */
713 p = (SquatWordTableLeafPresence *)
714 xmalloc(sizeof(SquatWordTableLeafPresence));
715 p->first_valid_entry = 256;
716 p->last_valid_entry = 0;
717 memset(p->presence, 0, sizeof(p->presence));
718 e->leaf_presence = p;
719
720 /* Update the bit vector */
721 set_presence_bit(p, ch);
722 return set_presence_bit(p, oldch); /* will always be SQUAT_ADD_NEW_WORD */
723 }
724 } else if (e->leaf_presence == NULL) {
725 /* There's nothing here. Let's make a singleton. */
726 /* this next step might be necessary if sizeof(void*) >
727 sizeof(int). We make sure that the low bit of the pointer in
728 leaf_presence is definitely 1. */
729 e->leaf_presence = (void *)1;
730 e->leaf_presence_singleton = (ch << 1) | 1;
731 return SQUAT_ADD_NEW_WORD;
732 } else {
733 /* We already have the bit vector, so let's just set another bit in it. */
734 return set_presence_bit(e->leaf_presence, ch);
735 }
736 } else {
737 /* We are in "all documents" mode. */
738 SquatWordTableLeafDocs *docs = e->leaf_docs;
739 WordDocEntry **entry_ptr;
740
741 /* Make a new leaf table if we don't already have one. */
742 if (docs == NULL) {
743 docs = (SquatWordTableLeafDocs *)
744 xmalloc(sizeof(SquatWordTableLeafDocs));
745 docs->first_valid_entry = 256;
746 docs->last_valid_entry = 0;
747 memset(docs->docs, 0, sizeof(docs->docs));
748 e->leaf_docs = docs;
749 }
750
751 entry_ptr = docs->docs + ch;
752
753 if (*entry_ptr == NULL) {
754 /* Adding a new word, so may need to update the valid_entry markers */
755 if (ch < docs->first_valid_entry) {
756 docs->first_valid_entry = ch;
757 }
758 if (ch > docs->last_valid_entry) {
759 docs->last_valid_entry = ch;
760 }
761 /* Create the linked list with the single element 'word_entry'. */
762 word_entry->next = word_entry; /* make it circular */
763 *entry_ptr = word_entry;
764 return SQUAT_ADD_NEW_WORD;
765 } else {
766 /* Just add the document to the linked list. word_entry will be
767 the new last element since the document IDs are strictly
768 increasing as we build the trie from its temporary file. */
769 word_entry->next = (*entry_ptr)->next; /* (*entry_ptr)->next is
770 (still) the first
771 element of the list */
772 (*entry_ptr)->next = word_entry; /* the old last element's
773 next now points to the
774 new last element. */
775 *entry_ptr = word_entry; /* save the new last element */
776 }
777 }
778
779 return SQUAT_OK;
780 }
781
782 /* Add 'doc_ID' to the list of document IDs for word 'word_ptr'
783 in the "all documents" trie. */
add_word_to_trie(SquatIndex * index,char const * word_ptr,int doc_ID)784 static int add_word_to_trie(SquatIndex * index, char const *word_ptr,
785 int doc_ID)
786 {
787 WordDocEntry *word_entry = index->word_doc_allocator++;
788
789 word_entry->doc_ID = doc_ID;
790 add_to_table(index, word_ptr, SQUAT_WORD_SIZE - 1, word_entry);
791
792 return SQUAT_OK;
793 }
794
795 /* Add the word 'data' to the "per document" trie for the current document. */
add_word_to_table(SquatIndex * index,char const * data)796 static int add_word_to_table(SquatIndex *index, char const *data)
797 {
798 int r;
799 int i;
800
801 /* Just ignore the word if it uses an invalid character. */
802 for (i = 0; i < SQUAT_WORD_SIZE; i++) {
803 int ch = (unsigned char)data[i];
804
805 if ((index->valid_char_bits[ch >> 3] & (1 << (ch & 7))) == 0) {
806 /* this word contains an invalid character and need not be indexed,
807 since search strings will never contain such a character. */
808 return SQUAT_OK;
809 }
810 }
811
812 r = add_to_table(index, data, SQUAT_WORD_SIZE, NULL);
813 if (r == SQUAT_ADD_NEW_WORD) {
814 /* Remember how many unique words in this document started with
815 the given first character. */
816 index->doc_words[(unsigned char)data[0]]++;
817 return SQUAT_OK;
818 } else {
819 return r;
820 }
821 }
822
squat_index_append_document(SquatIndex * index,char const * data,int data_len)823 int squat_index_append_document(SquatIndex * index, char const *data,
824 int data_len)
825 {
826 int i;
827 char buf[SQUAT_WORD_SIZE];
828 int new_runover;
829 int new_runover_data;
830
831 assert(data_len >= 0);
832
833 squat_set_last_error(SQUAT_ERR_OK);
834
835 if (data_len == 0) {
836 return SQUAT_OK;
837 }
838
839 /* Scan runover */
840 for (i = 0; i < index->runover_len; i++) {
841 /* Check if we can make a whole word starting with runover bytes
842 from offset i within the runover buffer and with the remaining
843 bytes taken from the new text */
844 if (index->runover_len - i + data_len >= SQUAT_WORD_SIZE) {
845 /* Yep. Build the complete word into 'buf' and then add it. */
846 memcpy(buf, index->runover_buf + i, index->runover_len - i);
847 memcpy(buf + index->runover_len - i, data,
848 SQUAT_WORD_SIZE - (index->runover_len - i));
849 if (add_word_to_table(index, buf) != SQUAT_OK) {
850 return SQUAT_ERR;
851 }
852 }
853 }
854
855 /* Scan main text */
856 for (i = 0; i <= data_len - SQUAT_WORD_SIZE; i++) {
857 if (add_word_to_table(index, data + i) != SQUAT_OK) {
858 return SQUAT_ERR;
859 }
860 }
861
862 /* Fill runover. We have to be careful to handle all the cases,
863 particularly we just saw less than SQUAT_WORD_SIZE bytes and we
864 need to copy some data from the old runover buffer into the new
865 runover buffer. */
866 new_runover = index->runover_len + data_len;
867 if (new_runover > SQUAT_WORD_SIZE) {
868 new_runover = SQUAT_WORD_SIZE;
869 }
870 new_runover_data = data_len;
871 if (new_runover_data > new_runover) {
872 new_runover_data = new_runover;
873 }
874 /* Copy data from the old runover buffer into its new position in
875 the new runover buffer */
876 memmove(index->runover_buf,
877 index->runover_buf + index->runover_len -
878 (new_runover - new_runover_data),
879 new_runover - new_runover_data);
880 /* Copy data from the new text into the new runover buffer */
881 memcpy(index->runover_buf + new_runover - new_runover_data,
882 data + data_len - new_runover_data, new_runover_data);
883 index->runover_len = new_runover;
884
885 /* Tracking how much data we've seen for this document in total */
886 index->current_doc_len += data_len;
887
888 return SQUAT_OK;
889 }
890
891 /* Write the word to the given temporary file. Since each temporary
892 file is dedicated to a given initial byte, the word passed to us
893 has the initial byte removed. */
output_word(SquatWriteBuffer * b,char const * word)894 static int output_word(SquatWriteBuffer *b, char const *word)
895 {
896 char *buf = prepare_buffered_write(b, SQUAT_WORD_SIZE - 1);
897
898 if (buf == NULL) {
899 return SQUAT_ERR;
900 }
901 memcpy(buf, word, SQUAT_WORD_SIZE - 1);
902 complete_buffered_write(b, buf + SQUAT_WORD_SIZE - 1);
903
904 return SQUAT_OK;
905 }
906
907 /* Write the word data from the trie 't' into the temporary file
908 accessed through 'b'. Words to write are assembled starting at
909 'word'; we assume that 'len' bytes have already been assembled
910 leading up to 'word'. This function clears the word data after
911 writing it out. This makes it ready to handle the next document
912 without reallocating everything. */
write_words(SquatIndex * index,SquatWriteBuffer * b,SquatWordTable * t,int len,char * word)913 static int write_words(SquatIndex *index, SquatWriteBuffer *b,
914 SquatWordTable *t, int len, char *word)
915 {
916 if (len == 2) {
917 /* Handle a branch node that refers to leaves. */
918 int i;
919
920 for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
921 SquatWordTableEntry *e = t->entries + i;
922
923 word[0] = (char)i;
924
925 if (((unsigned long)e->leaf_presence & 1) != 0) {
926 /* Got a singleton at this branch point. Just output the single word. */
927 word[1] = (char)(e->leaf_presence_singleton >> 1);
928 e->leaf_presence = NULL; /* clear the leaf out */
929 if (output_word(b, word - (SQUAT_WORD_SIZE - 3)) !=
930 SQUAT_OK) {
931 return SQUAT_ERR;
932 }
933 } else if (e->leaf_presence != NULL) {
934 /* Got a bit vector array which we have to scan. */
935 /* The following code is performance critical. It can dominate
936 the performance of the entire indexer. That's why we need
937 the valid_entry fields! */
938 SquatWordTableLeafPresence *p = e->leaf_presence;
939 int i;
940 int last_byte = p->last_valid_entry >> 3;
941
942 for (i = p->first_valid_entry >> 3; i <= last_byte; i++) {
943 if ((unsigned)i >= VECTOR_SIZE(p->presence)) {
944 return SQUAT_ERR;
945 } else {
946 int bits = (unsigned char)p->presence[i];
947 int j;
948
949 for (j = 0; bits > 0; j++, bits >>= 1) {
950 if ((bits & 1) != 0) {
951 /* Output a word for each bit that is set */
952 word[1] = (char)(i * 8 + j);
953 if (output_word
954 (b,
955 word - (SQUAT_WORD_SIZE - 3)) !=
956 SQUAT_OK) {
957 return SQUAT_ERR;
958 }
959 }
960 }
961 }
962 }
963 free(p);
964 e->leaf_presence = NULL;
965 }
966 }
967 } else {
968 /* Handle an interior branch node. A simple matter of recursion. */
969 int i;
970
971 for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
972 SquatWordTable *new_t = t->entries[i].table;
973
974 if (new_t != NULL) {
975 word[0] = (char)i;
976 if (write_words(index, b, new_t, len - 1, word + 1)
977 != SQUAT_OK) {
978 return SQUAT_ERR;
979 }
980 }
981 }
982 }
983
984 /* This effectively clears the array because we trust these entries. */
985 t->first_valid_entry = 256;
986 t->last_valid_entry = 0;
987
988 return SQUAT_OK;
989 }
990
squat_index_close_document(SquatIndex * index)991 int squat_index_close_document(SquatIndex *index)
992 {
993 char *buf;
994 unsigned i;
995
996 squat_set_last_error(SQUAT_ERR_OK);
997
998 /* Write out the length of the current document to the index file,
999 just after the document's name. */
1000 if ((buf = prepare_buffered_write(&index->out, 10)) == NULL) {
1001 return SQUAT_ERR;
1002 }
1003 buf = squat_encode_I(buf, index->current_doc_len);
1004 complete_buffered_write(&index->out, buf);
1005
1006 if (index->stats_callback != NULL) {
1007 SquatStatsEvent event;
1008
1009 event.generic.type = SQUAT_STATS_COMPLETED_DOC;
1010 event.completed_doc.num_unique_words = index->doc_words;
1011 index->stats_callback(index->stats_callback_closure, &event);
1012 }
1013
1014 /* For each byte that started a word in the source document, we need
1015 to dump all the words that occurred starting with that byte to
1016 the corresponding temporary file. */
1017 for (i = 0; i < VECTOR_SIZE(index->doc_words); i++) {
1018 if (index->doc_words[i] > 0) {
1019 char *write_ptr;
1020 char word_buf[SQUAT_WORD_SIZE - 1];
1021 int cur_offset;
1022
1023 if (index->index_buffers[i].fd < 0) {
1024 /* This is the first document that used a word starting with this byte.
1025 We need to create the temporary file. */
1026 if (init_write_buffer_to_temp
1027 (index, index->index_buffers + i)
1028 != SQUAT_OK) {
1029 return SQUAT_ERR;
1030 }
1031 }
1032
1033 index->total_num_words[i] += index->doc_words[i];
1034
1035 /* Write out the document ID and the number of words in this
1036 document that start with the initial byte. Then we write out
1037 the list of words themselves, SQUAT_WORD_SIZE-1 bytes
1038 each. Very simple format for the temporary files. We could
1039 compress them more but why bother? */
1040 write_ptr =
1041 prepare_buffered_write(index->index_buffers + i, 20);
1042 if (write_ptr == NULL) {
1043 return SQUAT_ERR;
1044 }
1045 write_ptr = squat_encode_I(write_ptr, index->current_doc_ID);
1046 write_ptr = squat_encode_I(write_ptr, index->doc_words[i]);
1047 complete_buffered_write(index->index_buffers + i, write_ptr);
1048
1049 cur_offset = index->index_buffers[i].total_output_bytes;
1050 if (write_words(index, index->index_buffers + i,
1051 index->doc_word_table->entries[i].table,
1052 SQUAT_WORD_SIZE - 1, word_buf)
1053 != SQUAT_OK) {
1054 return SQUAT_ERR;
1055 }
1056 /* Make sure that we actually output the exact number of words
1057 we thought we added to the trie. It's really easy to break
1058 this invariant with bugs in the above code! */
1059 assert(index->index_buffers[i].total_output_bytes - cur_offset
1060 == (SQUAT_WORD_SIZE - 1) * index->doc_words[i]);
1061 }
1062 }
1063
1064 index->current_doc_len = -1;
1065
1066 index->current_doc_ID++;
1067
1068 return SQUAT_OK;
1069 }
1070
1071 /* Dump out a branch node of an "all documents" trie to the index
1072 file. It's dumped as a presence table (telling us which branches
1073 are non-NULL) followed by a list of relative file offsets in
1074 I-format pointing to the subtries for the non-NULL branches. */
dump_word_table_offsets(SquatIndex * index,SquatWordTable * t,int * offset_buf)1075 static int dump_word_table_offsets(SquatIndex *index, SquatWordTable *t,
1076 int *offset_buf)
1077 {
1078 int start_present = t->first_valid_entry;
1079 int end_present = t->last_valid_entry;
1080 char *buf;
1081 int present_count; /* We store here the actual number of present branches */
1082
1083 if (start_present > end_present) {
1084 /* There are no non-empty branches so just write an empty presence table */
1085 if ((buf = prepare_buffered_write(&index->out, 2)) == NULL) {
1086 return SQUAT_ERR;
1087 } else {
1088 buf[0] = buf[1] = 0;
1089 complete_buffered_write(&index->out, buf + 2);
1090 return SQUAT_OK;
1091 }
1092 }
1093
1094 /* If there is just one valid entry but its index is < 32, then we
1095 can't use the one-byte representation for a singleton presence
1096 because it would be mistaken for the first byte of a (count,
1097 start) presence vector header. A singleton whose index is >= 32
1098 can be written out without ambiguity. */
1099 if (end_present == start_present && end_present >= 32) {
1100 if ((buf = prepare_buffered_write(&index->out, 1)) == NULL) {
1101 return SQUAT_ERR;
1102 } else {
1103 *buf++ = (char)end_present;
1104 present_count = 1;
1105 }
1106 } else {
1107 /* We're going to use the presence bit vector format. */
1108 int first_byte = start_present >> 3;
1109 int byte_count = (end_present >> 3) - first_byte + 1;
1110
1111 if ((buf =
1112 prepare_buffered_write(&index->out,
1113 2 + byte_count)) == NULL) {
1114 return SQUAT_ERR;
1115 } else {
1116 int i;
1117
1118 *buf++ = (char)first_byte;
1119 *buf++ = (char)byte_count - 1; /* subtract 1 to avoid ambiguity
1120 over the value '32' (we
1121 wouldn't use 0 anyway) */
1122 /* Clear the vector */
1123 memset(buf, 0, byte_count);
1124 present_count = 0;
1125 for (i = start_present; i <= end_present; i++) {
1126 if (offset_buf[i] > 0) {
1127 present_count++;
1128 /* Set the bit in the vector. */
1129 buf[(i >> 3) - first_byte] |= 1 << (i & 7);
1130 }
1131 }
1132 buf += byte_count;
1133 }
1134 }
1135 complete_buffered_write(&index->out, buf);
1136
1137 /* Now we write out the actual offset table in I-format. */
1138 if ((buf =
1139 prepare_buffered_write(&index->out,
1140 10 * present_count)) == NULL) {
1141 return SQUAT_ERR;
1142 } else {
1143 int i;
1144
1145 for (i = start_present; i <= end_present; i++) {
1146 int off = offset_buf[i];
1147
1148 if (off > 0) {
1149 buf = squat_encode_I(buf, off);
1150 }
1151 }
1152 }
1153 complete_buffered_write(&index->out, buf);
1154
1155 return SQUAT_OK;
1156 }
1157
1158 /* Write out the presence table for an "all documents" trie leaf. */
dump_doc_list_present_bits(SquatIndex * index,SquatWordTableLeafDocs * docs)1159 static int dump_doc_list_present_bits(SquatIndex *index,
1160 SquatWordTableLeafDocs *docs)
1161 {
1162 int start_present = docs->first_valid_entry;
1163 int end_present = docs->last_valid_entry;
1164 char *buf;
1165 int present_count;
1166
1167 /* If the leaf is empty, we should never get here! */
1168 assert(start_present <= end_present);
1169
1170 /* if it's a singleton < 32, then we can't use the one-byte
1171 representation because it would be mistaken for a starting byte */
1172 if (end_present == start_present && end_present >= 32) {
1173 if ((buf = prepare_buffered_write(&index->out, 1)) == NULL) {
1174 return SQUAT_ERR;
1175 } else {
1176 *buf++ = (char)end_present;
1177 present_count = 1;
1178 }
1179 } else {
1180 int first_byte = start_present >> 3;
1181 int byte_count = (end_present >> 3) - first_byte + 1;
1182
1183 if ((buf =
1184 prepare_buffered_write(&index->out,
1185 2 + byte_count)) == NULL) {
1186 return SQUAT_ERR;
1187 } else {
1188 int i;
1189
1190 *buf++ = (char)first_byte;
1191 *buf++ = (char)byte_count - 1;
1192 memset(buf, 0, byte_count);
1193 present_count = 0;
1194 for (i = start_present; i <= end_present; i++) {
1195 if (docs->docs[i] != NULL) {
1196 present_count++;
1197 buf[(i >> 3) - first_byte] |= 1 << (i & 7);
1198 }
1199 }
1200 buf += byte_count;
1201 }
1202 }
1203 complete_buffered_write(&index->out, buf);
1204
1205 return SQUAT_OK;
1206 }
1207
1208 /* Write out the document lists for an "all documents" trie leaf. */
dump_doc_list_docs(SquatIndex * index,SquatWordTableLeafDocs * docs)1209 static int dump_doc_list_docs(SquatIndex *index,
1210 SquatWordTableLeafDocs *docs)
1211 {
1212 int i;
1213 WordDocEntry **doc_list = docs->docs;
1214
1215 for (i = docs->first_valid_entry; i <= docs->last_valid_entry; i++) {
1216 if (doc_list[i] != NULL) {
1217 WordDocEntry *first_doc;
1218 WordDocEntry *doc;
1219 int run_size = 0; /* Bytes required to store the doclist for this word */
1220 int last_doc_ID;
1221 int run_seq_delta = 0;
1222 int run_seq_count;
1223 int doc_count = 0; /* number of documents containing this word */
1224 char *buf;
1225
1226 doc = first_doc = doc_list[i]->next;
1227
1228 last_doc_ID = 0;
1229 run_seq_count = 0;
1230 /* First compute the run_size bytes required to store the doclist */
1231 do {
1232 if (doc->doc_ID == last_doc_ID + 1 && run_seq_count > 0) {
1233 run_seq_count++;
1234 } else {
1235 if (run_seq_count > 0) {
1236 if (run_seq_count > 1) {
1237 run_size +=
1238 squat_count_encode_I(run_seq_count << 1)
1239 + squat_count_encode_I(run_seq_delta);
1240 } else {
1241 run_size +=
1242 squat_count_encode_I((run_seq_delta << 1) |
1243 1);
1244 }
1245 }
1246 run_seq_count = 1;
1247 run_seq_delta = doc->doc_ID - last_doc_ID;
1248 }
1249 last_doc_ID = doc->doc_ID;
1250 doc = doc->next;
1251 doc_count++;
1252 } while (doc != first_doc);
1253 if (run_seq_count > 0) {
1254 if (run_seq_count > 1) {
1255 run_size += squat_count_encode_I(run_seq_count << 1)
1256 + squat_count_encode_I(run_seq_delta);
1257 } else {
1258 run_size +=
1259 squat_count_encode_I((run_seq_delta << 1) | 1);
1260 }
1261 }
1262
1263 /* reserve more than enough space in the buffer */
1264 if ((buf = prepare_buffered_write(&index->out, 10 + run_size)) == NULL) {
1265 return SQUAT_ERR;
1266 }
1267
1268 /* If there's only one document, use singleton document format */
1269 if (doc_count == 1) {
1270 buf = squat_encode_I(buf, (doc->doc_ID << 1) | 1);
1271 } else {
1272 /* Store the entire document list, with its size first. */
1273 buf = squat_encode_I(buf, run_size << 1);
1274
1275 last_doc_ID = 0;
1276 run_seq_count = 0;
1277 /* This logic should mirror the logic above that counts the bytes. */
1278 do {
1279 if (doc->doc_ID == last_doc_ID + 1
1280 && run_seq_count > 0) {
1281 run_seq_count++;
1282 } else {
1283 if (run_seq_count > 0) {
1284 if (run_seq_count > 1) {
1285 buf =
1286 squat_encode_I(buf,
1287 run_seq_count << 1);
1288 buf = squat_encode_I(buf, run_seq_delta);
1289 } else {
1290 buf =
1291 squat_encode_I(buf,
1292 (run_seq_delta << 1) |
1293 1);
1294 }
1295 }
1296 run_seq_count = 1;
1297 run_seq_delta = doc->doc_ID - last_doc_ID;
1298 }
1299 last_doc_ID = doc->doc_ID;
1300 doc = doc->next;
1301 } while (doc != first_doc);
1302 if (run_seq_count > 0) {
1303 if (run_seq_count > 1) {
1304 buf = squat_encode_I(buf, run_seq_count << 1);
1305 buf = squat_encode_I(buf, run_seq_delta);
1306 } else {
1307 buf =
1308 squat_encode_I(buf, (run_seq_delta << 1) | 1);
1309 }
1310 }
1311 }
1312
1313 complete_buffered_write(&index->out, buf);
1314 }
1315 }
1316
1317 return SQUAT_OK;
1318 }
1319
1320 /* Write an "all documents" subtrie to the index file.
1321 'result_offset' is an absolute offset within the file where this
1322 subtrie was stored. We free the trie leaves as we go. */
write_trie_word_data(SquatIndex * index,SquatWordTable * t,int len,int * result_offset)1323 static int write_trie_word_data(SquatIndex *index, SquatWordTable *t,
1324 int len, int *result_offset)
1325 {
1326 int i;
1327 int offsets[256]; /* Collect the offsets of the subtries in this array. */
1328 int off;
1329 SquatWordTableEntry *entries = t->entries;
1330 int r;
1331
1332 memset(offsets, 0, t->first_valid_entry * sizeof(int));
1333 if (len > 2) {
1334 /* interior branch */
1335 for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
1336 SquatWordTable *new_t = entries[i].table;
1337
1338 if (new_t != NULL) {
1339 if (write_trie_word_data
1340 (index, new_t, len - 1, offsets + i)
1341 != SQUAT_OK) {
1342 return SQUAT_ERR;
1343 }
1344 free(entries[i].table);
1345 entries[i].table = NULL;
1346 } else {
1347 offsets[i] = 0;
1348 }
1349 }
1350 } else {
1351 /* Leaf case */
1352 for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
1353 SquatWordTableLeafDocs *leaf_docs = entries[i].leaf_docs;
1354
1355 if (leaf_docs != NULL) {
1356 offsets[i] = index->out.total_output_bytes;
1357
1358 if (dump_doc_list_present_bits(index, leaf_docs) !=
1359 SQUAT_OK
1360 || dump_doc_list_docs(index, leaf_docs) != SQUAT_OK) {
1361 return SQUAT_ERR;
1362 }
1363 free(entries[i].leaf_docs);
1364 entries[i].leaf_docs = NULL;
1365 } else {
1366 offsets[i] = 0;
1367 }
1368 }
1369 }
1370 memset(offsets + i, 0, (256 - i) * sizeof(int));
1371
1372 /* Now we've written out our subtries, we know where our branch
1373 table is going to be. */
1374 *result_offset = off = index->out.total_output_bytes;
1375
1376 /* Relativize the offsets. This is just to reduce the probable
1377 magnitude of the numbers so they will pack better into I-format. */
1378 for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
1379 if (offsets[i] != 0) {
1380 offsets[i] = off - offsets[i];
1381 }
1382 }
1383
1384 r = dump_word_table_offsets(index, t, offsets);
1385
1386 /* Mark this subtrie as empty. */
1387 t->first_valid_entry = 256;
1388 t->last_valid_entry = 0;
1389
1390 return r;
1391 }
1392
1393 /* Dump out a complete trie for the given initial byte from its temporary file.
1394 The absolute offset of the trie's root table within the file is
1395 returned in 'result_offset'. */
dump_index_trie_words(SquatIndex * index,int first_char,int * result_offset)1396 static int dump_index_trie_words(SquatIndex *index, int first_char,
1397 int *result_offset)
1398 {
1399 SquatSearchIndex *old_index = index->old_index;
1400 SquatWriteBuffer *buf = index->index_buffers + first_char;
1401 int num_words = index->total_num_words[first_char];
1402 WordDocEntry *doc_table;
1403 char const *word_list_ptr;
1404 int r = SQUAT_OK;
1405 char const *word_ptr;
1406 int existing = 0;
1407
1408 if (old_index &&
1409 squat_count_docs(old_index, first_char, &existing) != SQUAT_OK) {
1410 return (SQUAT_ERR);
1411 }
1412
1413 /* Allocate all the necessary document-ID linked list entries at once. */
1414 doc_table =
1415 (WordDocEntry *) xmalloc(sizeof(WordDocEntry) *
1416 (num_words + existing));
1417 index->word_doc_allocator = doc_table;
1418
1419 /* Send existing trie across first as those leafs have lowest doc IDs */
1420 if (old_index) {
1421 r = squat_scan(old_index, first_char, add_word_callback, index);
1422 if (r != SQUAT_OK) {
1423 r = SQUAT_ERR;
1424 goto cleanup;
1425 }
1426 }
1427
1428 /* mmap the temporary file. */
1429 word_list_ptr =
1430 mmap(NULL, buf->total_output_bytes, PROT_READ, MAP_SHARED, buf->fd,
1431 0);
1432 if (word_list_ptr == MAP_FAILED) {
1433 squat_set_last_error(SQUAT_ERR_SYSERR);
1434 r = SQUAT_ERR;
1435 goto cleanup;
1436 }
1437 word_ptr = word_list_ptr;
1438
1439 /* Scan through the file */
1440 while (num_words > 0) {
1441 /* For each document, add all its words to the trie with this document ID */
1442 int doc_ID = (int)squat_decode_I(&word_ptr);
1443 int doc_words = (int)squat_decode_I(&word_ptr);
1444
1445 num_words -= doc_words;
1446
1447 while (doc_words > 0) {
1448 if (add_word_to_trie(index, word_ptr, doc_ID)
1449 != SQUAT_OK) {
1450 r = SQUAT_ERR;
1451 goto cleanup_map;
1452 }
1453 word_ptr += SQUAT_WORD_SIZE - 1;
1454 doc_words--;
1455 }
1456 }
1457
1458 /* Make sure we read all the bytes from the temporary file. */
1459 assert(word_ptr - word_list_ptr == buf->total_output_bytes);
1460
1461 /* Now dump the trie to the index file. */
1462 r = write_trie_word_data(index, index->doc_word_table,
1463 SQUAT_WORD_SIZE - 1, result_offset);
1464
1465 cleanup_map:
1466 if (munmap((void *)word_list_ptr, buf->total_output_bytes) != 0
1467 && r == SQUAT_OK) {
1468 squat_set_last_error(SQUAT_ERR_SYSERR);
1469 r = SQUAT_ERR;
1470 }
1471
1472 cleanup:
1473 free(doc_table);
1474
1475 return r;
1476 }
1477
dump_index_trie_words_no_file(SquatIndex * index,int first_char,int * result_offset)1478 static int dump_index_trie_words_no_file(SquatIndex *index,
1479 int first_char,
1480 int *result_offset)
1481 {
1482 SquatSearchIndex *old_index = index->old_index;
1483 WordDocEntry *doc_table;
1484 int r = SQUAT_OK;
1485 int existing = 0;
1486
1487 if (!old_index)
1488 return (SQUAT_OK); /* Should never happen? */
1489
1490 if (squat_count_docs(old_index, first_char, &existing) != SQUAT_OK)
1491 return (SQUAT_ERR);
1492 if (existing == 0)
1493 return (SQUAT_OK);
1494
1495 /* Allocate all the necessary document-ID linked list entries at once. */
1496 doc_table = (WordDocEntry *) xmalloc(sizeof(WordDocEntry) * existing);
1497 index->word_doc_allocator = doc_table;
1498
1499 /* Send existing trie across first as those leafs have lowest doc IDs */
1500 r = squat_scan(old_index, first_char, add_word_callback, index);
1501 if (r != SQUAT_OK) {
1502 r = SQUAT_ERR;
1503 goto cleanup;
1504 }
1505
1506 if (index->word_doc_allocator > doc_table) {
1507 /* Now dump the trie to the index file. */
1508 r = write_trie_word_data(index, index->doc_word_table,
1509 SQUAT_WORD_SIZE - 1, result_offset);
1510 }
1511
1512 cleanup:
1513 free(doc_table);
1514 return r;
1515 }
1516
1517 /* This does the grunt work of completing the index. If OK is false we
1518 just take the cleanup path ... this is used by squat_index_destroy. */
index_close_internal(SquatIndex * index,int OK)1519 static int index_close_internal(SquatIndex *index, int OK)
1520 {
1521 int r = SQUAT_OK;
1522 int doc_list_offset;
1523 int doc_ID_list_offset;
1524 int word_list_offset;
1525 char *buf;
1526 unsigned i;
1527 SquatDiskHeader *header;
1528 int offset_buf[256];
1529
1530 squat_set_last_error(SQUAT_ERR_OK);
1531
1532 if (!OK) {
1533 goto cleanup;
1534 }
1535
1536 /* Close any open document ... this would really be a client bug. */
1537 if (index->current_doc_len >= 0) {
1538 squat_index_close_document(index);
1539 }
1540
1541 /* Clear the current trie. We are now going to use it to build
1542 all-documents tries. */
1543 word_table_delete(index->doc_word_table, SQUAT_WORD_SIZE);
1544 index->doc_word_table = word_table_new();
1545
1546 /* Write out the array that maps document IDs to offsets of the
1547 document records. */
1548 doc_list_offset = sizeof(SquatDiskHeader);
1549 doc_ID_list_offset = index->out.total_output_bytes + 1;
1550 if ((buf = prepare_buffered_write(&index->out,
1551 SQUAT_SAFETY_ZONE +
1552 ((index->current_doc_ID +
1553 1) * 4))) == NULL) {
1554 r = SQUAT_ERR;
1555 goto cleanup;
1556 }
1557 *buf++ = 0;
1558 memcpy(buf, index->doc_ID_list, index->current_doc_ID * 4);
1559 buf += index->current_doc_ID * 4;
1560 memset(buf, 0, 4);
1561 complete_buffered_write(&index->out, buf + 4);
1562
1563 /* Now write out the trie for every initial byte that we saw. The
1564 offsets are collected in 'offset_buf'. */
1565 memset(offset_buf, 0, sizeof(offset_buf));
1566 for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
1567 if (index->stats_callback != NULL) {
1568 SquatStatsEvent event;
1569
1570 event.generic.type = SQUAT_STATS_COMPLETED_INITIAL_CHAR;
1571 event.completed_initial_char.completed_char = i;
1572 event.completed_initial_char.num_words =
1573 index->total_num_words[i];
1574 if (index->index_buffers[i].fd >= 0) {
1575 event.completed_initial_char.temp_file_size =
1576 index->index_buffers[i].total_output_bytes;
1577 } else {
1578 event.completed_initial_char.temp_file_size = 0;
1579 }
1580 index->stats_callback(index->stats_callback_closure, &event);
1581 }
1582
1583 if (index->index_buffers[i].fd >= 0) {
1584 /* We have to flush the temporary file output buffer before we try to use
1585 the temporary file. */
1586 if (flush_and_reset_buffered_writes(index->index_buffers + i)
1587 != SQUAT_OK
1588 || dump_index_trie_words(index, i,
1589 offset_buf + i) != SQUAT_OK) {
1590 r = SQUAT_ERR;
1591 goto cleanup;
1592 }
1593 /* Close files and free memory as we go. This could be important
1594 if disk space is low and we're generating a huge index. */
1595 if (close(index->index_buffers[i].fd) < 0) {
1596 squat_set_last_error(SQUAT_ERR_SYSERR);
1597 r = SQUAT_ERR;
1598 }
1599 index->index_buffers[i].fd = -1;
1600 buf_free(&index->index_buffers[i].buf);
1601 } else if (index->old_index) {
1602 /* Only needed if incremental updates going on */
1603 /* Just copy across existing trie if nothing new to merge in */
1604 if (dump_index_trie_words_no_file(index, i, offset_buf + i) !=
1605 SQUAT_OK) {
1606 r = SQUAT_ERR;
1607 goto cleanup;
1608 }
1609 }
1610 }
1611
1612 /* Save the offset where the root of the index trie is going to go. */
1613 word_list_offset = index->out.total_output_bytes;
1614
1615 /* Relativize the subtrie offsets. */
1616 for (i = 0; i < VECTOR_SIZE(offset_buf); i++) {
1617 if (offset_buf[i] != 0) {
1618 offset_buf[i] = word_list_offset - offset_buf[i];
1619
1620 if ((int)i < index->doc_word_table->first_valid_entry) {
1621 index->doc_word_table->first_valid_entry = i;
1622 }
1623 index->doc_word_table->last_valid_entry = i;
1624 }
1625 }
1626
1627 /* Dump out the offset buffer at last. */
1628 if (dump_word_table_offsets(index, index->doc_word_table, offset_buf)
1629 != SQUAT_OK) {
1630 r = SQUAT_ERR;
1631 goto cleanup;
1632 }
1633
1634 /* finally, write trailing zeroes and the header ... now that we know
1635 we initialized the file with no errors */
1636 if ((buf =
1637 prepare_buffered_write(&index->out, SQUAT_SAFETY_ZONE)) == NULL) {
1638 r = SQUAT_ERR;
1639 goto cleanup;
1640 }
1641 memset(buf, 0, SQUAT_SAFETY_ZONE);
1642 complete_buffered_write(&index->out, buf + SQUAT_SAFETY_ZONE);
1643
1644 /* Flush writes before we seek back to the start to write the header */
1645 if (flush_and_reset_buffered_writes(&index->out) != SQUAT_OK) {
1646 r = SQUAT_ERR;
1647 goto cleanup;
1648 }
1649
1650 /* Blat out the header */
1651 if ((header = (SquatDiskHeader *) prepare_buffered_write(&index->out,
1652 sizeof
1653 (SquatDiskHeader)))
1654 == NULL) {
1655 r = SQUAT_ERR;
1656 goto cleanup;
1657 }
1658 memcpy(header->header_text, squat_index_file_header, 8);
1659 squat_encode_64(header->doc_list_offset, doc_list_offset);
1660 squat_encode_64(header->doc_ID_list_offset, doc_ID_list_offset);
1661 squat_encode_64(header->word_list_offset, word_list_offset);
1662 memcpy(header->valid_char_bits, index->valid_char_bits,
1663 sizeof(header->valid_char_bits));
1664 complete_buffered_write(&index->out, (char *)(header + 1));
1665
1666 /* Flush out the header */
1667 if (flush_and_reset_buffered_writes(&index->out) != SQUAT_OK) {
1668 r = SQUAT_ERR;
1669 goto cleanup;
1670 }
1671
1672 /* WOOHOO! It's done! */
1673
1674 cleanup:
1675 buf_free(&index->out.buf);
1676 word_table_delete(index->doc_word_table, SQUAT_WORD_SIZE - 1);
1677 /* If we're bailing out because of an error, we might not have
1678 released all the temporary file resources. */
1679 for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
1680 if (index->index_buffers[i].fd >= 0)
1681 close(index->index_buffers[i].fd);
1682 buf_free(&index->index_buffers[i].buf);
1683 }
1684 free(index->tmp_path);
1685 free(index->doc_ID_list);
1686 doc_ID_map_free(&index->doc_ID_map);
1687 free(index);
1688
1689 return r;
1690 }
1691
squat_index_finish(SquatIndex * index)1692 int squat_index_finish(SquatIndex *index)
1693 {
1694 return index_close_internal(index, 1);
1695 }
1696
squat_index_destroy(SquatIndex * index)1697 int squat_index_destroy(SquatIndex *index)
1698 {
1699 return index_close_internal(index, 0);
1700 }
1701