1 /*
2  * Copyright (c) 1994-2008 Carnegie Mellon University.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. The name "Carnegie Mellon University" must not be used to
17  *    endorse or promote products derived from this software without
18  *    prior written permission. For permission or any legal
19  *    details, please contact
20  *      Carnegie Mellon University
21  *      Center for Technology Transfer and Enterprise Creation
22  *      4615 Forbes Avenue
23  *      Suite 302
24  *      Pittsburgh, PA  15213
25  *      (412) 268-7393, fax: (412) 268-7395
26  *      innovation@andrew.cmu.edu
27  *
28  * 4. Redistributions of any form whatsoever must retain the following
29  *    acknowledgment:
30  *    "This product includes software developed by Computing Services
31  *     at Carnegie Mellon University (http://www.cmu.edu/computing/)."
32  *
33  * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
34  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
35  * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
36  * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
37  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
38  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
39  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
40  */
41 
42 /*
43   SQUAT code for building indexes.
44   Robert O'Callahan
45 
46   IMPLEMENTATION NOTES:
47 
48   The basic strategy here is pretty simple. During the index build
49   process we keep 256 temporary files. Each time we read a source
50   document, we add all its words that start with byte i, along with
51   the document ID, to file #i. Once we've seen all the source
52   documents we proceed through each temporary file #i, one by one,
53   constructing a trie of all the words starting with byte i, and which
54   stores the IDs of the documents that contain each word. When we get
55   to the end of each temporary file, we can write out the trie to the
56   index file and start all over again on the next temporary file.
57 
58   This is marvellously scalable! During the document reading phase,
59   we're just dumping data out into temporary files, and the amount of
60   data we dump out is proportional to the total size of the source
61   documents. (In the worst case, with large input files of random
62   data, we write out 3 bytes per input byte into temporary files.)
63   During the trie-building phase, we reread the temporary files and
64   output the final index. In this phase we consume a fair bit of
65   memory, but in the worst case only 8 bytes per document ID per word
66   which starts with the right byte. Even in the very worst case, if
67   there were gigabytes of random data, there are only 2^24 possible
68   such words, and in practice of course there are far fewer.
69 
70   In practice performance is dominated by sequential I/O. On my email,
71   I can index half a megabyte of source text per second on a
72   single-disk desktop PC.
73 
74   The same trie data structures are used to build tries to record the
75   words used in a particular document (while the source document is
76   being fed in) and to build tries to record the words used in all
77   documents that start with a given byte (while we process each
78   temporary file).
79 
80   Each "per document" trie stores all words occurring in the
81   document. We make it a depth 3 trie, and at the leaves we store a
82   bit vector recording which words are present in the document, with a
83   bit set to 1 if a word occurs with its 4th character set to the
84   corresponding byte.
85 
86   Each "all document" trie assumes a fixed first word byte, and
87   therefore is only of depth 3. The leaves store the list of document
88   IDs containing the word.
89 */
90 
91 #include <config.h>
92 
93 #include <stdlib.h>
94 #include <string.h>
95 #include <unistd.h>
96 #include <sys/mman.h>
97 
98 #include "squat_internal.h"
99 #include "mailbox.h"
100 #include "message.h"
101 
102 #include "assert.h"
103 #include "util.h"
104 #include "index.h"
105 #include "xmalloc.h"
106 
107 /* A simple write-buffering module which avoids copying of the output data. */
108 
109 typedef struct {
110     struct buf buf;         /* The extending malloc'ed buffer */
111     int fd;                 /* The fd to write to. */
112     int total_output_bytes; /* How much data have we written out
113                                through this buffer in total? */
114 } SquatWriteBuffer;
115 
init_write_buffer(SquatWriteBuffer * b,int buf_size,int fd)116 static int init_write_buffer(SquatWriteBuffer* b, int buf_size, int fd)
117 {
118     buf_ensure(&b->buf, buf_size);
119     b->fd = fd;
120     b->total_output_bytes = 0;
121 
122     return SQUAT_OK;
123 }
124 
125 /* Make sure that there is enough space in the buffer to write 'len' bytes.
126    Return a pointer to where the written data should be placed. */
prepare_buffered_write(SquatWriteBuffer * b,int len)127 static char *prepare_buffered_write(SquatWriteBuffer *b, int len)
128 {
129     if (b->buf.len + len >= b->buf.alloc) {
130         if (write(b->fd, b->buf.s, b->buf.len) != (long)b->buf.len) {
131             squat_set_last_error(SQUAT_ERR_SYSERR);
132             return NULL;
133         }
134         buf_reset(&b->buf);
135         buf_ensure(&b->buf, len);
136     }
137 
138     return b->buf.s + b->buf.len;
139 }
140 
141 /* Signal that data has been written up to the mark 'ptr'.
142    Call this after prepare_buffered_write. */
complete_buffered_write(SquatWriteBuffer * b,char * ptr)143 static void complete_buffered_write(SquatWriteBuffer *b, char *ptr)
144 {
145     int oldbytes = b->buf.len;
146     int newbytes = ptr - b->buf.s;
147     buf_truncate(&b->buf, newbytes);
148     b->total_output_bytes += newbytes - oldbytes;
149 }
150 
151 /* Flush the output buffer to the file. Reset the file pointer to the start
152    of the file. */
flush_and_reset_buffered_writes(SquatWriteBuffer * b)153 static int flush_and_reset_buffered_writes(SquatWriteBuffer *b)
154 {
155     if (b->buf.len) {
156         if (write(b->fd, b->buf.s, b->buf.len) != (long)b->buf.len) {
157             squat_set_last_error(SQUAT_ERR_SYSERR);
158             return SQUAT_ERR;
159         }
160         buf_reset(&b->buf);
161     }
162 
163     if (lseek(b->fd, 0, SEEK_SET) != 0) {
164         squat_set_last_error(SQUAT_ERR_SYSERR);
165         return SQUAT_ERR;
166     }
167 
168     return SQUAT_OK;
169 }
170 
171 /* A circular linked list of document IDs, stored in increasing order
172    of document ID. */
173 typedef struct _WordDocEntry {
174     struct _WordDocEntry *next;
175     int doc_ID;
176 } WordDocEntry;
177 
178 /* These form the leaves of the "all documents" tries. For each of the
179    256 words with trailing byte 'i', docs[i] is NULL if the word does
180    not occur in any document, otherwise it is the head of a linked
181    list of document IDs for the documents which contain the word. */
182 typedef struct {
183   short first_valid_entry;  /* We record the first and last valid
184                                entries in the array below. These could
185                                be computed by just scanning the array,
186                                but it turns out that in practice such
187                                array scanning dominates the CPU
188                                consumption of the indexer. We get
189                                major speedup by maintaining these
190                                entries on the fly. */
191   short last_valid_entry;
192   WordDocEntry* docs[256];  /* Pointers to the document ID lists for
193                                each of the 256 words rooted at this
194                                part of the trie. Each non-NULL pointer
195                                points to the LAST element of the
196                                linked list (i.e. the entry with the
197                                highest document ID). This means we can
198                                efficiently add to the end of the
199                                linked list, and also efficiently get
200                                to the start of the linked list (the
201                                element with lowest document ID)
202                                (because it's circular). */
203 } SquatWordTableLeafDocs;
204 
205 /* These form the leaves of the "per document" tries. For each of the
206    256 words with trailing byte 'i', presence[i >> 3] & (1 << (i & 7))
207    is 1 if the word occurs in the document, otherwise 0. */
208 typedef struct {
209   short first_valid_entry;  /* We record the first and last valid
210                                entries in the bit vector below. These
211                                could be computed by just scanning the
212                                array, but we get significant speedup
213                                by maintaining them here. */
214   short last_valid_entry;
215   char presence[32];
216 } SquatWordTableLeafPresence;
217 
218 /* This is an entry in a trie. */
219 typedef union _SquatWordTableEntry {
220   struct _SquatWordTable* table;   /* This is a branch node */
221 
222   /* These variants are used for leaves of "per document" tries.
223      They are distinguished by the value of the low bit. */
224   SquatWordTableLeafPresence* leaf_presence;    /* low bit is 0 */
225   int leaf_presence_singleton;                  /* low bit is 1 */
226 
227   /* This variant is used for leaves of "all document" tries. */
228   SquatWordTableLeafDocs* leaf_docs;
229 } SquatWordTableEntry;
230 
231 /* This is a trie branch node. */
232 typedef struct _SquatWordTable {
233   short first_valid_entry;   /* We record the first and last valid
234                                 entries in the array below, as in the
235                                 above data structures. */
236   short last_valid_entry;
237   SquatWordTableEntry entries[256];
238 } SquatWordTable;
239 
240 /* Map docIDs in existing index to docIDs in the new index */
241 struct doc_ID_map {
242     int *map;
243     int alloc;
244     int max;
245     int new;
246 };
247 
248 struct _SquatIndex {
249   char* tmp_path;                     /* Saved tmp_path option, with
250                                          the temporary filename
251                                          pattern appended */
252   SquatWriteBuffer out;               /* The buffer for the index file itself */
253   char* doc_ID_list;                  /* A buffer where we hold the
254                                          encoded array that maps from
255                                          a document ID to the offset
256                                          of the document record within
257                                          the index file. */
258   int doc_ID_list_size;               /* The allocated size of the
259                                          above buffer, measured in
260                                          multiples of
261                                          sizeof(SquatInt32) (i.e., 4) */
262   int current_doc_ID;                 /* The current document
263                                          ID. Document IDs are numbered
264                                          starting at zero and
265                                          incremented by 1 every time
266                                          we finish processing a source
267                                          document. */
268   int current_doc_len;                /* The total number of bytes
269                                          processed in the current
270                                          source document. */
271   SquatWordTable *doc_word_table;     /* The root of the trie being
272                                          built for the current
273                                          document or for the current
274                                          initial byte. */
275   char runover_buf[SQUAT_WORD_SIZE];  /* holds the last runover_len
276                                          bytes of the current source
277                                          document */
278   int runover_len;
279   WordDocEntry* word_doc_allocator;   /* A preallocated buffer of
280                                          WordDocEntries; this pointer
281                                          is bumped up one every
282                                          allocation */
283   unsigned char valid_char_bits[32];  /* Saved valid_char_bits option */
284   SquatStatsCallback stats_callback;  /* Saved stats_callback option */
285   void* stats_callback_closure;
286 
287   SquatSearchIndex* old_index;        /* Link to old index in incremental */
288   struct doc_ID_map doc_ID_map;       /* Map doc_IDs in old index to new */
289   SquatDocChooserCallback select_doc; /* Decide whether we want doc in new */
290   void *select_doc_closure;           /* Data for handler */
291 
292   /* put the big structures at the end */
293 
294   SquatWriteBuffer index_buffers[256]; /* Buffers for the temporary
295                                           files, one for each first
296                                           byte of words occurring in
297                                           the source documents */
298   int total_num_words[256];  /* total number of words starting with
299                                 given char */
300   int doc_words[256];        /* number of words in current document
301                                 starting with given char */
302 };
303 
304 /* ====================================================================== */
305 
306 /* Collection of utility routines to maintain mapping between doc_ID in
307  * the old and new squat files. Not a one to one map as old documents
308  * (old messages in Cyrus) may have been deleted.
309  */
310 
311 /* Copy existing document details verbatim from old to new index */
squat_index_copy_document(SquatIndex * index,char const * name,SquatInt64 size)312 static int squat_index_copy_document(SquatIndex *index, char const *name,
313                                      SquatInt64 size)
314 {
315     char *buf;
316     int r = squat_index_open_document(index, name);
317 
318     if (r != SQUAT_OK)
319         return (r);
320 
321     squat_set_last_error(SQUAT_ERR_OK);
322     if ((buf = prepare_buffered_write(&index->out, 10)) == NULL) {
323         return SQUAT_ERR;
324     }
325     buf = squat_encode_I(buf, size);
326     complete_buffered_write(&index->out, buf);
327 
328     index->current_doc_len = -1;
329     index->current_doc_ID++;
330 
331     return SQUAT_OK;
332 }
333 
doc_ID_map_init(struct doc_ID_map * doc_ID_map)334 static void doc_ID_map_init(struct doc_ID_map *doc_ID_map)
335 {
336     doc_ID_map->alloc = 50;
337     doc_ID_map->map = xmalloc(doc_ID_map->alloc * sizeof(int));
338     doc_ID_map->max = 0;
339     doc_ID_map->new = 0;
340 }
341 
doc_ID_map_free(struct doc_ID_map * doc_ID_map)342 static void doc_ID_map_free(struct doc_ID_map *doc_ID_map)
343 {
344     if (doc_ID_map->map)
345         free(doc_ID_map->map);
346 
347     memset(doc_ID_map, 0, sizeof(struct doc_ID_map));
348 }
349 
doc_ID_map_add(struct doc_ID_map * doc_ID_map,int exists)350 static void doc_ID_map_add(struct doc_ID_map *doc_ID_map, int exists)
351 {
352     if (doc_ID_map->max == doc_ID_map->alloc) {
353         doc_ID_map->alloc *= 2;
354         doc_ID_map->map =
355             xrealloc(doc_ID_map->map, doc_ID_map->alloc * sizeof(int));
356     }
357     if (exists) {
358         doc_ID_map->map[doc_ID_map->max++] = doc_ID_map->new++;
359     } else {
360         doc_ID_map->map[doc_ID_map->max++] = 0; /* Does not exist in new index */
361     }
362 }
363 
doc_ID_map_lookup(struct doc_ID_map * doc_ID_map,int docID)364 static int doc_ID_map_lookup(struct doc_ID_map *doc_ID_map, int docID)
365 {
366     if ((docID < 1) || (docID > doc_ID_map->max))
367         return (0);
368 
369     return (doc_ID_map->map[docID]);
370 }
371 
copy_docIDs(void * closure,SquatListDoc const * doc)372 static int copy_docIDs(void *closure, SquatListDoc const *doc)
373 {
374     SquatIndex *index = (SquatIndex *) closure;
375     struct doc_ID_map *doc_ID_map = &index->doc_ID_map;
376     int choice = (index->select_doc) (index->select_doc_closure, doc);
377 
378     if (choice > 0) {
379         doc_ID_map_add(doc_ID_map, 1);
380         return (squat_index_copy_document
381                 (index, doc->doc_name, doc->size));
382     }
383 
384     /* This docID no longer exists */
385     doc_ID_map_add(doc_ID_map, 0);
386     return SQUAT_CALLBACK_CONTINUE;
387 }
388 
389 /* Comes later */
390 static int add_word_to_trie(SquatIndex* index, char const* word_ptr,
391                             int doc_ID);
392 
add_word_callback(void * closure,char * name,int doc_ID)393 static int add_word_callback(void *closure, char *name, int doc_ID)
394 {
395     SquatIndex *index = (SquatIndex *) closure;
396     struct doc_ID_map *doc_ID_map = &index->doc_ID_map;
397 
398     /* Find doc_ID in the new index which corresponds to this old doc_ID */
399     if ((doc_ID = doc_ID_map_lookup(doc_ID_map, doc_ID)) == 0)
400         return SQUAT_ERR;
401 
402     add_word_to_trie(index, name + 1, doc_ID);
403 
404     return SQUAT_CALLBACK_CONTINUE;
405 }
406 
squat_index_add_existing(SquatIndex * index,SquatSearchIndex * old_index,SquatDocChooserCallback select_doc,void * select_doc_closure)407 int squat_index_add_existing(SquatIndex *index,
408                              SquatSearchIndex *old_index,
409                              SquatDocChooserCallback select_doc,
410                              void *select_doc_closure)
411 {
412     index->old_index = old_index;
413     index->select_doc = select_doc;
414     index->select_doc_closure = select_doc_closure;
415 
416     return (squat_search_list_docs(old_index, copy_docIDs, index));
417 }
418 
419 /* ====================================================================== */
420 
421 /* Initially, before we see a document, there are no words for the document. */
word_table_new(void)422 static SquatWordTable *word_table_new(void)
423 {
424     SquatWordTable *ret =
425         (SquatWordTable *) xzmalloc(sizeof(SquatWordTable));
426 
427     /* Initially there are no valid entries. Set things up so that
428        the obvious tests will set first_valid_entry and
429        last_valid_entry correctly. */
430     ret->first_valid_entry = 256;
431     ret->last_valid_entry = 0;
432     return ret;
433 }
434 
squat_index_init(int fd,const SquatOptions * options)435 SquatIndex *squat_index_init(int fd, const SquatOptions *options)
436 {
437     SquatIndex *index;
438     unsigned i;
439     char *buf;
440     char const *tmp_path;
441 
442     squat_set_last_error(SQUAT_ERR_OK);
443 
444     index = (SquatIndex *) xzmalloc(sizeof(SquatIndex));
445 
446     /* Copy processed options into the SquatIndex */
447     if (options != NULL
448         && (options->option_mask & SQUAT_OPTION_TMP_PATH) != 0) {
449         tmp_path = options->tmp_path;
450     } else {
451         tmp_path = "/tmp";
452     }
453     index->tmp_path = strconcat(tmp_path, "/squatXXXXXX", (char *)NULL);
454 
455     if (options != NULL &&
456         (options->option_mask & SQUAT_OPTION_VALID_CHARS) != 0) {
457         int i;
458 
459         memset(index->valid_char_bits, 0, sizeof(index->valid_char_bits));
460         for (i = 0; options->valid_chars[i] != 0; i++) {
461             int ch = (unsigned char)options->valid_chars[i];
462 
463             index->valid_char_bits[ch >> 3] |= 1 << (ch & 7);
464         }
465     } else {
466         memset(index->valid_char_bits, 255,
467                sizeof(index->valid_char_bits));
468     }
469 
470     if (options != NULL &&
471         (options->option_mask & SQUAT_OPTION_STATISTICS) != 0) {
472         index->stats_callback = options->stats_callback;
473         index->stats_callback_closure = options->stats_callback_closure;
474     } else {
475         index->stats_callback = NULL;
476     }
477 
478     /* Finish initializing the SquatIndex */
479     for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
480         index->index_buffers[i].fd = -1;
481     }
482 
483     index->doc_ID_list_size = 1000;
484     index->doc_ID_list =
485         (char *)xmalloc(index->doc_ID_list_size * sizeof(SquatInt32));
486 
487     /* Use a 128K write buffer for the main index file */
488     if (init_write_buffer(&index->out, 128 * 1024, fd) != SQUAT_OK) {
489         goto cleanup_doc_ID_list;
490     }
491 
492     /* Write out a dummy header. This will be replaced by the real header at the
493        end of the process. */
494     buf = prepare_buffered_write(&index->out, sizeof(SquatDiskHeader));
495     if (buf == NULL) {
496         goto cleanup_out_buffer;
497     }
498     memset(buf, 0, sizeof(SquatDiskHeader));
499     complete_buffered_write(&index->out, buf + sizeof(SquatDiskHeader));
500 
501     index->current_doc_ID = 0;
502     index->doc_word_table = word_table_new();
503 
504     memset(index->total_num_words, 0, sizeof(index->total_num_words));
505 
506     index->old_index = NULL;    /* Until we are given one */
507     doc_ID_map_init(&index->doc_ID_map);
508 
509     return index;
510 
511 cleanup_out_buffer:
512     buf_free(&index->out.buf);
513 
514 cleanup_doc_ID_list:
515     free(index->doc_ID_list);
516 
517 /*cleanup_tmp_path:*/
518     free(index->tmp_path);
519 
520 /*cleanup_index:*/
521     free(index);
522     return NULL;
523 }
524 
525 /* Initialize a write buffer for a temporary file. We generate the
526    temporary file name here. The file is unlinked right away so if we
527    crash, the temporary file doesn't need to be cleaned up. */
init_write_buffer_to_temp(SquatIndex * index,SquatWriteBuffer * b)528 static int init_write_buffer_to_temp(SquatIndex *index,
529                                      SquatWriteBuffer *b)
530 {
531     char *tmp_path = xstrdup(index->tmp_path);
532     int fd = mkstemp(tmp_path);
533 
534     if (fd < 0) {
535         free(tmp_path);
536         squat_set_last_error(SQUAT_ERR_SYSERR);
537         return SQUAT_ERR;
538     }
539 
540     if (unlink(tmp_path) < 0) {
541         squat_set_last_error(SQUAT_ERR_SYSERR);
542         goto cleanup_fd;
543     }
544 
545     if (init_write_buffer(b, 64 * 1024, fd) != SQUAT_OK) {
546         goto cleanup_fd;
547     }
548 
549     free(tmp_path);
550     return SQUAT_OK;
551 
552 cleanup_fd:
553     close(fd);
554     free(tmp_path);
555     return SQUAT_ERR;
556 }
557 
squat_index_open_document(SquatIndex * index,char const * name)558 int squat_index_open_document(SquatIndex *index, char const *name)
559 {
560     int name_len;
561     char *buf;
562 
563     squat_set_last_error(SQUAT_ERR_OK);
564 
565     /* Grow the document ID array as necessary */
566     if (index->current_doc_ID >= index->doc_ID_list_size) {
567         index->doc_ID_list_size *= 2;
568         index->doc_ID_list =
569             (char *)xrealloc(index->doc_ID_list,
570                              index->doc_ID_list_size * sizeof(SquatInt32));
571     }
572 
573     /* Store the offset of the new document record into the array */
574     squat_encode_32(index->doc_ID_list + index->current_doc_ID * 4,
575                     index->out.total_output_bytes -
576                     sizeof(SquatDiskHeader));
577 
578     /* Now write the new document name out to the file. Later we will
579        write the document length right after this. Nobody writes to the
580        file in the interim. */
581     name_len = strlen(name) + 1;
582     if ((buf = prepare_buffered_write(&index->out, name_len)) == NULL) {
583         return SQUAT_ERR;
584     }
585     strcpy(buf, name);
586     complete_buffered_write(&index->out, buf + name_len);
587 
588     index->current_doc_len = 0;
589     index->runover_len = 0;
590     memset(index->doc_words, 0, sizeof(index->doc_words));
591 
592     return SQUAT_OK;
593 }
594 
595 /* Destroy the SquatWordTable. The leaf data and the internal nodes are free'd. */
word_table_delete(SquatWordTable * t,int depth)596 static void word_table_delete(SquatWordTable *t, int depth)
597 {
598     if (depth > 2) {
599         unsigned i;
600 
601         depth--;
602         for (i = 0; i < VECTOR_SIZE(t->entries); i++) {
603             SquatWordTableEntry *e = &(t->entries[i]);
604 
605             if (e->table != NULL) {
606                 word_table_delete(e->table, depth);
607             }
608         }
609     } else {
610         unsigned i;
611 
612         /* this happens to work whether the leaf entries are leaf_presence
613            or leaf_docs. This is ugly but acceptable :-) */
614         for (i = 0; i < VECTOR_SIZE(t->entries); i++) {
615             SquatWordTableEntry *e = &(t->entries[i]);
616 
617             if (e->leaf_presence != NULL
618                 && ((unsigned long)e->leaf_presence & 1) == 0) {
619                 free(e->leaf_presence);
620             }
621         }
622     }
623     free(t);
624 }
625 
626 #define SQUAT_ADD_NEW_WORD (SQUAT_LAST_BUILTIN + 1)
627 
628 /* Add an entry to the compressed presence set. We maintain
629    first_valid_entry and last_valid_entry.
630    This is faster than scanning to compute them later.
631    We return SQUAT_ADD_NEW_WORD if the bit wasn't already set. */
set_presence_bit(SquatWordTableLeafPresence * p,int ch)632 static int set_presence_bit(SquatWordTableLeafPresence *p, int ch)
633 {
634     int mask = 1 << (ch & 7);
635     char *ptr = p->presence + (ch >> 3);
636 
637     if (ch < p->first_valid_entry) {
638         p->first_valid_entry = ch;
639     }
640     if (ch > p->last_valid_entry) {
641         p->last_valid_entry = ch;
642     }
643 
644     if ((*ptr & mask) == 0) {
645         *ptr |= mask;
646         return SQUAT_ADD_NEW_WORD;
647     } else {
648         return SQUAT_OK;
649     }
650 }
651 
652 /* Add a word to the SquatWordTable trie.
653    If word_entry is NULL then we are in "per document" mode and just record
654    the presence or absence of a word, not the actual document.
655    We return SQUAT_ADD_NEW_WORD if this is the first occurrence of the
656    word in the trie. */
add_to_table(SquatIndex * index,char const * data,int data_len,WordDocEntry * word_entry)657 static int add_to_table(SquatIndex *index, char const *data, int data_len,
658                         WordDocEntry *word_entry)
659 {
660     SquatWordTable *t = index->doc_word_table;
661     int ch;
662     SquatWordTableEntry *e;
663 
664     while (data_len > 2) {
665         /* Follow the branch node down to the next level of the trie. */
666         ch = (unsigned char)data[0];
667         /* Maintain the valid_entry variables so that we don't have to
668            perform expensive scans of the 256-element arrays
669            later. Surprisingly, this optimization really matters! */
670         if (ch < t->first_valid_entry) {
671             t->first_valid_entry = ch;
672         }
673         if (ch > t->last_valid_entry) {
674             t->last_valid_entry = ch;
675         }
676 
677         e = t->entries + ch;
678         t = e->table;
679         /* Allocate the next branch node if it doesn't already exist. */
680         if (t == NULL)
681             e->table = t = word_table_new();
682 
683         data++;
684         data_len--;
685     }
686 
687     /* Follow the branch node down to the leaf level */
688     ch = (unsigned char)data[0];
689     if (ch < t->first_valid_entry) {
690         t->first_valid_entry = ch;
691     }
692     if (ch > t->last_valid_entry) {
693         t->last_valid_entry = ch;
694     }
695     e = t->entries + ch;
696 
697     ch = (unsigned char)data[1];
698 
699     if (word_entry == NULL) {
700         /* We are in "per document" mode. */
701         if (((unsigned long)e->leaf_presence & 1) != 0) {
702             /* We currently have a singleton here. */
703             int oldch = e->leaf_presence_singleton >> 1;
704 
705             /* If the singleton indicates the same word as the current word,
706                then we don't have to do anything. */
707             if (oldch != ch) {
708                 /* Otherwise we have to add the new word. This means we have
709                    to convert the singleton to a bit vector. */
710                 SquatWordTableLeafPresence *p;
711 
712                 /* Make an empty bit vector. */
713                 p = (SquatWordTableLeafPresence *)
714                     xmalloc(sizeof(SquatWordTableLeafPresence));
715                 p->first_valid_entry = 256;
716                 p->last_valid_entry = 0;
717                 memset(p->presence, 0, sizeof(p->presence));
718                 e->leaf_presence = p;
719 
720                 /* Update the bit vector */
721                 set_presence_bit(p, ch);
722                 return set_presence_bit(p, oldch);      /* will always be SQUAT_ADD_NEW_WORD */
723             }
724         } else if (e->leaf_presence == NULL) {
725             /* There's nothing here. Let's make a singleton. */
726             /* this next step might be necessary if sizeof(void*) >
727                sizeof(int). We make sure that the low bit of the pointer in
728                leaf_presence is definitely 1. */
729             e->leaf_presence = (void *)1;
730             e->leaf_presence_singleton = (ch << 1) | 1;
731             return SQUAT_ADD_NEW_WORD;
732         } else {
733             /* We already have the bit vector, so let's just set another bit in it. */
734             return set_presence_bit(e->leaf_presence, ch);
735         }
736     } else {
737         /* We are in "all documents" mode. */
738         SquatWordTableLeafDocs *docs = e->leaf_docs;
739         WordDocEntry **entry_ptr;
740 
741         /* Make a new leaf table if we don't already have one. */
742         if (docs == NULL) {
743             docs = (SquatWordTableLeafDocs *)
744                 xmalloc(sizeof(SquatWordTableLeafDocs));
745             docs->first_valid_entry = 256;
746             docs->last_valid_entry = 0;
747             memset(docs->docs, 0, sizeof(docs->docs));
748             e->leaf_docs = docs;
749         }
750 
751         entry_ptr = docs->docs + ch;
752 
753         if (*entry_ptr == NULL) {
754             /* Adding a new word, so may need to update the valid_entry markers */
755             if (ch < docs->first_valid_entry) {
756                 docs->first_valid_entry = ch;
757             }
758             if (ch > docs->last_valid_entry) {
759                 docs->last_valid_entry = ch;
760             }
761             /* Create the linked list with the single element 'word_entry'. */
762             word_entry->next = word_entry;      /* make it circular */
763             *entry_ptr = word_entry;
764             return SQUAT_ADD_NEW_WORD;
765         } else {
766             /* Just add the document to the linked list. word_entry will be
767                the new last element since the document IDs are strictly
768                increasing as we build the trie from its temporary file. */
769             word_entry->next = (*entry_ptr)->next;      /* (*entry_ptr)->next is
770                                                            (still) the first
771                                                            element of the list */
772             (*entry_ptr)->next = word_entry;    /* the old last element's
773                                                    next now points to the
774                                                    new last element. */
775             *entry_ptr = word_entry;    /* save the new last element */
776         }
777     }
778 
779     return SQUAT_OK;
780 }
781 
782 /* Add 'doc_ID' to the list of document IDs for word 'word_ptr'
783    in the "all documents" trie. */
add_word_to_trie(SquatIndex * index,char const * word_ptr,int doc_ID)784 static int add_word_to_trie(SquatIndex * index, char const *word_ptr,
785                             int doc_ID)
786 {
787     WordDocEntry *word_entry = index->word_doc_allocator++;
788 
789     word_entry->doc_ID = doc_ID;
790     add_to_table(index, word_ptr, SQUAT_WORD_SIZE - 1, word_entry);
791 
792     return SQUAT_OK;
793 }
794 
795 /* Add the word 'data' to the "per document" trie for the current document. */
add_word_to_table(SquatIndex * index,char const * data)796 static int add_word_to_table(SquatIndex *index, char const *data)
797 {
798     int r;
799     int i;
800 
801     /* Just ignore the word if it uses an invalid character. */
802     for (i = 0; i < SQUAT_WORD_SIZE; i++) {
803         int ch = (unsigned char)data[i];
804 
805         if ((index->valid_char_bits[ch >> 3] & (1 << (ch & 7))) == 0) {
806             /* this word contains an invalid character and need not be indexed,
807                since search strings will never contain such a character. */
808             return SQUAT_OK;
809         }
810     }
811 
812     r = add_to_table(index, data, SQUAT_WORD_SIZE, NULL);
813     if (r == SQUAT_ADD_NEW_WORD) {
814         /* Remember how many unique words in this document started with
815            the given first character. */
816         index->doc_words[(unsigned char)data[0]]++;
817         return SQUAT_OK;
818     } else {
819         return r;
820     }
821 }
822 
squat_index_append_document(SquatIndex * index,char const * data,int data_len)823 int squat_index_append_document(SquatIndex * index, char const *data,
824                                 int data_len)
825 {
826     int i;
827     char buf[SQUAT_WORD_SIZE];
828     int new_runover;
829     int new_runover_data;
830 
831     assert(data_len >= 0);
832 
833     squat_set_last_error(SQUAT_ERR_OK);
834 
835     if (data_len == 0) {
836         return SQUAT_OK;
837     }
838 
839     /* Scan runover */
840     for (i = 0; i < index->runover_len; i++) {
841         /* Check if we can make a whole word starting with runover bytes
842            from offset i within the runover buffer and with the remaining
843            bytes taken from the new text */
844         if (index->runover_len - i + data_len >= SQUAT_WORD_SIZE) {
845             /* Yep. Build the complete word into 'buf' and then add it. */
846             memcpy(buf, index->runover_buf + i, index->runover_len - i);
847             memcpy(buf + index->runover_len - i, data,
848                    SQUAT_WORD_SIZE - (index->runover_len - i));
849             if (add_word_to_table(index, buf) != SQUAT_OK) {
850                 return SQUAT_ERR;
851             }
852         }
853     }
854 
855     /* Scan main text */
856     for (i = 0; i <= data_len - SQUAT_WORD_SIZE; i++) {
857         if (add_word_to_table(index, data + i) != SQUAT_OK) {
858             return SQUAT_ERR;
859         }
860     }
861 
862     /* Fill runover. We have to be careful to handle all the cases,
863        particularly we just saw less than SQUAT_WORD_SIZE bytes and we
864        need to copy some data from the old runover buffer into the new
865        runover buffer. */
866     new_runover = index->runover_len + data_len;
867     if (new_runover > SQUAT_WORD_SIZE) {
868         new_runover = SQUAT_WORD_SIZE;
869     }
870     new_runover_data = data_len;
871     if (new_runover_data > new_runover) {
872         new_runover_data = new_runover;
873     }
874     /* Copy data from the old runover buffer into its new position in
875        the new runover buffer */
876     memmove(index->runover_buf,
877             index->runover_buf + index->runover_len -
878             (new_runover - new_runover_data),
879             new_runover - new_runover_data);
880     /* Copy data from the new text into the new runover buffer */
881     memcpy(index->runover_buf + new_runover - new_runover_data,
882            data + data_len - new_runover_data, new_runover_data);
883     index->runover_len = new_runover;
884 
885     /* Tracking how much data we've seen for this document in total */
886     index->current_doc_len += data_len;
887 
888     return SQUAT_OK;
889 }
890 
891 /* Write the word to the given temporary file. Since each temporary
892    file is dedicated to a given initial byte, the word passed to us
893    has the initial byte removed. */
output_word(SquatWriteBuffer * b,char const * word)894 static int output_word(SquatWriteBuffer *b, char const *word)
895 {
896     char *buf = prepare_buffered_write(b, SQUAT_WORD_SIZE - 1);
897 
898     if (buf == NULL) {
899         return SQUAT_ERR;
900     }
901     memcpy(buf, word, SQUAT_WORD_SIZE - 1);
902     complete_buffered_write(b, buf + SQUAT_WORD_SIZE - 1);
903 
904     return SQUAT_OK;
905 }
906 
907 /* Write the word data from the trie 't' into the temporary file
908    accessed through 'b'. Words to write are assembled starting at
909    'word'; we assume that 'len' bytes have already been assembled
910    leading up to 'word'. This function clears the word data after
911    writing it out. This makes it ready to handle the next document
912    without reallocating everything. */
write_words(SquatIndex * index,SquatWriteBuffer * b,SquatWordTable * t,int len,char * word)913 static int write_words(SquatIndex *index, SquatWriteBuffer *b,
914                        SquatWordTable *t, int len, char *word)
915 {
916     if (len == 2) {
917         /* Handle a branch node that refers to leaves. */
918         int i;
919 
920         for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
921             SquatWordTableEntry *e = t->entries + i;
922 
923             word[0] = (char)i;
924 
925             if (((unsigned long)e->leaf_presence & 1) != 0) {
926                 /* Got a singleton at this branch point. Just output the single word. */
927                 word[1] = (char)(e->leaf_presence_singleton >> 1);
928                 e->leaf_presence = NULL;        /* clear the leaf out */
929                 if (output_word(b, word - (SQUAT_WORD_SIZE - 3)) !=
930                     SQUAT_OK) {
931                     return SQUAT_ERR;
932                 }
933             } else if (e->leaf_presence != NULL) {
934                 /* Got a bit vector array which we have to scan. */
935                 /* The following code is performance critical. It can dominate
936                    the performance of the entire indexer. That's why we need
937                    the valid_entry fields! */
938                 SquatWordTableLeafPresence *p = e->leaf_presence;
939                 int i;
940                 int last_byte = p->last_valid_entry >> 3;
941 
942                 for (i = p->first_valid_entry >> 3; i <= last_byte; i++) {
943                     if ((unsigned)i >= VECTOR_SIZE(p->presence)) {
944                         return SQUAT_ERR;
945                     } else {
946                         int bits = (unsigned char)p->presence[i];
947                         int j;
948 
949                         for (j = 0; bits > 0; j++, bits >>= 1) {
950                             if ((bits & 1) != 0) {
951                                 /* Output a word for each bit that is set */
952                                 word[1] = (char)(i * 8 + j);
953                                 if (output_word
954                                     (b,
955                                      word - (SQUAT_WORD_SIZE - 3)) !=
956                                     SQUAT_OK) {
957                                     return SQUAT_ERR;
958                                 }
959                             }
960                         }
961                     }
962                 }
963                 free(p);
964                 e->leaf_presence = NULL;
965             }
966         }
967     } else {
968         /* Handle an interior branch node. A simple matter of recursion. */
969         int i;
970 
971         for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
972             SquatWordTable *new_t = t->entries[i].table;
973 
974             if (new_t != NULL) {
975                 word[0] = (char)i;
976                 if (write_words(index, b, new_t, len - 1, word + 1)
977                     != SQUAT_OK) {
978                     return SQUAT_ERR;
979                 }
980             }
981         }
982     }
983 
984     /* This effectively clears the array because we trust these entries. */
985     t->first_valid_entry = 256;
986     t->last_valid_entry = 0;
987 
988     return SQUAT_OK;
989 }
990 
squat_index_close_document(SquatIndex * index)991 int squat_index_close_document(SquatIndex *index)
992 {
993     char *buf;
994     unsigned i;
995 
996     squat_set_last_error(SQUAT_ERR_OK);
997 
998     /* Write out the length of the current document to the index file,
999        just after the document's name. */
1000     if ((buf = prepare_buffered_write(&index->out, 10)) == NULL) {
1001         return SQUAT_ERR;
1002     }
1003     buf = squat_encode_I(buf, index->current_doc_len);
1004     complete_buffered_write(&index->out, buf);
1005 
1006     if (index->stats_callback != NULL) {
1007         SquatStatsEvent event;
1008 
1009         event.generic.type = SQUAT_STATS_COMPLETED_DOC;
1010         event.completed_doc.num_unique_words = index->doc_words;
1011         index->stats_callback(index->stats_callback_closure, &event);
1012     }
1013 
1014     /* For each byte that started a word in the source document, we need
1015        to dump all the words that occurred starting with that byte to
1016        the corresponding temporary file. */
1017     for (i = 0; i < VECTOR_SIZE(index->doc_words); i++) {
1018         if (index->doc_words[i] > 0) {
1019             char *write_ptr;
1020             char word_buf[SQUAT_WORD_SIZE - 1];
1021             int cur_offset;
1022 
1023             if (index->index_buffers[i].fd < 0) {
1024                 /* This is the first document that used a word starting with this byte.
1025                    We need to create the temporary file. */
1026                 if (init_write_buffer_to_temp
1027                     (index, index->index_buffers + i)
1028                     != SQUAT_OK) {
1029                     return SQUAT_ERR;
1030                 }
1031             }
1032 
1033             index->total_num_words[i] += index->doc_words[i];
1034 
1035             /* Write out the document ID and the number of words in this
1036                document that start with the initial byte. Then we write out
1037                the list of words themselves, SQUAT_WORD_SIZE-1 bytes
1038                each. Very simple format for the temporary files. We could
1039                compress them more but why bother? */
1040             write_ptr =
1041                 prepare_buffered_write(index->index_buffers + i, 20);
1042             if (write_ptr == NULL) {
1043                 return SQUAT_ERR;
1044             }
1045             write_ptr = squat_encode_I(write_ptr, index->current_doc_ID);
1046             write_ptr = squat_encode_I(write_ptr, index->doc_words[i]);
1047             complete_buffered_write(index->index_buffers + i, write_ptr);
1048 
1049             cur_offset = index->index_buffers[i].total_output_bytes;
1050             if (write_words(index, index->index_buffers + i,
1051                             index->doc_word_table->entries[i].table,
1052                             SQUAT_WORD_SIZE - 1, word_buf)
1053                 != SQUAT_OK) {
1054                 return SQUAT_ERR;
1055             }
1056             /* Make sure that we actually output the exact number of words
1057                we thought we added to the trie. It's really easy to break
1058                this invariant with bugs in the above code! */
1059             assert(index->index_buffers[i].total_output_bytes - cur_offset
1060                    == (SQUAT_WORD_SIZE - 1) * index->doc_words[i]);
1061         }
1062     }
1063 
1064     index->current_doc_len = -1;
1065 
1066     index->current_doc_ID++;
1067 
1068     return SQUAT_OK;
1069 }
1070 
1071 /* Dump out a branch node of an "all documents" trie to the index
1072    file. It's dumped as a presence table (telling us which branches
1073    are non-NULL) followed by a list of relative file offsets in
1074    I-format pointing to the subtries for the non-NULL branches. */
dump_word_table_offsets(SquatIndex * index,SquatWordTable * t,int * offset_buf)1075 static int dump_word_table_offsets(SquatIndex *index, SquatWordTable *t,
1076                                    int *offset_buf)
1077 {
1078     int start_present = t->first_valid_entry;
1079     int end_present = t->last_valid_entry;
1080     char *buf;
1081     int present_count;          /* We store here the actual number of present branches */
1082 
1083     if (start_present > end_present) {
1084         /* There are no non-empty branches so just write an empty presence table */
1085         if ((buf = prepare_buffered_write(&index->out, 2)) == NULL) {
1086             return SQUAT_ERR;
1087         } else {
1088             buf[0] = buf[1] = 0;
1089             complete_buffered_write(&index->out, buf + 2);
1090             return SQUAT_OK;
1091         }
1092     }
1093 
1094     /* If there is just one valid entry but its index is < 32, then we
1095        can't use the one-byte representation for a singleton presence
1096        because it would be mistaken for the first byte of a (count,
1097        start) presence vector header. A singleton whose index is >= 32
1098        can be written out without ambiguity. */
1099     if (end_present == start_present && end_present >= 32) {
1100         if ((buf = prepare_buffered_write(&index->out, 1)) == NULL) {
1101             return SQUAT_ERR;
1102         } else {
1103             *buf++ = (char)end_present;
1104             present_count = 1;
1105         }
1106     } else {
1107         /* We're going to use the presence bit vector format. */
1108         int first_byte = start_present >> 3;
1109         int byte_count = (end_present >> 3) - first_byte + 1;
1110 
1111         if ((buf =
1112              prepare_buffered_write(&index->out,
1113                                     2 + byte_count)) == NULL) {
1114             return SQUAT_ERR;
1115         } else {
1116             int i;
1117 
1118             *buf++ = (char)first_byte;
1119             *buf++ = (char)byte_count - 1;      /* subtract 1 to avoid ambiguity
1120                                                    over the value '32' (we
1121                                                    wouldn't use 0 anyway) */
1122             /* Clear the vector */
1123             memset(buf, 0, byte_count);
1124             present_count = 0;
1125             for (i = start_present; i <= end_present; i++) {
1126                 if (offset_buf[i] > 0) {
1127                     present_count++;
1128                     /* Set the bit in the vector. */
1129                     buf[(i >> 3) - first_byte] |= 1 << (i & 7);
1130                 }
1131             }
1132             buf += byte_count;
1133         }
1134     }
1135     complete_buffered_write(&index->out, buf);
1136 
1137     /* Now we write out the actual offset table in I-format. */
1138     if ((buf =
1139          prepare_buffered_write(&index->out,
1140                                 10 * present_count)) == NULL) {
1141         return SQUAT_ERR;
1142     } else {
1143         int i;
1144 
1145         for (i = start_present; i <= end_present; i++) {
1146             int off = offset_buf[i];
1147 
1148             if (off > 0) {
1149                 buf = squat_encode_I(buf, off);
1150             }
1151         }
1152     }
1153     complete_buffered_write(&index->out, buf);
1154 
1155     return SQUAT_OK;
1156 }
1157 
1158 /* Write out the presence table for an "all documents" trie leaf. */
dump_doc_list_present_bits(SquatIndex * index,SquatWordTableLeafDocs * docs)1159 static int dump_doc_list_present_bits(SquatIndex *index,
1160                                       SquatWordTableLeafDocs *docs)
1161 {
1162     int start_present = docs->first_valid_entry;
1163     int end_present = docs->last_valid_entry;
1164     char *buf;
1165     int present_count;
1166 
1167     /* If the leaf is empty, we should never get here! */
1168     assert(start_present <= end_present);
1169 
1170     /* if it's a singleton < 32, then we can't use the one-byte
1171        representation because it would be mistaken for a starting byte */
1172     if (end_present == start_present && end_present >= 32) {
1173         if ((buf = prepare_buffered_write(&index->out, 1)) == NULL) {
1174             return SQUAT_ERR;
1175         } else {
1176             *buf++ = (char)end_present;
1177             present_count = 1;
1178         }
1179     } else {
1180         int first_byte = start_present >> 3;
1181         int byte_count = (end_present >> 3) - first_byte + 1;
1182 
1183         if ((buf =
1184              prepare_buffered_write(&index->out,
1185                                     2 + byte_count)) == NULL) {
1186             return SQUAT_ERR;
1187         } else {
1188             int i;
1189 
1190             *buf++ = (char)first_byte;
1191             *buf++ = (char)byte_count - 1;
1192             memset(buf, 0, byte_count);
1193             present_count = 0;
1194             for (i = start_present; i <= end_present; i++) {
1195                 if (docs->docs[i] != NULL) {
1196                     present_count++;
1197                     buf[(i >> 3) - first_byte] |= 1 << (i & 7);
1198                 }
1199             }
1200             buf += byte_count;
1201         }
1202     }
1203     complete_buffered_write(&index->out, buf);
1204 
1205     return SQUAT_OK;
1206 }
1207 
1208 /* Write out the document lists for an "all documents" trie leaf. */
dump_doc_list_docs(SquatIndex * index,SquatWordTableLeafDocs * docs)1209 static int dump_doc_list_docs(SquatIndex *index,
1210                               SquatWordTableLeafDocs *docs)
1211 {
1212     int i;
1213     WordDocEntry **doc_list = docs->docs;
1214 
1215     for (i = docs->first_valid_entry; i <= docs->last_valid_entry; i++) {
1216         if (doc_list[i] != NULL) {
1217             WordDocEntry *first_doc;
1218             WordDocEntry *doc;
1219             int run_size = 0;   /* Bytes required to store the doclist for this word */
1220             int last_doc_ID;
1221             int run_seq_delta = 0;
1222             int run_seq_count;
1223             int doc_count = 0;  /* number of documents containing this word */
1224             char *buf;
1225 
1226             doc = first_doc = doc_list[i]->next;
1227 
1228             last_doc_ID = 0;
1229             run_seq_count = 0;
1230             /* First compute the run_size bytes required to store the doclist */
1231             do {
1232                 if (doc->doc_ID == last_doc_ID + 1 && run_seq_count > 0) {
1233                     run_seq_count++;
1234                 } else {
1235                     if (run_seq_count > 0) {
1236                         if (run_seq_count > 1) {
1237                             run_size +=
1238                                 squat_count_encode_I(run_seq_count << 1)
1239                                 + squat_count_encode_I(run_seq_delta);
1240                         } else {
1241                             run_size +=
1242                                 squat_count_encode_I((run_seq_delta << 1) |
1243                                                      1);
1244                         }
1245                     }
1246                     run_seq_count = 1;
1247                     run_seq_delta = doc->doc_ID - last_doc_ID;
1248                 }
1249                 last_doc_ID = doc->doc_ID;
1250                 doc = doc->next;
1251                 doc_count++;
1252             } while (doc != first_doc);
1253             if (run_seq_count > 0) {
1254                 if (run_seq_count > 1) {
1255                     run_size += squat_count_encode_I(run_seq_count << 1)
1256                         + squat_count_encode_I(run_seq_delta);
1257                 } else {
1258                     run_size +=
1259                         squat_count_encode_I((run_seq_delta << 1) | 1);
1260                 }
1261             }
1262 
1263             /* reserve more than enough space in the buffer */
1264             if ((buf = prepare_buffered_write(&index->out, 10 + run_size)) == NULL) {
1265                 return SQUAT_ERR;
1266             }
1267 
1268             /* If there's only one document, use singleton document format */
1269             if (doc_count == 1) {
1270                 buf = squat_encode_I(buf, (doc->doc_ID << 1) | 1);
1271             } else {
1272                 /* Store the entire document list, with its size first. */
1273                 buf = squat_encode_I(buf, run_size << 1);
1274 
1275                 last_doc_ID = 0;
1276                 run_seq_count = 0;
1277                 /* This logic should mirror the logic above that counts the bytes. */
1278                 do {
1279                     if (doc->doc_ID == last_doc_ID + 1
1280                         && run_seq_count > 0) {
1281                         run_seq_count++;
1282                     } else {
1283                         if (run_seq_count > 0) {
1284                             if (run_seq_count > 1) {
1285                                 buf =
1286                                     squat_encode_I(buf,
1287                                                    run_seq_count << 1);
1288                                 buf = squat_encode_I(buf, run_seq_delta);
1289                             } else {
1290                                 buf =
1291                                     squat_encode_I(buf,
1292                                                    (run_seq_delta << 1) |
1293                                                    1);
1294                             }
1295                         }
1296                         run_seq_count = 1;
1297                         run_seq_delta = doc->doc_ID - last_doc_ID;
1298                     }
1299                     last_doc_ID = doc->doc_ID;
1300                     doc = doc->next;
1301                 } while (doc != first_doc);
1302                 if (run_seq_count > 0) {
1303                     if (run_seq_count > 1) {
1304                         buf = squat_encode_I(buf, run_seq_count << 1);
1305                         buf = squat_encode_I(buf, run_seq_delta);
1306                     } else {
1307                         buf =
1308                             squat_encode_I(buf, (run_seq_delta << 1) | 1);
1309                     }
1310                 }
1311             }
1312 
1313             complete_buffered_write(&index->out, buf);
1314         }
1315     }
1316 
1317     return SQUAT_OK;
1318 }
1319 
1320 /* Write an "all documents" subtrie to the index file.
1321    'result_offset' is an absolute offset within the file where this
1322    subtrie was stored. We free the trie leaves as we go. */
write_trie_word_data(SquatIndex * index,SquatWordTable * t,int len,int * result_offset)1323 static int write_trie_word_data(SquatIndex *index, SquatWordTable *t,
1324                                 int len, int *result_offset)
1325 {
1326     int i;
1327     int offsets[256];           /* Collect the offsets of the subtries in this array. */
1328     int off;
1329     SquatWordTableEntry *entries = t->entries;
1330     int r;
1331 
1332     memset(offsets, 0, t->first_valid_entry * sizeof(int));
1333     if (len > 2) {
1334         /* interior branch */
1335         for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
1336             SquatWordTable *new_t = entries[i].table;
1337 
1338             if (new_t != NULL) {
1339                 if (write_trie_word_data
1340                     (index, new_t, len - 1, offsets + i)
1341                     != SQUAT_OK) {
1342                     return SQUAT_ERR;
1343                 }
1344                 free(entries[i].table);
1345                 entries[i].table = NULL;
1346             } else {
1347                 offsets[i] = 0;
1348             }
1349         }
1350     } else {
1351         /* Leaf case */
1352         for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
1353             SquatWordTableLeafDocs *leaf_docs = entries[i].leaf_docs;
1354 
1355             if (leaf_docs != NULL) {
1356                 offsets[i] = index->out.total_output_bytes;
1357 
1358                 if (dump_doc_list_present_bits(index, leaf_docs) !=
1359                     SQUAT_OK
1360                     || dump_doc_list_docs(index, leaf_docs) != SQUAT_OK) {
1361                     return SQUAT_ERR;
1362                 }
1363                 free(entries[i].leaf_docs);
1364                 entries[i].leaf_docs = NULL;
1365             } else {
1366                 offsets[i] = 0;
1367             }
1368         }
1369     }
1370     memset(offsets + i, 0, (256 - i) * sizeof(int));
1371 
1372     /* Now we've written out our subtries, we know where our branch
1373        table is going to be. */
1374     *result_offset = off = index->out.total_output_bytes;
1375 
1376     /* Relativize the offsets. This is just to reduce the probable
1377        magnitude of the numbers so they will pack better into I-format. */
1378     for (i = t->first_valid_entry; i <= t->last_valid_entry; i++) {
1379         if (offsets[i] != 0) {
1380             offsets[i] = off - offsets[i];
1381         }
1382     }
1383 
1384     r = dump_word_table_offsets(index, t, offsets);
1385 
1386     /* Mark this subtrie as empty. */
1387     t->first_valid_entry = 256;
1388     t->last_valid_entry = 0;
1389 
1390     return r;
1391 }
1392 
1393 /* Dump out a complete trie for the given initial byte from its temporary file.
1394    The absolute offset of the trie's root table within the file is
1395    returned in 'result_offset'. */
dump_index_trie_words(SquatIndex * index,int first_char,int * result_offset)1396 static int dump_index_trie_words(SquatIndex *index, int first_char,
1397                                  int *result_offset)
1398 {
1399     SquatSearchIndex *old_index = index->old_index;
1400     SquatWriteBuffer *buf = index->index_buffers + first_char;
1401     int num_words = index->total_num_words[first_char];
1402     WordDocEntry *doc_table;
1403     char const *word_list_ptr;
1404     int r = SQUAT_OK;
1405     char const *word_ptr;
1406     int existing = 0;
1407 
1408     if (old_index &&
1409         squat_count_docs(old_index, first_char, &existing) != SQUAT_OK) {
1410         return (SQUAT_ERR);
1411     }
1412 
1413     /* Allocate all the necessary document-ID linked list entries at once. */
1414     doc_table =
1415         (WordDocEntry *) xmalloc(sizeof(WordDocEntry) *
1416                                  (num_words + existing));
1417     index->word_doc_allocator = doc_table;
1418 
1419     /* Send existing trie across first as those leafs have lowest doc IDs */
1420     if (old_index) {
1421         r = squat_scan(old_index, first_char, add_word_callback, index);
1422         if (r != SQUAT_OK) {
1423             r = SQUAT_ERR;
1424             goto cleanup;
1425         }
1426     }
1427 
1428     /* mmap the temporary file. */
1429     word_list_ptr =
1430         mmap(NULL, buf->total_output_bytes, PROT_READ, MAP_SHARED, buf->fd,
1431              0);
1432     if (word_list_ptr == MAP_FAILED) {
1433         squat_set_last_error(SQUAT_ERR_SYSERR);
1434         r = SQUAT_ERR;
1435         goto cleanup;
1436     }
1437     word_ptr = word_list_ptr;
1438 
1439     /* Scan through the file */
1440     while (num_words > 0) {
1441         /* For each document, add all its words to the trie with this document ID */
1442         int doc_ID = (int)squat_decode_I(&word_ptr);
1443         int doc_words = (int)squat_decode_I(&word_ptr);
1444 
1445         num_words -= doc_words;
1446 
1447         while (doc_words > 0) {
1448             if (add_word_to_trie(index, word_ptr, doc_ID)
1449                 != SQUAT_OK) {
1450                 r = SQUAT_ERR;
1451                 goto cleanup_map;
1452             }
1453             word_ptr += SQUAT_WORD_SIZE - 1;
1454             doc_words--;
1455         }
1456     }
1457 
1458     /* Make sure we read all the bytes from the temporary file. */
1459     assert(word_ptr - word_list_ptr == buf->total_output_bytes);
1460 
1461     /* Now dump the trie to the index file. */
1462     r = write_trie_word_data(index, index->doc_word_table,
1463                              SQUAT_WORD_SIZE - 1, result_offset);
1464 
1465 cleanup_map:
1466     if (munmap((void *)word_list_ptr, buf->total_output_bytes) != 0
1467         && r == SQUAT_OK) {
1468         squat_set_last_error(SQUAT_ERR_SYSERR);
1469         r = SQUAT_ERR;
1470     }
1471 
1472 cleanup:
1473     free(doc_table);
1474 
1475     return r;
1476 }
1477 
dump_index_trie_words_no_file(SquatIndex * index,int first_char,int * result_offset)1478 static int dump_index_trie_words_no_file(SquatIndex *index,
1479                                          int first_char,
1480                                          int *result_offset)
1481 {
1482     SquatSearchIndex *old_index = index->old_index;
1483     WordDocEntry *doc_table;
1484     int r = SQUAT_OK;
1485     int existing = 0;
1486 
1487     if (!old_index)
1488         return (SQUAT_OK);      /* Should never happen? */
1489 
1490     if (squat_count_docs(old_index, first_char, &existing) != SQUAT_OK)
1491         return (SQUAT_ERR);
1492     if (existing == 0)
1493         return (SQUAT_OK);
1494 
1495     /* Allocate all the necessary document-ID linked list entries at once. */
1496     doc_table = (WordDocEntry *) xmalloc(sizeof(WordDocEntry) * existing);
1497     index->word_doc_allocator = doc_table;
1498 
1499     /* Send existing trie across first as those leafs have lowest doc IDs */
1500     r = squat_scan(old_index, first_char, add_word_callback, index);
1501     if (r != SQUAT_OK) {
1502         r = SQUAT_ERR;
1503         goto cleanup;
1504     }
1505 
1506     if (index->word_doc_allocator > doc_table) {
1507         /* Now dump the trie to the index file. */
1508         r = write_trie_word_data(index, index->doc_word_table,
1509                                  SQUAT_WORD_SIZE - 1, result_offset);
1510     }
1511 
1512 cleanup:
1513     free(doc_table);
1514     return r;
1515 }
1516 
1517 /* This does the grunt work of completing the index. If OK is false we
1518    just take the cleanup path ... this is used by squat_index_destroy. */
index_close_internal(SquatIndex * index,int OK)1519 static int index_close_internal(SquatIndex *index, int OK)
1520 {
1521     int r = SQUAT_OK;
1522     int doc_list_offset;
1523     int doc_ID_list_offset;
1524     int word_list_offset;
1525     char *buf;
1526     unsigned i;
1527     SquatDiskHeader *header;
1528     int offset_buf[256];
1529 
1530     squat_set_last_error(SQUAT_ERR_OK);
1531 
1532     if (!OK) {
1533         goto cleanup;
1534     }
1535 
1536     /* Close any open document ... this would really be a client bug. */
1537     if (index->current_doc_len >= 0) {
1538         squat_index_close_document(index);
1539     }
1540 
1541     /* Clear the current trie. We are now going to use it to build
1542        all-documents tries. */
1543     word_table_delete(index->doc_word_table, SQUAT_WORD_SIZE);
1544     index->doc_word_table = word_table_new();
1545 
1546     /* Write out the array that maps document IDs to offsets of the
1547        document records. */
1548     doc_list_offset = sizeof(SquatDiskHeader);
1549     doc_ID_list_offset = index->out.total_output_bytes + 1;
1550     if ((buf = prepare_buffered_write(&index->out,
1551                                       SQUAT_SAFETY_ZONE +
1552                                       ((index->current_doc_ID +
1553                                         1) * 4))) == NULL) {
1554         r = SQUAT_ERR;
1555         goto cleanup;
1556     }
1557     *buf++ = 0;
1558     memcpy(buf, index->doc_ID_list, index->current_doc_ID * 4);
1559     buf += index->current_doc_ID * 4;
1560     memset(buf, 0, 4);
1561     complete_buffered_write(&index->out, buf + 4);
1562 
1563     /* Now write out the trie for every initial byte that we saw. The
1564        offsets are collected in 'offset_buf'. */
1565     memset(offset_buf, 0, sizeof(offset_buf));
1566     for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
1567         if (index->stats_callback != NULL) {
1568             SquatStatsEvent event;
1569 
1570             event.generic.type = SQUAT_STATS_COMPLETED_INITIAL_CHAR;
1571             event.completed_initial_char.completed_char = i;
1572             event.completed_initial_char.num_words =
1573                 index->total_num_words[i];
1574             if (index->index_buffers[i].fd >= 0) {
1575                 event.completed_initial_char.temp_file_size =
1576                     index->index_buffers[i].total_output_bytes;
1577             } else {
1578                 event.completed_initial_char.temp_file_size = 0;
1579             }
1580             index->stats_callback(index->stats_callback_closure, &event);
1581         }
1582 
1583         if (index->index_buffers[i].fd >= 0) {
1584             /* We have to flush the temporary file output buffer before we try to use
1585                the temporary file. */
1586             if (flush_and_reset_buffered_writes(index->index_buffers + i)
1587                 != SQUAT_OK
1588                 || dump_index_trie_words(index, i,
1589                                          offset_buf + i) != SQUAT_OK) {
1590                 r = SQUAT_ERR;
1591                 goto cleanup;
1592             }
1593             /* Close files and free memory as we go. This could be important
1594                if disk space is low and we're generating a huge index. */
1595             if (close(index->index_buffers[i].fd) < 0) {
1596                 squat_set_last_error(SQUAT_ERR_SYSERR);
1597                 r = SQUAT_ERR;
1598             }
1599             index->index_buffers[i].fd = -1;
1600             buf_free(&index->index_buffers[i].buf);
1601         } else if (index->old_index) {
1602             /* Only needed if incremental updates going on */
1603             /* Just copy across existing trie if nothing new to merge in */
1604             if (dump_index_trie_words_no_file(index, i, offset_buf + i) !=
1605                 SQUAT_OK) {
1606                 r = SQUAT_ERR;
1607                 goto cleanup;
1608             }
1609         }
1610     }
1611 
1612     /* Save the offset where the root of the index trie is going to go. */
1613     word_list_offset = index->out.total_output_bytes;
1614 
1615     /* Relativize the subtrie offsets. */
1616     for (i = 0; i < VECTOR_SIZE(offset_buf); i++) {
1617         if (offset_buf[i] != 0) {
1618             offset_buf[i] = word_list_offset - offset_buf[i];
1619 
1620             if ((int)i < index->doc_word_table->first_valid_entry) {
1621                 index->doc_word_table->first_valid_entry = i;
1622             }
1623             index->doc_word_table->last_valid_entry = i;
1624         }
1625     }
1626 
1627     /* Dump out the offset buffer at last. */
1628     if (dump_word_table_offsets(index, index->doc_word_table, offset_buf)
1629         != SQUAT_OK) {
1630         r = SQUAT_ERR;
1631         goto cleanup;
1632     }
1633 
1634     /* finally, write trailing zeroes and the header ... now that we know
1635        we initialized the file with no errors */
1636     if ((buf =
1637          prepare_buffered_write(&index->out, SQUAT_SAFETY_ZONE)) == NULL) {
1638         r = SQUAT_ERR;
1639         goto cleanup;
1640     }
1641     memset(buf, 0, SQUAT_SAFETY_ZONE);
1642     complete_buffered_write(&index->out, buf + SQUAT_SAFETY_ZONE);
1643 
1644     /* Flush writes before we seek back to the start to write the header */
1645     if (flush_and_reset_buffered_writes(&index->out) != SQUAT_OK) {
1646         r = SQUAT_ERR;
1647         goto cleanup;
1648     }
1649 
1650     /* Blat out the header */
1651     if ((header = (SquatDiskHeader *) prepare_buffered_write(&index->out,
1652                                                              sizeof
1653                                                              (SquatDiskHeader)))
1654         == NULL) {
1655         r = SQUAT_ERR;
1656         goto cleanup;
1657     }
1658     memcpy(header->header_text, squat_index_file_header, 8);
1659     squat_encode_64(header->doc_list_offset, doc_list_offset);
1660     squat_encode_64(header->doc_ID_list_offset, doc_ID_list_offset);
1661     squat_encode_64(header->word_list_offset, word_list_offset);
1662     memcpy(header->valid_char_bits, index->valid_char_bits,
1663            sizeof(header->valid_char_bits));
1664     complete_buffered_write(&index->out, (char *)(header + 1));
1665 
1666     /* Flush out the header */
1667     if (flush_and_reset_buffered_writes(&index->out) != SQUAT_OK) {
1668         r = SQUAT_ERR;
1669         goto cleanup;
1670     }
1671 
1672     /* WOOHOO! It's done! */
1673 
1674 cleanup:
1675     buf_free(&index->out.buf);
1676     word_table_delete(index->doc_word_table, SQUAT_WORD_SIZE - 1);
1677     /* If we're bailing out because of an error, we might not have
1678        released all the temporary file resources. */
1679     for (i = 0; i < VECTOR_SIZE(index->index_buffers); i++) {
1680         if (index->index_buffers[i].fd >= 0)
1681             close(index->index_buffers[i].fd);
1682         buf_free(&index->index_buffers[i].buf);
1683     }
1684     free(index->tmp_path);
1685     free(index->doc_ID_list);
1686     doc_ID_map_free(&index->doc_ID_map);
1687     free(index);
1688 
1689     return r;
1690 }
1691 
squat_index_finish(SquatIndex * index)1692 int squat_index_finish(SquatIndex *index)
1693 {
1694     return index_close_internal(index, 1);
1695 }
1696 
squat_index_destroy(SquatIndex * index)1697 int squat_index_destroy(SquatIndex *index)
1698 {
1699     return index_close_internal(index, 0);
1700 }
1701