1 /* SQUAT code for searching indexes.
2  *  Robert O'Callahan
3  *
4  * Copyright (c) 1994-2008 Carnegie Mellon University.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * 3. The name "Carnegie Mellon University" must not be used to
19  *    endorse or promote products derived from this software without
20  *    prior written permission. For permission or any legal
21  *    details, please contact
22  *      Carnegie Mellon University
23  *      Center for Technology Transfer and Enterprise Creation
24  *      4615 Forbes Avenue
25  *      Suite 302
26  *      Pittsburgh, PA  15213
27  *      (412) 268-7393, fax: (412) 268-7395
28  *      innovation@andrew.cmu.edu
29  *
30  * 4. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by Computing Services
33  *     at Carnegie Mellon University (http://www.cmu.edu/computing/)."
34  *
35  * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
36  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
37  * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
38  * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
40  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
41  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 #include <config.h>
45 
46 #include <sys/stat.h>
47 #include <sys/mman.h>
48 #include <unistd.h>
49 #include <stdlib.h>
50 #include <string.h>
51 
52 #include "squat_internal.h"
53 
54 #include "assert.h"
55 #include "xmalloc.h"
56 
57 struct _SquatSearchIndex {
58   int         index_fd;               /* the index file */
59   char const* data;                   /* where it's mmaped to */
60   char const* doc_list;               /* where does the doc-list
61                                          sequence start in memory */
62   char const* word_list;              /* where does the word trie
63                                          offset table start in memory */
64   char const* doc_ID_list;            /* where does the doc-ID-list
65                                          array start in memory */
66   char const* data_end;               /* the end of the mmaped file */
67   unsigned char valid_char_bits[32];  /* which characters are valid in
68                                          queries according to whoever
69                                          created the index */
70 };
71 
72 /* For each 0 <= i < 256, bit_counts[i] is the number of bits set in i */
73 static char bit_counts[256];
74 
75 /* Returns true IFF the 'len' bytes starting at 's' are each equal to 'v' */
memconst(char const * s,int len,char v)76 static int memconst(char const* s, int len, char v) {
77   while (len > 0 && *s == v) {
78     s++;
79     len--;
80   }
81   return len == 0;
82 }
83 
squat_search_open(int fd)84 EXPORTED SquatSearchIndex* squat_search_open(int fd) {
85   struct stat buf;
86   SquatSearchIndex* index;
87   SquatDiskHeader const* header;
88   SquatInt64 doc_list_offset, doc_ID_list_offset, word_list_offset;
89   SquatInt64 data_len;
90 
91   squat_set_last_error(SQUAT_ERR_OK);
92 
93   /* initialize bit_counts constant array.
94      This is so clever, I could die */
95   if (bit_counts[1] == 0) {
96     int c;
97     for (c = 1; c < 256; c++) {
98       bit_counts[c] = bit_counts[c >> 1] + (c & 1);
99     }
100   }
101 
102   index = (SquatSearchIndex*)xmalloc(sizeof(SquatSearchIndex));
103   index->index_fd = fd;
104 
105   if (fstat(fd, &buf) != 0) {  /* fstat64? */
106     squat_set_last_error(SQUAT_ERR_SYSERR);
107     goto cleanup_index;
108   }
109   data_len = buf.st_size - SQUAT_SAFETY_ZONE;
110   if ((size_t)data_len < sizeof(SquatDiskHeader)) {
111     squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
112     goto cleanup_index;
113   }
114 
115   index->data = mmap(NULL, data_len + SQUAT_SAFETY_ZONE, PROT_READ, MAP_SHARED, fd, 0);
116   if (index->data == MAP_FAILED) {
117     squat_set_last_error(SQUAT_ERR_SYSERR);
118     goto cleanup_index;
119   }
120 
121   header = (SquatDiskHeader const*)index->data;
122   doc_list_offset = squat_decode_64(header->doc_list_offset);
123   word_list_offset = squat_decode_64(header->word_list_offset);
124   doc_ID_list_offset = squat_decode_64(header->doc_ID_list_offset);
125 
126   /* Do some sanity checking in case the header was corrupted. We wouldn't
127      want to dereference any bad pointers... */
128   if (memcmp(header->header_text, squat_index_file_header, 8) != 0
129       || doc_list_offset < 0 || doc_list_offset >= data_len
130       || word_list_offset < 0 || word_list_offset >= data_len
131       || doc_ID_list_offset < 0 || doc_ID_list_offset >= data_len
132       || !memconst(index->data + data_len, SQUAT_SAFETY_ZONE, 0)) {
133     squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
134     goto cleanup_unmap;
135   }
136 
137   index->doc_list = index->data + doc_list_offset;
138   index->word_list = index->data + word_list_offset;
139   index->doc_ID_list = index->data + doc_ID_list_offset;
140   index->data_end = index->data + data_len;
141   memcpy(index->valid_char_bits, header->valid_char_bits,
142          sizeof(index->valid_char_bits));
143 
144   return index;
145 
146 cleanup_unmap:
147   munmap((void*)index->data, data_len + SQUAT_SAFETY_ZONE);
148 
149 cleanup_index:
150   free(index);
151   return NULL;
152 }
153 
squat_search_list_docs(SquatSearchIndex * index,SquatListDocCallback handler,void * closure)154 EXPORTED int squat_search_list_docs(SquatSearchIndex* index,
155   SquatListDocCallback handler, void* closure) {
156   char const* s = index->doc_list;
157 
158   squat_set_last_error(SQUAT_ERR_OK);
159 
160   while (*s != 0) {
161     SquatListDoc list_doc;
162     int r;
163 
164     list_doc.doc_name = s;
165     s += strlen(s) + 1;
166     list_doc.size = squat_decode_I(&s);
167     r = handler(closure, &list_doc);
168 
169     if (r == SQUAT_CALLBACK_ABORT) {
170       break;
171     }
172     assert(r == SQUAT_CALLBACK_CONTINUE);
173   }
174 
175   return SQUAT_OK;
176 }
177 
178 /* Get a pointer to the index file's list of documents containing the
179    word 'data' */
lookup_word_docs(SquatSearchIndex * index,char const * data,int * invalid_file)180 static char const* lookup_word_docs(SquatSearchIndex* index,
181   char const* data, int* invalid_file) {
182   int i;
183   char const* s = index->word_list;
184 
185   for (i = 0; i < SQUAT_WORD_SIZE; i++) {
186     char p;
187     char ch = data[i];
188     char const* branch_start = s;
189     int skip;
190 
191     /* decode 'present' bits to see if ch is present at this level of
192        the tries */
193     p = *s++;
194     if ((p & 0xE0) != 0) { /* singleton */
195       if (ch != p) {
196         return NULL;
197       }
198       skip = 0;
199       /* we're done. s is now pointing at the data for the singleton */
200     } else { /* list of bits */
201       char count;
202       char const* base;
203       int offset, j;
204 
205       if ((unsigned char)ch < 8*p) { /* before start of list */
206         return NULL;
207       }
208 
209       count = (*s++) + 1;
210 
211       if ((unsigned char)ch >= 8*(p + count)) { /* beyond end of list */
212         return NULL;
213       }
214 
215       offset = (unsigned char)ch/8 - p;
216       if ((s[offset] & (1 << (ch & 7))) == 0) { /* not in list */
217         return NULL;
218       }
219 
220       base = s;
221       s += count;
222 
223       /* figure out how many entries there are before our entry */
224       skip = 0;
225       for (j = 0; j < offset; j++) {
226         skip += bit_counts[(unsigned char)base[j]];
227       }
228       for (j = 0; j < (ch & 7); j++) {
229         if ((base[offset] & (1 << j)) != 0) {
230           skip++;
231         }
232       }
233     }
234 
235     if (i < SQUAT_WORD_SIZE - 1) {
236       int next_offset;
237 
238       s = squat_decode_skip_I(s, skip);
239 
240       /* find offset to next branch data */
241       next_offset = squat_decode_I(&s);
242       s = branch_start - next_offset;
243       if (next_offset < 0 || s >= index->data_end) {
244         *invalid_file = 1;
245         return NULL; /* corrupt index */
246       }
247     } else {
248       /* leaf case. We need to scan through the document lists for each
249          leaf to skip. */
250       while (skip-- > 0) {
251         char const* t = s;
252         int v = (int)squat_decode_I(&t);
253 
254         if ((v & 1) != 0) {
255           s = t;  /* singleton; no more data to eat for this word */
256         } else {
257           s = t + (v >> 1); /* run-list; size is in v>>1 */
258         }
259       }
260     }
261     /* s now points at the trie branch for the data */
262   }
263 
264   return s;
265 }
266 
267 /* Get the pointer to the list of documents containing 'data' into
268    '*run_start', and return the number of documents in the list. */
count_docs_containing_word(SquatSearchIndex * index,char const * data,char const ** run_start)269 static int count_docs_containing_word(SquatSearchIndex* index,
270   char const* data, char const** run_start) {
271   int invalid_file = 0;
272   char const* raw_doc_list = lookup_word_docs(index, data, &invalid_file);
273   int i;
274 
275   if (raw_doc_list == NULL) {
276     return invalid_file ? -1 : 0;
277   }
278 
279   *run_start = raw_doc_list;
280 
281   i = (int)squat_decode_I(&raw_doc_list);
282   if ((i & 1) != 0) {
283     return 1; /* singleton */
284   } else {
285     int size = i >> 1;
286     char const* s = raw_doc_list;
287     int count = 0;
288 
289     if (raw_doc_list + size >= index->data_end) {
290       return -1;
291     }
292 
293     while (s - raw_doc_list < size) {
294       i = (int)squat_decode_I(&s);
295       if ((i & 1) == 1) {
296         count++;
297       } else {
298         count += i >> 1;
299         s = squat_decode_skip_I(s, 1);
300       }
301     }
302 
303     if (raw_doc_list + size != s) {
304       return -1;
305     }
306 
307     return count;
308   }
309 }
310 
311 /* We store a set of documents in this little structure. The set
312    also maintains a 'current' document pointer. */
313 typedef struct {
314   int array_len;   /* The length of the array below */
315   int* array_data; /* An array of document IDs, sorted by increasing
316                       document ID. It can also contain elements equal
317                       to -1, which means "no document".
318                    */
319   int index;       /* The index of the 'current' document within the array. */
320 } SquatDocSet;
321 
322 /* Extract the list of documents containing the word 'data' into a
323    SquatDocSet. The list is extracted from the index file data
324    'doc_list' which refers to 'doc_count' documents.
325 */
326 static int
set_to_docs_containing_word(SquatSearchIndex * index,SquatDocSet * set,char const * data,int doc_count,char const * doc_list)327 set_to_docs_containing_word(SquatSearchIndex* index __attribute__((unused)),
328                             SquatDocSet* set,
329                             char const* data __attribute__((unused)),
330                             int doc_count, char const* doc_list)
331 {
332   int i;
333 
334   set->array_len = doc_count;
335   set->array_data = (int*)xmalloc(sizeof(int)*set->array_len);
336 
337   i = (int)squat_decode_I(&doc_list);
338   if ((i & 1) != 0) {
339     set->array_data[0] = i >> 1;
340   } else {
341     int size = i >> 1;
342     char const* s = doc_list;
343     int last_doc = 0;
344     int j = 0;
345 
346     while (s - doc_list < size) {
347       i = (int)squat_decode_I(&s);
348       if ((i & 1) == 1) {
349         last_doc = set->array_data[j++] = last_doc + (i >> 1);
350       } else {
351         int count = i >> 1;
352         int delta = squat_decode_I(&s);
353 
354         last_doc += delta;
355         set->array_data[j++] = last_doc;
356         while (--count > 0) {
357           last_doc++;
358           set->array_data[j++] = last_doc;
359         }
360       }
361     }
362   }
363 
364   return SQUAT_OK;
365 }
366 
367 /* Advance the "current document" in the set to the first document
368    with ID > 'doc'. Remove any documents found along the way that were
369    not 'doc'.
370 */
filter_doc(SquatDocSet * set,int doc)371 static void filter_doc(SquatDocSet* set, int doc) {
372   int i = set->index;
373 
374   while (i < set->array_len && set->array_data[i] < doc) {
375     /* this document is not in the currently filtered set */
376     set->array_data[i] = -1;
377     i++;
378   }
379 
380   /* skip over the matched document, if we matched */
381   if (i < set->array_len && set->array_data[i] == doc) {
382     i++;
383   }
384 
385   set->index = i;
386 }
387 
388 /* Remove from a SquatDocSet any documents not in the list of
389    documents containing the word 'data'. The list is extracted from
390    the index file data 'doc_list'.
391 */
392 static void
filter_to_docs_containing_word(SquatSearchIndex * index,SquatDocSet * set,char const * data,char const * doc_list)393 filter_to_docs_containing_word(SquatSearchIndex* index __attribute__((unused)),
394                                SquatDocSet* set,
395                                char const* data __attribute__((unused)),
396                                char const* doc_list)
397 {
398   int i = (int)squat_decode_I(&doc_list);
399 
400   set->index = 0;
401 
402   if ((i & 1) != 0) {
403     filter_doc(set, i >> 1);
404   } else {
405     int size = i >> 1;
406     char const* s = doc_list;
407     int last_doc = 0;
408 
409     while (s - doc_list < size) {
410       i = (int)squat_decode_I(&s);
411       if ((i & 1) == 1) {
412         filter_doc(set, last_doc += i >> 1);
413       } else {
414         int count = i >> 1;
415         int delta = squat_decode_I(&s);
416 
417         last_doc += delta;
418         filter_doc(set, last_doc);
419         while (--count > 0) {
420           last_doc++;
421           filter_doc(set, last_doc);
422         }
423       }
424     }
425   }
426 }
427 
428 /* Advance the "current document" pointer to the first document in the set. */
select_first_doc(SquatDocSet * set)429 static void select_first_doc(SquatDocSet* set) {
430   set->index = 0;
431   while (set->index < set->array_len && set->array_data[set->index] < 0) {
432     set->index++;
433   }
434 }
435 
436 /* Is the "current document" pointer pointing to any real document? */
has_more_docs(SquatDocSet * set)437 static int has_more_docs(SquatDocSet* set) {
438   return set->index < set->array_len;
439 }
440 
441 /* Advance the "current document" pointer to the next document in the set,
442    and return its old value */
get_next_doc(SquatDocSet * set)443 static int get_next_doc(SquatDocSet* set) {
444   int doc = set->array_data[set->index];
445 
446   set->index++;
447   while (set->index < set->array_len && set->array_data[set->index] < 0) {
448     set->index++;
449   }
450 
451   return doc;
452 }
453 
destroy_docset(SquatDocSet * set)454 static void destroy_docset(SquatDocSet* set) {
455   free(set->array_data);
456 }
457 
458 /* The basic strategy here is pretty simple. We just want to find the
459    documents that contain every subword of the search string. The
460    index tells us which documents contain each subword so it's just a
461    matter of doing O(N) lookups into the index. We construct an
462    explicit document list for one of the subwords and then iterate
463    through that list for each other subword, throwing out any
464    documents that don't contain that subword.
465 
466    The only trick is that some subwords may occur in lots of documents
467    while others only occur in a few (or no) documents. In that case we
468    would rather construct the list with the smallest possible number
469    of documents, to save memory and the cost of traversing that list
470    several times.
471 */
squat_search_execute(SquatSearchIndex * index,char const * data,int data_len,SquatSearchResultCallback handler,void * closure)472 HIDDEN int squat_search_execute(SquatSearchIndex* index, char const* data,
473   int data_len, SquatSearchResultCallback handler, void* closure) {
474   int i;
475   int min_doc_count_word; /* The subword of 'data' that appears in
476                              fewest documents */
477   int min_doc_count;      /* The number of documents that include that
478                              subword */
479   SquatDocSet set;
480   char const** run_starts;
481 
482   /* First, do sanity checking on the string. We wouldn't want invalid
483      client searches to mysteriously return 'no documents'. */
484   if (data_len < SQUAT_WORD_SIZE) {
485     squat_set_last_error(SQUAT_ERR_SEARCH_STRING_TOO_SHORT);
486     return SQUAT_ERR;
487   }
488 
489   for (i = 0; i < data_len; i++) {
490     int ch = (unsigned char)data[i];
491 
492     if ((index->valid_char_bits[ch >> 3] & (1 << (ch & 7))) == 0) {
493       squat_set_last_error(SQUAT_ERR_SEARCH_STRING_INVALID_CHAR);
494       return SQUAT_ERR;
495     }
496   }
497 
498   /* We search for every subword of the search string. We save a
499      pointer to the document list for each subword in this array
500      ... so we don't have to traverse the trie data structures more
501      than once per subword.
502   */
503   run_starts = (char const**)xmalloc(sizeof(char const*)*
504                                     (data_len - SQUAT_WORD_SIZE + 1));
505   squat_set_last_error(SQUAT_ERR_OK);
506 
507   /* Now, for each subword, find its list of documents and how many
508      documents are in the list. Remember the word which had minimum
509      number of documents.
510   */
511   min_doc_count = count_docs_containing_word(index, data, run_starts);
512   if (min_doc_count < 0) {
513     squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
514     goto cleanup_run_starts;
515   } else if (min_doc_count == 0) {
516       /* The first word of the substring isn't in any documents, so we
517          can just stop now. */
518     goto cleanup_run_starts_ok;
519   }
520   min_doc_count_word = 0;
521   for (i = 1; i <= data_len - SQUAT_WORD_SIZE; i++) {
522     int doc_count = count_docs_containing_word(index, data + i,
523                                                run_starts + i);
524     if (doc_count < 0) {
525       squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
526       goto cleanup_run_starts;
527     } else if (doc_count == 0) {
528       /* This word isn't in any documents, we can stop now. */
529       goto cleanup_run_starts_ok;
530     } else if (doc_count < min_doc_count) {
531       min_doc_count = doc_count;
532       min_doc_count_word = i;
533     }
534   }
535 
536   /* Now, extract the shortest document list into an array. By
537      starting with the shortest document list we avoid pathological
538      situations where one or more of the subwords occurs in zillions
539      of documents, and we'd allocate a huge array and have to iterate
540      through it all lots of times.
541   */
542   if (set_to_docs_containing_word(index, &set, data + min_doc_count_word,
543         min_doc_count, run_starts[min_doc_count_word]) == SQUAT_ERR) {
544     goto cleanup_run_starts;
545   }
546   /* Scan through the other document lists and throw out any documents
547      that aren't in all those lists. */
548   for (i = 0; i <= data_len - SQUAT_WORD_SIZE; i++) {
549     if (i != min_doc_count_word) {
550       filter_to_docs_containing_word(index, &set, data + i, run_starts[i]);
551     }
552   }
553 
554   /* Now we have the results. Scan through the set and report each
555      element to the callback function. */
556   select_first_doc(&set);
557   while (has_more_docs(&set)) {
558     int next_doc;
559     char const* next_doc_info;
560     char const* next_doc_data;
561     int r;
562 
563     /* Lookup the document info so we can get the document name to report. */
564     next_doc = get_next_doc(&set);
565     next_doc_info = index->doc_ID_list + next_doc*4;
566     if (next_doc < 0 && next_doc_info >= index->data_end) {
567       squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
568       goto cleanup_docset;
569     }
570 
571     next_doc_data = index->doc_list + squat_decode_32(next_doc_info);
572     if (next_doc_data < index->doc_list || next_doc_data >= index->data_end) {
573       squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
574       goto cleanup_docset;
575     }
576 
577     r = handler(closure, next_doc_data);
578     if (r == SQUAT_CALLBACK_ABORT) {
579       break;
580     }
581     assert(r == SQUAT_CALLBACK_CONTINUE);
582   }
583 
584   destroy_docset(&set);
585 
586 cleanup_run_starts_ok:
587   free(run_starts);
588   return SQUAT_OK;
589 
590 cleanup_docset:
591   destroy_docset(&set);
592 
593 cleanup_run_starts:
594   free(run_starts);
595   return SQUAT_ERR;
596 }
597 
squat_search_close(SquatSearchIndex * index)598 EXPORTED int squat_search_close(SquatSearchIndex* index) {
599   int r = SQUAT_OK;
600 
601   squat_set_last_error(SQUAT_ERR_OK);
602 
603   if (munmap((void*)index->data,
604              index->data_end + SQUAT_SAFETY_ZONE - index->data) != 0) {
605     squat_set_last_error(SQUAT_ERR_SYSERR);
606     r = SQUAT_ERR;
607   }
608 
609   free(index);
610   return r;
611 }
612 
613 /* ====================================================================== */
614 
615 /* squat_scan(SquatSearchIndex* index, char first_char
616  *            SquatScanCallback handler, void* closure)
617  *
618  * sweeps the entire subtrie for a given starting character. It generates
619  * callbacks to the hander routine (which has it own opaque data structure
620  * in closure) for each document ID that it finds in the leaves. The
621  * callback function looks something like:
622  *
623  *   int add_word_callback(void* closure, char *name, int doc_ID);
624  *
625  * "name" is the full four character squat "word", including first_char and
626  * a trailing '\0', just in case you want to print it. doc_ID is the
627  * normal squat doc_ID offset.
628  *
629  * squat_scan_recurse() and squat_scan_leaf() are helper functions to
630  * scan the entire trie. squat_find_branch() is used to find the initial
631  * subtrie for a given first_char.
632  *
633  */
634 
squat_scan_leaf(char const * doc_list,char * name,SquatScanCallback handler,void * closure)635 static int squat_scan_leaf(char const* doc_list, char *name,
636                            SquatScanCallback handler, void* closure)
637 {
638   int i;
639 
640   i = (int)squat_decode_I(&doc_list);
641   if ((i & 1) != 0) {
642     handler(closure, name, i >> 1);
643   } else {
644     int size = i >> 1;
645     char const* s = doc_list;
646     int last_doc = 0;
647 
648     while (s - doc_list < size) {
649       i = (int)squat_decode_I(&s);
650       if ((i & 1) == 1) {
651         last_doc = last_doc + (i >> 1);
652         handler(closure, name, last_doc);
653       } else {
654         int count = i >> 1;
655         int delta = squat_decode_I(&s);
656 
657         last_doc += delta;
658         handler(closure, name, last_doc);
659         while (--count > 0) {
660           last_doc++;
661           handler(closure, name, last_doc);
662         }
663       }
664     }
665   }
666   return(SQUAT_OK);
667 }
668 
squat_scan_recurse(char const * s,char const * data_end,char * name,int level,SquatScanCallback handler,void * closure)669 static int squat_scan_recurse(char const* s, char const* data_end,
670                               char *name, int level,
671                               SquatScanCallback handler, void* closure)
672 {
673   int r = SQUAT_OK;
674   char p;
675   char const* branch_start = s;
676   char const* start_of_list;
677   int skip;
678   char count;
679   char const* base;
680   int offset, j;
681   int next_offset;
682   int ch;
683 
684   /* Ignore empty subtrees generated by old versions of squatter */
685   if ((s[0] == 0) && (s[1] == 0))
686     return(SQUAT_OK);
687 
688   p = *s++;
689   if ((p & 0xE0) != 0) {
690     /* Byte >= 0x20 (ASCII SPACE) indicates singleton list */
691 
692     name[level] = p;
693     if (level < (SQUAT_WORD_SIZE-1)) {
694       next_offset = squat_decode_I(&s);
695       s = branch_start - next_offset;
696       if (next_offset < 0 || s >= data_end) {
697         return(SQUAT_ERR);
698       }
699       r = squat_scan_recurse(s, data_end, name, level+1, handler, closure);
700     } else {
701       r = squat_scan_leaf(s, name, handler, closure);
702     }
703     return(r);
704   }
705 
706   count = (*s++) + 1;   /* +1 to allow range 1..32 using (*s) < 32 */
707   base = s;
708   s += count;
709   start_of_list = s;       /* So that we can repeat */
710 
711   /* Up to 256 subtree branches, determined by variable length bitmask
712    * which is up to 32 bytes (plus two lead in bytes).
713    *      p*8  : First p*8 branches all empty, can be ignored
714    *  count*8  : Size of base array, in bits
715    *
716    *
717    * Should be able to optimise following into linear scan,
718    * without returning to start_of_list each time around the loop
719    */
720 
721   for (ch = 8*p; ch < 8*(p+count) && r == SQUAT_OK; ch++) {
722     offset = (unsigned char)ch/8 - p;
723     if ((base[offset] & (1 << (ch & 7))) == 0) /* not in list */
724       continue;
725 
726     /* figure out how many entries there are before our entry */
727     skip = 0;
728     for (j = 0; j < offset; j++) {
729       skip += bit_counts[(unsigned char)base[j]];
730     }
731     for (j = 0; j < (ch & 7); j++) {
732       if ((base[offset] & (1 << j)) != 0) {
733         skip++;
734       }
735     }
736     s = start_of_list;
737 
738     name[level] = ch;
739     if (level < (SQUAT_WORD_SIZE-1)) {
740       s = squat_decode_skip_I(s, skip);
741       next_offset = squat_decode_I(&s);
742 
743       s = branch_start - next_offset;
744       if (next_offset < 0 || s >= data_end) {
745         return(SQUAT_ERR);
746       }
747       r = squat_scan_recurse(s, data_end, name, level+1, handler, closure);
748     } else {
749       /* leaf case. We need to scan through the document lists for each
750          leaf to skip. */
751       while (skip-- > 0) {
752         char const* t = s;
753         int v = (int)squat_decode_I(&t);
754 
755         if ((v & 1) != 0) {
756           s = t;  /* singleton; no more data to eat for this word */
757         } else {
758           s = t + (v >> 1); /* run-list; size is in v>>1 */
759         }
760       }
761       r = squat_scan_leaf(s, name, handler, closure);
762     }
763   }
764   return(r);
765 }
766 
767 /* char const** prev is used by squat_count_docs to find start of
768  * mmap()ed byte range that we need to traverse, so that we can
769  * read in the data before we start trie walk. No other purpose.
770  */
771 
squat_find_branch(char const ** result,char const ** prev,char const * s,char const * data_end,char ch)772 static int squat_find_branch(char const** result, char const** prev,
773                              char const* s,
774                              char const* data_end, char ch)
775 {
776   char p;
777   char const* branch_start = s, *t;
778   int skip;
779   char count;
780   char const* base;
781   int offset, j;
782   int next_offset, prev_offset;
783 
784   *result = NULL;
785   if (prev) *prev = NULL;
786 
787   /* Ignore empty subtrees generated by old versions of squatter */
788   if ((s[0] == 0) && (s[1] == 0)) {
789     *result = NULL;
790     return(SQUAT_OK);
791   }
792 
793   p = *s++;
794 
795   if ((p & 0xE0) != 0) { /* singleton */
796     if (p != ch) {
797       *result = NULL;
798       return(SQUAT_OK);
799     }
800 
801     next_offset = squat_decode_I(&s);
802     s = branch_start - next_offset;
803     if (next_offset < 0 || s >= data_end) {
804       return(SQUAT_ERR);
805     }
806     *result = s;
807     return(SQUAT_OK);
808   }
809 
810   count = (*s++) + 1;
811   offset = (unsigned char)ch/8 - p;
812   base = s;
813   s += count;
814 
815   /* Does this character fall in range */
816   if (((unsigned char)ch < 8*p) ||
817       ((unsigned char)ch >= 8*(p + count)) ||
818       ((base[offset] & (1 << (ch & 7))) == 0)) {
819     *result = NULL;
820     return(SQUAT_OK);
821   }
822 
823   /* figure out how many entries there are before our entry */
824   skip = 0;
825   for (j = 0; j < offset; j++) {
826     skip += bit_counts[(unsigned char)base[j]];
827   }
828   for (j = 0; j < (ch & 7); j++) {
829     if ((base[offset] & (1 << j)) != 0) {
830       skip++;
831     }
832   }
833 
834   if (skip > 0) {
835     s = squat_decode_skip_I(s, skip-1);
836     prev_offset = squat_decode_I(&s);
837 
838     t = branch_start - prev_offset;
839     if (prev && (prev_offset >= 0 && t < data_end)) {
840       *prev = t;
841     }
842   }
843 
844   /* find offset to next branch data */
845   next_offset = squat_decode_I(&s);
846 
847   s = branch_start - next_offset;
848   if (next_offset < 0 || s >= data_end) {
849     return(SQUAT_ERR);
850   }
851   *result = s;
852   return(SQUAT_OK);
853 }
854 
squat_scan(SquatSearchIndex * index,char first_char,SquatScanCallback handler,void * closure)855 EXPORTED int squat_scan(SquatSearchIndex* index, char first_char,
856                SquatScanCallback handler, void* closure)
857 {
858   char buf[SQUAT_WORD_SIZE+1];
859   const char *s;
860   int r = squat_find_branch(&s, NULL,
861                             index->word_list, index->data_end, first_char);
862 
863   if (r != SQUAT_OK)
864     return(r);
865   if (!s)
866     return SQUAT_OK;
867 
868   memset(buf, 0, sizeof(buf));
869   buf[0] = first_char;
870 
871   return(squat_scan_recurse(s, index->data_end, buf, 1, handler, closure));
872 }
873 
874 /* ====================================================================== */
875 
876 /* squat_count_docs(SquatSearchIndex* index, char first_char, int *counter)
877  *
878  * count the total number of document ID which appear in the entire
879  * subtree for a given starting character. Typically followed immediately
880  * by a call to squat_scan after caller has allocated a target array
881  * which is large enough for counter callback elements.
882  *
883  * squat_count_docs() will page in lots of data from disk via the mmapped
884  * index file. Unfortunately squat indexes are built back to front, so
885  * the scan would read blocks out of order as the data structure is
886  * traversed. squat_preload_data() is a rather evil hack to try and
887  * force sequential disk I/O: it serves no other function.
888  */
889 
890 static int
squat_count_docs_callback(void * closure,char * name,int doc_ID)891 squat_count_docs_callback(void* closure,
892                           char *name __attribute__((unused)),
893                           int doc_ID __attribute__((unused)))
894 {
895   int *counter = (int *)closure;
896 
897   (*counter)++;
898 
899   return SQUAT_CALLBACK_CONTINUE;
900 }
901 
902 /* Attempt to load blocks of data from mmap()ed array before we start
903    random access */
904 
905 #define PRELOAD_BLOCK_SIZE (4096)          /*  4 KBytes */
906 #define PRELOAD_MAX_SIZE   (20*1024*1024)  /* 20 MBytes */
907 
908 static void
squat_preload_data(char const * t,char const * s)909 squat_preload_data(char const* t, char const* s)
910 {
911   char buf[PRELOAD_BLOCK_SIZE];
912   char const* start = (t > s) ? s : t;
913   unsigned long len = (t > s) ? (t-s) : (s-t);
914 
915   if (len >= PRELOAD_MAX_SIZE)
916     return;
917 
918   while (len > 0) {
919     unsigned long size = (len > PRELOAD_BLOCK_SIZE) ? PRELOAD_BLOCK_SIZE : len;
920 
921     memcpy(buf, start, size);
922     start += size;
923     len   -= size;
924   }
925 }
926 
squat_count_docs(SquatSearchIndex * index,char first_char,int * counter)927 EXPORTED int squat_count_docs(SquatSearchIndex* index, char first_char, int *counter)
928 {
929   char buf[SQUAT_WORD_SIZE+1];
930   const char *s, *t;
931   int r = squat_find_branch(&s, &t, index->word_list,
932                             index->data_end, first_char);
933   *counter = 0;
934 
935   if (r != SQUAT_OK)
936     return(r);
937   if (!s)
938     return SQUAT_OK;
939 
940   /* First trie is at index->doc_ID_list + (no_docIDs *4). However
941    * no_docIDs doesn't appear to be stored anywhere (zero terminated list).
942    * Following is good enough for our purposes */
943   squat_preload_data((t) ? t : index->doc_ID_list, s);
944 
945   memset(buf, 0, sizeof(buf));
946   buf[0] = first_char;
947 
948   return(squat_scan_recurse(s, index->data_end, buf, 1,
949                             squat_count_docs_callback, counter));
950 }
951