1 /* SQUAT code for searching indexes.
2 * Robert O'Callahan
3 *
4 * Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * 3. The name "Carnegie Mellon University" must not be used to
19 * endorse or promote products derived from this software without
20 * prior written permission. For permission or any legal
21 * details, please contact
22 * Carnegie Mellon University
23 * Center for Technology Transfer and Enterprise Creation
24 * 4615 Forbes Avenue
25 * Suite 302
26 * Pittsburgh, PA 15213
27 * (412) 268-7393, fax: (412) 268-7395
28 * innovation@andrew.cmu.edu
29 *
30 * 4. Redistributions of any form whatsoever must retain the following
31 * acknowledgment:
32 * "This product includes software developed by Computing Services
33 * at Carnegie Mellon University (http://www.cmu.edu/computing/)."
34 *
35 * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
36 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
37 * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
38 * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
40 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
41 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42 */
43
44 #include <config.h>
45
46 #include <sys/stat.h>
47 #include <sys/mman.h>
48 #include <unistd.h>
49 #include <stdlib.h>
50 #include <string.h>
51
52 #include "squat_internal.h"
53
54 #include "assert.h"
55 #include "xmalloc.h"
56
57 struct _SquatSearchIndex {
58 int index_fd; /* the index file */
59 char const* data; /* where it's mmaped to */
60 char const* doc_list; /* where does the doc-list
61 sequence start in memory */
62 char const* word_list; /* where does the word trie
63 offset table start in memory */
64 char const* doc_ID_list; /* where does the doc-ID-list
65 array start in memory */
66 char const* data_end; /* the end of the mmaped file */
67 unsigned char valid_char_bits[32]; /* which characters are valid in
68 queries according to whoever
69 created the index */
70 };
71
72 /* For each 0 <= i < 256, bit_counts[i] is the number of bits set in i */
73 static char bit_counts[256];
74
75 /* Returns true IFF the 'len' bytes starting at 's' are each equal to 'v' */
memconst(char const * s,int len,char v)76 static int memconst(char const* s, int len, char v) {
77 while (len > 0 && *s == v) {
78 s++;
79 len--;
80 }
81 return len == 0;
82 }
83
squat_search_open(int fd)84 EXPORTED SquatSearchIndex* squat_search_open(int fd) {
85 struct stat buf;
86 SquatSearchIndex* index;
87 SquatDiskHeader const* header;
88 SquatInt64 doc_list_offset, doc_ID_list_offset, word_list_offset;
89 SquatInt64 data_len;
90
91 squat_set_last_error(SQUAT_ERR_OK);
92
93 /* initialize bit_counts constant array.
94 This is so clever, I could die */
95 if (bit_counts[1] == 0) {
96 int c;
97 for (c = 1; c < 256; c++) {
98 bit_counts[c] = bit_counts[c >> 1] + (c & 1);
99 }
100 }
101
102 index = (SquatSearchIndex*)xmalloc(sizeof(SquatSearchIndex));
103 index->index_fd = fd;
104
105 if (fstat(fd, &buf) != 0) { /* fstat64? */
106 squat_set_last_error(SQUAT_ERR_SYSERR);
107 goto cleanup_index;
108 }
109 data_len = buf.st_size - SQUAT_SAFETY_ZONE;
110 if ((size_t)data_len < sizeof(SquatDiskHeader)) {
111 squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
112 goto cleanup_index;
113 }
114
115 index->data = mmap(NULL, data_len + SQUAT_SAFETY_ZONE, PROT_READ, MAP_SHARED, fd, 0);
116 if (index->data == MAP_FAILED) {
117 squat_set_last_error(SQUAT_ERR_SYSERR);
118 goto cleanup_index;
119 }
120
121 header = (SquatDiskHeader const*)index->data;
122 doc_list_offset = squat_decode_64(header->doc_list_offset);
123 word_list_offset = squat_decode_64(header->word_list_offset);
124 doc_ID_list_offset = squat_decode_64(header->doc_ID_list_offset);
125
126 /* Do some sanity checking in case the header was corrupted. We wouldn't
127 want to dereference any bad pointers... */
128 if (memcmp(header->header_text, squat_index_file_header, 8) != 0
129 || doc_list_offset < 0 || doc_list_offset >= data_len
130 || word_list_offset < 0 || word_list_offset >= data_len
131 || doc_ID_list_offset < 0 || doc_ID_list_offset >= data_len
132 || !memconst(index->data + data_len, SQUAT_SAFETY_ZONE, 0)) {
133 squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
134 goto cleanup_unmap;
135 }
136
137 index->doc_list = index->data + doc_list_offset;
138 index->word_list = index->data + word_list_offset;
139 index->doc_ID_list = index->data + doc_ID_list_offset;
140 index->data_end = index->data + data_len;
141 memcpy(index->valid_char_bits, header->valid_char_bits,
142 sizeof(index->valid_char_bits));
143
144 return index;
145
146 cleanup_unmap:
147 munmap((void*)index->data, data_len + SQUAT_SAFETY_ZONE);
148
149 cleanup_index:
150 free(index);
151 return NULL;
152 }
153
squat_search_list_docs(SquatSearchIndex * index,SquatListDocCallback handler,void * closure)154 EXPORTED int squat_search_list_docs(SquatSearchIndex* index,
155 SquatListDocCallback handler, void* closure) {
156 char const* s = index->doc_list;
157
158 squat_set_last_error(SQUAT_ERR_OK);
159
160 while (*s != 0) {
161 SquatListDoc list_doc;
162 int r;
163
164 list_doc.doc_name = s;
165 s += strlen(s) + 1;
166 list_doc.size = squat_decode_I(&s);
167 r = handler(closure, &list_doc);
168
169 if (r == SQUAT_CALLBACK_ABORT) {
170 break;
171 }
172 assert(r == SQUAT_CALLBACK_CONTINUE);
173 }
174
175 return SQUAT_OK;
176 }
177
178 /* Get a pointer to the index file's list of documents containing the
179 word 'data' */
lookup_word_docs(SquatSearchIndex * index,char const * data,int * invalid_file)180 static char const* lookup_word_docs(SquatSearchIndex* index,
181 char const* data, int* invalid_file) {
182 int i;
183 char const* s = index->word_list;
184
185 for (i = 0; i < SQUAT_WORD_SIZE; i++) {
186 char p;
187 char ch = data[i];
188 char const* branch_start = s;
189 int skip;
190
191 /* decode 'present' bits to see if ch is present at this level of
192 the tries */
193 p = *s++;
194 if ((p & 0xE0) != 0) { /* singleton */
195 if (ch != p) {
196 return NULL;
197 }
198 skip = 0;
199 /* we're done. s is now pointing at the data for the singleton */
200 } else { /* list of bits */
201 char count;
202 char const* base;
203 int offset, j;
204
205 if ((unsigned char)ch < 8*p) { /* before start of list */
206 return NULL;
207 }
208
209 count = (*s++) + 1;
210
211 if ((unsigned char)ch >= 8*(p + count)) { /* beyond end of list */
212 return NULL;
213 }
214
215 offset = (unsigned char)ch/8 - p;
216 if ((s[offset] & (1 << (ch & 7))) == 0) { /* not in list */
217 return NULL;
218 }
219
220 base = s;
221 s += count;
222
223 /* figure out how many entries there are before our entry */
224 skip = 0;
225 for (j = 0; j < offset; j++) {
226 skip += bit_counts[(unsigned char)base[j]];
227 }
228 for (j = 0; j < (ch & 7); j++) {
229 if ((base[offset] & (1 << j)) != 0) {
230 skip++;
231 }
232 }
233 }
234
235 if (i < SQUAT_WORD_SIZE - 1) {
236 int next_offset;
237
238 s = squat_decode_skip_I(s, skip);
239
240 /* find offset to next branch data */
241 next_offset = squat_decode_I(&s);
242 s = branch_start - next_offset;
243 if (next_offset < 0 || s >= index->data_end) {
244 *invalid_file = 1;
245 return NULL; /* corrupt index */
246 }
247 } else {
248 /* leaf case. We need to scan through the document lists for each
249 leaf to skip. */
250 while (skip-- > 0) {
251 char const* t = s;
252 int v = (int)squat_decode_I(&t);
253
254 if ((v & 1) != 0) {
255 s = t; /* singleton; no more data to eat for this word */
256 } else {
257 s = t + (v >> 1); /* run-list; size is in v>>1 */
258 }
259 }
260 }
261 /* s now points at the trie branch for the data */
262 }
263
264 return s;
265 }
266
267 /* Get the pointer to the list of documents containing 'data' into
268 '*run_start', and return the number of documents in the list. */
count_docs_containing_word(SquatSearchIndex * index,char const * data,char const ** run_start)269 static int count_docs_containing_word(SquatSearchIndex* index,
270 char const* data, char const** run_start) {
271 int invalid_file = 0;
272 char const* raw_doc_list = lookup_word_docs(index, data, &invalid_file);
273 int i;
274
275 if (raw_doc_list == NULL) {
276 return invalid_file ? -1 : 0;
277 }
278
279 *run_start = raw_doc_list;
280
281 i = (int)squat_decode_I(&raw_doc_list);
282 if ((i & 1) != 0) {
283 return 1; /* singleton */
284 } else {
285 int size = i >> 1;
286 char const* s = raw_doc_list;
287 int count = 0;
288
289 if (raw_doc_list + size >= index->data_end) {
290 return -1;
291 }
292
293 while (s - raw_doc_list < size) {
294 i = (int)squat_decode_I(&s);
295 if ((i & 1) == 1) {
296 count++;
297 } else {
298 count += i >> 1;
299 s = squat_decode_skip_I(s, 1);
300 }
301 }
302
303 if (raw_doc_list + size != s) {
304 return -1;
305 }
306
307 return count;
308 }
309 }
310
311 /* We store a set of documents in this little structure. The set
312 also maintains a 'current' document pointer. */
313 typedef struct {
314 int array_len; /* The length of the array below */
315 int* array_data; /* An array of document IDs, sorted by increasing
316 document ID. It can also contain elements equal
317 to -1, which means "no document".
318 */
319 int index; /* The index of the 'current' document within the array. */
320 } SquatDocSet;
321
322 /* Extract the list of documents containing the word 'data' into a
323 SquatDocSet. The list is extracted from the index file data
324 'doc_list' which refers to 'doc_count' documents.
325 */
326 static int
set_to_docs_containing_word(SquatSearchIndex * index,SquatDocSet * set,char const * data,int doc_count,char const * doc_list)327 set_to_docs_containing_word(SquatSearchIndex* index __attribute__((unused)),
328 SquatDocSet* set,
329 char const* data __attribute__((unused)),
330 int doc_count, char const* doc_list)
331 {
332 int i;
333
334 set->array_len = doc_count;
335 set->array_data = (int*)xmalloc(sizeof(int)*set->array_len);
336
337 i = (int)squat_decode_I(&doc_list);
338 if ((i & 1) != 0) {
339 set->array_data[0] = i >> 1;
340 } else {
341 int size = i >> 1;
342 char const* s = doc_list;
343 int last_doc = 0;
344 int j = 0;
345
346 while (s - doc_list < size) {
347 i = (int)squat_decode_I(&s);
348 if ((i & 1) == 1) {
349 last_doc = set->array_data[j++] = last_doc + (i >> 1);
350 } else {
351 int count = i >> 1;
352 int delta = squat_decode_I(&s);
353
354 last_doc += delta;
355 set->array_data[j++] = last_doc;
356 while (--count > 0) {
357 last_doc++;
358 set->array_data[j++] = last_doc;
359 }
360 }
361 }
362 }
363
364 return SQUAT_OK;
365 }
366
367 /* Advance the "current document" in the set to the first document
368 with ID > 'doc'. Remove any documents found along the way that were
369 not 'doc'.
370 */
filter_doc(SquatDocSet * set,int doc)371 static void filter_doc(SquatDocSet* set, int doc) {
372 int i = set->index;
373
374 while (i < set->array_len && set->array_data[i] < doc) {
375 /* this document is not in the currently filtered set */
376 set->array_data[i] = -1;
377 i++;
378 }
379
380 /* skip over the matched document, if we matched */
381 if (i < set->array_len && set->array_data[i] == doc) {
382 i++;
383 }
384
385 set->index = i;
386 }
387
388 /* Remove from a SquatDocSet any documents not in the list of
389 documents containing the word 'data'. The list is extracted from
390 the index file data 'doc_list'.
391 */
392 static void
filter_to_docs_containing_word(SquatSearchIndex * index,SquatDocSet * set,char const * data,char const * doc_list)393 filter_to_docs_containing_word(SquatSearchIndex* index __attribute__((unused)),
394 SquatDocSet* set,
395 char const* data __attribute__((unused)),
396 char const* doc_list)
397 {
398 int i = (int)squat_decode_I(&doc_list);
399
400 set->index = 0;
401
402 if ((i & 1) != 0) {
403 filter_doc(set, i >> 1);
404 } else {
405 int size = i >> 1;
406 char const* s = doc_list;
407 int last_doc = 0;
408
409 while (s - doc_list < size) {
410 i = (int)squat_decode_I(&s);
411 if ((i & 1) == 1) {
412 filter_doc(set, last_doc += i >> 1);
413 } else {
414 int count = i >> 1;
415 int delta = squat_decode_I(&s);
416
417 last_doc += delta;
418 filter_doc(set, last_doc);
419 while (--count > 0) {
420 last_doc++;
421 filter_doc(set, last_doc);
422 }
423 }
424 }
425 }
426 }
427
428 /* Advance the "current document" pointer to the first document in the set. */
select_first_doc(SquatDocSet * set)429 static void select_first_doc(SquatDocSet* set) {
430 set->index = 0;
431 while (set->index < set->array_len && set->array_data[set->index] < 0) {
432 set->index++;
433 }
434 }
435
436 /* Is the "current document" pointer pointing to any real document? */
has_more_docs(SquatDocSet * set)437 static int has_more_docs(SquatDocSet* set) {
438 return set->index < set->array_len;
439 }
440
441 /* Advance the "current document" pointer to the next document in the set,
442 and return its old value */
get_next_doc(SquatDocSet * set)443 static int get_next_doc(SquatDocSet* set) {
444 int doc = set->array_data[set->index];
445
446 set->index++;
447 while (set->index < set->array_len && set->array_data[set->index] < 0) {
448 set->index++;
449 }
450
451 return doc;
452 }
453
destroy_docset(SquatDocSet * set)454 static void destroy_docset(SquatDocSet* set) {
455 free(set->array_data);
456 }
457
458 /* The basic strategy here is pretty simple. We just want to find the
459 documents that contain every subword of the search string. The
460 index tells us which documents contain each subword so it's just a
461 matter of doing O(N) lookups into the index. We construct an
462 explicit document list for one of the subwords and then iterate
463 through that list for each other subword, throwing out any
464 documents that don't contain that subword.
465
466 The only trick is that some subwords may occur in lots of documents
467 while others only occur in a few (or no) documents. In that case we
468 would rather construct the list with the smallest possible number
469 of documents, to save memory and the cost of traversing that list
470 several times.
471 */
squat_search_execute(SquatSearchIndex * index,char const * data,int data_len,SquatSearchResultCallback handler,void * closure)472 HIDDEN int squat_search_execute(SquatSearchIndex* index, char const* data,
473 int data_len, SquatSearchResultCallback handler, void* closure) {
474 int i;
475 int min_doc_count_word; /* The subword of 'data' that appears in
476 fewest documents */
477 int min_doc_count; /* The number of documents that include that
478 subword */
479 SquatDocSet set;
480 char const** run_starts;
481
482 /* First, do sanity checking on the string. We wouldn't want invalid
483 client searches to mysteriously return 'no documents'. */
484 if (data_len < SQUAT_WORD_SIZE) {
485 squat_set_last_error(SQUAT_ERR_SEARCH_STRING_TOO_SHORT);
486 return SQUAT_ERR;
487 }
488
489 for (i = 0; i < data_len; i++) {
490 int ch = (unsigned char)data[i];
491
492 if ((index->valid_char_bits[ch >> 3] & (1 << (ch & 7))) == 0) {
493 squat_set_last_error(SQUAT_ERR_SEARCH_STRING_INVALID_CHAR);
494 return SQUAT_ERR;
495 }
496 }
497
498 /* We search for every subword of the search string. We save a
499 pointer to the document list for each subword in this array
500 ... so we don't have to traverse the trie data structures more
501 than once per subword.
502 */
503 run_starts = (char const**)xmalloc(sizeof(char const*)*
504 (data_len - SQUAT_WORD_SIZE + 1));
505 squat_set_last_error(SQUAT_ERR_OK);
506
507 /* Now, for each subword, find its list of documents and how many
508 documents are in the list. Remember the word which had minimum
509 number of documents.
510 */
511 min_doc_count = count_docs_containing_word(index, data, run_starts);
512 if (min_doc_count < 0) {
513 squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
514 goto cleanup_run_starts;
515 } else if (min_doc_count == 0) {
516 /* The first word of the substring isn't in any documents, so we
517 can just stop now. */
518 goto cleanup_run_starts_ok;
519 }
520 min_doc_count_word = 0;
521 for (i = 1; i <= data_len - SQUAT_WORD_SIZE; i++) {
522 int doc_count = count_docs_containing_word(index, data + i,
523 run_starts + i);
524 if (doc_count < 0) {
525 squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
526 goto cleanup_run_starts;
527 } else if (doc_count == 0) {
528 /* This word isn't in any documents, we can stop now. */
529 goto cleanup_run_starts_ok;
530 } else if (doc_count < min_doc_count) {
531 min_doc_count = doc_count;
532 min_doc_count_word = i;
533 }
534 }
535
536 /* Now, extract the shortest document list into an array. By
537 starting with the shortest document list we avoid pathological
538 situations where one or more of the subwords occurs in zillions
539 of documents, and we'd allocate a huge array and have to iterate
540 through it all lots of times.
541 */
542 if (set_to_docs_containing_word(index, &set, data + min_doc_count_word,
543 min_doc_count, run_starts[min_doc_count_word]) == SQUAT_ERR) {
544 goto cleanup_run_starts;
545 }
546 /* Scan through the other document lists and throw out any documents
547 that aren't in all those lists. */
548 for (i = 0; i <= data_len - SQUAT_WORD_SIZE; i++) {
549 if (i != min_doc_count_word) {
550 filter_to_docs_containing_word(index, &set, data + i, run_starts[i]);
551 }
552 }
553
554 /* Now we have the results. Scan through the set and report each
555 element to the callback function. */
556 select_first_doc(&set);
557 while (has_more_docs(&set)) {
558 int next_doc;
559 char const* next_doc_info;
560 char const* next_doc_data;
561 int r;
562
563 /* Lookup the document info so we can get the document name to report. */
564 next_doc = get_next_doc(&set);
565 next_doc_info = index->doc_ID_list + next_doc*4;
566 if (next_doc < 0 && next_doc_info >= index->data_end) {
567 squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
568 goto cleanup_docset;
569 }
570
571 next_doc_data = index->doc_list + squat_decode_32(next_doc_info);
572 if (next_doc_data < index->doc_list || next_doc_data >= index->data_end) {
573 squat_set_last_error(SQUAT_ERR_INVALID_INDEX_FILE);
574 goto cleanup_docset;
575 }
576
577 r = handler(closure, next_doc_data);
578 if (r == SQUAT_CALLBACK_ABORT) {
579 break;
580 }
581 assert(r == SQUAT_CALLBACK_CONTINUE);
582 }
583
584 destroy_docset(&set);
585
586 cleanup_run_starts_ok:
587 free(run_starts);
588 return SQUAT_OK;
589
590 cleanup_docset:
591 destroy_docset(&set);
592
593 cleanup_run_starts:
594 free(run_starts);
595 return SQUAT_ERR;
596 }
597
squat_search_close(SquatSearchIndex * index)598 EXPORTED int squat_search_close(SquatSearchIndex* index) {
599 int r = SQUAT_OK;
600
601 squat_set_last_error(SQUAT_ERR_OK);
602
603 if (munmap((void*)index->data,
604 index->data_end + SQUAT_SAFETY_ZONE - index->data) != 0) {
605 squat_set_last_error(SQUAT_ERR_SYSERR);
606 r = SQUAT_ERR;
607 }
608
609 free(index);
610 return r;
611 }
612
613 /* ====================================================================== */
614
615 /* squat_scan(SquatSearchIndex* index, char first_char
616 * SquatScanCallback handler, void* closure)
617 *
618 * sweeps the entire subtrie for a given starting character. It generates
619 * callbacks to the hander routine (which has it own opaque data structure
620 * in closure) for each document ID that it finds in the leaves. The
621 * callback function looks something like:
622 *
623 * int add_word_callback(void* closure, char *name, int doc_ID);
624 *
625 * "name" is the full four character squat "word", including first_char and
626 * a trailing '\0', just in case you want to print it. doc_ID is the
627 * normal squat doc_ID offset.
628 *
629 * squat_scan_recurse() and squat_scan_leaf() are helper functions to
630 * scan the entire trie. squat_find_branch() is used to find the initial
631 * subtrie for a given first_char.
632 *
633 */
634
squat_scan_leaf(char const * doc_list,char * name,SquatScanCallback handler,void * closure)635 static int squat_scan_leaf(char const* doc_list, char *name,
636 SquatScanCallback handler, void* closure)
637 {
638 int i;
639
640 i = (int)squat_decode_I(&doc_list);
641 if ((i & 1) != 0) {
642 handler(closure, name, i >> 1);
643 } else {
644 int size = i >> 1;
645 char const* s = doc_list;
646 int last_doc = 0;
647
648 while (s - doc_list < size) {
649 i = (int)squat_decode_I(&s);
650 if ((i & 1) == 1) {
651 last_doc = last_doc + (i >> 1);
652 handler(closure, name, last_doc);
653 } else {
654 int count = i >> 1;
655 int delta = squat_decode_I(&s);
656
657 last_doc += delta;
658 handler(closure, name, last_doc);
659 while (--count > 0) {
660 last_doc++;
661 handler(closure, name, last_doc);
662 }
663 }
664 }
665 }
666 return(SQUAT_OK);
667 }
668
squat_scan_recurse(char const * s,char const * data_end,char * name,int level,SquatScanCallback handler,void * closure)669 static int squat_scan_recurse(char const* s, char const* data_end,
670 char *name, int level,
671 SquatScanCallback handler, void* closure)
672 {
673 int r = SQUAT_OK;
674 char p;
675 char const* branch_start = s;
676 char const* start_of_list;
677 int skip;
678 char count;
679 char const* base;
680 int offset, j;
681 int next_offset;
682 int ch;
683
684 /* Ignore empty subtrees generated by old versions of squatter */
685 if ((s[0] == 0) && (s[1] == 0))
686 return(SQUAT_OK);
687
688 p = *s++;
689 if ((p & 0xE0) != 0) {
690 /* Byte >= 0x20 (ASCII SPACE) indicates singleton list */
691
692 name[level] = p;
693 if (level < (SQUAT_WORD_SIZE-1)) {
694 next_offset = squat_decode_I(&s);
695 s = branch_start - next_offset;
696 if (next_offset < 0 || s >= data_end) {
697 return(SQUAT_ERR);
698 }
699 r = squat_scan_recurse(s, data_end, name, level+1, handler, closure);
700 } else {
701 r = squat_scan_leaf(s, name, handler, closure);
702 }
703 return(r);
704 }
705
706 count = (*s++) + 1; /* +1 to allow range 1..32 using (*s) < 32 */
707 base = s;
708 s += count;
709 start_of_list = s; /* So that we can repeat */
710
711 /* Up to 256 subtree branches, determined by variable length bitmask
712 * which is up to 32 bytes (plus two lead in bytes).
713 * p*8 : First p*8 branches all empty, can be ignored
714 * count*8 : Size of base array, in bits
715 *
716 *
717 * Should be able to optimise following into linear scan,
718 * without returning to start_of_list each time around the loop
719 */
720
721 for (ch = 8*p; ch < 8*(p+count) && r == SQUAT_OK; ch++) {
722 offset = (unsigned char)ch/8 - p;
723 if ((base[offset] & (1 << (ch & 7))) == 0) /* not in list */
724 continue;
725
726 /* figure out how many entries there are before our entry */
727 skip = 0;
728 for (j = 0; j < offset; j++) {
729 skip += bit_counts[(unsigned char)base[j]];
730 }
731 for (j = 0; j < (ch & 7); j++) {
732 if ((base[offset] & (1 << j)) != 0) {
733 skip++;
734 }
735 }
736 s = start_of_list;
737
738 name[level] = ch;
739 if (level < (SQUAT_WORD_SIZE-1)) {
740 s = squat_decode_skip_I(s, skip);
741 next_offset = squat_decode_I(&s);
742
743 s = branch_start - next_offset;
744 if (next_offset < 0 || s >= data_end) {
745 return(SQUAT_ERR);
746 }
747 r = squat_scan_recurse(s, data_end, name, level+1, handler, closure);
748 } else {
749 /* leaf case. We need to scan through the document lists for each
750 leaf to skip. */
751 while (skip-- > 0) {
752 char const* t = s;
753 int v = (int)squat_decode_I(&t);
754
755 if ((v & 1) != 0) {
756 s = t; /* singleton; no more data to eat for this word */
757 } else {
758 s = t + (v >> 1); /* run-list; size is in v>>1 */
759 }
760 }
761 r = squat_scan_leaf(s, name, handler, closure);
762 }
763 }
764 return(r);
765 }
766
767 /* char const** prev is used by squat_count_docs to find start of
768 * mmap()ed byte range that we need to traverse, so that we can
769 * read in the data before we start trie walk. No other purpose.
770 */
771
squat_find_branch(char const ** result,char const ** prev,char const * s,char const * data_end,char ch)772 static int squat_find_branch(char const** result, char const** prev,
773 char const* s,
774 char const* data_end, char ch)
775 {
776 char p;
777 char const* branch_start = s, *t;
778 int skip;
779 char count;
780 char const* base;
781 int offset, j;
782 int next_offset, prev_offset;
783
784 *result = NULL;
785 if (prev) *prev = NULL;
786
787 /* Ignore empty subtrees generated by old versions of squatter */
788 if ((s[0] == 0) && (s[1] == 0)) {
789 *result = NULL;
790 return(SQUAT_OK);
791 }
792
793 p = *s++;
794
795 if ((p & 0xE0) != 0) { /* singleton */
796 if (p != ch) {
797 *result = NULL;
798 return(SQUAT_OK);
799 }
800
801 next_offset = squat_decode_I(&s);
802 s = branch_start - next_offset;
803 if (next_offset < 0 || s >= data_end) {
804 return(SQUAT_ERR);
805 }
806 *result = s;
807 return(SQUAT_OK);
808 }
809
810 count = (*s++) + 1;
811 offset = (unsigned char)ch/8 - p;
812 base = s;
813 s += count;
814
815 /* Does this character fall in range */
816 if (((unsigned char)ch < 8*p) ||
817 ((unsigned char)ch >= 8*(p + count)) ||
818 ((base[offset] & (1 << (ch & 7))) == 0)) {
819 *result = NULL;
820 return(SQUAT_OK);
821 }
822
823 /* figure out how many entries there are before our entry */
824 skip = 0;
825 for (j = 0; j < offset; j++) {
826 skip += bit_counts[(unsigned char)base[j]];
827 }
828 for (j = 0; j < (ch & 7); j++) {
829 if ((base[offset] & (1 << j)) != 0) {
830 skip++;
831 }
832 }
833
834 if (skip > 0) {
835 s = squat_decode_skip_I(s, skip-1);
836 prev_offset = squat_decode_I(&s);
837
838 t = branch_start - prev_offset;
839 if (prev && (prev_offset >= 0 && t < data_end)) {
840 *prev = t;
841 }
842 }
843
844 /* find offset to next branch data */
845 next_offset = squat_decode_I(&s);
846
847 s = branch_start - next_offset;
848 if (next_offset < 0 || s >= data_end) {
849 return(SQUAT_ERR);
850 }
851 *result = s;
852 return(SQUAT_OK);
853 }
854
squat_scan(SquatSearchIndex * index,char first_char,SquatScanCallback handler,void * closure)855 EXPORTED int squat_scan(SquatSearchIndex* index, char first_char,
856 SquatScanCallback handler, void* closure)
857 {
858 char buf[SQUAT_WORD_SIZE+1];
859 const char *s;
860 int r = squat_find_branch(&s, NULL,
861 index->word_list, index->data_end, first_char);
862
863 if (r != SQUAT_OK)
864 return(r);
865 if (!s)
866 return SQUAT_OK;
867
868 memset(buf, 0, sizeof(buf));
869 buf[0] = first_char;
870
871 return(squat_scan_recurse(s, index->data_end, buf, 1, handler, closure));
872 }
873
874 /* ====================================================================== */
875
876 /* squat_count_docs(SquatSearchIndex* index, char first_char, int *counter)
877 *
878 * count the total number of document ID which appear in the entire
879 * subtree for a given starting character. Typically followed immediately
880 * by a call to squat_scan after caller has allocated a target array
881 * which is large enough for counter callback elements.
882 *
883 * squat_count_docs() will page in lots of data from disk via the mmapped
884 * index file. Unfortunately squat indexes are built back to front, so
885 * the scan would read blocks out of order as the data structure is
886 * traversed. squat_preload_data() is a rather evil hack to try and
887 * force sequential disk I/O: it serves no other function.
888 */
889
890 static int
squat_count_docs_callback(void * closure,char * name,int doc_ID)891 squat_count_docs_callback(void* closure,
892 char *name __attribute__((unused)),
893 int doc_ID __attribute__((unused)))
894 {
895 int *counter = (int *)closure;
896
897 (*counter)++;
898
899 return SQUAT_CALLBACK_CONTINUE;
900 }
901
902 /* Attempt to load blocks of data from mmap()ed array before we start
903 random access */
904
905 #define PRELOAD_BLOCK_SIZE (4096) /* 4 KBytes */
906 #define PRELOAD_MAX_SIZE (20*1024*1024) /* 20 MBytes */
907
908 static void
squat_preload_data(char const * t,char const * s)909 squat_preload_data(char const* t, char const* s)
910 {
911 char buf[PRELOAD_BLOCK_SIZE];
912 char const* start = (t > s) ? s : t;
913 unsigned long len = (t > s) ? (t-s) : (s-t);
914
915 if (len >= PRELOAD_MAX_SIZE)
916 return;
917
918 while (len > 0) {
919 unsigned long size = (len > PRELOAD_BLOCK_SIZE) ? PRELOAD_BLOCK_SIZE : len;
920
921 memcpy(buf, start, size);
922 start += size;
923 len -= size;
924 }
925 }
926
squat_count_docs(SquatSearchIndex * index,char first_char,int * counter)927 EXPORTED int squat_count_docs(SquatSearchIndex* index, char first_char, int *counter)
928 {
929 char buf[SQUAT_WORD_SIZE+1];
930 const char *s, *t;
931 int r = squat_find_branch(&s, &t, index->word_list,
932 index->data_end, first_char);
933 *counter = 0;
934
935 if (r != SQUAT_OK)
936 return(r);
937 if (!s)
938 return SQUAT_OK;
939
940 /* First trie is at index->doc_ID_list + (no_docIDs *4). However
941 * no_docIDs doesn't appear to be stored anywhere (zero terminated list).
942 * Following is good enough for our purposes */
943 squat_preload_data((t) ? t : index->doc_ID_list, s);
944
945 memset(buf, 0, sizeof(buf));
946 buf[0] = first_char;
947
948 return(squat_scan_recurse(s, index->data_end, buf, 1,
949 squat_count_docs_callback, counter));
950 }
951