1 /* index.c --
2 * Created: Wed Oct 9 14:52:23 1996 by faith@dict.org
3 * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org)
4 * Copyright 2002-2008 Aleksey Cheusov (vle@gmx.net)
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 1, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 #include "dictP.h"
22 #include "dictzip.h"
23 #include "index.h"
24 #include "strategy.h"
25 #include "str.h"
26
27 #ifdef USE_PLUGIN
28 #include "plugin.h"
29 #endif
30
31 #include <regex.h>
32
33 #include <sys/stat.h>
34
35 #include <fcntl.h>
36 #include <ctype.h>
37 #include <wctype.h>
38 #include <wchar.h>
39 #include <stdio.h>
40 #include <sys/stat.h>
41
42 extern int mmap_mode;
43
44 #define FIND_PREV(begin, pt) while (pt > begin && pt [-1] != '\n') --pt;
45 #define FIND_NEXT(pt, end) while (pt < end && *pt++ != '\n');
46
47 #define MAXWORDLEN 512
48 #define BMH_THRESHOLD 3 /* When to start using Boyer-Moore-Hoorspool */
49
50 /* defaults to run in UTF-8 mode */
51 int utf8_mode=1; /* dictd uses UTF-8 dictionaries */
52
53 int optStart_mode = 1; /* Optimize search range for constant start */
54
55 dictConfig *DictConfig;
56
57 int _dict_comparisons;
58 static int isspacealnumtab [UCHAR_MAX + 1];
59 static int isspacealnumtab_allchars[UCHAR_MAX + 1];
60 static int isspacepuncttab [UCHAR_MAX + 1];
61 static int char2indextab [UCHAR_MAX + 2];
62 static int index2chartab [UCHAR_MAX + 2];
63 static int tolowertab [UCHAR_MAX + 1];
64
65 char global_alphabet_8bit [UCHAR_MAX + 2];
66 char global_alphabet_ascii [UCHAR_MAX + 2];
67
68 static int chartab [UCHAR_MAX + 1];
69 static int charcount = 0;
70
71 static int define_or_match = 0; /* 1 if define */
72
73 /* #define isspacealnum(x) (isspacealnumtab[(unsigned char)(x)]) */
74 #define c2i(x) (char2indextab[(unsigned char)(x)])
75 #define i2c(x) (index2chartab[(unsigned char)(x)])
76 #define c(x) (((x) < charcount) ? chartab[(unsigned char)(x)] : 0)
77
78 /*
79 compares two 8bit strings (containing one character)
80 according to locate
81 */
dict_table_init_compare_strcoll(const void * a,const void * b)82 static int dict_table_init_compare_strcoll (const void *a, const void *b)
83 {
84 return strcoll(*(const char **)a, *(const char **)b);
85 }
86
87 /*
88 compares two strings (containing one character)
89 converting ASCII character to lower case.
90 */
dict_table_init_compare_utf8(const void * a,const void * b)91 static int dict_table_init_compare_utf8 (const void *a, const void *b)
92 {
93 int c1 = ** (const unsigned char **) a;
94 int c2 = ** (const unsigned char **) b;
95
96 if (c1 <= CHAR_MAX)
97 c1 = tolower (c1);
98 if (c2 <= CHAR_MAX)
99 c2 = tolower (c2);
100
101 return c1 - c2;
102 }
103
dict_make_global_alphabet(void)104 static void dict_make_global_alphabet (void)
105 {
106 int i;
107 int j = 0;
108 int ch;
109
110 for (i=0; i < charcount; ++i){
111 global_alphabet_8bit [i] = ch = c(i);
112
113 if (ch < 128)
114 global_alphabet_ascii [j++] = ch;
115 }
116
117 global_alphabet_8bit [charcount] = 0;
118 global_alphabet_ascii [j] = 0;
119
120 PRINTF(DBG_SEARCH, (
121 "global_alphabet_8bit = '%s'\n",
122 global_alphabet_8bit));
123 PRINTF(DBG_SEARCH, (
124 "global_alphabet_ascii = '%s'\n",
125 global_alphabet_ascii));
126 }
127
dict_table_init(void)128 static void dict_table_init(void)
129 {
130 int i;
131 unsigned char s[2 * UCHAR_MAX + 2];
132 unsigned char *p[UCHAR_MAX + 1];
133
134 for (i = 0; i <= UCHAR_MAX; i++) {
135 if (isspace(i) || isalnum(i) || (utf8_mode && i >= 0x80)){
136 isspacealnumtab [i] = 1;
137 }else{
138 isspacealnumtab [i] = 0;
139 }
140
141 tolowertab [i] = tolower (i);
142 if (i >= 0x80){
143 if (utf8_mode || !utf8_mode){
144 /* utf-8 or ASCII mode */
145 tolowertab [i] = i;
146 }
147 }
148
149 if (utf8_mode && i >= 0x80){
150 isspacepuncttab [i] = 0;
151 }else{
152 isspacepuncttab [i] = isspace(i) || ispunct(i);
153 }
154
155 isspacealnumtab_allchars [i] = 1;
156 }
157 isspacepuncttab['\t'] = isspacepuncttab['\n'] = 1; /* special */
158 isspacealnumtab['\t'] = isspacealnumtab['\n'] = 0; /* special */
159 isspacealnumtab_allchars['\t'] = isspacealnumtab_allchars['\n'] = 0; /* special */
160
161 charcount = 0;
162 for (i = 0; i <= UCHAR_MAX; i++){
163 if (islower (i) || (utf8_mode && i >= 0xC0))
164 chartab [charcount++] = i;
165 }
166
167 /* Populate an array with length-1 strings */
168 for (i = 0; i <= UCHAR_MAX; i++) {
169 if (!isupper (i)){
170 s[2 * i] = i;
171 }else{
172 s[2 * i] = 0;
173 }
174
175 s[2 * i + 1] = '\0';
176 p[i] = &s[2 * i];
177 }
178 /* Sort those strings in the locale */
179 if (utf8_mode)
180 qsort(p, UCHAR_MAX + 1, sizeof(p[0]), dict_table_init_compare_utf8);
181 else
182 qsort(p, UCHAR_MAX + 1, sizeof(p[0]), dict_table_init_compare_strcoll);
183
184 /* Extract our unordered arrays */
185 for (i = 0; i <= UCHAR_MAX; i++) {
186 char2indextab[*p[i]] = i;
187 index2chartab[i] = *p[i];
188 }
189 char2indextab[UCHAR_MAX + 1] = UCHAR_MAX;
190 index2chartab[UCHAR_MAX + 1] = UCHAR_MAX; /* we may index here in */
191
192 if (dbg_test(DBG_SEARCH)) {
193 for (i = 0; i <= UCHAR_MAX; ++i){
194 if (p [i][0] <= CHAR_MAX){
195 PRINTF (DBG_SEARCH,("sorted list: %s\n", p [i]));
196 }else{
197 PRINTF (DBG_SEARCH,("sorted list: %i\n", (unsigned char) p [i] [0]));
198 }
199 }
200 }
201
202 if (dbg_test(DBG_SEARCH)) {
203 for (i = 0; i < charcount; i++)
204 PRINTF(DBG_SEARCH,("%03d %d ('%c')\n", i, c(i), c(i)));
205
206 for (i = 0; i <= UCHAR_MAX; i++)
207 PRINTF(DBG_SEARCH,("c2i(%d/'%c') = %d; i2c(%d) = %d/'%c'\n",
208 i, (char) isgraph(i) ? i : '.',
209 c2i(i), c2i(i),
210 i2c(c2i(i)), (char) i2c(c2i(i)) ? i2c(c2i(i)) : '.'));
211 }
212
213
214 dict_make_global_alphabet ();
215 }
216
compare_allchars(const char * word,const char * start,const char * end)217 static int compare_allchars(
218 const char *word,
219 const char *start, const char *end )
220 {
221 int c1, c2;
222 int result;
223
224 PRINTF(DBG_SEARCH,(" We are inside index.c:compare_allchars\n"));
225
226 while (*word && *word != '\t' && *word != '\n' &&
227 start < end && *start != '\t' && *start != '\n')
228 {
229 #if 0
230 if (isspace( (unsigned char) *start ))
231 c2 = ' ';
232 else
233 c2 = * (unsigned char *) start;
234
235 if (isspace( (unsigned char) *word ))
236 c1 = ' ';
237 else
238 c1 = * (unsigned char *) word;
239
240 #else
241 c2 = * (unsigned char *) start;
242 c1 = * (unsigned char *) word;
243 #endif
244 if (c1 != c2) {
245 if (c1 < c2){
246 result = -2;
247 }else{
248 result = 1;
249 }
250
251 if (dbg_test(DBG_SEARCH)){
252 PRINTF(DBG_SEARCH,(" result = %d (%i != %i) \n", result, c1, c2));
253 }
254 return result;
255 }
256 ++word;
257 ++start;
258 }
259
260 if (*word && *word != '\t' && *word != '\n')
261 result = 1;
262 else if (*start != '\t' && *start != '\n')
263 result = -1;
264 else
265 result = 0;
266
267 PRINTF(DBG_SEARCH,(" result = %d\n", result));
268 return result;
269 }
270
compare_alnumspace(const char * word,const dictIndex * dbindex,const char * start,const char * end)271 static int compare_alnumspace(
272 const char *word,
273 const dictIndex *dbindex,
274 const char *start, const char *end )
275 {
276 int c1, c2;
277 int result;
278 int ret;
279
280 assert (dbindex);
281
282 PRINTF(DBG_SEARCH,(" We are inside index.c:compare_alnumspace\n"));
283
284 /* FIXME. Optimize this inner loop. */
285 while (*word && *word != '\t' && *word != '\n' &&
286 start < end && *start != '\t' && *start != '\n')
287 {
288 if (!dbindex -> isspacealnum[* (const unsigned char *) start]) {
289 ++start;
290 continue;
291 }
292
293 c1 = (unsigned char) *word;
294 c2 = (unsigned char) *start;
295
296 if (!dbindex -> flag_casesensitive){
297 c2 = tolowertab [c2];
298 c1 = tolowertab [c1];
299 }
300
301 if (c1 != c2) {
302 if (utf8_mode){
303 if (
304 (c1 <= CHAR_MAX ? c2i (c1) : c1) <
305 (c2 <= CHAR_MAX ? c2i (c2) : c2))
306 {
307 result = -2;
308 }else{
309 result = 1;
310 }
311 }else{
312 result = (c2i (c1) < c2i (c2) ? -2 : 1);
313 }
314 if (dbg_test(DBG_SEARCH)){
315 if (utf8_mode){
316 PRINTF (DBG_SEARCH,(
317 " result = %d (%i != %i) \n", result, c1, c2));
318 }else{
319 PRINTF (DBG_SEARCH,(
320 " result = %d ('%c'(c2i=%i) != '%c'(c2i=%i)) \n",
321 result,
322 c1,
323 c2i (c1),
324 c2,
325 c2i (c2)));
326 }
327 }
328 return result;
329 }
330 ++word;
331 ++start;
332 }
333
334 while (
335 *start != '\t' && *start != '\n' &&
336 !dbindex -> isspacealnum[* (const unsigned char *) start])
337 {
338 ++start;
339 }
340
341 if (*word && *word != '\t' && *word != '\n')
342 ret = 1;
343 else if (*start != '\t' && *start != '\n')
344 ret = -1;
345 else
346 ret = 0;
347
348 PRINTF(DBG_SEARCH,(" result = %d\n", ret));
349
350 return ret;
351 }
352
353 /* Compare:
354
355 Return -2 if word < word-pointed-to-by-start
356 -1 if word prefix word-pointed-to-by-start
357 0 if word == word-pointed-to-by-start
358 1 if word > word-pointed-to-by-start
359 2 if some kind of error happened
360
361 The comparison must be the same as "sort -df" phone directory order:
362 ignore all characters except letters, digits, and blanks; fold upper
363 case into the equivalent lowercase.
364
365 word already has all of the illegal characters removed
366 */
367
compare(const char * word,const dictIndex * dbindex,const char * start,const char * end)368 static int compare(
369 const char *word,
370 const dictIndex *dbindex,
371 const char *start, const char *end )
372 {
373 char buf[80], *d;
374 const char *s;
375
376 assert (dbindex);
377
378 if (dbg_test(DBG_SEARCH)) {
379 for (
380 d = buf, s = start;
381 d - buf + 1 < (int) sizeof (buf) && s < end && *s != '\t';)
382 {
383 *d++ = *s++;
384 }
385
386 *d = '\0';
387 PRINTF(DBG_SEARCH,
388 ("compare \"%s\" with \"%s\" (sizes: %lu and %lu)\n",
389 word, buf, (unsigned long) strlen( word ), (unsigned long) strlen( buf ) ));
390 }
391
392 ++_dict_comparisons; /* counter for profiling */
393
394 if (dbindex &&
395 (dbindex -> flag_allchars || dbindex -> flag_utf8 ||
396 dbindex -> flag_8bit))
397 {
398 return compare_allchars( word, start, end );
399 }else{
400 return compare_alnumspace( word, dbindex, start, end );
401 }
402 }
403
dict_entry_is_4column(const char * entry,const char * index_end)404 static int dict_entry_is_4column (
405 const char *entry, const char *index_end)
406 {
407 int tab_count = 0;
408
409 for (; entry < index_end; ++entry){
410 switch (*entry){
411 case '\t':
412 ++tab_count;
413 break;
414 case '\n':
415 switch (tab_count){
416 case 2:
417 return 0;
418 case 3:
419 return 1;
420 default:
421 goto err;
422 }
423 }
424 }
425
426 err:
427 err_fatal (__func__, "bad .index file");
428 }
429
compare_1or4(const char * word,const dictIndex * dbindex,const char * start,const char * end,const char ** column_1or4)430 static int compare_1or4 (
431 const char *word,
432 const dictIndex *dbindex,
433 const char *start, const char *end,
434 const char **column_1or4)
435 {
436 if (dict_entry_is_4column (start, end)){
437 while (start < end && *start != '\t')
438 ++start;
439 ++start;
440 while (start < end && *start != '\t')
441 ++start;
442 ++start;
443 }
444
445 if (column_1or4)
446 *column_1or4 = start;
447
448 return compare (word, dbindex, start, end);
449 }
450
binary_search(const char * word,const dictIndex * dbindex,const char * start,const char * end)451 static const char *binary_search(
452 const char *word,
453 const dictIndex *dbindex,
454 const char *start, const char *end )
455 {
456 const char *pt;
457
458 assert (dbindex);
459
460 PRINTF(DBG_SEARCH,("%s %p %p\n", word, start, end));
461
462 pt = start + (end-start)/2;
463 FIND_PREV(start, pt);
464 while (start < end) {
465 switch (compare( word, dbindex, pt, end )){
466 case -2: case -1: case 0:
467 end = pt;
468 break;
469 case 1:
470 start = pt;
471 FIND_NEXT(start, end)
472 break;
473 case 2:
474 return end; /* ERROR!!! */
475 default:
476 assert (0);
477 }
478 PRINTF(DBG_SEARCH,("%s %p %p\n",word,start,end));
479 pt = start + (end-start)/2;
480 FIND_PREV(start, pt);
481 }
482
483 return start;
484 }
485
binary_search_8bit(const char * word,const dictIndex * dbindex,const char * start,const char * end)486 static const char *binary_search_8bit(
487 const char *word,
488 const dictIndex *dbindex,
489 const char *start, const char *end )
490 {
491 char buf[80], *d;
492 const char *s;
493 const char *pt;
494 int cmp;
495
496 assert (dbindex);
497
498 PRINTF(DBG_SEARCH,("word/start/end %s/%p/%p\n",word,start,end));
499
500 pt = start + (end-start)/2;
501 FIND_PREV(start, pt);
502 while (start < end) {
503 if (dbg_test(DBG_SEARCH)) {
504 for (
505 d = buf, s = pt;
506 s < end && *s != '\t' && d - buf + 1 < (int) sizeof (buf);)
507 {
508 *d++ = *s++;
509 }
510
511 *d = '\0';
512 printf( "compare \"%s\" with \"%s\" (sizes: %lu and %lu)\n",
513 word, buf, (unsigned long) strlen( word ), (unsigned long) strlen( buf ) );
514 }
515
516 if (
517 dbindex &&
518 (dbindex -> flag_utf8 || dbindex -> flag_allchars))
519 {
520 cmp = compare_allchars ( word, pt, end );
521 }else{
522 cmp = compare_alnumspace ( word, dbindex, pt, end );
523 }
524
525 switch (cmp){
526 case -2: case -1: case 0:
527 end = pt;
528 break;
529 case 1:
530 start = pt;
531 FIND_NEXT(start, end)
532 break;
533 case 2:
534 return end; /* ERROR!!! */
535 default:
536 assert (0);
537 }
538 PRINTF(DBG_SEARCH,("%s %p %p\n",word,start,end));
539 pt = start + (end-start)/2;
540 FIND_PREV(start, pt);
541 }
542
543 return start;
544 }
545
linear_search(const char * word,const dictIndex * dbindex,const char * start,const char * end)546 static const char *linear_search(
547 const char *word,
548 const dictIndex *dbindex,
549 const char *start, const char *end )
550 {
551 const char *pt;
552
553 assert (dbindex);
554
555 for (pt = start; pt < end;) {
556 switch (compare( word, dbindex, pt, end )) {
557 case -2: return NULL; /* less than and not prefix */
558 case -1: /* prefix */
559 case 0: return pt; /* exact */
560 case 1: break; /* greater than */
561 case 2: return NULL; /* ERROR!!! */
562 }
563 FIND_NEXT(pt,end);
564 }
565 return NULL;
566 }
567
dict_index_search(const char * word,dictIndex * idx)568 static const char *dict_index_search( const char *word, dictIndex *idx )
569 {
570 const char *start;
571 const char *end;
572 int first, last;
573
574 assert (idx);
575
576 /* With optStart:
577 17071 comparisons, 1000 words
578 33946 comparisons, 2000 words
579
580 Without optStart:
581 20889 comparisons, 1000 words
582 41668 comparisons, 2000 words
583
584 Linear:
585 594527 comparisons, 1000 words
586 2097035 comparisons, 2000 words
587 */
588
589 if (optStart_mode){
590 first = * (const unsigned char *) word;
591 last = i2c(c2i(first)+1);
592
593 if (dbg_test(DBG_SEARCH)) {
594 if (!utf8_mode || (last <= CHAR_MAX && first <= CHAR_MAX))
595 printf("binary_search from %c to %c\n", first, last);
596 else
597 printf("binary_search from %i to %i\n", first, last);
598 }
599
600 end = idx->optStart [last];
601 start = idx->optStart [first];
602 #if 0
603 fprintf (stderr, "start1 = %p\n", start);
604 fprintf (stderr, "end1 = %p\n", end);
605 #endif
606 }else{
607 start = idx->start;
608 end = idx->end;
609 }
610
611 if (end < start) end = idx->end;
612
613 start = binary_search( word, idx, start, end );
614
615 PRINTF(DBG_SEARCH,("binary_search returns %p\n",start));
616
617 start = linear_search( word, idx, start, idx->end );
618
619 PRINTF(DBG_SEARCH,("linear_search returns %p\n",start));
620
621 return start;
622 }
623
dict_word_create(const char * entry,const dictDatabase * database,const dictIndex * dbindex)624 static dictWord *dict_word_create(
625 const char *entry,
626 const dictDatabase *database,
627 const dictIndex *dbindex)
628 {
629 int offs_word = 0;
630 int offs_offset = 0;
631 int offs_length = 0;
632 int offset = 0;
633
634 int word_len = 0;
635
636 dictWord *dw = xmalloc( sizeof( struct dictWord ) );
637 const char *pt = entry;
638 char *d;
639
640 assert (dbindex);
641 assert (pt >= dbindex -> start && pt < dbindex -> end);
642
643 memset (dw, 0, sizeof (*dw));
644
645 for (;pt < dbindex->end && *pt != '\n'; pt++, offset++) {
646 if (*pt == '\t') {
647 if (!offs_offset)
648 offs_offset = offset + 1;
649 else if (!offs_length)
650 offs_length = offset + 1;
651 else if (!offs_word)
652 offs_word = offset + 1;
653 else{
654 err_internal( __func__,
655 "Too many tabs in index entry \"%*.*s\"\n",
656 offs_length, offs_length, entry );
657 }
658 }
659 }
660
661 if (!offs_length)
662 err_internal( __func__,
663 "Too few tabs in index entry \"%20.20s\"\n", entry );
664
665 dw->start = b64_decode_buf (entry + offs_offset, offs_length - offs_offset - 1);
666 if (offs_word > 0){
667 word_len = offset - offs_word;
668 dw->end = b64_decode_buf (entry + offs_length, offs_word - offs_length - 1);
669 }else{
670 word_len = offs_offset - 1;
671 dw->end = b64_decode_buf (entry + offs_length, offset - offs_length);
672 }
673
674 dw->def = NULL;
675 dw->def_size = 0;
676 dw->database = database;
677
678 /* Apply quoting to word */
679 dw->word = xmalloc (word_len*2 + 1);
680 entry += offs_word;
681
682 for (d = (char *)dw->word; word_len--;) {
683 switch (*entry) {
684 case '"':
685 case '\\':
686 *d++ = '\\';
687 }
688 *d++ = *entry++;
689 }
690 *d = '\0';
691
692 return dw;
693 }
694
dict_dump_datum(const void * datum)695 static int dict_dump_datum( const void *datum )
696 {
697 dictWord *dw = (dictWord *)datum;
698 printf(
699 "\"%s\" %lu/%lu %p/%i\n",
700 dw->word, dw->start, dw->end,
701 dw->def, dw->def_size );
702
703 return 0;
704 }
705
dict_dump_list(lst_List list)706 void dict_dump_list( lst_List list )
707 {
708 lst_iterate( list, dict_dump_datum );
709 }
710
dict_destroy_datum(const void * datum)711 int dict_destroy_datum( const void *datum )
712 {
713 dictWord *dw = (dictWord *)datum;
714
715 if (!datum)
716 return 0;
717
718 if (dw->word)
719 xfree( (char *)dw->word );
720
721 dw->word = NULL;
722 dw->start = 0;
723 dw->end = 0;
724 dw->database = NULL;
725 xfree( dw );
726 return 0;
727 }
728
dict_destroy_list(lst_List list)729 void dict_destroy_list( lst_List list )
730 {
731 lst_iterate( list, dict_destroy_datum );
732 lst_destroy( list );
733 }
734
735 /* returns NULL if limits exceeded */
dict_add_word_to_list(lst_List l,const dictDatabase * database,const dictIndex * dbindex,const char * pt)736 static dictWord *dict_add_word_to_list (
737 lst_List l,
738 const dictDatabase *database,
739 const dictIndex *dbindex,
740 const char *pt)
741 {
742 dictWord * datum;
743
744 assert (l);
745
746 if (define_or_match){
747 if (_dict_daemon_limit_defs
748 && lst_length (l) >= _dict_daemon_limit_defs)
749 return NULL;
750 }else{
751 if (_dict_daemon_limit_matches
752 && lst_length (l) >= _dict_daemon_limit_matches)
753 return NULL;
754 }
755
756 datum = dict_word_create (pt, database, dbindex);
757 lst_append (l, datum);
758 return datum;
759 }
760
dict_search_exact(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int uniq_only)761 static int dict_search_exact( lst_List l,
762 const char *word,
763 const dictDatabase *database,
764 dictIndex *dbindex,
765 int uniq_only)
766 {
767 const char *pt = NULL;
768 int count = 0;
769 const char *previous = NULL;
770
771 assert (dbindex);
772
773 pt = dict_index_search( word, dbindex );
774
775 while (pt && pt < dbindex->end) {
776 if (!compare( word, dbindex, pt, dbindex->end )) {
777 if (!uniq_only || !previous
778 || compare_1or4 (previous, dbindex, pt, dbindex->end, &previous))
779 {
780 ++count;
781 if (l){
782 if (!dict_add_word_to_list (l, database, dbindex, pt)) {
783 break;
784 }
785 }
786 }
787 }else{
788 break;
789 }
790
791 FIND_NEXT( pt, dbindex->end );
792 }
793
794 return count;
795 }
796
797 enum {
798 BMH_SUBSTRING,
799 BMH_PREFIX,
800 BMH_SUFFIX,
801 BMH_WORD,
802 BMH_FIRST,
803 BMH_LAST,
804 };
805
dict_search_prefix_first(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int flag,int skip_count,int item_count)806 static int dict_search_prefix_first( lst_List l,
807 const char *word,
808 const dictDatabase *database,
809 dictIndex *dbindex,
810 int flag,
811 int skip_count,
812 int item_count)
813 {
814 const char *pt = dict_index_search( word, dbindex );
815 int count = 0;
816 const char *previous = NULL;
817 int wordlen = strlen (word);
818 int c = 0;
819
820 assert (dbindex);
821
822 if (item_count <= 0){
823 return 0;
824 }
825
826 while (pt && pt < dbindex->end) {
827 switch (compare( word, dbindex, pt, dbindex->end )) {
828 case -2:
829 return count;
830 case -1:
831 case 0:
832 if (!previous
833 || compare_1or4 (previous, dbindex, pt, dbindex->end,
834 &previous))
835 {
836 if (flag == BMH_FIRST){
837 c = (unsigned char) pt [wordlen];
838 if (c != '\t' && !isspacepuncttab [c])
839 break;
840 }
841
842 if (skip_count == 0){
843 ++count;
844
845 if (!dict_add_word_to_list (l, database, dbindex, pt))
846 return count;
847
848 --item_count;
849 if (!item_count){
850 return count;
851 }
852 }else{
853 --skip_count;
854 }
855 }
856 break;
857 case 1:
858 return count;
859 case 2:
860 return count; /* ERROR!!! */
861 default:
862 assert (0);
863 }
864
865 FIND_NEXT( pt, dbindex->end );
866 }
867
868 return count;
869 }
870
dict_search_prefix(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int skip_count,int item_count)871 static int dict_search_prefix (
872 lst_List l, const char *word,
873 const dictDatabase *database, dictIndex *dbindex,
874 int skip_count, int item_count)
875 {
876 return dict_search_prefix_first (l, word, database, dbindex,
877 BMH_PREFIX, skip_count, item_count);
878 }
879
dict_search_first(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)880 static int dict_search_first (
881 lst_List l, const char *word,
882 const dictDatabase *database, dictIndex *dbindex)
883 {
884 return dict_search_prefix_first (l, word, database, dbindex,
885 BMH_FIRST, 0, INT_MAX);
886 }
887
dict_search_brute(lst_List l,const unsigned char * word,const dictDatabase * database,dictIndex * dbindex,int flag,int patlen)888 static int dict_search_brute( lst_List l,
889 const unsigned char *word,
890 const dictDatabase *database,
891 dictIndex *dbindex,
892 int flag,
893 int patlen )
894 {
895 const unsigned char *const start = (const unsigned char *) dbindex->start;
896 const unsigned char *const end = (const unsigned char *) dbindex->end;
897 const unsigned char *p, *pt;
898 int count = 0;
899 int result;
900 const char *previous = NULL;
901 int c;
902
903 assert (dbindex);
904
905 p = start;
906 while (p < end && !dbindex -> isspacealnum[*p]) ++p;
907 for (; p < end; ++p) {
908 if (*p == '\t') {
909 while (p < end && *p != '\n') ++p;
910 ++p;
911 while (p < end && !dbindex -> isspacealnum[*p]) ++p;
912 }
913
914 c = *p;
915 if (!dbindex -> flag_casesensitive){
916 c = tolowertab [c];
917 }
918
919 if (c == *word) {
920 result = compare( (const char *) word,
921 dbindex,
922 (const char *) p,
923 (const char *) end );
924
925 if (result == -1 || result == 0) {
926 switch (flag){
927 case BMH_SUBSTRING:
928 break;
929
930 case BMH_SUFFIX:
931 if (result)
932 continue;
933
934 break;
935
936 case BMH_WORD:
937 if (p > start && !isspacepuncttab [p [-1]])
938 continue;
939 if (p+patlen < end && !isspacepuncttab [p [patlen]])
940 continue;
941
942 break;
943
944 case BMH_LAST:
945 if (result)
946 continue;
947 if (p > start && !isspacepuncttab [p [-1]])
948 continue;
949
950 break;
951 default:
952 abort ();
953 }
954
955 for (pt = p; pt >= start && *pt != '\n'; --pt)
956 if (*pt == '\t')
957 goto continue2;
958
959 if (!previous ||
960 compare_1or4 (previous, dbindex, (char*) pt + 1,
961 (const char *) end, &previous))
962 {
963 ++count;
964
965 if (!dict_add_word_to_list
966 (l, database, dbindex, (const char *) pt + 1))
967 {
968 break;
969 }
970 }
971 FIND_NEXT(p,end);
972 --p;
973 }
974 }
975 continue2:
976 ;
977 }
978
979 return count;
980 }
981
982 /* dict_search_bmh implements a version of the Boyer-Moore-Horspool text
983 searching algorithm, as described in G. H. Gonnet and R. Baeza-Yates,
984 HANDBOOK OF ALGORITHMS AND DATA STRUCTURES: IN PASCAL AND C (2nd ed).
985 Addison-Wesley Publishing Co., 1991. Pages 258-9. */
986
dict_search_bmh(lst_List l,const unsigned char * word,const dictDatabase * database,dictIndex * dbindex,int flag)987 static int dict_search_bmh( lst_List l,
988 const unsigned char *word,
989 const dictDatabase *database,
990 dictIndex *dbindex,
991 int flag )
992 {
993 const unsigned char *const start = (const unsigned char *) dbindex->start;
994 const unsigned char *const end = (const unsigned char *) dbindex->end;
995 const unsigned patlen = strlen( (const char *) word );
996 int skip[UCHAR_MAX + 1];
997 int i;
998 int j;
999 int c;
1000 #if 0
1001 int k;
1002 #endif
1003 const unsigned char *p, *pt, *ptr;
1004 int count = 0;
1005 const unsigned char *f = NULL; /* Boolean flag, but has to be a pointer */
1006 const unsigned char *wpt;
1007 const char *previous = NULL;
1008
1009 assert (dbindex);
1010
1011 if (patlen < BMH_THRESHOLD)
1012 return dict_search_brute( l, word, database, dbindex, flag, patlen );
1013
1014 for (i = 0; i <= UCHAR_MAX; i++) {
1015 if (dbindex -> isspacealnum[i])
1016 skip[i] = patlen;
1017 else
1018 skip[i] = 1;
1019 }
1020 for (i = 0; i < patlen-1; i++)
1021 skip[(unsigned char)word[i]] = patlen-i-1;
1022
1023 for (p = start+patlen-1; p < end; ) {
1024 while (*p == '\t') {
1025 FIND_NEXT(p,end);
1026 p += patlen-1;
1027 if (p > end)
1028 return count;
1029 }
1030 ++_dict_comparisons; /* counter for profiling */
1031
1032 /* FIXME. Optimize this inner loop. */
1033 for (j = patlen - 1, pt = p, wpt = word + patlen - 1; j >= 0; j--) {
1034 if (pt < start)
1035 break;
1036
1037 while (pt >= start && !dbindex -> isspacealnum[*pt]) {
1038 if (*pt == '\n' || *pt == '\t')
1039 goto continue2;
1040
1041 --pt;
1042 }
1043
1044 c = *pt--;
1045 if (!dbindex -> flag_casesensitive){
1046 c = tolowertab [c];
1047 }
1048
1049 if (c != *wpt--)
1050 break;
1051 }
1052
1053 if (j == -1) {
1054 switch (flag){
1055 case BMH_SUBSTRING:
1056 break;
1057 case BMH_SUFFIX:
1058 if (p[1] != '\t')
1059 goto continue2;
1060
1061 break;
1062 case BMH_WORD:
1063 ptr = p - patlen + 1;
1064
1065 if (ptr > start && !isspacepuncttab [ptr [-1]])
1066 goto continue2;
1067 if (p < end && !isspacepuncttab [p [1]])
1068 goto continue2;
1069
1070 break;
1071 case BMH_LAST:
1072 if (p[1] != '\t')
1073 goto continue2;
1074
1075 ptr = p - patlen + 1;
1076
1077 if (ptr > start && !isspacepuncttab [ptr [-1]])
1078 goto continue2;
1079
1080 break;
1081 }
1082
1083 for (; pt >= start && *pt != '\n'; --pt)
1084 if (*pt == '\t')
1085 goto continue2;
1086
1087 ++pt;
1088
1089 assert (pt >= start && pt < end);
1090
1091 if (!previous ||
1092 compare_1or4 (previous, dbindex, (const char *) pt,
1093 dbindex->end, &previous))
1094 {
1095 ++count;
1096 if (l){
1097 if (!dict_add_word_to_list
1098 (l, database, dbindex, (const char *) pt))
1099 {
1100 return count;
1101 }
1102 }
1103 }
1104 FIND_NEXT(p,end);
1105 f = p += patlen-1; /* Set boolean flag to non-NULL value */
1106 if (p > end)
1107 return count;
1108 }
1109 continue2:
1110 if (f){
1111 f = NULL;
1112 }else{
1113 c = *p;
1114 if (!dbindex -> flag_casesensitive){
1115 c = tolowertab [c];
1116 }
1117 p += skip [c];
1118 }
1119 }
1120
1121 return count;
1122 }
1123
dict_search_substring(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1124 static int dict_search_substring( lst_List l,
1125 const char *word,
1126 const dictDatabase *database,
1127 dictIndex *dbindex)
1128 {
1129 return dict_search_bmh (
1130 l, (const unsigned char *) word,
1131 database, dbindex, BMH_SUBSTRING );
1132 }
1133
dict_search_word(lst_List l,const char * word,const dictDatabase * database)1134 static int dict_search_word(
1135 lst_List l,
1136 const char *word,
1137 const dictDatabase *database)
1138 {
1139 lst_Position pos;
1140 dictWord *dw;
1141 const char *p;
1142 int ret1, ret2;
1143 int count;
1144 int len;
1145
1146 assert (database);
1147 assert (database -> index);
1148
1149 if (database->index_word){
1150 ret2 = dict_search_exact( l, word, database, database->index, 0 );
1151 if (ret2 < 0)
1152 return ret2;
1153
1154 count = lst_length (l);
1155
1156 ret1 = dict_search_exact( l, word, database, database->index_word, 0 );
1157 if (ret1 < 0)
1158 return ret1;
1159
1160 LST_ITERATE (l, pos, dw){
1161 if (count-- <= 0){
1162 xfree (dw -> word);
1163
1164 p = database -> index -> start + dw -> start;
1165 assert (p == database -> index -> start || p [-1] == '\n');
1166 len = strchr (p, '\t') - p;
1167
1168 dw -> word = xmalloc (len + 1);
1169 memcpy (dw -> word, p, len);
1170 dw -> word [len] = 0;
1171 dw -> start = -2;
1172 dw -> end = 0;
1173 }
1174 }
1175
1176 return ret1 + ret2;
1177 }else{
1178 return dict_search_bmh (
1179 l, (const unsigned char *) word,
1180 database, database->index, BMH_WORD );
1181 }
1182 }
1183
1184 /* return non-zero if success, 0 otherwise */
dict_match(const regex_t * re,const char * word,size_t word_len,int eflags)1185 static int dict_match (
1186 const regex_t *re,
1187 const char *word, size_t word_len,
1188 int eflags)
1189 {
1190 #if defined(REG_STARTEND)
1191 regmatch_t subs[1];
1192 subs [0].rm_so = 0;
1193 subs [0].rm_eo = word_len;
1194 return !regexec(re, word, 1, subs, eflags | REG_STARTEND);
1195 #else
1196 char *word_copy = (char *) alloca (word_len + 1);
1197 memcpy (word_copy, word, word_len);
1198 word_copy [word_len] = 0;
1199 return !regexec(re, word_copy, 0, NULL, eflags);
1200 #endif
1201 }
1202
dict_search_regexpr(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int type)1203 static int dict_search_regexpr( lst_List l,
1204 const char *word,
1205 const dictDatabase *database,
1206 dictIndex *dbindex,
1207 int type )
1208 {
1209 const char *start = dbindex->start;
1210 const char *end = dbindex->end;
1211 const char *p, *pt;
1212 int count = 0;
1213 regex_t re;
1214 char erbuf[100];
1215 int err;
1216 unsigned char first;
1217 const char *previous = NULL;
1218
1219 assert (dbindex);
1220
1221 #if 1
1222 /* optimization code */
1223 if (optStart_mode){
1224 if (
1225 *word == '^'
1226 && dbindex -> isspacealnum [(unsigned char) word[1]]
1227 && strchr (word, '|') == NULL)
1228 {
1229 first = word[1];
1230
1231 end = dbindex->optStart[i2c(c2i(first)+1)];
1232 start = dbindex->optStart[first];
1233
1234 #if 0
1235 fprintf (stderr, "optStart_regexp [%i] = %p\n", first, start);
1236 fprintf (stderr, "optStart_regexp [%i] = %p\n", i2c(c2i(first)+1), end);
1237 #endif
1238
1239 if (end < start)
1240 end = dbindex->end;
1241
1242 /* FIND_NEXT(end, dbindex -> end); */
1243 }
1244 }
1245 #endif
1246
1247 if ((err = regcomp(&re, word, REG_ICASE|REG_NOSUB|type))) {
1248 regerror(err, &re, erbuf, sizeof(erbuf));
1249 log_info( "regcomp(%s): %s\n", word, erbuf );
1250 return 0;
1251 }
1252
1253 pt = start;
1254 while (pt && pt < end) {
1255 for (p = pt; *p != '\t' && p < end; ++p);
1256 ++_dict_comparisons;
1257
1258 if (dict_match (&re, pt, p - pt, 0)) {
1259 if (!previous || compare_1or4 (previous, dbindex, pt, end, &previous))
1260 {
1261 ++count;
1262 if (!dict_add_word_to_list
1263 (l, database, dbindex, pt))
1264 {
1265 break;
1266 }
1267 }
1268 }
1269 pt = p + 1;
1270 FIND_NEXT( pt, end );
1271 }
1272
1273 regfree(&re);
1274
1275 return count;
1276 }
1277
dict_search_re(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1278 static int dict_search_re( lst_List l,
1279 const char *word,
1280 const dictDatabase *database,
1281 dictIndex *dbindex)
1282 {
1283 return dict_search_regexpr( l, word, database, dbindex, REG_EXTENDED );
1284 }
1285
dict_search_regexp(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1286 static int dict_search_regexp( lst_List l,
1287 const char *word,
1288 const dictDatabase *database,
1289 dictIndex *dbindex)
1290 {
1291 return dict_search_regexpr( l, word, database, dbindex, 0 /*REG_BASIC*/ );
1292 }
1293
dict_search_soundex(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1294 static int dict_search_soundex( lst_List l,
1295 const char *word,
1296 const dictDatabase *database,
1297 dictIndex *dbindex)
1298 {
1299 const char *pt;
1300 const char *end;
1301 int count = 0;
1302 char soundex [10];
1303 char soundex2 [5];
1304 char buffer[MAXWORDLEN];
1305 char *d;
1306 const unsigned char *s;
1307 int i;
1308 int c = (unsigned char)*word;
1309 const char *previous = NULL;
1310
1311 assert (dbindex);
1312
1313 if (optStart_mode){
1314 pt = dbindex->optStart[ c ];
1315 end = dbindex->optStart[ i2c(c2i(c)+1) ];
1316 if (end < pt)
1317 end = dbindex->end;
1318 }else{
1319 pt = dbindex->start;
1320 end = dbindex->end;
1321 }
1322
1323 txt_soundex2 (word, soundex);
1324
1325 while (pt && pt < end) {
1326 s = (const unsigned char *) pt;
1327 for (i = 0, d = buffer; i < MAXWORDLEN - 1; i++, ++s) {
1328 if (*s == '\t') break;
1329 if (!dbindex -> isspacealnum [*s]) continue;
1330 *d++ = *s;
1331 }
1332 *d = '\0';
1333
1334 txt_soundex2 (buffer, soundex2);
1335 if (!strcmp (soundex, soundex2)) {
1336 if (!previous || compare_1or4 (previous, dbindex, pt, end, &previous))
1337 {
1338 if (!dict_add_word_to_list
1339 (l, database, dbindex, pt))
1340 {
1341 break;
1342 }
1343
1344 ++count;
1345 }
1346 }
1347 FIND_NEXT(pt,end);
1348 }
1349
1350 return count;
1351 }
1352
1353 /* I was unable to locate V. I. Levenshtein, "Binary codes capable of
1354 correcting deletions, insertions, and reversals,"
1355 Sov. Phys.-Dokt. 10(8): Feb. 1966, 707-710.
1356
1357 So, I used Joseph J. Pollock and Antonio Zamora, "Automatic spelling
1358 correction in scientific and scholarly text," CACM, 27(4): Apr. 1985,
1359 358-368. They point out (p. 363) that the precedence of these tests,
1360 when used for spelling correction, is OMISSION = TRANSPOSITION >
1361 INSERTION > SUBSTITUTION. If list is not sorted, then this order should
1362 be used.
1363
1364 In this routine, we only consider corrections with a Levenshtein
1365 distance of 1.
1366
1367 */
1368
1369 typedef struct lev_args_ {
1370 const dictDatabase *database;
1371 dictIndex *dbindex;
1372 lst_List l;
1373 } LEV_ARGS;
1374
1375 #define CHECK(word, args) \
1376 if ((pt = dict_index_search((word), (args) -> dbindex)) \
1377 && !compare((word), (args) -> dbindex, pt, (args) -> dbindex -> end)) \
1378 { \
1379 if (!set_member(s,(word))) { \
1380 ++count; \
1381 set_insert(s,str_find((word))); \
1382 if (!dict_add_word_to_list ((args) -> l, (args) -> database, (args) -> dbindex, pt)) return count; \
1383 PRINTF(DBG_LEV,(" %s added\n",(word))); \
1384 } \
1385 }
1386
1387 #define LEV_VARS \
1388 char tmp;
1389
1390 #include "lev.h"
1391
dict_search_levenshtein(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1392 static int dict_search_levenshtein (
1393 lst_List l,
1394 const char *word,
1395 const dictDatabase *database,
1396 dictIndex *dbindex)
1397 {
1398 LEV_ARGS lev_args = { database, dbindex, l };
1399
1400 assert (database);
1401 assert (dbindex);
1402
1403 if (database -> alphabet){
1404 return dict_search_lev (
1405 word, database -> alphabet, dbindex -> flag_utf8, &lev_args);
1406 }else{
1407 if (dbindex -> flag_utf8){
1408 return dict_search_lev (
1409 word, global_alphabet_ascii, 1, &lev_args);
1410 }else{
1411 return dict_search_lev (
1412 word, global_alphabet_8bit, 0, &lev_args);
1413 }
1414 }
1415 }
1416
1417 /*
1418 makes anagram of the 8-bit string 'str'
1419 if length == -1 then str is 0-terminated string
1420 */
stranagram_8bit(char * str,int length)1421 static void stranagram_8bit (char *str, int length)
1422 {
1423 char* i = str;
1424 char* j;
1425 char v;
1426
1427 assert (str);
1428
1429 if (length == -1)
1430 length = strlen (str);
1431
1432 j = str + length - 1;
1433
1434 while (i < j){
1435 v = *i;
1436 *i = *j;
1437 *j = v;
1438
1439 ++i;
1440 --j;
1441 }
1442 }
1443
1444 #if HAVE_UTF8
1445 /*
1446 makes anagram of the utf-8 string 'str'
1447 Returns non-zero if success, 0 otherwise
1448 */
stranagram_utf8(char * str)1449 static int stranagram_utf8 (char *str)
1450 {
1451 size_t len;
1452 char *p;
1453
1454 mbstate_t ps;
1455
1456 assert (str);
1457
1458 memset (&ps, 0, sizeof (ps));
1459
1460 for (p = str; *p; ){
1461 len = mbrlen__ (p, MB_CUR_MAX__, &ps);
1462 if ((int) len < 0)
1463 return 0; /* not a UTF-8 string */
1464
1465 if (len > 1)
1466 stranagram_8bit (p, len);
1467
1468 p += len;
1469 }
1470
1471 stranagram_8bit (str, -1);
1472 return 1;
1473 }
1474 #endif
1475
1476 /* makes anagram of utf-8 string 'str' */
stranagram(char * str,int utf8_string)1477 static int stranagram (char *str, int utf8_string)
1478 {
1479 assert (str);
1480
1481 #if HAVE_UTF8
1482 if (utf8_string){
1483 return stranagram_utf8 (str);
1484 }else{
1485 stranagram_8bit (str, -1);
1486 return 1;
1487 }
1488 #else
1489 stranagram_8bit (str, -1);
1490 return 1;
1491 #endif
1492 }
1493
dict_search_suffix(lst_List l,const char * word,const dictDatabase * database)1494 static int dict_search_suffix(
1495 lst_List l,
1496 const char *word,
1497 const dictDatabase *database)
1498 {
1499 int ret;
1500 lst_Position p;
1501 dictWord *dw;
1502 char *buf = NULL;
1503 int count;
1504
1505 assert (database);
1506
1507 if (database->index_suffix){
1508 buf = (char *) alloca (strlen (word));
1509 strcpy (buf, word);
1510
1511 PRINTF(DBG_SEARCH, ("anagram: '%s' ==> ", buf));
1512 if (!stranagram (buf, utf8_mode)){
1513 PRINTF(DBG_SEARCH, ("failed building anagram\n"));
1514 return 0; /* invalid utf8 string */
1515 }
1516
1517 count = lst_length (l);
1518
1519 PRINTF(DBG_SEARCH, ("'%s'\n", buf));
1520 ret = dict_search_prefix (
1521 l, buf, database, database->index_suffix, 0, INT_MAX);
1522
1523 LST_ITERATE (l, p, dw) {
1524 if (count <= 0){
1525 stranagram (dw -> word, utf8_mode);
1526 }
1527
1528 --count;
1529 }
1530 return ret;
1531 }else{
1532 return dict_search_bmh(
1533 l, (const unsigned char *) word,
1534 database, database -> index, BMH_SUFFIX );
1535 }
1536 }
1537
dict_search_last(lst_List l,const char * word,const dictDatabase * database)1538 static int dict_search_last (
1539 lst_List l,
1540 const char *word,
1541 const dictDatabase *database)
1542 {
1543 int ret;
1544 lst_Position p;
1545 dictWord *dw;
1546 char *buf = NULL;
1547 int count;
1548
1549 assert (database);
1550
1551 if (database->index_suffix){
1552 buf = (char *) alloca (strlen (word));
1553 strcpy (buf, word);
1554
1555 PRINTF(DBG_SEARCH, ("anagram: '%s' ==> ", buf));
1556 if (!stranagram (buf, utf8_mode)){
1557 PRINTF(DBG_SEARCH, ("failed building anagram\n"));
1558 return 0; /* invalid utf8 string */
1559 }
1560
1561 count = lst_length (l);
1562
1563 PRINTF(DBG_SEARCH, ("'%s'\n", buf));
1564 ret = dict_search_first (l, buf, database, database->index_suffix);
1565
1566 LST_ITERATE (l, p, dw) {
1567 if (count-- <= 0){
1568 stranagram (dw -> word, utf8_mode);
1569 }
1570 }
1571 return ret;
1572 }else{
1573 return dict_search_bmh(
1574 l, (const unsigned char *) word,
1575 database, database -> index, BMH_LAST );
1576 }
1577 }
1578
1579 #if HAVE_UTF8
1580 static const char *utf8_err_msg = "\
1581 error: The request is not a valid UTF-8 string";
1582 #endif
1583
1584 /*
1585 returns a number of matches ( >= 0 ) or
1586 negative value for invalid UTF-8 string
1587 */
dict_search_database_(lst_List l,const char * word,const dictDatabase * database,int strategy_or_define)1588 int dict_search_database_ (
1589 lst_List l,
1590 const char *word,
1591 const dictDatabase *database,
1592 int strategy_or_define )
1593 {
1594 char *buf = NULL;
1595 #if HAVE_UTF8
1596 dictWord *dw = NULL;
1597 #endif
1598 unsigned int skip_count = 0;
1599 unsigned int item_count = INT_MAX;
1600
1601 int strategy = strategy_or_define & ~DICT_MATCH_MASK;
1602
1603 define_or_match = (strategy == strategy_or_define);
1604
1605 assert (database);
1606 assert (database -> index);
1607
1608 if (strategy == DICT_STRAT_DOT){
1609 strategy = database -> default_strategy;
1610 }
1611
1612 if (strategy == DICT_STRAT_NPREFIX){
1613 if (2 == sscanf (word, "%u#%u#", &skip_count, &item_count)){
1614 while (*word++ != '#');
1615 ++word;
1616 while (*word++ != '#');
1617 }
1618 }
1619
1620 buf = alloca( strlen( word ) + 1 );
1621
1622 #if HAVE_UTF8
1623 if (
1624 !strcmp(utf8_err_msg, word) ||
1625 tolower_alnumspace (
1626 word, buf,
1627 database -> index -> flag_allchars,
1628 database -> index -> flag_casesensitive,
1629 utf8_mode))
1630 {
1631 PRINTF(DBG_SEARCH, ("tolower_... ERROR!!!\n"));
1632
1633 dw = xmalloc (sizeof (dictWord));
1634 memset (dw, 0, sizeof (dictWord));
1635
1636 dw -> database = database;
1637 dw -> def = utf8_err_msg;
1638 dw -> def_size = -1;
1639 dw -> word = strdup (word);
1640
1641 lst_append (l, dw);
1642
1643 return -1;
1644 }
1645 #else
1646 tolower_alnumspace (
1647 word, buf,
1648 database -> index -> flag_allchars,
1649 database -> index -> flag_casesensitive,
1650 utf8_mode);
1651 #endif
1652
1653 if (!buf [0] && word [0]){
1654 /*
1655 This may happen because of invalid --locale specified.
1656 Without following line entire dictionary will be returned
1657 for non-ASCII words.
1658 */
1659 return 0;
1660 }
1661
1662 switch (strategy) {
1663 case DICT_STRAT_EXACT:
1664 return dict_search_exact( l, buf, database, database->index,
1665 strategy_or_define != strategy);
1666
1667 case DICT_STRAT_PREFIX:
1668 case DICT_STRAT_NPREFIX:
1669 return dict_search_prefix (l, buf, database, database->index,
1670 skip_count, item_count);
1671
1672 case DICT_STRAT_SUBSTRING:
1673 return dict_search_substring( l, buf, database, database->index );
1674
1675 case DICT_STRAT_SUFFIX:
1676 return dict_search_suffix( l, buf, database );
1677
1678 case DICT_STRAT_RE:
1679 return dict_search_re( l, word, database, database->index );
1680
1681 case DICT_STRAT_REGEXP:
1682 return dict_search_regexp( l, word, database, database->index );
1683
1684 case DICT_STRAT_SOUNDEX:
1685 return dict_search_soundex( l, buf, database, database->index );
1686
1687 case DICT_STRAT_LEVENSHTEIN:
1688 return dict_search_levenshtein( l, buf, database, database->index);
1689
1690 case DICT_STRAT_WORD:
1691 return dict_search_word( l, buf, database);
1692
1693 case DICT_STRAT_FIRST:
1694 return dict_search_first( l, buf, database, database->index );
1695
1696 case DICT_STRAT_LAST:
1697 return dict_search_last( l, buf, database );
1698
1699 default:
1700 /* plugins may support unusual search strategies */
1701 return 0;
1702 }
1703 }
1704
1705 /*
1706 Replaces invisible databases with db argument.
1707 */
replace_invisible_databases(lst_Position pos,const dictDatabase * db)1708 static void replace_invisible_databases (
1709 lst_Position pos,
1710 const dictDatabase *db)
1711 {
1712 dictWord *dw;
1713
1714 while (pos){
1715 dw = (dictWord *) lst_get_position (pos);
1716
1717 if (
1718 dw -> database &&
1719 dw -> database -> invisible &&
1720 !dw -> database_visible)
1721 {
1722 dw -> database_visible = db;
1723 }
1724
1725 pos = lst_next_position (pos);
1726 }
1727 }
1728
1729 /*
1730 returns a number of matches ( >= 0 ) or
1731 negative value for invalid UTF-8 string
1732 */
dict_search(lst_List l,const char * const word,const dictDatabase * database,int strategy,int option_mime,int * extra_result,const dictPluginData ** extra_data,int * extra_data_size)1733 int dict_search (
1734 lst_List l,
1735 const char *const word,
1736 const dictDatabase *database,
1737 int strategy,
1738 int option_mime,
1739 int *extra_result,
1740 const dictPluginData **extra_data,
1741 int *extra_data_size)
1742 {
1743 int count = 0;
1744
1745 int norm_strategy = strategy & ~DICT_MATCH_MASK;
1746
1747 if (extra_result)
1748 *extra_result = DICT_PLUGIN_RESULT_NOTFOUND;
1749
1750 assert (word);
1751 assert (database);
1752
1753 if (
1754 database -> strategy_disabled &&
1755 database -> strategy_disabled [norm_strategy])
1756 {
1757 /* disable_strategy keyword from configuration file */
1758 #if 0
1759 PRINTF (DBG_SEARCH, (
1760 ":S: strategy '%s' is disabled for database '%s'\n",
1761 get_strategies () [norm_strategy] -> name,
1762 database -> databaseName ? database -> databaseName : "(unknown)"));
1763 #endif
1764 return 0;
1765 }
1766
1767 PRINTF (DBG_SEARCH, (":S: Searching in '%s'\n", database -> databaseName));
1768
1769 #if 0
1770 fprintf (stderr, "STRATEGY: %x\n", strategy);
1771 #endif
1772
1773 if (database -> index){
1774 PRINTF (DBG_SEARCH, (":S: database search\n"));
1775 count = dict_search_database_ (l, word, database, strategy);
1776 }
1777
1778 #ifdef USE_PLUGIN
1779 if (!count && database -> plugin){
1780 PRINTF (DBG_SEARCH, (":S: plugin search\n"));
1781 count = dict_search_plugin (
1782 l, word, database, strategy,
1783 extra_result, extra_data, extra_data_size);
1784
1785 if (count)
1786 return count;
1787 }
1788 #endif
1789
1790 if (!count && database -> virtual_db_list){
1791 lst_Position db_list_pos;
1792 dictDatabase *db = NULL;
1793 int old_count = lst_length (l);
1794
1795 assert (lst_init_position (database -> virtual_db_list));
1796
1797 LST_ITERATE (database -> virtual_db_list, db_list_pos, db){
1798 count += dict_search (
1799 l, word, db, strategy, option_mime,
1800 extra_result, extra_data, extra_data_size);
1801 }
1802
1803 if (count > old_count){
1804 replace_invisible_databases (
1805 lst_nth_position (l, old_count + 1),
1806 database);
1807 }
1808 }
1809
1810 if (!count && database -> mime_db){
1811 int old_count = lst_length (l);
1812
1813 count += dict_search (
1814 l, word,
1815 (option_mime ? database -> mime_mimeDB :
1816 database -> mime_nomimeDB),
1817 strategy, 0,
1818 extra_result, extra_data, extra_data_size);
1819
1820 if (count > old_count){
1821 replace_invisible_databases (
1822 lst_nth_position (l, old_count + 1),
1823 database);
1824 }
1825 }
1826
1827 if (count > 0 && extra_result)
1828 *extra_result = DICT_PLUGIN_RESULT_FOUND;
1829
1830 return count;
1831 }
1832
dict_index_open(const char * filename,int init_flags,const dictIndex * base)1833 dictIndex *dict_index_open(
1834 const char *filename,
1835 int init_flags,
1836 const dictIndex *base)
1837 {
1838 struct stat sb;
1839 static int tabInit = 0;
1840 dictIndex *i;
1841 dictDatabase db;
1842 int j;
1843 char buf[2];
1844
1845 int old_8bit_format = 0;
1846
1847 int first_char;
1848 int first_char_uc;
1849
1850 if (!filename)
1851 return NULL;
1852
1853 i = xmalloc( sizeof( struct dictIndex ) );
1854
1855 if (!tabInit) dict_table_init();
1856 tabInit = 1;
1857
1858 memset( i, 0, sizeof( struct dictIndex ) );
1859
1860 if ((i->fd = open( filename, O_RDONLY )) < 0)
1861 err_fatal_errno( __func__,
1862 "Cannot open index file \"%s\"\n", filename );
1863 if (fstat( i->fd, &sb ))
1864 err_fatal_errno( __func__,
1865 "Cannot stat index file \"%s\"\n", filename );
1866 i->size = sb.st_size;
1867
1868 if (mmap_mode){
1869 if (i->size) {
1870 i->start = mmap( NULL, i->size, PROT_READ, MAP_SHARED, i->fd, 0 );
1871 if ((void *)i->start == (void *)(-1))
1872 err_fatal_errno (
1873 __func__,
1874 "Cannot mmap index file \"%s\"\b", filename );
1875 } else i->start = NULL; /* allow for /dev/null dummy index */
1876 }else{
1877 i->start = xmalloc (i->size);
1878 if (-1 == read (i->fd, (char *) i->start, i->size))
1879 err_fatal_errno (
1880 __func__,
1881 "Cannot read index file \"%s\"\b", filename );
1882
1883 close (i -> fd);
1884 i -> fd = 0;
1885 }
1886
1887 i->end = i->start + i->size;
1888
1889 i->flag_8bit = 0;
1890 if (base){
1891 i->flag_utf8 = base -> flag_utf8;
1892 i->flag_allchars = base -> flag_allchars;
1893 i->flag_casesensitive = base -> flag_casesensitive;
1894 }
1895 i->isspacealnum = isspacealnumtab;
1896
1897 if (optStart_mode){
1898 for (j = 0; j <= UCHAR_MAX; j++)
1899 i->optStart[j] = i->start;
1900 }
1901
1902 if (init_flags){
1903 memset (&db, 0, sizeof (db));
1904 db.index = i;
1905
1906 /* for exact search */
1907 i->flag_allchars = 1;
1908 i->isspacealnum = isspacealnumtab_allchars;
1909
1910 /* allchars flag */
1911 i->flag_allchars =
1912 0 != dict_search_database_ (NULL, DICT_FLAG_ALLCHARS, &db, DICT_STRAT_EXACT);
1913 PRINTF(DBG_INIT, (":I: \"%s\": flag_allchars=%i\n", filename, i->flag_allchars));
1914
1915 /* case-sensitive flag */
1916 i->flag_casesensitive =
1917 0 != dict_search_database_ (NULL, DICT_FLAG_CASESENSITIVE, &db, DICT_STRAT_EXACT);
1918 PRINTF(DBG_INIT, (":I: \"%s\": flag_casesensitive=%i\n", filename, i->flag_casesensitive));
1919
1920 /* utf8 flag */
1921 if (!i -> flag_allchars)
1922 i -> isspacealnum = isspacealnumtab;
1923
1924 i->flag_utf8 =
1925 0 != dict_search_database_ (NULL, DICT_FLAG_UTF8, &db, DICT_STRAT_EXACT);
1926 PRINTF(DBG_INIT, (":I: \"%s\": flag_utf8=%i\n", filename, i->flag_utf8));
1927 if (i->flag_utf8 && !utf8_mode){
1928 log_info( ":E: locale '%s' can not be used for utf-8 dictionaries. Exiting\n", locale );
1929 exit (1);
1930 }
1931
1932 /* 8bit flag */
1933 i->flag_8bit =
1934 0 != dict_search_database_ (NULL, DICT_FLAG_8BIT_NEW, &db, DICT_STRAT_EXACT);
1935 old_8bit_format =
1936 0 != dict_search_database_ (NULL, DICT_FLAG_8BIT_OLD, &db, DICT_STRAT_EXACT);
1937
1938 if (old_8bit_format){
1939 log_info( ":E: index file '%s' was created using dictfmt <1.9.15\n"
1940 ":E: and can not be used with dictd-1.9.15 or later\n"
1941 ":E: Rebuild it like this:\n"
1942 ":E: dictunformat db.index < db.dict | dictfmt -t --locale <8bit-locale> db-new\n", filename );
1943 exit (1);
1944 }
1945
1946 PRINTF(DBG_INIT, (":I: \"%s\": flag_8bit=%i\n", filename, i->flag_8bit));
1947 if (i->flag_8bit){
1948 log_info( ":E: locale '%s' can not be used for 8-bit dictionaries. Exiting\n", locale );
1949 exit (1);
1950 }
1951 }
1952
1953 if (optStart_mode){
1954 buf[0] = ' ';
1955 buf[1] = '\0';
1956 i->optStart[ ' ' ] = binary_search_8bit( buf, i, i->start, i->end );
1957
1958 for (j = 0; j < charcount; j++) {
1959 first_char = c(j);
1960
1961 buf[0] = first_char;
1962 buf[1] = '\0';
1963
1964 i->optStart [first_char]
1965 = binary_search_8bit( buf, i, i->start, i->end );
1966
1967 if (!utf8_mode || first_char < 128){
1968 first_char_uc = toupper (first_char);
1969
1970 assert (first_char_uc >= 0 && first_char_uc <= UCHAR_MAX);
1971
1972 i->optStart [first_char_uc] = i->optStart [first_char];
1973 }
1974 }
1975
1976 for (j = '0'; j <= '9'; j++) {
1977 buf[0] = j;
1978 buf[1] = '\0';
1979 i->optStart[j] = binary_search_8bit( buf, i, i->start, i->end );
1980 }
1981
1982 i->optStart[UCHAR_MAX] = i->end;
1983 i->optStart[UCHAR_MAX+1] = i->end;
1984
1985 if (dbg_test (DBG_SEARCH)){
1986 for (j=0; j <= UCHAR_MAX; ++j){
1987 if (!utf8_mode || j <= CHAR_MAX)
1988 printf (
1989 "optStart [%c] = (%p) %10s\n",
1990 j,
1991 i->optStart [j],
1992 i->optStart [j]);
1993 else
1994 printf (
1995 "optStart [%i] = (%p) %10s\n",
1996 j,
1997 i->optStart [j],
1998 i->optStart [j]);
1999 }
2000 }
2001 }
2002
2003 return i;
2004 }
2005
dict_index_close(dictIndex * i)2006 void dict_index_close( dictIndex *i )
2007 {
2008 if (!i)
2009 return;
2010
2011 if (mmap_mode){
2012 if (i->fd >= 0) {
2013 if(i->start)
2014 munmap( (void *)i->start, i->size );
2015 close( i->fd );
2016 i->fd = 0;
2017 }
2018 }else{
2019 if (i -> start)
2020 xfree ((char *) i -> start);
2021 }
2022
2023 xfree (i);
2024 }
2025