1 /* index.c --
2  * Created: Wed Oct  9 14:52:23 1996 by faith@dict.org
3  * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org)
4  * Copyright 2002-2008 Aleksey Cheusov (vle@gmx.net)
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License as published by the
8  * Free Software Foundation; either version 1, or (at your option) any
9  * later version.
10  *
11  * This program is distributed in the hope that it will be useful, but
12  * WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with this program; if not, write to the Free Software Foundation, Inc.,
18  * 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20 
21 #include "dictP.h"
22 #include "dictzip.h"
23 #include "index.h"
24 #include "strategy.h"
25 #include "str.h"
26 
27 #ifdef USE_PLUGIN
28 #include "plugin.h"
29 #endif
30 
31 #include <regex.h>
32 
33 #include <sys/stat.h>
34 
35 #include <fcntl.h>
36 #include <ctype.h>
37 #include <wctype.h>
38 #include <wchar.h>
39 #include <stdio.h>
40 #include <sys/stat.h>
41 
42 extern int mmap_mode;
43 
44 #define FIND_PREV(begin, pt) while (pt > begin && pt [-1] != '\n') --pt;
45 #define FIND_NEXT(pt, end) while (pt < end && *pt++ != '\n');
46 
47 #define MAXWORDLEN    512
48 #define BMH_THRESHOLD   3	/* When to start using Boyer-Moore-Hoorspool */
49 
50 /* defaults to run in UTF-8 mode */
51 int utf8_mode=1;        /* dictd uses UTF-8 dictionaries */
52 
53 int optStart_mode = 1;	/* Optimize search range for constant start */
54 
55 dictConfig *DictConfig;
56 
57 int _dict_comparisons;
58 static int isspacealnumtab [UCHAR_MAX + 1];
59 static int isspacealnumtab_allchars[UCHAR_MAX + 1];
60 static int isspacepuncttab [UCHAR_MAX + 1];
61 static int char2indextab   [UCHAR_MAX + 2];
62 static int index2chartab   [UCHAR_MAX + 2];
63 static int tolowertab      [UCHAR_MAX + 1];
64 
65 char global_alphabet_8bit [UCHAR_MAX + 2];
66 char global_alphabet_ascii [UCHAR_MAX + 2];
67 
68 static int chartab [UCHAR_MAX + 1];
69 static int charcount = 0;
70 
71 static int define_or_match = 0; /* 1 if define */
72 
73 /* #define isspacealnum(x) (isspacealnumtab[(unsigned char)(x)]) */
74 #define c2i(x) (char2indextab[(unsigned char)(x)])
75 #define i2c(x) (index2chartab[(unsigned char)(x)])
76 #define c(x)   (((x) < charcount) ? chartab[(unsigned char)(x)] : 0)
77 
78 /*
79   compares two 8bit strings (containing one character)
80   according to locate
81 */
dict_table_init_compare_strcoll(const void * a,const void * b)82 static int dict_table_init_compare_strcoll (const void *a, const void *b)
83 {
84     return strcoll(*(const char **)a, *(const char **)b);
85 }
86 
87 /*
88   compares two strings (containing one character)
89   converting ASCII character to lower case.
90 */
dict_table_init_compare_utf8(const void * a,const void * b)91 static int dict_table_init_compare_utf8 (const void *a, const void *b)
92 {
93     int c1 = ** (const unsigned char **) a;
94     int c2 = ** (const unsigned char **) b;
95 
96     if (c1 <= CHAR_MAX)
97 	c1 = tolower (c1);
98     if (c2 <= CHAR_MAX)
99 	c2 = tolower (c2);
100 
101     return c1 - c2;
102 }
103 
dict_make_global_alphabet(void)104 static void dict_make_global_alphabet (void)
105 {
106    int i;
107    int j = 0;
108    int ch;
109 
110    for (i=0; i < charcount; ++i){
111       global_alphabet_8bit [i] = ch = c(i);
112 
113       if (ch < 128)
114 	 global_alphabet_ascii [j++] = ch;
115    }
116 
117    global_alphabet_8bit  [charcount] = 0;
118    global_alphabet_ascii [j] = 0;
119 
120    PRINTF(DBG_SEARCH, (
121 	  "global_alphabet_8bit  = '%s'\n",
122 	  global_alphabet_8bit));
123    PRINTF(DBG_SEARCH, (
124 	  "global_alphabet_ascii = '%s'\n",
125 	  global_alphabet_ascii));
126 }
127 
dict_table_init(void)128 static void dict_table_init(void)
129 {
130    int      i;
131    unsigned char s[2 * UCHAR_MAX + 2];
132    unsigned char *p[UCHAR_MAX + 1];
133 
134    for (i = 0; i <= UCHAR_MAX; i++) {
135       if (isspace(i) || isalnum(i) || (utf8_mode && i >= 0x80)){
136 	 isspacealnumtab [i] = 1;
137       }else{
138 	 isspacealnumtab [i] = 0;
139       }
140 
141       tolowertab [i] = tolower (i);
142       if (i >= 0x80){
143 	 if (utf8_mode || !utf8_mode){
144 	    /* utf-8 or ASCII mode */
145 	    tolowertab [i] = i;
146 	 }
147       }
148 
149       if (utf8_mode && i >= 0x80){
150 	 isspacepuncttab [i] = 0;
151       }else{
152 	 isspacepuncttab [i] = isspace(i) || ispunct(i);
153       }
154 
155       isspacealnumtab_allchars [i] = 1;
156    }
157    isspacepuncttab['\t'] = isspacepuncttab['\n'] = 1; /* special */
158    isspacealnumtab['\t'] = isspacealnumtab['\n'] = 0; /* special */
159    isspacealnumtab_allchars['\t'] = isspacealnumtab_allchars['\n'] = 0; /* special */
160 
161    charcount = 0;
162    for (i = 0; i <= UCHAR_MAX; i++){
163       if (islower (i) || (utf8_mode && i >= 0xC0))
164 	 chartab [charcount++] = i;
165    }
166 
167                                 /* Populate an array with length-1 strings */
168    for (i = 0; i <= UCHAR_MAX; i++) {
169       if (!isupper (i)){
170 	 s[2 * i] = i;
171       }else{
172 	 s[2 * i] = 0;
173       }
174 
175       s[2 * i + 1] = '\0';
176       p[i]         = &s[2 * i];
177    }
178                                 /* Sort those strings in the locale */
179    if (utf8_mode)
180       qsort(p, UCHAR_MAX + 1, sizeof(p[0]), dict_table_init_compare_utf8);
181    else
182       qsort(p, UCHAR_MAX + 1, sizeof(p[0]), dict_table_init_compare_strcoll);
183 
184                                 /* Extract our unordered arrays */
185    for (i = 0; i <= UCHAR_MAX; i++) {
186       char2indextab[*p[i]] = i;
187       index2chartab[i]     = *p[i];
188    }
189    char2indextab[UCHAR_MAX + 1] = UCHAR_MAX;
190    index2chartab[UCHAR_MAX + 1] = UCHAR_MAX; /* we may index here in  */
191 
192    if (dbg_test(DBG_SEARCH)) {
193       for (i = 0; i <= UCHAR_MAX; ++i){
194 	 if (p [i][0] <= CHAR_MAX){
195 	    PRINTF (DBG_SEARCH,("sorted list: %s\n", p [i]));
196 	 }else{
197 	    PRINTF (DBG_SEARCH,("sorted list: %i\n", (unsigned char) p [i] [0]));
198 	 }
199       }
200    }
201 
202    if (dbg_test(DBG_SEARCH)) {
203       for (i = 0; i < charcount; i++)
204 	 PRINTF(DBG_SEARCH,("%03d %d ('%c')\n", i, c(i), c(i)));
205 
206       for (i = 0; i <= UCHAR_MAX; i++)
207 	 PRINTF(DBG_SEARCH,("c2i(%d/'%c') = %d; i2c(%d) = %d/'%c'\n",
208 		i, (char) isgraph(i) ? i : '.',
209 		c2i(i), c2i(i),
210 		i2c(c2i(i)), (char) i2c(c2i(i)) ? i2c(c2i(i)) : '.'));
211    }
212 
213 
214    dict_make_global_alphabet ();
215 }
216 
compare_allchars(const char * word,const char * start,const char * end)217 static int compare_allchars(
218     const char *word,
219     const char *start, const char *end )
220 {
221    int c1, c2;
222    int result;
223 
224    PRINTF(DBG_SEARCH,("   We are inside index.c:compare_allchars\n"));
225 
226    while (*word && *word != '\t' && *word != '\n' &&
227 	  start < end && *start != '\t' && *start != '\n')
228    {
229 #if 0
230       if (isspace( (unsigned char) *start ))
231 	 c2 = ' ';
232       else
233 	 c2 = * (unsigned char *) start;
234 
235       if (isspace( (unsigned char) *word ))
236 	 c1 = ' ';
237       else
238 	 c1 = * (unsigned char *) word;
239 
240 #else
241       c2 = * (unsigned char *) start;
242       c1 = * (unsigned char *) word;
243 #endif
244       if (c1 != c2) {
245 	 if (c1 < c2){
246 	    result = -2;
247 	 }else{
248 	    result = 1;
249 	 }
250 
251 	 if (dbg_test(DBG_SEARCH)){
252 	    PRINTF(DBG_SEARCH,("   result = %d (%i != %i) \n", result, c1, c2));
253 	 }
254          return result;
255       }
256       ++word;
257       ++start;
258    }
259 
260    if (*word && *word != '\t' && *word != '\n')
261       result = 1;
262    else if (*start != '\t' && *start != '\n')
263       result = -1;
264    else
265       result = 0;
266 
267    PRINTF(DBG_SEARCH,("   result = %d\n", result));
268    return  result;
269 }
270 
compare_alnumspace(const char * word,const dictIndex * dbindex,const char * start,const char * end)271 static int compare_alnumspace(
272     const char *word,
273     const dictIndex *dbindex,
274     const char *start, const char *end )
275 {
276    int c1, c2;
277    int result;
278    int ret;
279 
280    assert (dbindex);
281 
282    PRINTF(DBG_SEARCH,("   We are inside index.c:compare_alnumspace\n"));
283 
284    /* FIXME.  Optimize this inner loop. */
285    while (*word && *word != '\t' && *word != '\n' &&
286 	  start < end && *start != '\t' && *start != '\n')
287    {
288       if (!dbindex -> isspacealnum[* (const unsigned char *) start]) {
289 	 ++start;
290 	 continue;
291       }
292 
293       c1 = (unsigned char) *word;
294       c2 = (unsigned char) *start;
295 
296       if (!dbindex -> flag_casesensitive){
297 	 c2 = tolowertab [c2];
298 	 c1 = tolowertab [c1];
299       }
300 
301       if (c1 != c2) {
302 	 if (utf8_mode){
303 	    if (
304 	       (c1 <= CHAR_MAX ? c2i (c1) : c1) <
305 	       (c2 <= CHAR_MAX ? c2i (c2) : c2))
306 	    {
307 	       result = -2;
308 	    }else{
309 	       result = 1;
310 	    }
311 	 }else{
312 	    result = (c2i (c1) < c2i (c2) ? -2 : 1);
313 	 }
314 	 if (dbg_test(DBG_SEARCH)){
315 	    if (utf8_mode){
316 	       PRINTF (DBG_SEARCH,(
317 			 "   result = %d (%i != %i) \n", result, c1, c2));
318 	    }else{
319 	       PRINTF (DBG_SEARCH,(
320 		  "   result = %d ('%c'(c2i=%i) != '%c'(c2i=%i)) \n",
321 		  result,
322 		  c1,
323 		  c2i (c1),
324 		  c2,
325 		  c2i (c2)));
326 	    }
327 	 }
328          return result;
329       }
330       ++word;
331       ++start;
332    }
333 
334    while (
335        *start != '\t' && *start != '\n' &&
336        !dbindex -> isspacealnum[* (const unsigned char *) start])
337    {
338       ++start;
339    }
340 
341    if (*word && *word != '\t' && *word != '\n')
342       ret = 1;
343    else if (*start != '\t' && *start != '\n')
344       ret = -1;
345    else
346       ret = 0;
347 
348    PRINTF(DBG_SEARCH,("   result = %d\n", ret));
349 
350    return ret;
351 }
352 
353 /* Compare:
354 
355    Return -2 if word <  word-pointed-to-by-start
356           -1 if word prefix word-pointed-to-by-start
357            0 if word == word-pointed-to-by-start
358 	   1 if word >  word-pointed-to-by-start
359 	   2 if some kind of error happened
360 
361    The comparison must be the same as "sort -df" phone directory order:
362    ignore all characters except letters, digits, and blanks; fold upper
363    case into the equivalent lowercase.
364 
365    word already has all of the illegal characters removed
366 */
367 
compare(const char * word,const dictIndex * dbindex,const char * start,const char * end)368 static int compare(
369    const char *word,
370    const dictIndex *dbindex,
371    const char *start, const char *end )
372 {
373    char       buf[80], *d;
374    const char *s;
375 
376    assert (dbindex);
377 
378    if (dbg_test(DBG_SEARCH)) {
379       for (
380 	 d = buf, s = start;
381 	 d - buf + 1 < (int) sizeof (buf) && s < end && *s != '\t';)
382       {
383 	 *d++ = *s++;
384       }
385 
386       *d = '\0';
387       PRINTF(DBG_SEARCH,
388 	     ("compare \"%s\" with \"%s\" (sizes: %lu and %lu)\n",
389 	      word, buf, (unsigned long) strlen( word ), (unsigned long) strlen( buf ) ));
390    }
391 
392    ++_dict_comparisons;		/* counter for profiling */
393 
394    if (dbindex &&
395        (dbindex -> flag_allchars || dbindex -> flag_utf8 ||
396 	dbindex -> flag_8bit))
397    {
398       return compare_allchars( word, start, end );
399    }else{
400       return compare_alnumspace( word, dbindex, start, end );
401    }
402 }
403 
dict_entry_is_4column(const char * entry,const char * index_end)404 static int dict_entry_is_4column (
405    const char *entry, const char *index_end)
406 {
407    int tab_count = 0;
408 
409    for (; entry < index_end; ++entry){
410       switch (*entry){
411       case '\t':
412 	 ++tab_count;
413 	 break;
414       case '\n':
415 	 switch (tab_count){
416 	 case 2:
417 	    return 0;
418 	 case 3:
419 	    return 1;
420 	 default:
421 	    goto err;
422 	 }
423       }
424    }
425 
426  err:
427    err_fatal (__func__, "bad .index file");
428 }
429 
compare_1or4(const char * word,const dictIndex * dbindex,const char * start,const char * end,const char ** column_1or4)430 static int compare_1or4 (
431    const char *word,
432    const dictIndex *dbindex,
433    const char *start, const char *end,
434    const char **column_1or4)
435 {
436    if (dict_entry_is_4column (start, end)){
437       while (start < end && *start != '\t')
438 	 ++start;
439       ++start;
440       while (start < end && *start != '\t')
441 	 ++start;
442       ++start;
443    }
444 
445    if (column_1or4)
446       *column_1or4 = start;
447 
448    return compare (word, dbindex, start, end);
449 }
450 
binary_search(const char * word,const dictIndex * dbindex,const char * start,const char * end)451 static const char *binary_search(
452     const char *word,
453     const dictIndex *dbindex,
454     const char *start, const char *end )
455 {
456    const char *pt;
457 
458    assert (dbindex);
459 
460    PRINTF(DBG_SEARCH,("%s %p %p\n", word, start, end));
461 
462    pt = start + (end-start)/2;
463    FIND_PREV(start, pt);
464    while (start < end) {
465       switch (compare( word, dbindex, pt, end )){
466 	 case -2: case -1: case 0:
467 	    end = pt;
468 	    break;
469 	 case 1:
470 	    start = pt;
471 	    FIND_NEXT(start, end)
472 	    break;
473 	 case  2:
474 	    return end;     /* ERROR!!! */
475 	 default:
476 	    assert (0);
477       }
478       PRINTF(DBG_SEARCH,("%s %p %p\n",word,start,end));
479       pt = start + (end-start)/2;
480       FIND_PREV(start, pt);
481    }
482 
483    return start;
484 }
485 
binary_search_8bit(const char * word,const dictIndex * dbindex,const char * start,const char * end)486 static const char *binary_search_8bit(
487     const char *word,
488     const dictIndex *dbindex,
489     const char *start, const char *end )
490 {
491    char       buf[80], *d;
492    const char *s;
493    const char *pt;
494    int cmp;
495 
496    assert (dbindex);
497 
498    PRINTF(DBG_SEARCH,("word/start/end %s/%p/%p\n",word,start,end));
499 
500    pt = start + (end-start)/2;
501    FIND_PREV(start, pt);
502    while (start < end) {
503       if (dbg_test(DBG_SEARCH)) {
504          for (
505 	    d = buf, s = pt;
506 	    s < end && *s != '\t' && d - buf + 1 < (int) sizeof (buf);)
507 	 {
508 	    *d++ = *s++;
509 	 }
510 
511          *d = '\0';
512          printf( "compare \"%s\" with \"%s\" (sizes: %lu and %lu)\n",
513             word, buf, (unsigned long) strlen( word ), (unsigned long) strlen( buf ) );
514       }
515 
516       if (
517 	 dbindex &&
518 	 (dbindex -> flag_utf8 || dbindex -> flag_allchars))
519       {
520 	 cmp = compare_allchars ( word, pt, end );
521       }else{
522 	 cmp = compare_alnumspace ( word, dbindex, pt, end );
523       }
524 
525       switch (cmp){
526       case -2: case -1: case 0:
527 	 end = pt;
528 	 break;
529       case 1:
530 	 start = pt;
531 	 FIND_NEXT(start, end)
532 	 break;
533       case  2:
534 	 return end;     /* ERROR!!! */
535       default:
536 	 assert (0);
537       }
538       PRINTF(DBG_SEARCH,("%s %p %p\n",word,start,end));
539       pt = start + (end-start)/2;
540       FIND_PREV(start, pt);
541    }
542 
543    return start;
544 }
545 
linear_search(const char * word,const dictIndex * dbindex,const char * start,const char * end)546 static const char *linear_search(
547     const char *word,
548     const dictIndex *dbindex,
549     const char *start, const char *end )
550 {
551    const char *pt;
552 
553    assert (dbindex);
554 
555    for (pt = start; pt < end;) {
556       switch (compare( word, dbindex, pt, end )) {
557       case -2: return NULL;	/* less than and not prefix */
558       case -1:			/* prefix */
559       case  0: return pt;	/* exact */
560       case  1: break;           /* greater than */
561       case  2: return NULL;     /* ERROR!!! */
562       }
563       FIND_NEXT(pt,end);
564    }
565    return NULL;
566 }
567 
dict_index_search(const char * word,dictIndex * idx)568 static const char *dict_index_search( const char *word, dictIndex *idx )
569 {
570    const char    *start;
571    const char    *end;
572    int first, last;
573 
574    assert (idx);
575 
576    /* With optStart:
577       17071 comparisons, 1000 words
578       33946 comparisons, 2000 words
579 
580       Without optStart:
581       20889 comparisons, 1000 words
582       41668 comparisons, 2000 words
583 
584       Linear:
585        594527 comparisons, 1000 words
586       2097035 comparisons, 2000 words
587    */
588 
589    if (optStart_mode){
590       first   = * (const unsigned char *) word;
591       last    = i2c(c2i(first)+1);
592 
593       if (dbg_test(DBG_SEARCH)) {
594 	 if (!utf8_mode || (last <= CHAR_MAX && first <= CHAR_MAX))
595 	    printf("binary_search from %c to %c\n", first, last);
596 	 else
597 	    printf("binary_search from %i to %i\n", first, last);
598       }
599 
600       end   = idx->optStart [last];
601       start = idx->optStart [first];
602 #if 0
603       fprintf (stderr, "start1 = %p\n", start);
604       fprintf (stderr, "end1   = %p\n", end);
605 #endif
606    }else{
607       start = idx->start;
608       end   = idx->end;
609    }
610 
611    if (end < start) end = idx->end;
612 
613    start = binary_search( word, idx, start, end );
614 
615    PRINTF(DBG_SEARCH,("binary_search returns %p\n",start));
616 
617    start = linear_search( word, idx, start, idx->end );
618 
619    PRINTF(DBG_SEARCH,("linear_search returns %p\n",start));
620 
621    return start;
622 }
623 
dict_word_create(const char * entry,const dictDatabase * database,const dictIndex * dbindex)624 static dictWord *dict_word_create(
625     const char *entry,
626     const dictDatabase *database,
627     const dictIndex *dbindex)
628 {
629    int        offs_word   = 0;
630    int        offs_offset = 0;
631    int        offs_length = 0;
632    int        offset      = 0;
633 
634    int        word_len    = 0;
635 
636    dictWord   *dw       = xmalloc( sizeof( struct dictWord ) );
637    const char *pt       = entry;
638    char       *d;
639 
640    assert (dbindex);
641    assert (pt >= dbindex -> start && pt < dbindex -> end);
642 
643    memset (dw, 0, sizeof (*dw));
644 
645    for (;pt < dbindex->end && *pt != '\n'; pt++, offset++) {
646       if (*pt == '\t') {
647 	 if (!offs_offset)
648 	    offs_offset = offset + 1;
649 	 else if (!offs_length)
650 	    offs_length = offset + 1;
651 	 else if (!offs_word)
652 	    offs_word = offset + 1;
653 	 else{
654 	    err_internal( __func__,
655 			  "Too many tabs in index entry \"%*.*s\"\n",
656 			  offs_length, offs_length, entry );
657 	 }
658       }
659    }
660 
661    if (!offs_length)
662       err_internal( __func__,
663 		    "Too few tabs in index entry \"%20.20s\"\n", entry );
664 
665    dw->start    = b64_decode_buf (entry + offs_offset, offs_length - offs_offset - 1);
666    if (offs_word > 0){
667       word_len  = offset - offs_word;
668       dw->end   = b64_decode_buf (entry + offs_length, offs_word - offs_length - 1);
669    }else{
670       word_len  = offs_offset - 1;
671       dw->end   = b64_decode_buf (entry + offs_length, offset - offs_length);
672    }
673 
674    dw->def      = NULL;
675    dw->def_size = 0;
676    dw->database = database;
677 
678 				/* Apply quoting to word */
679    dw->word     = xmalloc (word_len*2 + 1);
680    entry += offs_word;
681 
682    for (d = (char *)dw->word; word_len--;) {
683        switch (*entry) {
684        case '"':
685        case '\\':
686 	   *d++ = '\\';
687        }
688        *d++ = *entry++;
689    }
690    *d = '\0';
691 
692    return dw;
693 }
694 
dict_dump_datum(const void * datum)695 static int dict_dump_datum( const void *datum )
696 {
697    dictWord *dw = (dictWord *)datum;
698    printf(
699       "\"%s\" %lu/%lu %p/%i\n",
700       dw->word, dw->start, dw->end,
701       dw->def, dw->def_size );
702 
703    return 0;
704 }
705 
dict_dump_list(lst_List list)706 void dict_dump_list( lst_List list )
707 {
708    lst_iterate( list, dict_dump_datum );
709 }
710 
dict_destroy_datum(const void * datum)711 int dict_destroy_datum( const void *datum )
712 {
713    dictWord *dw = (dictWord *)datum;
714 
715    if (!datum)
716       return 0;
717 
718    if (dw->word)
719       xfree( (char *)dw->word );
720 
721    dw->word     = NULL;
722    dw->start    = 0;
723    dw->end      = 0;
724    dw->database = NULL;
725    xfree( dw );
726    return 0;
727 }
728 
dict_destroy_list(lst_List list)729 void dict_destroy_list( lst_List list )
730 {
731    lst_iterate( list, dict_destroy_datum );
732    lst_destroy( list );
733 }
734 
735 /* returns NULL if limits exceeded */
dict_add_word_to_list(lst_List l,const dictDatabase * database,const dictIndex * dbindex,const char * pt)736 static dictWord *dict_add_word_to_list (
737    lst_List l,
738    const dictDatabase *database,
739    const dictIndex *dbindex,
740    const char *pt)
741 {
742    dictWord * datum;
743 
744    assert (l);
745 
746    if (define_or_match){
747       if (_dict_daemon_limit_defs
748 	  && lst_length (l) >= _dict_daemon_limit_defs)
749 	 return NULL;
750    }else{
751       if (_dict_daemon_limit_matches
752 	  && lst_length (l) >= _dict_daemon_limit_matches)
753 	 return NULL;
754    }
755 
756    datum = dict_word_create (pt, database, dbindex);
757    lst_append (l, datum);
758    return datum;
759 }
760 
dict_search_exact(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int uniq_only)761 static int dict_search_exact( lst_List l,
762 			      const char *word,
763 			      const dictDatabase *database,
764 			      dictIndex *dbindex,
765 			      int uniq_only)
766 {
767    const char *pt   = NULL;
768    int        count = 0;
769    const char *previous = NULL;
770 
771    assert (dbindex);
772 
773    pt   = dict_index_search( word, dbindex );
774 
775    while (pt && pt < dbindex->end) {
776       if (!compare( word, dbindex, pt, dbindex->end )) {
777 	 if (!uniq_only || !previous
778 	     || compare_1or4 (previous, dbindex, pt, dbindex->end, &previous))
779 	 {
780 	    ++count;
781 	    if (l){
782 	       if (!dict_add_word_to_list (l, database, dbindex, pt)) {
783 		  break;
784 	       }
785 	    }
786 	 }
787       }else{
788 	 break;
789       }
790 
791       FIND_NEXT( pt, dbindex->end );
792    }
793 
794    return count;
795 }
796 
797 enum {
798    BMH_SUBSTRING,
799    BMH_PREFIX,
800    BMH_SUFFIX,
801    BMH_WORD,
802    BMH_FIRST,
803    BMH_LAST,
804 };
805 
dict_search_prefix_first(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int flag,int skip_count,int item_count)806 static int dict_search_prefix_first( lst_List l,
807 			       const char *word,
808 			       const dictDatabase *database,
809 			       dictIndex *dbindex,
810 			       int flag,
811 			       int skip_count,
812 			       int item_count)
813 {
814    const char *pt   = dict_index_search( word, dbindex );
815    int        count = 0;
816    const char *previous = NULL;
817    int wordlen          = strlen (word);
818    int c                = 0;
819 
820    assert (dbindex);
821 
822    if (item_count <= 0){
823       return 0;
824    }
825 
826    while (pt && pt < dbindex->end) {
827       switch (compare( word, dbindex, pt, dbindex->end )) {
828 	 case -2:
829 	    return count;
830 	 case -1:
831 	 case 0:
832 	    if (!previous
833 		|| compare_1or4 (previous, dbindex, pt, dbindex->end,
834 				 &previous))
835 	    {
836 	       if (flag == BMH_FIRST){
837 		  c = (unsigned char) pt [wordlen];
838 		  if (c != '\t' && !isspacepuncttab [c])
839 		     break;
840 	       }
841 
842 	       if (skip_count == 0){
843 		  ++count;
844 
845 		  if (!dict_add_word_to_list (l, database, dbindex, pt))
846 		     return count;
847 
848 		  --item_count;
849 		  if (!item_count){
850 		     return count;
851 		  }
852 	       }else{
853 		  --skip_count;
854 	       }
855 	    }
856 	    break;
857 	 case 1:
858 	    return count;
859 	 case 2:
860 	    return count; /* ERROR!!! */
861 	 default:
862 	    assert (0);
863       }
864 
865       FIND_NEXT( pt, dbindex->end );
866    }
867 
868    return count;
869 }
870 
dict_search_prefix(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int skip_count,int item_count)871 static int dict_search_prefix (
872    lst_List l, const char *word,
873    const dictDatabase *database, dictIndex *dbindex,
874    int skip_count, int item_count)
875 {
876    return dict_search_prefix_first (l, word, database, dbindex,
877 			     BMH_PREFIX, skip_count, item_count);
878 }
879 
dict_search_first(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)880 static int dict_search_first (
881    lst_List l, const char *word,
882    const dictDatabase *database, dictIndex *dbindex)
883 {
884    return dict_search_prefix_first (l, word, database, dbindex,
885 			     BMH_FIRST, 0, INT_MAX);
886 }
887 
dict_search_brute(lst_List l,const unsigned char * word,const dictDatabase * database,dictIndex * dbindex,int flag,int patlen)888 static int dict_search_brute( lst_List l,
889 			      const unsigned char *word,
890 			      const dictDatabase *database,
891 			      dictIndex *dbindex,
892 			      int flag,
893 			      int patlen )
894 {
895    const unsigned char *const start = (const unsigned char *) dbindex->start;
896    const unsigned char *const end   = (const unsigned char *) dbindex->end;
897    const unsigned char *p, *pt;
898    int        count = 0;
899    int        result;
900    const char *previous = NULL;
901    int c;
902 
903    assert (dbindex);
904 
905    p = start;
906    while (p < end && !dbindex -> isspacealnum[*p]) ++p;
907    for (; p < end; ++p) {
908       if (*p == '\t') {
909 	 while (p < end && *p != '\n') ++p;
910 	 ++p;
911 	 while (p < end && !dbindex -> isspacealnum[*p]) ++p;
912       }
913 
914       c = *p;
915       if (!dbindex -> flag_casesensitive){
916 	 c = tolowertab [c];
917       }
918 
919       if (c == *word) {
920 	 result = compare( (const char *) word,
921 			   dbindex,
922 			   (const char *) p,
923 			   (const char *) end );
924 
925 	 if (result == -1 || result == 0) {
926 	    switch (flag){
927 	    case BMH_SUBSTRING:
928 	       break;
929 
930 	    case BMH_SUFFIX:
931 	       if (result)
932 		  continue;
933 
934 	       break;
935 
936 	    case BMH_WORD:
937 	       if (p > start && !isspacepuncttab [p [-1]])
938 		  continue;
939 	       if (p+patlen < end && !isspacepuncttab [p [patlen]])
940 		  continue;
941 
942 	       break;
943 
944 	    case BMH_LAST:
945 	       if (result)
946 		  continue;
947 	       if (p > start && !isspacepuncttab [p [-1]])
948 		  continue;
949 
950 	       break;
951 	    default:
952 	       abort ();
953 	    }
954 
955 	    for (pt = p; pt >= start && *pt != '\n'; --pt)
956 	       if (*pt == '\t')
957 		  goto continue2;
958 
959 	    if (!previous ||
960 		compare_1or4 (previous, dbindex, (char*) pt + 1,
961 			      (const char *) end, &previous))
962 	    {
963 	       ++count;
964 
965 	       if (!dict_add_word_to_list
966 		   (l, database, dbindex, (const char *) pt + 1))
967 	       {
968 		  break;
969 	       }
970 	    }
971 	    FIND_NEXT(p,end);
972 	    --p;
973 	 }
974       }
975  continue2:
976       ;
977    }
978 
979    return count;
980 }
981 
982 /* dict_search_bmh implements a version of the Boyer-Moore-Horspool text
983    searching algorithm, as described in G. H. Gonnet and R. Baeza-Yates,
984    HANDBOOK OF ALGORITHMS AND DATA STRUCTURES: IN PASCAL AND C (2nd ed).
985    Addison-Wesley Publishing Co., 1991.  Pages 258-9. */
986 
dict_search_bmh(lst_List l,const unsigned char * word,const dictDatabase * database,dictIndex * dbindex,int flag)987 static int dict_search_bmh( lst_List l,
988 			    const unsigned char *word,
989 			    const dictDatabase *database,
990 			    dictIndex *dbindex,
991 			    int flag )
992 {
993    const unsigned char *const start = (const unsigned char *) dbindex->start;
994    const unsigned char *const end   = (const unsigned char *) dbindex->end;
995    const unsigned patlen = strlen( (const char *) word );
996    int        skip[UCHAR_MAX + 1];
997    int        i;
998    int        j;
999    int c;
1000 #if 0
1001    int k;
1002 #endif
1003    const unsigned char *p, *pt, *ptr;
1004    int        count = 0;
1005    const unsigned char *f = NULL; /* Boolean flag, but has to be a pointer */
1006    const unsigned char *wpt;
1007    const char *previous = NULL;
1008 
1009    assert (dbindex);
1010 
1011    if (patlen < BMH_THRESHOLD)
1012       return dict_search_brute( l, word, database, dbindex, flag, patlen );
1013 
1014    for (i = 0; i <= UCHAR_MAX; i++) {
1015       if (dbindex -> isspacealnum[i])
1016 	 skip[i] = patlen;
1017       else
1018 	 skip[i] = 1;
1019    }
1020    for (i = 0; i < patlen-1; i++)
1021       skip[(unsigned char)word[i]] = patlen-i-1;
1022 
1023    for (p = start+patlen-1; p < end; ) {
1024       while (*p == '\t') {
1025 	 FIND_NEXT(p,end);
1026 	 p += patlen-1;
1027 	 if (p > end)
1028 	    return count;
1029       }
1030       ++_dict_comparisons;		/* counter for profiling */
1031 
1032 				/* FIXME.  Optimize this inner loop. */
1033       for (j = patlen - 1, pt = p, wpt = word + patlen - 1; j >= 0; j--) {
1034 	 if (pt < start)
1035 	    break;
1036 
1037  	 while (pt >= start && !dbindex -> isspacealnum[*pt]) {
1038 	    if (*pt == '\n' || *pt == '\t')
1039 	       goto continue2;
1040 
1041 	    --pt;
1042 	 }
1043 
1044 	 c = *pt--;
1045 	 if (!dbindex -> flag_casesensitive){
1046 	    c = tolowertab [c];
1047 	 }
1048 
1049 	 if (c != *wpt--)
1050 	    break;
1051       }
1052 
1053       if (j == -1) {
1054 	 switch (flag){
1055 	 case BMH_SUBSTRING:
1056 	    break;
1057 	 case BMH_SUFFIX:
1058 	    if (p[1] != '\t')
1059 	       goto continue2;
1060 
1061 	    break;
1062 	 case BMH_WORD:
1063 	    ptr = p - patlen + 1;
1064 
1065 	    if (ptr > start && !isspacepuncttab [ptr [-1]])
1066 	       goto continue2;
1067 	    if (p < end && !isspacepuncttab [p [1]])
1068 	       goto continue2;
1069 
1070 	    break;
1071 	 case BMH_LAST:
1072 	    if (p[1] != '\t')
1073 	       goto continue2;
1074 
1075 	    ptr = p - patlen + 1;
1076 
1077 	    if (ptr > start && !isspacepuncttab [ptr [-1]])
1078 	       goto continue2;
1079 
1080 	    break;
1081 	 }
1082 
1083 	 for (; pt >= start && *pt != '\n'; --pt)
1084 	    if (*pt == '\t')
1085 	       goto continue2;
1086 
1087 	 ++pt;
1088 
1089 	 assert (pt >= start && pt < end);
1090 
1091 	 if (!previous ||
1092 	     compare_1or4 (previous, dbindex, (const char *) pt,
1093 			   dbindex->end, &previous))
1094 	 {
1095 	    ++count;
1096 	    if (l){
1097 	       if (!dict_add_word_to_list
1098 		   (l, database, dbindex, (const char *) pt))
1099 	       {
1100 		  return count;
1101 	       }
1102 	    }
1103 	 }
1104 	 FIND_NEXT(p,end);
1105 	 f = p += patlen-1;	/* Set boolean flag to non-NULL value */
1106 	 if (p > end)
1107 	    return count;
1108       }
1109 continue2:
1110       if (f){
1111 	 f = NULL;
1112       }else{
1113 	 c = *p;
1114 	 if (!dbindex -> flag_casesensitive){
1115 	    c = tolowertab [c];
1116 	 }
1117 	 p += skip [c];
1118       }
1119    }
1120 
1121    return count;
1122 }
1123 
dict_search_substring(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1124 static int dict_search_substring( lst_List l,
1125 				  const char *word,
1126 				  const dictDatabase *database,
1127 				  dictIndex *dbindex)
1128 {
1129    return dict_search_bmh (
1130       l, (const unsigned char *) word,
1131       database, dbindex, BMH_SUBSTRING );
1132 }
1133 
dict_search_word(lst_List l,const char * word,const dictDatabase * database)1134 static int dict_search_word(
1135    lst_List l,
1136    const char *word,
1137    const dictDatabase *database)
1138 {
1139    lst_Position pos;
1140    dictWord *dw;
1141    const char *p;
1142    int ret1, ret2;
1143    int count;
1144    int len;
1145 
1146    assert (database);
1147    assert (database -> index);
1148 
1149    if (database->index_word){
1150       ret2 = dict_search_exact( l, word, database, database->index, 0 );
1151       if (ret2 < 0)
1152 	 return ret2;
1153 
1154       count = lst_length (l);
1155 
1156       ret1 = dict_search_exact( l, word, database, database->index_word, 0 );
1157       if (ret1 < 0)
1158 	 return ret1;
1159 
1160       LST_ITERATE (l, pos, dw){
1161 	 if (count-- <= 0){
1162 	    xfree (dw -> word);
1163 
1164 	    p = database -> index -> start + dw -> start;
1165 	    assert (p == database -> index -> start || p [-1] == '\n');
1166 	    len = strchr (p, '\t') - p;
1167 
1168 	    dw -> word = xmalloc (len + 1);
1169 	    memcpy (dw -> word, p, len);
1170 	    dw -> word [len] = 0;
1171 	    dw -> start = -2;
1172 	    dw -> end   = 0;
1173 	 }
1174       }
1175 
1176       return ret1 + ret2;
1177    }else{
1178       return dict_search_bmh (
1179 	 l, (const unsigned char *) word,
1180 	 database, database->index, BMH_WORD );
1181    }
1182 }
1183 
1184 /* return non-zero if success, 0 otherwise */
dict_match(const regex_t * re,const char * word,size_t word_len,int eflags)1185 static int dict_match (
1186    const regex_t *re,
1187    const char *word, size_t word_len,
1188    int eflags)
1189 {
1190 #if defined(REG_STARTEND)
1191    regmatch_t    subs[1];
1192    subs [0].rm_so = 0;
1193    subs [0].rm_eo = word_len;
1194    return !regexec(re, word, 1, subs, eflags | REG_STARTEND);
1195 #else
1196    char *word_copy = (char *) alloca (word_len + 1);
1197    memcpy (word_copy, word, word_len);
1198    word_copy [word_len] = 0;
1199    return !regexec(re, word_copy, 0, NULL, eflags);
1200 #endif
1201 }
1202 
dict_search_regexpr(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex,int type)1203 static int dict_search_regexpr( lst_List l,
1204 				const char *word,
1205 				const dictDatabase *database,
1206 				dictIndex *dbindex,
1207 				int type )
1208 {
1209    const char    *start = dbindex->start;
1210    const char    *end   = dbindex->end;
1211    const char    *p, *pt;
1212    int           count = 0;
1213    regex_t       re;
1214    char          erbuf[100];
1215    int           err;
1216    unsigned char first;
1217    const char    *previous = NULL;
1218 
1219    assert (dbindex);
1220 
1221 #if 1
1222    /* optimization code */
1223    if (optStart_mode){
1224       if (
1225 	 *word == '^'
1226 	 && dbindex -> isspacealnum [(unsigned char) word[1]]
1227 	 && strchr (word, '|') == NULL)
1228       {
1229 	 first = word[1];
1230 
1231 	 end   = dbindex->optStart[i2c(c2i(first)+1)];
1232 	 start = dbindex->optStart[first];
1233 
1234 #if 0
1235 	 fprintf (stderr, "optStart_regexp [%i] = %p\n", first, start);
1236 	 fprintf (stderr, "optStart_regexp [%i] = %p\n", i2c(c2i(first)+1), end);
1237 #endif
1238 
1239 	 if (end < start)
1240 	    end = dbindex->end;
1241 
1242 /*	 FIND_NEXT(end, dbindex -> end); */
1243       }
1244    }
1245 #endif
1246 
1247    if ((err = regcomp(&re, word, REG_ICASE|REG_NOSUB|type))) {
1248       regerror(err, &re, erbuf, sizeof(erbuf));
1249       log_info( "regcomp(%s): %s\n", word, erbuf );
1250       return 0;
1251    }
1252 
1253    pt = start;
1254    while (pt && pt < end) {
1255       for (p = pt; *p != '\t' && p < end; ++p);
1256       ++_dict_comparisons;
1257 
1258       if (dict_match (&re, pt, p - pt, 0)) {
1259 	 if (!previous || compare_1or4 (previous, dbindex, pt, end, &previous))
1260 	 {
1261 	    ++count;
1262 	    if (!dict_add_word_to_list
1263 		(l, database, dbindex, pt))
1264 	    {
1265 	       break;
1266 	    }
1267 	 }
1268       }
1269       pt = p + 1;
1270       FIND_NEXT( pt, end );
1271    }
1272 
1273    regfree(&re);
1274 
1275    return count;
1276 }
1277 
dict_search_re(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1278 static int dict_search_re( lst_List l,
1279 			   const char *word,
1280 			   const dictDatabase *database,
1281 			   dictIndex    *dbindex)
1282 {
1283    return dict_search_regexpr( l, word, database, dbindex, REG_EXTENDED );
1284 }
1285 
dict_search_regexp(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1286 static int dict_search_regexp( lst_List l,
1287 			       const char *word,
1288 			       const dictDatabase *database,
1289 			       dictIndex    *dbindex)
1290 {
1291    return dict_search_regexpr( l, word, database, dbindex, 0 /*REG_BASIC*/ );
1292 }
1293 
dict_search_soundex(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1294 static int dict_search_soundex( lst_List l,
1295 				const char *word,
1296 				const dictDatabase *database,
1297 				dictIndex    *dbindex)
1298 {
1299    const char *pt;
1300    const char *end;
1301    int        count = 0;
1302    char       soundex  [10];
1303    char       soundex2 [5];
1304    char       buffer[MAXWORDLEN];
1305    char       *d;
1306    const unsigned char *s;
1307    int        i;
1308    int        c = (unsigned char)*word;
1309    const char *previous = NULL;
1310 
1311    assert (dbindex);
1312 
1313    if (optStart_mode){
1314       pt  = dbindex->optStart[ c ];
1315       end = dbindex->optStart[ i2c(c2i(c)+1) ];
1316       if (end < pt)
1317 	 end = dbindex->end;
1318    }else{
1319       pt  = dbindex->start;
1320       end = dbindex->end;
1321    }
1322 
1323    txt_soundex2 (word, soundex);
1324 
1325    while (pt && pt < end) {
1326       s = (const unsigned char *) pt;
1327       for (i = 0, d = buffer; i < MAXWORDLEN - 1; i++, ++s) {
1328 	 if (*s == '\t') break;
1329 	 if (!dbindex -> isspacealnum [*s]) continue;
1330 	 *d++ = *s;
1331       }
1332       *d = '\0';
1333 
1334       txt_soundex2 (buffer, soundex2);
1335       if (!strcmp (soundex, soundex2)) {
1336 	 if (!previous || compare_1or4 (previous, dbindex, pt, end, &previous))
1337 	 {
1338 	    if (!dict_add_word_to_list
1339 		(l, database, dbindex, pt))
1340 	    {
1341 	       break;
1342 	    }
1343 
1344 	    ++count;
1345 	 }
1346       }
1347       FIND_NEXT(pt,end);
1348    }
1349 
1350    return count;
1351 }
1352 
1353 /* I was unable to locate V. I. Levenshtein, "Binary codes capable of
1354    correcting deletions, insertions, and reversals,"
1355    Sov. Phys.-Dokt. 10(8): Feb. 1966, 707-710.
1356 
1357    So, I used Joseph J. Pollock and Antonio Zamora, "Automatic spelling
1358    correction in scientific and scholarly text," CACM, 27(4): Apr. 1985,
1359    358-368.  They point out (p. 363) that the precedence of these tests,
1360    when used for spelling correction, is OMISSION = TRANSPOSITION >
1361    INSERTION > SUBSTITUTION.  If list is not sorted, then this order should
1362    be used.
1363 
1364    In this routine, we only consider corrections with a Levenshtein
1365    distance of 1.
1366 
1367 */
1368 
1369 typedef struct lev_args_ {
1370    const dictDatabase *database;
1371    dictIndex          *dbindex;
1372    lst_List            l;
1373 } LEV_ARGS;
1374 
1375 #define CHECK(word, args)                                \
1376    if ((pt = dict_index_search((word), (args) -> dbindex))         \
1377        && !compare((word), (args) -> dbindex, pt, (args) -> dbindex -> end)) \
1378    { \
1379       if (!set_member(s,(word))) {                       \
1380 	 ++count;                                        \
1381 	 set_insert(s,str_find((word)));                 \
1382 	 if (!dict_add_word_to_list ((args) -> l, (args) -> database, (args) -> dbindex, pt)) return count; \
1383          PRINTF(DBG_LEV,("  %s added\n",(word)));     \
1384       }                                               \
1385    }
1386 
1387 #define LEV_VARS \
1388       char tmp;
1389 
1390 #include "lev.h"
1391 
dict_search_levenshtein(lst_List l,const char * word,const dictDatabase * database,dictIndex * dbindex)1392 static int dict_search_levenshtein (
1393    lst_List l,
1394    const char *word,
1395    const dictDatabase *database,
1396    dictIndex *dbindex)
1397 {
1398    LEV_ARGS lev_args = { database, dbindex, l };
1399 
1400    assert (database);
1401    assert (dbindex);
1402 
1403    if (database -> alphabet){
1404       return dict_search_lev (
1405 	 word, database -> alphabet, dbindex -> flag_utf8, &lev_args);
1406    }else{
1407       if (dbindex -> flag_utf8){
1408 	 return dict_search_lev (
1409 	    word, global_alphabet_ascii, 1, &lev_args);
1410       }else{
1411 	 return dict_search_lev (
1412 	    word, global_alphabet_8bit, 0, &lev_args);
1413       }
1414    }
1415 }
1416 
1417 /*
1418   makes anagram of the 8-bit string 'str'
1419   if length == -1 then str is 0-terminated string
1420 */
stranagram_8bit(char * str,int length)1421 static void stranagram_8bit (char *str, int length)
1422 {
1423    char* i = str;
1424    char* j;
1425    char v;
1426 
1427    assert (str);
1428 
1429    if (length == -1)
1430        length = strlen (str);
1431 
1432    j = str + length - 1;
1433 
1434    while (i < j){
1435        v = *i;
1436        *i = *j;
1437        *j = v;
1438 
1439        ++i;
1440        --j;
1441    }
1442 }
1443 
1444 #if HAVE_UTF8
1445 /*
1446   makes anagram of the utf-8 string 'str'
1447   Returns non-zero if success, 0 otherwise
1448 */
stranagram_utf8(char * str)1449 static int stranagram_utf8 (char *str)
1450 {
1451    size_t len;
1452    char   *p;
1453 
1454    mbstate_t ps;
1455 
1456    assert (str);
1457 
1458    memset (&ps,  0, sizeof (ps));
1459 
1460    for (p = str; *p; ){
1461       len = mbrlen__ (p, MB_CUR_MAX__, &ps);
1462       if ((int) len < 0)
1463 	 return 0; /* not a UTF-8 string */
1464 
1465       if (len > 1)
1466 	  stranagram_8bit (p, len);
1467 
1468       p += len;
1469    }
1470 
1471    stranagram_8bit (str, -1);
1472    return 1;
1473 }
1474 #endif
1475 
1476 /* makes anagram of utf-8 string 'str' */
stranagram(char * str,int utf8_string)1477 static int stranagram (char *str, int utf8_string)
1478 {
1479    assert (str);
1480 
1481 #if HAVE_UTF8
1482    if (utf8_string){
1483       return stranagram_utf8 (str);
1484    }else{
1485       stranagram_8bit (str, -1);
1486       return 1;
1487    }
1488 #else
1489    stranagram_8bit (str, -1);
1490    return 1;
1491 #endif
1492 }
1493 
dict_search_suffix(lst_List l,const char * word,const dictDatabase * database)1494 static int dict_search_suffix(
1495    lst_List l,
1496    const char *word,
1497    const dictDatabase *database)
1498 {
1499    int ret;
1500    lst_Position p;
1501    dictWord *dw;
1502    char *buf = NULL;
1503    int count;
1504 
1505    assert (database);
1506 
1507    if (database->index_suffix){
1508       buf = (char *) alloca (strlen (word));
1509       strcpy (buf, word);
1510 
1511       PRINTF(DBG_SEARCH, ("anagram: '%s' ==> ", buf));
1512       if (!stranagram (buf, utf8_mode)){
1513 	 PRINTF(DBG_SEARCH, ("failed building anagram\n"));
1514 	 return 0; /* invalid utf8 string */
1515       }
1516 
1517       count = lst_length (l);
1518 
1519       PRINTF(DBG_SEARCH, ("'%s'\n", buf));
1520       ret = dict_search_prefix (
1521 	 l, buf, database, database->index_suffix, 0, INT_MAX);
1522 
1523       LST_ITERATE (l, p, dw) {
1524 	 if (count <= 0){
1525 	    stranagram (dw -> word, utf8_mode);
1526 	 }
1527 
1528 	 --count;
1529       }
1530       return ret;
1531    }else{
1532       return dict_search_bmh(
1533 	 l, (const unsigned char *) word,
1534 	 database, database -> index, BMH_SUFFIX );
1535    }
1536 }
1537 
dict_search_last(lst_List l,const char * word,const dictDatabase * database)1538 static int dict_search_last (
1539    lst_List l,
1540    const char *word,
1541    const dictDatabase *database)
1542 {
1543    int ret;
1544    lst_Position p;
1545    dictWord *dw;
1546    char *buf = NULL;
1547    int count;
1548 
1549    assert (database);
1550 
1551    if (database->index_suffix){
1552       buf = (char *) alloca (strlen (word));
1553       strcpy (buf, word);
1554 
1555       PRINTF(DBG_SEARCH, ("anagram: '%s' ==> ", buf));
1556       if (!stranagram (buf, utf8_mode)){
1557 	 PRINTF(DBG_SEARCH, ("failed building anagram\n"));
1558 	 return 0; /* invalid utf8 string */
1559       }
1560 
1561       count = lst_length (l);
1562 
1563       PRINTF(DBG_SEARCH, ("'%s'\n", buf));
1564       ret = dict_search_first (l, buf, database, database->index_suffix);
1565 
1566       LST_ITERATE (l, p, dw) {
1567 	 if (count-- <= 0){
1568 	    stranagram (dw -> word, utf8_mode);
1569 	 }
1570       }
1571       return ret;
1572    }else{
1573       return dict_search_bmh(
1574 	 l, (const unsigned char *) word,
1575 	 database, database -> index, BMH_LAST );
1576    }
1577 }
1578 
1579 #if HAVE_UTF8
1580 static const char *utf8_err_msg = "\
1581 error: The request is not a valid UTF-8 string";
1582 #endif
1583 
1584 /*
1585   returns a number of matches ( >= 0 ) or
1586   negative value for invalid UTF-8 string
1587 */
dict_search_database_(lst_List l,const char * word,const dictDatabase * database,int strategy_or_define)1588 int dict_search_database_ (
1589    lst_List l,
1590    const char *word,
1591    const dictDatabase *database,
1592    int strategy_or_define )
1593 {
1594    char       *buf      = NULL;
1595 #if HAVE_UTF8
1596    dictWord   *dw       = NULL;
1597 #endif
1598    unsigned int skip_count       = 0;
1599    unsigned int item_count       = INT_MAX;
1600 
1601    int strategy = strategy_or_define & ~DICT_MATCH_MASK;
1602 
1603    define_or_match = (strategy == strategy_or_define);
1604 
1605    assert (database);
1606    assert (database -> index);
1607 
1608    if (strategy == DICT_STRAT_DOT){
1609       strategy = database -> default_strategy;
1610    }
1611 
1612    if (strategy == DICT_STRAT_NPREFIX){
1613       if (2 == sscanf (word, "%u#%u#", &skip_count, &item_count)){
1614 	 while (*word++ != '#');
1615 	 ++word;
1616 	 while (*word++ != '#');
1617       }
1618    }
1619 
1620    buf = alloca( strlen( word ) + 1 );
1621 
1622 #if HAVE_UTF8
1623    if (
1624       !strcmp(utf8_err_msg, word) ||
1625       tolower_alnumspace (
1626 	 word, buf,
1627 	 database -> index -> flag_allchars,
1628 	 database -> index -> flag_casesensitive,
1629 	 utf8_mode))
1630    {
1631       PRINTF(DBG_SEARCH, ("tolower_... ERROR!!!\n"));
1632 
1633       dw = xmalloc (sizeof (dictWord));
1634       memset (dw, 0, sizeof (dictWord));
1635 
1636       dw -> database = database;
1637       dw -> def      = utf8_err_msg;
1638       dw -> def_size = -1;
1639       dw -> word     = strdup (word);
1640 
1641       lst_append (l, dw);
1642 
1643       return -1;
1644    }
1645 #else
1646    tolower_alnumspace (
1647       word, buf,
1648       database -> index -> flag_allchars,
1649       database -> index -> flag_casesensitive,
1650       utf8_mode);
1651 #endif
1652 
1653    if (!buf [0] && word [0]){
1654       /*
1655         This may happen because of invalid --locale specified.
1656 	Without following line entire dictionary will be returned
1657           for non-ASCII words.
1658       */
1659       return 0;
1660    }
1661 
1662    switch (strategy) {
1663    case DICT_STRAT_EXACT:
1664       return dict_search_exact( l, buf, database, database->index,
1665 				strategy_or_define != strategy);
1666 
1667    case DICT_STRAT_PREFIX:
1668    case DICT_STRAT_NPREFIX:
1669       return dict_search_prefix (l, buf, database, database->index,
1670 				 skip_count, item_count);
1671 
1672    case DICT_STRAT_SUBSTRING:
1673       return dict_search_substring( l, buf, database, database->index );
1674 
1675    case DICT_STRAT_SUFFIX:
1676       return dict_search_suffix( l, buf, database );
1677 
1678    case DICT_STRAT_RE:
1679       return dict_search_re( l, word, database, database->index );
1680 
1681    case DICT_STRAT_REGEXP:
1682       return dict_search_regexp( l, word, database, database->index );
1683 
1684    case DICT_STRAT_SOUNDEX:
1685       return dict_search_soundex( l, buf, database, database->index );
1686 
1687    case DICT_STRAT_LEVENSHTEIN:
1688       return dict_search_levenshtein( l, buf, database, database->index);
1689 
1690    case DICT_STRAT_WORD:
1691       return dict_search_word( l, buf, database);
1692 
1693    case DICT_STRAT_FIRST:
1694       return dict_search_first( l, buf, database, database->index );
1695 
1696    case DICT_STRAT_LAST:
1697       return dict_search_last( l, buf, database );
1698 
1699    default:
1700       /* plugins may support unusual search strategies */
1701       return 0;
1702    }
1703 }
1704 
1705 /*
1706   Replaces invisible databases with db argument.
1707  */
replace_invisible_databases(lst_Position pos,const dictDatabase * db)1708 static void replace_invisible_databases (
1709    lst_Position pos,
1710    const dictDatabase *db)
1711 {
1712    dictWord *dw;
1713 
1714    while (pos){
1715       dw = (dictWord *) lst_get_position (pos);
1716 
1717       if (
1718 	 dw -> database &&
1719 	 dw -> database -> invisible &&
1720 	 !dw -> database_visible)
1721       {
1722 	 dw -> database_visible = db;
1723       }
1724 
1725       pos = lst_next_position (pos);
1726    }
1727 }
1728 
1729 /*
1730   returns a number of matches ( >= 0 ) or
1731   negative value for invalid UTF-8 string
1732 */
dict_search(lst_List l,const char * const word,const dictDatabase * database,int strategy,int option_mime,int * extra_result,const dictPluginData ** extra_data,int * extra_data_size)1733 int dict_search (
1734    lst_List l,
1735    const char *const word,
1736    const dictDatabase *database,
1737    int strategy,
1738    int option_mime,
1739    int *extra_result,
1740    const dictPluginData **extra_data,
1741    int *extra_data_size)
1742 {
1743    int count = 0;
1744 
1745    int norm_strategy = strategy & ~DICT_MATCH_MASK;
1746 
1747    if (extra_result)
1748       *extra_result = DICT_PLUGIN_RESULT_NOTFOUND;
1749 
1750    assert (word);
1751    assert (database);
1752 
1753    if (
1754       database -> strategy_disabled &&
1755       database -> strategy_disabled [norm_strategy])
1756    {
1757       /* disable_strategy keyword from configuration file */
1758 #if 0
1759       PRINTF (DBG_SEARCH, (
1760 	 ":S: strategy '%s' is disabled for database '%s'\n",
1761 	 get_strategies () [norm_strategy] -> name,
1762 	 database -> databaseName ? database -> databaseName : "(unknown)"));
1763 #endif
1764       return 0;
1765    }
1766 
1767    PRINTF (DBG_SEARCH, (":S: Searching in '%s'\n", database -> databaseName));
1768 
1769 #if 0
1770    fprintf (stderr, "STRATEGY: %x\n", strategy);
1771 #endif
1772 
1773    if (database -> index){
1774       PRINTF (DBG_SEARCH, (":S:   database search\n"));
1775       count = dict_search_database_ (l, word, database, strategy);
1776    }
1777 
1778 #ifdef USE_PLUGIN
1779    if (!count && database -> plugin){
1780       PRINTF (DBG_SEARCH, (":S:   plugin search\n"));
1781       count = dict_search_plugin (
1782 	 l, word, database, strategy,
1783 	 extra_result, extra_data, extra_data_size);
1784 
1785       if (count)
1786 	 return count;
1787    }
1788 #endif
1789 
1790    if (!count && database -> virtual_db_list){
1791       lst_Position db_list_pos;
1792       dictDatabase *db = NULL;
1793       int old_count = lst_length (l);
1794 
1795       assert (lst_init_position (database -> virtual_db_list));
1796 
1797       LST_ITERATE (database -> virtual_db_list, db_list_pos, db){
1798 	 count += dict_search (
1799 	    l, word, db, strategy, option_mime,
1800 	    extra_result, extra_data, extra_data_size);
1801       }
1802 
1803       if (count > old_count){
1804 	 replace_invisible_databases (
1805 	    lst_nth_position (l, old_count + 1),
1806 	    database);
1807       }
1808    }
1809 
1810    if (!count && database -> mime_db){
1811       int old_count = lst_length (l);
1812 
1813       count += dict_search (
1814 	 l, word,
1815 	 (option_mime ? database -> mime_mimeDB :
1816 	  database -> mime_nomimeDB),
1817 	 strategy, 0,
1818 	 extra_result, extra_data, extra_data_size);
1819 
1820       if (count > old_count){
1821 	 replace_invisible_databases (
1822 	    lst_nth_position (l, old_count + 1),
1823 	    database);
1824       }
1825    }
1826 
1827    if (count > 0 && extra_result)
1828       *extra_result = DICT_PLUGIN_RESULT_FOUND;
1829 
1830    return count;
1831 }
1832 
dict_index_open(const char * filename,int init_flags,const dictIndex * base)1833 dictIndex *dict_index_open(
1834    const char *filename,
1835    int init_flags,
1836    const dictIndex *base)
1837 {
1838    struct stat sb;
1839    static int  tabInit = 0;
1840    dictIndex   *i;
1841    dictDatabase db;
1842    int         j;
1843    char        buf[2];
1844 
1845    int         old_8bit_format = 0;
1846 
1847    int first_char;
1848    int first_char_uc;
1849 
1850    if (!filename)
1851       return NULL;
1852 
1853    i = xmalloc( sizeof( struct dictIndex ) );
1854 
1855    if (!tabInit) dict_table_init();
1856    tabInit = 1;
1857 
1858    memset( i, 0, sizeof( struct dictIndex ) );
1859 
1860    if ((i->fd = open( filename, O_RDONLY )) < 0)
1861       err_fatal_errno( __func__,
1862 		       "Cannot open index file \"%s\"\n", filename );
1863    if (fstat( i->fd, &sb ))
1864       err_fatal_errno( __func__,
1865 		       "Cannot stat index file \"%s\"\n", filename );
1866    i->size = sb.st_size;
1867 
1868    if (mmap_mode){
1869       if (i->size) {
1870          i->start = mmap( NULL, i->size, PROT_READ, MAP_SHARED, i->fd, 0 );
1871          if ((void *)i->start == (void *)(-1))
1872             err_fatal_errno (
1873                __func__,
1874                "Cannot mmap index file \"%s\"\b", filename );
1875       } else i->start = NULL;  /* allow for /dev/null dummy index */
1876    }else{
1877       i->start = xmalloc (i->size);
1878       if (-1 == read (i->fd, (char *) i->start, i->size))
1879 	 err_fatal_errno (
1880 	    __func__,
1881 	    "Cannot read index file \"%s\"\b", filename );
1882 
1883       close (i -> fd);
1884       i -> fd = 0;
1885    }
1886 
1887    i->end = i->start + i->size;
1888 
1889    i->flag_8bit     = 0;
1890    if (base){
1891       i->flag_utf8          = base -> flag_utf8;
1892       i->flag_allchars      = base -> flag_allchars;
1893       i->flag_casesensitive = base -> flag_casesensitive;
1894    }
1895    i->isspacealnum  = isspacealnumtab;
1896 
1897    if (optStart_mode){
1898       for (j = 0; j <= UCHAR_MAX; j++)
1899 	 i->optStart[j] = i->start;
1900    }
1901 
1902    if (init_flags){
1903       memset (&db, 0, sizeof (db));
1904       db.index = i;
1905 
1906       /* for exact search */
1907       i->flag_allchars = 1;
1908       i->isspacealnum = isspacealnumtab_allchars;
1909 
1910       /* allchars flag */
1911       i->flag_allchars =
1912 	 0 != dict_search_database_ (NULL, DICT_FLAG_ALLCHARS, &db, DICT_STRAT_EXACT);
1913       PRINTF(DBG_INIT, (":I:     \"%s\": flag_allchars=%i\n", filename, i->flag_allchars));
1914 
1915       /* case-sensitive flag */
1916       i->flag_casesensitive =
1917 	 0 != dict_search_database_ (NULL, DICT_FLAG_CASESENSITIVE, &db, DICT_STRAT_EXACT);
1918       PRINTF(DBG_INIT, (":I:     \"%s\": flag_casesensitive=%i\n", filename, i->flag_casesensitive));
1919 
1920       /* utf8 flag */
1921       if (!i -> flag_allchars)
1922 	 i -> isspacealnum = isspacealnumtab;
1923 
1924       i->flag_utf8 =
1925          0 != dict_search_database_ (NULL, DICT_FLAG_UTF8, &db, DICT_STRAT_EXACT);
1926       PRINTF(DBG_INIT, (":I:     \"%s\": flag_utf8=%i\n", filename, i->flag_utf8));
1927       if (i->flag_utf8 && !utf8_mode){
1928 	 log_info( ":E: locale '%s' can not be used for utf-8 dictionaries. Exiting\n", locale );
1929 	 exit (1);
1930       }
1931 
1932       /* 8bit flag */
1933       i->flag_8bit =
1934 	 0 != dict_search_database_ (NULL, DICT_FLAG_8BIT_NEW, &db, DICT_STRAT_EXACT);
1935       old_8bit_format =
1936 	 0 != dict_search_database_ (NULL, DICT_FLAG_8BIT_OLD, &db, DICT_STRAT_EXACT);
1937 
1938       if (old_8bit_format){
1939 	 log_info( ":E: index file '%s' was created using dictfmt <1.9.15\n"
1940 	           ":E:   and can not be used with dictd-1.9.15 or later\n"
1941 	           ":E:   Rebuild it like this:\n"
1942 	           ":E:   dictunformat db.index < db.dict | dictfmt -t --locale <8bit-locale> db-new\n", filename );
1943 	 exit (1);
1944       }
1945 
1946       PRINTF(DBG_INIT, (":I:     \"%s\": flag_8bit=%i\n", filename, i->flag_8bit));
1947       if (i->flag_8bit){
1948 	 log_info( ":E: locale '%s' can not be used for 8-bit dictionaries. Exiting\n", locale );
1949 	 exit (1);
1950       }
1951    }
1952 
1953    if (optStart_mode){
1954       buf[0] = ' ';
1955       buf[1] = '\0';
1956       i->optStart[ ' ' ] = binary_search_8bit( buf, i, i->start, i->end );
1957 
1958       for (j = 0; j < charcount; j++) {
1959 	 first_char = c(j);
1960 
1961 	 buf[0] = first_char;
1962 	 buf[1] = '\0';
1963 
1964 	 i->optStart [first_char]
1965 	    = binary_search_8bit( buf, i, i->start, i->end );
1966 
1967 	 if (!utf8_mode || first_char < 128){
1968 	    first_char_uc = toupper (first_char);
1969 
1970 	    assert (first_char_uc >= 0 && first_char_uc <= UCHAR_MAX);
1971 
1972 	    i->optStart [first_char_uc] = i->optStart [first_char];
1973 	 }
1974       }
1975 
1976       for (j = '0'; j <= '9'; j++) {
1977 	 buf[0] = j;
1978 	 buf[1] = '\0';
1979 	 i->optStart[j] = binary_search_8bit( buf, i, i->start, i->end );
1980       }
1981 
1982       i->optStart[UCHAR_MAX]   = i->end;
1983       i->optStart[UCHAR_MAX+1] = i->end;
1984 
1985       if (dbg_test (DBG_SEARCH)){
1986 	 for (j=0; j <= UCHAR_MAX; ++j){
1987 	    if (!utf8_mode || j <= CHAR_MAX)
1988 	       printf (
1989 		  "optStart [%c] = (%p) %10s\n",
1990 		  j,
1991 		  i->optStart [j],
1992 		  i->optStart [j]);
1993 	    else
1994 	       printf (
1995 		  "optStart [%i] = (%p) %10s\n",
1996 		  j,
1997 		  i->optStart [j],
1998 		  i->optStart [j]);
1999 	 }
2000       }
2001    }
2002 
2003    return i;
2004 }
2005 
dict_index_close(dictIndex * i)2006 void dict_index_close( dictIndex *i )
2007 {
2008    if (!i)
2009       return;
2010 
2011    if (mmap_mode){
2012       if (i->fd >= 0) {
2013          if(i->start)
2014             munmap( (void *)i->start, i->size );
2015 	 close( i->fd );
2016 	 i->fd = 0;
2017       }
2018    }else{
2019       if (i -> start)
2020 	 xfree ((char *) i -> start);
2021    }
2022 
2023    xfree (i);
2024 }
2025