1 /* Copyright (C) 2003-2009 Nadav Har'El and Dan Kenigsberg */
2
3 #include <stdio.h>
4 #include <sys/types.h>
5 #include <stdlib.h>
6 #include <string.h>
7
8 /* This is for declaring the uint32_t type, a type holding a 32-bit unsigned
9 integer. It exists on Linux and on fairly modern Solaris, but
10 maybe not anywhere else. We should use autoconf to solve this portability
11 nightmare.
12 */
13 #include <inttypes.h>
14
15 /* If the Zlib library is available, we use it for reading the compressed
16 dictionaries, rather than opening a pipe to an external gzip process.
17 In one measurement, this halved the loading time (0.1 sec to 0.05 sec).
18 It also allowed Hspell to be used on systems where the Zlib library
19 is available, but the gzip program is not (e.g., OpenOffice on MS-Windows).
20
21 The definitions which are pretty bizarre, but help us convert the existing
22 code into something that will work with zlib without ugly ifdefs
23 everywhere (further ifdefs are only needed in some places). Note that
24 when BUFFERED_ZLIB is enabled (and it is enabled by default here) we
25 enable special buffered version of zlib (gzbuffered.h) instead of the
26 normal zlib functions.
27 */
28 #ifdef HAVE_ZLIB
29 #define BUFFERED_ZLIB
30 #undef FILE
31 #undef pclose
32 #undef getc
33 #ifdef BUFFERED_ZLIB
34 #include "gzbuffered.h"
35 #undef gzopen
36 #undef gzdopen
37 #define FILE void /* void* can be either normal FILE* or gzbFile*. Eek. */
38 #define gzopen(path,mode) gzb_open(path,mode)
39 #define gzdopen(path,mode) gzb_dopen(path,mode)
40 #define pclose(f) (gzb_close((gzbFile *)(f)))
41 #define getc(f) (gzb_getc(((gzbFile *)(f))))
42 #else
43 #include <zlib.h>
44 #define FILE void /* FILE* is void*, a.k.a. voidp or gzFile */
45 #define pclose(f) (gzclose((f)))
46 #define getc(f) (gzgetc((f)))
47 #endif
48 #endif /* HAVE_ZLIB */
49
50 /* Our radix tree has four types of "nodes": leaf nodes, small nodes
51 * (carrying up to SMALL_NODE_CHILDREN children), medium nodes (carrying up to
52 * MEDIUM_NODE_CHILDREN) and full nodes carrying exactly NUM_LETTERS children.
53 *
54 * Since there are plenty of leaf nodes, we want these to be tiny, containing
55 * basically just a value. Therefore we overload the same 32-bit "val_or_index"
56 * position to be one of:
57 * 1. Empty (in this case val_or_index==0)
58 * 2. Value (value must be non-zero and 30 bit only!)
59 * 3. Index of full node (3 on highest 2 bits, the 30 lowest are the index)
60 * 4. Index of medium node (2 on highest 2 bits, the 30 lowest are the index)
61 * 5. Index of small node (1 on highest 2 bits, the 30 lowest are the index)
62 */
63 #define CONST32(x) ((uint32_t)(x))
64 #define HIGHBITS ((CONST32(1)<<31) | (CONST32(1)<<30))
65 #define HIGHBITS_VALUE (CONST32(0) << 30)
66 #define HIGHBITS_SMALL (CONST32(1) << 30)
67 #define HIGHBITS_MEDIUM (CONST32(2) << 30)
68 #define HIGHBITS_FULL (CONST32(3) << 30)
69 #define VALUEMASK (~HIGHBITS)
70
71 #define NUM_LETTERS 29 /* 27 Hebrew letters, " and ' */
72 /* When trying on the Hebrew dictionary, when there are only small and
73 * full nodes, small_node_children=4 was the clear winner, taking 3363K
74 * of memory.
75 * When added medium nodes, there are two ties for minimal space usage
76 * (at 2260K each): 2,8 and 3,8. Both have 1831 full nodes, 2,8 results in
77 * 61771/25072 small/medium nodes, and 3,8 results in 71856/14987 small/medium
78 * nodes.
79 * One way to choose among them is to minimize search time. On average
80 * searching a node with N children takes N/2 comparisons. If we pass
81 * all nodes (and I doubt this is a meaningful measure... :( ) the 2,8
82 * will make 162059 comparisons and 3,8 will make 167732. Again, roughly
83 * the same, so I can't decide :(
84 * Another deciding factor: read time. 2,8 is slightly quicker - I have
85 * no idea why.
86 *
87 * Note: to minimize search time we might want to choose a set of sizes
88 * which does not assure the smallest size. HOWEVER, one interesting thing
89 * to note: the children in small and medium nodes are sorted. This might
90 * mean that it is quicker to search the medium node using a binary search,
91 * rather than linear? I don't know. Maybe for N=8, it ain't worth it.
92 */
93 /*#define SMALL_NODE_CHILDREN 4*/
94 #define SMALL_NODE_CHILDREN 2
95 #define MEDIUM_NODE_CHILDREN 8
96
97 #if 0
98 /*
99 * NOTE: SMALL-MEDIUM = 1-4 has an interesting advantage. At 2876K It wasn't
100 * smallest (2-8 was, with 2257K) but it makes a lot of nodes full or
101 * 1-child only (and therefore very quick to search) and only some nodes
102 * with 4 children which is only slightly harder to search (only 2.5
103 * comparisons needed on average).
104 * */
105 /* search speed optimization */
106 #define SMALL_NODE_CHILDREN 1
107 #define MEDIUM_NODE_CHILDREN 4
108 #endif
109
110 struct node_index {
111 /* if most-significant bit of val is on, it's an index. Otherwise,
112 * it's only a value (31 bit and nonzero).
113 */
114 uint32_t val_or_index;
115 };
116
117 struct node {
118 uint32_t value;
119 struct node_index children[NUM_LETTERS];
120 };
121 struct node_small {
122 uint32_t value;
123 char chars[SMALL_NODE_CHILDREN];
124 struct node_index children[SMALL_NODE_CHILDREN];
125 };
126 struct node_medium {
127 uint32_t value;
128 char chars[MEDIUM_NODE_CHILDREN];
129 struct node_index children[MEDIUM_NODE_CHILDREN];
130 };
131
132
133 /* Note: char_to_letter prints a message when it comes across an invalid
134 letter, so it should not be used in lookup(), only in reading the
135 dictionary (which is assumed to contain only valid words). lookup()
136 has its own implementation of this function inside it.
137 */
char_to_letter(unsigned char c)138 static inline int char_to_letter(unsigned char c)
139 {
140 if(c>=(unsigned char)'�' && c<(unsigned char)'�'+27){
141 return c - (unsigned char)'�' + 2;
142 } else if (c=='"'){
143 return 0;
144 } else if (c=='\''){
145 return 1;
146 } else {
147 fprintf(stderr,"Hspell: unknown letter %c...\n",c);
148 /* a silly thing to do, but what the heck */
149 return 0;
150 }
151 }
152
letter_to_char(int l)153 static inline unsigned char letter_to_char(int l)
154 {
155 if(l>=2 && l<29){
156 return l+(unsigned char)'�'-2;
157 } else if(l==0){
158 return '"';
159 } else if(l==1){
160 return '\'';
161 } else {
162 /* this will never happen in the current code: */
163 fprintf(stderr,"Hspell: internal error: unknown letter %d... "
164 "exiting.\n",l);
165 exit(1);
166 }
167 }
168
169 /* This routine was written for debugging purposes only, and not for
170 * absolute efficiency.
171 */
172 static void
do_print_tree(struct node * nodes,struct node_small * nodes_small,struct node_medium * nodes_medium,struct node_index head,char * word,int len,int maxlen)173 do_print_tree(struct node *nodes, struct node_small *nodes_small,
174 struct node_medium *nodes_medium,
175 struct node_index head, char *word, int len, int maxlen){
176 int i;
177 if(len>=maxlen){
178 fprintf(stderr,"Hspell: do_print_tree(): warning: buffer overflow.\n");
179 return;
180 }
181 if((head.val_or_index & HIGHBITS) == HIGHBITS_FULL){
182 struct node *n = &nodes[head.val_or_index & VALUEMASK];
183 if(n->value){
184 word[len]='\0';
185 printf("%s %d\n", word, n->value);
186 }
187 for(i=0;i<NUM_LETTERS;i++){
188 word[len]=letter_to_char(i);
189 do_print_tree(nodes,nodes_small,nodes_medium,
190 n->children[i],word,len+1,maxlen);
191 }
192 } else if((head.val_or_index & HIGHBITS) == HIGHBITS_SMALL){
193 struct node_small *n = &nodes_small[head.val_or_index & VALUEMASK];
194 if(n->value){
195 word[len]='\0';
196 printf("%s %d\n", word, n->value);
197 }
198 for(i=0;i<SMALL_NODE_CHILDREN;i++){
199 if(n->chars[i]){
200 word[len]=n->chars[i];
201 do_print_tree(nodes,nodes_small,nodes_medium,
202 n->children[i],word,len+1,maxlen);
203 }
204 }
205 } else if((head.val_or_index & HIGHBITS) == HIGHBITS_MEDIUM){
206 struct node_medium *n = &nodes_medium[head.val_or_index & VALUEMASK];
207 if(n->value){
208 word[len]='\0';
209 printf("%s %d\n", word, n->value);
210 }
211 for(i=0;i<MEDIUM_NODE_CHILDREN;i++){
212 if(n->chars[i]){
213 word[len]=n->chars[i];
214 do_print_tree(nodes,nodes_small,nodes_medium,
215 n->children[i],word,len+1,maxlen);
216 }
217 }
218 } else if(head.val_or_index){
219 word[len]='\0';
220 printf("%s %d\n", word, head.val_or_index);
221 }
222 }
223
224 struct dict_radix {
225 /* The nodes used by the radix tree representation of the dictionary */
226 int nnodes_small, size_nodes_small;
227 struct node_small *nodes_small;
228
229 int nnodes_medium, size_nodes_medium;
230 struct node_medium *nodes_medium;
231
232 int nnodes, size_nodes;
233 struct node *nodes;
234
235 struct node_index head;
236
237 /* Freelist of recycled small nodes. As more words are added to the
238 dictionary in the process of read_dict(), small nodes become
239 medium and medium nodes become full. Because these small/medium
240 nodes that are no longer needed are in the middle of the node
241 list, we keep them aside in a freelist. They are recycled quickly,
242 as new small/medium nodes are continued to be created.
243 */
244 int free_nodes_small[16], nfree_nodes_small;
245 int free_nodes_medium[16], nfree_nodes_medium;
246
247 int nwords;
248 };
249
250 /* new_dict_radix is the constructor for an opaque (to the includer of
251 dict_radix.h) object.
252 */
253 struct dict_radix *
new_dict_radix(void)254 new_dict_radix(void)
255 {
256 struct dict_radix *dict;
257 dict= (struct dict_radix *) malloc(sizeof(struct dict_radix));
258 /* By default, zero all fields in dict_radix */
259 if(dict)
260 memset(dict, 0, sizeof(*dict));
261 return dict;
262 }
263
264 /* Note that delete_dict_radix frees everything inside a dict_radix, and
265 the dict_radix structure itself. The pointer given to it is no longer
266 a valid pointer after this call.
267 */
268 void
delete_dict_radix(struct dict_radix * dict)269 delete_dict_radix(struct dict_radix *dict)
270 {
271 if(!dict)
272 return; /* allow deleting null object, like in C++... */
273 if(dict->nodes_small)
274 free(dict->nodes_small);
275 if(dict->nodes_medium)
276 free(dict->nodes_medium);
277 if(dict->nodes)
278 free(dict->nodes);
279 free(dict);
280 }
281
282 int
allocate_nodes(struct dict_radix * dict,int nsmall,int nmedium,int nfull)283 allocate_nodes(struct dict_radix *dict, int nsmall, int nmedium, int nfull)
284 {
285 /* if already allocated, it's an error */
286 if(dict->nodes)
287 return -1;
288
289 dict->nodes_small = malloc(sizeof(struct node_small)*nsmall);
290 dict->size_nodes_small = nsmall;
291
292 dict->nodes_medium = malloc(sizeof(struct node_medium)*nmedium);
293 dict->size_nodes_medium = nmedium;
294
295 dict->nodes = malloc(sizeof(struct node)*nfull);
296 dict->size_nodes = nfull;
297
298 if(dict->nodes_small==NULL || dict->nodes_medium==NULL ||
299 dict->nodes==NULL)
300 return -2;
301
302 return 0;
303 }
304
305
306 /* Efficiently read a compressed dictionary from the given directory.
307 Use memory pre-allocation hints from another file in this directory.
308
309 returns 1 on success, 0 on failure.
310
311 TODO: there are too many printouts here. We need to return error
312 numbers instead of all those printouts.
313 */
314
315 #define PREFIX_FILE
316
317 #ifdef PREFIX_FILE
318 static int do_read_dict(FILE *fp, FILE *prefixes, struct dict_radix *dict);
319 #else
320 static int do_read_dict(FILE *fp, struct dict_radix *dict);
321 #endif
322
323 int
read_dict(struct dict_radix * dict,const char * dir)324 read_dict(struct dict_radix *dict, const char *dir)
325 {
326 if(dir){
327 FILE *fp;
328 char s[1024];
329 int small,medium,full,ret;
330 #ifdef PREFIX_FILE
331 FILE *prefixes;
332 #endif
333
334 snprintf(s,sizeof(s),"%s.sizes",dir);
335 if(!(fp=fopen(s,"r"))){
336 fprintf(stderr,"Hspell: can't open %s.\n",s);
337 return 0;
338 }
339 if(fscanf(fp,"%d %d %d",&small,&medium,&full)!=3){
340 fprintf(stderr,"Hspell: can't read from %s.\n",s);
341 return 0;
342 }
343 fclose(fp);
344
345 #ifdef HAVE_ZLIB
346 if(!(fp=gzopen(dir,"r"))){
347 fprintf(stderr,"Hspell: can't open %s.\n",dir);
348 return 0;
349 }
350 #else
351 snprintf(s,sizeof(s),"gzip -dc '%s'",dir);
352 if(!(fp=popen(s,"r"))){
353 fprintf(stderr,"Hspell: can't run %s.\n",s);
354 return 0;
355 }
356 #endif /* HAVE_ZLIB */
357
358 #ifdef PREFIX_FILE
359 #ifdef HAVE_ZLIB
360 snprintf(s,sizeof(s),"%s.prefixes",dir);
361 if(!(prefixes=gzopen(s,"rb"))){
362 fprintf(stderr,"Hspell: can't open %s.\n",s);
363 return 0;
364 }
365 #else
366 snprintf(s,sizeof(s),"gzip -dc '%s.prefixes'",dir);
367 if(!(prefixes=popen(s,"rb"))){
368 fprintf(stderr,"Hspell: can't run %s.\n",s);
369 return 0;
370 }
371 #endif /* HAVE_ZLIB */
372 #endif
373
374 allocate_nodes(dict,small,medium,full);
375 #ifdef PREFIX_FILE
376 ret=do_read_dict(fp, prefixes, dict);
377 pclose(prefixes);
378 #else
379 ret=do_read_dict(fp, dict);
380 #endif
381 pclose(fp);
382 return ret;
383 } else {
384 #ifdef HAVE_ZLIB
385 /* note that gzopen also works on non-gzipped files */
386 FILE *in=gzdopen(fileno(stdin),"r");
387 #ifdef PREFIX_FILE
388 FILE *zero=gzopen("/dev/zero","r");
389 #endif
390 #else
391 FILE *in=stdin;
392 #ifdef PREFIX_FILE
393 FILE *zero=fopen("/dev/zero","r");
394 #endif
395 #endif /* HAVE_ZLIB */
396
397 #ifdef PREFIX_FILE
398 return do_read_dict(in, zero, dict);
399 #else
400 return do_read_dict(in, dict);
401 #endif
402 }
403 }
404
405 #ifdef PREFIX_FILE
do_read_dict(FILE * fp,FILE * prefixes,struct dict_radix * dict)406 static int do_read_dict(FILE *fp, FILE *prefixes, struct dict_radix *dict)
407 #else
408 static int do_read_dict(FILE *fp, struct dict_radix *dict)
409 #endif
410 {
411 struct node_index *stack[256];
412 int sdepth=0;
413 int c,n,cc;
414 /* Local copies of dict-> variables, for efficiency. */
415 int nwords=0;
416 struct node *nodes = dict->nodes;
417 struct node_small *nodes_small = dict->nodes_small;
418 struct node_medium *nodes_medium = dict->nodes_medium;
419 int nnodes_small=0, nnodes_medium=0, nnodes=0;
420
421 if(dict->nnodes||dict->nnodes_small||dict->nnodes_medium||
422 dict->nwords){
423 fprintf(stderr, "Hspell: do_read_dict(): called for a non-"
424 "empty dictionary\n");
425 return 0;
426 }
427 if(!nodes||!nodes_small||!nodes_medium){
428 fprintf(stderr, "Hspell: do_read_dict(): allocate_nodes() must"
429 " be called first\n");
430 return 0;
431 }
432
433 memset(&nodes[nnodes], 0, sizeof(nodes[nnodes]));
434 dict->head.val_or_index=(nnodes++) | HIGHBITS_FULL;
435 stack[0]=&dict->head;
436 sdepth=0;
437 while((c=getc(fp))!=EOF){
438 if(c>='0' && c<='9'){
439 /* new word - finalize old word first (set value) */
440 nwords++; /* statistics */
441 /* assert(!stack[sdepth]->val_or_index) */
442 #ifdef PREFIX_FILE
443 stack[sdepth]->val_or_index=getc(prefixes);
444 #else
445 stack[sdepth]->val_or_index=nwords; /** TODO: different values */
446 #endif
447 /* and read how much to go back */
448 n=0;
449 do {
450 /* base 10... */
451 n*=10;
452 n+=(c-'0');
453 } while ((c=getc(fp))!=EOF && c>='0' && c<='9');
454 sdepth-=n;
455 if(sdepth<0 || sdepth >= (sizeof(stack)/sizeof(stack[0]))-1){
456 fprintf(stderr,"Hspell: bad backlength %d... giving up\n", sdepth);
457 return 0;
458 }
459 /* we got a new letter c - continue the loop */
460 }
461 /* word letter - add it */
462 if(sdepth>=sizeof(stack)/sizeof(stack[0])-1){
463 fprintf(stderr,"Hspell: word too long... giving up\n");
464 return 0;
465 }
466 cc=char_to_letter(c);
467 /* make sure previous node is a small or full node, not just a
468 * value, and if it is small, that it's not full */
469 if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_VALUE){
470 int chosen;
471 if(dict->nfree_nodes_small){
472 chosen=dict->free_nodes_small
473 [--(dict->nfree_nodes_small)];
474 } else {
475 chosen=nnodes_small;
476 if(nnodes_small>=dict->size_nodes_small){
477 fprintf(stderr,"Hspell: Realloc needed (small) - failing.\n");
478 return 0;
479 }
480 nnodes_small++;
481 }
482 memset(&nodes_small[chosen], 0, sizeof(nodes_small[chosen]));
483 nodes_small[chosen].value = stack[sdepth]->val_or_index;
484 stack[sdepth]->val_or_index = chosen | HIGHBITS_SMALL;
485
486 nodes_small[chosen].chars[0]=c;
487 stack[sdepth+1] = &nodes_small[chosen].children[0];
488 } else if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_SMALL){
489 int j;
490 struct node_small *n=
491 &nodes_small[stack[sdepth]->val_or_index&VALUEMASK];
492 /* is the small node not full yet? */
493 for(j=0;j<SMALL_NODE_CHILDREN;j++)
494 if(!n->chars[j]){
495 n->chars[j]=c;
496 stack[sdepth+1] = &n->children[j];
497 break;
498 }
499 if(j==SMALL_NODE_CHILDREN){
500 /* small node full! convert it to medium node */
501 int chosen;
502 if(dict->nfree_nodes_medium){
503 chosen=dict->free_nodes_medium
504 [--(dict->nfree_nodes_medium)];
505 } else {
506 chosen=nnodes_medium;
507 if(nnodes_medium>=dict->size_nodes_medium){
508 fprintf(stderr,"Hspell: Realloc needed (medium) - failing.\n");
509 return 0;
510 }
511 nnodes_medium++;
512 }
513 memset(&nodes_medium[chosen], 0, sizeof(nodes_medium[chosen]));
514 if(dict->nfree_nodes_small>=
515 sizeof(dict->free_nodes_small)/
516 sizeof(dict->free_nodes_small[0])){
517 fprintf(stderr,"Hspell: overflow in free_nodes_small.\n");
518 return 0;
519 }
520 dict->free_nodes_small
521 [(dict->nfree_nodes_small)++]=
522 stack[sdepth]->val_or_index & VALUEMASK;
523 stack[sdepth]->val_or_index = chosen | HIGHBITS_MEDIUM;
524 /* copy the children from n to nodes[nnodes]: */
525 /* TODO: use memcpy instead! */
526 nodes_medium[chosen].value = n->value;
527 for(j=0;j<SMALL_NODE_CHILDREN;j++){
528 nodes_medium[chosen].chars[j]=
529 n->chars[j];
530 nodes_medium[chosen].children[j]=
531 n->children[j];
532 }
533 /* and finally choose the next child */
534 nodes_medium[chosen].chars[SMALL_NODE_CHILDREN]=
535 c;
536 stack[sdepth+1] = &nodes_medium[chosen].
537 children[SMALL_NODE_CHILDREN];
538 }
539 } else if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_MEDIUM){
540 int j;
541 struct node_medium *n=
542 &nodes_medium[stack[sdepth]->val_or_index&VALUEMASK];
543 /* is the medium node not full yet? */
544 for(j=0;j<MEDIUM_NODE_CHILDREN;j++)
545 if(!n->chars[j]){
546 n->chars[j]=c;
547 stack[sdepth+1] = &n->children[j];
548 break;
549 }
550 if(j==MEDIUM_NODE_CHILDREN){
551 /* medium node full! convert it to full node */
552 if(nnodes>=dict->size_nodes){
553 fprintf(stderr,"Hspell: Realloc needed (full) - failing.\n");
554 return 0;
555 }
556 memset(&nodes[nnodes], 0, sizeof(nodes[nnodes]));
557 nodes[nnodes].value = n->value;
558 if(dict->nfree_nodes_medium>=
559 sizeof(dict->free_nodes_medium)/
560 sizeof(dict->free_nodes_medium[0])){
561 fprintf(stderr,"Hspell: overflow in free_nodes_medium.\n");
562 return 0;
563 }
564 dict->free_nodes_medium
565 [(dict->nfree_nodes_medium)++]=
566 stack[sdepth]->val_or_index & VALUEMASK;
567 stack[sdepth]->val_or_index = nnodes | HIGHBITS_FULL;
568 /* copy the children from n to nodes[nnodes]: */
569 for(j=0;j<MEDIUM_NODE_CHILDREN;j++)
570 nodes[nnodes].children[char_to_letter(
571 n->chars[j])]=
572 n->children[j];
573 /* and finally choose the next child */
574 stack[sdepth+1] = &nodes[nnodes].children[cc];
575 nnodes++;
576 }
577 } else { /* HIGHBITS_FULL */
578 stack[sdepth+1] = &nodes[
579 stack[sdepth]->val_or_index & VALUEMASK].children[cc];
580 }
581 sdepth++;
582 }
583 /* output last word */
584 nwords++; /* statistics */
585 #ifdef PREFIX_FILE
586 stack[sdepth]->val_or_index=getc(prefixes);
587 #else
588 stack[sdepth]->val_or_index=nwords; /** TODO: different values */
589 #endif
590
591 /* return local copies to dict-> structure */
592 dict->nwords=nwords;
593 dict->nnodes_small=nnodes_small;
594 dict->nnodes_medium=nnodes_medium;
595 dict->nnodes=nnodes;
596
597 return 1;
598 }
599
600 void
print_stats(struct dict_radix * dict)601 print_stats(struct dict_radix *dict)
602 {
603 fprintf(stderr, "%d words in %d full nodes, %d medium nodes, "
604 "%d small nodes.\n", dict->nwords, dict->nnodes,
605 dict->nnodes_medium, dict->nnodes_small);
606 fprintf(stderr, "%d nfree_nodes_small %d nfree_nodes_medium.\n",
607 dict->nfree_nodes_small,dict->nfree_nodes_medium);
608 fprintf(stderr, "node memory filled: %d K\n",
609 (int)(dict->nnodes*sizeof(struct node)
610 + dict->nnodes_small*sizeof(struct node_small)
611 + dict->nnodes_medium*sizeof(struct node_medium)
612 )/1024);
613 }
614
615 void
print_tree(struct dict_radix * dict)616 print_tree(struct dict_radix *dict)
617 {
618 char word[256];
619 do_print_tree(dict->nodes,dict->nodes_small,dict->nodes_medium,
620 dict->head,word,0,sizeof(word));
621
622 }
623
624 void
print_sizes(struct dict_radix * dict)625 print_sizes(struct dict_radix *dict)
626 {
627 printf("%d %d %d\n", dict->nnodes_small, dict->nnodes_medium,
628 dict->nnodes);
629 }
630
631 int
lookup(const struct dict_radix * dict,const char * word)632 lookup(const struct dict_radix *dict, const char *word)
633 {
634 struct node_index current = dict->head;
635 for(;;){
636 switch(current.val_or_index & HIGHBITS){
637 case HIGHBITS_VALUE:
638 if(*word){
639 /* The word isn't over yet but we reached a
640 leaf node. So the word isn't in the dict */
641 return 0;
642 } else {
643 return current.val_or_index & VALUEMASK;
644 }
645 break;
646 case HIGHBITS_SMALL:
647 if(*word){
648 struct node_small *n =
649 &dict->nodes_small[current.val_or_index
650 & VALUEMASK];
651 #if SMALL_NODE_CHILDREN==2
652 if(n->chars[0]==*word)
653 current=n->children[0];
654 else if(n->chars[1]==*word)
655 current=n->children[1];
656 else
657 return 0; /* not found... */
658 #else
659 #error "small node lookup not implemented except for 2 children."
660 #endif
661 } else {
662 return dict->nodes_small[current.val_or_index
663 & VALUEMASK]
664 .value;
665 }
666 break;
667 case HIGHBITS_MEDIUM:
668 if(*word){
669 struct node_medium *n =
670 &dict->nodes_medium[current.val_or_index
671 & VALUEMASK];
672 #if MEDIUM_NODE_CHILDREN==8
673 register char c=*word, *cs=n->chars;
674 /* TODO: use binary search? stop searching
675 on the first 0? All these optimizations
676 are probably useless for 8 chars... */
677 if(*(cs++)==c) current=n->children[0];
678 else if(*(cs++)==c) current=n->children[1];
679 else if(*(cs++)==c) current=n->children[2];
680 else if(*(cs++)==c) current=n->children[3];
681 else if(*(cs++)==c) current=n->children[4];
682 else if(*(cs++)==c) current=n->children[5];
683 else if(*(cs++)==c) current=n->children[6];
684 else if(*(cs++)==c) current=n->children[7];
685 else
686 return 0; /* not found... */
687 #else
688 #error "medium node lookup not implemented except for 8 children."
689 #endif
690 } else {
691 return dict->nodes_medium[current.val_or_index
692 & VALUEMASK]
693 .value;
694 }
695 break;
696 case HIGHBITS_FULL:
697 if(*word){
698 /* the following is a copy of char_to_letter */
699 register int ind;
700 register unsigned char c = *word;
701 if(c>=(unsigned char)'�' &&
702 c<(unsigned char)'�'+27)
703 ind = c - (unsigned char)'�' + 2;
704 else if (c=='"')
705 ind = 0;
706 else if (c=='\'')
707 ind = 1;
708 else
709 return 0; /* non-Hebrew letter */
710 current=dict->nodes[current.val_or_index
711 & VALUEMASK]
712 .children[ind];
713 } else {
714 return dict->nodes[current.val_or_index
715 & VALUEMASK].value;
716 }
717 break;
718 }
719 word++;
720 }
721 }
722