1 /* Copyright (C) 2003-2009 Nadav Har'El and Dan Kenigsberg */
2 
3 #include <stdio.h>
4 #include <sys/types.h>
5 #include <stdlib.h>
6 #include <string.h>
7 
8 /* This is for declaring the uint32_t type, a type holding a 32-bit unsigned
9    integer. It exists on Linux and on fairly modern Solaris, but
10    maybe not anywhere else. We should use autoconf to solve this portability
11    nightmare.
12 */
13 #include <inttypes.h>
14 
15 /* If the Zlib library is available, we use it for reading the compressed
16    dictionaries, rather than opening a pipe to an external gzip process.
17    In one measurement, this halved the loading time (0.1 sec to 0.05 sec).
18    It also allowed Hspell to be used on systems where the Zlib library
19    is available, but the gzip program is not (e.g., OpenOffice on MS-Windows).
20 
21    The definitions which are pretty bizarre, but help us convert the existing
22    code into something that will work with zlib without ugly ifdefs
23    everywhere (further ifdefs are only needed in some places). Note that
24    when BUFFERED_ZLIB is enabled (and it is enabled by default here) we
25    enable special buffered version of zlib (gzbuffered.h) instead of the
26    normal zlib functions.
27 */
28 #ifdef HAVE_ZLIB
29 #define BUFFERED_ZLIB
30 #undef FILE
31 #undef pclose
32 #undef getc
33 #ifdef BUFFERED_ZLIB
34 #include "gzbuffered.h"
35 #undef gzopen
36 #undef gzdopen
37 #define FILE void /* void* can be either normal FILE* or gzbFile*. Eek. */
38 #define gzopen(path,mode) gzb_open(path,mode)
39 #define gzdopen(path,mode) gzb_dopen(path,mode)
40 #define pclose(f) (gzb_close((gzbFile *)(f)))
41 #define getc(f) (gzb_getc(((gzbFile *)(f))))
42 #else
43 #include <zlib.h>
44 #define FILE void    /* FILE* is void*, a.k.a. voidp or gzFile */
45 #define pclose(f) (gzclose((f)))
46 #define getc(f) (gzgetc((f)))
47 #endif
48 #endif /* HAVE_ZLIB */
49 
50 /* Our radix tree has four types of "nodes": leaf nodes, small nodes
51  * (carrying up to SMALL_NODE_CHILDREN children), medium nodes (carrying up to
52  * MEDIUM_NODE_CHILDREN) and full nodes carrying exactly NUM_LETTERS children.
53  *
54  * Since there are plenty of leaf nodes, we want these to be tiny, containing
55  * basically just a value. Therefore we overload the same 32-bit "val_or_index"
56  * position to be one of:
57  * 1.  Empty  (in this case val_or_index==0)
58  * 2.  Value  (value must be non-zero and 30 bit only!)
59  * 3.  Index of full node (3 on highest 2 bits, the 30 lowest are the index)
60  * 4.  Index of medium node (2 on highest 2 bits, the 30 lowest are the index)
61  * 5.  Index of small node (1 on highest 2 bits, the 30 lowest are the index)
62  */
63 #define CONST32(x) ((uint32_t)(x))
64 #define HIGHBITS ((CONST32(1)<<31) | (CONST32(1)<<30))
65 #define HIGHBITS_VALUE  (CONST32(0) << 30)
66 #define HIGHBITS_SMALL  (CONST32(1) << 30)
67 #define HIGHBITS_MEDIUM (CONST32(2) << 30)
68 #define HIGHBITS_FULL   (CONST32(3) << 30)
69 #define VALUEMASK (~HIGHBITS)
70 
71 #define NUM_LETTERS 29  /* 27 Hebrew letters, " and ' */
72 /* When trying on the Hebrew dictionary, when there are only small and
73  * full nodes, small_node_children=4 was the clear winner, taking 3363K
74  * of memory.
75  * When added medium nodes, there are two ties for minimal space usage
76  * (at 2260K each): 2,8 and 3,8. Both have 1831 full nodes, 2,8 results in
77  * 61771/25072 small/medium nodes, and 3,8 results in 71856/14987 small/medium
78  * nodes.
79  * One way to choose among them is to minimize search time. On average
80  * searching a node with N children takes N/2 comparisons. If we pass
81  * all nodes (and I doubt this is a meaningful measure... :( ) the 2,8
82  * will make 162059 comparisons and 3,8 will make 167732. Again, roughly
83  * the same, so I can't decide :(
84  * Another deciding factor: read time. 2,8 is slightly quicker - I have
85  * no idea why.
86  *
87  * Note: to minimize search time we might want to choose a set of sizes
88  * which does not assure the smallest size. HOWEVER, one interesting thing
89  * to note: the children in small and medium nodes are sorted. This might
90  * mean that it is quicker to search the medium node using a binary search,
91  * rather than linear? I don't know. Maybe for N=8, it ain't worth it.
92  */
93 /*#define SMALL_NODE_CHILDREN 4*/
94 #define SMALL_NODE_CHILDREN 2
95 #define MEDIUM_NODE_CHILDREN 8
96 
97 #if 0
98  /*
99  * NOTE:  SMALL-MEDIUM = 1-4 has an interesting advantage. At 2876K It wasn't
100  * smallest (2-8 was, with 2257K) but it makes a lot of nodes full or
101  * 1-child only (and therefore very quick to search) and only some nodes
102  * with 4 children which is only slightly harder to search (only 2.5
103  * comparisons needed on average).
104  * */
105 /* search speed optimization */
106 #define SMALL_NODE_CHILDREN 1
107 #define MEDIUM_NODE_CHILDREN 4
108 #endif
109 
110 struct node_index {
111 	/* if most-significant bit of val is on, it's an index. Otherwise,
112 	 * it's only a value (31 bit and nonzero).
113 	*/
114 	uint32_t val_or_index;
115 };
116 
117 struct node {
118 	uint32_t value;
119 	struct node_index children[NUM_LETTERS];
120 };
121 struct node_small {
122 	uint32_t value;
123 	char chars[SMALL_NODE_CHILDREN];
124 	struct node_index children[SMALL_NODE_CHILDREN];
125 };
126 struct node_medium {
127 	uint32_t value;
128 	char chars[MEDIUM_NODE_CHILDREN];
129 	struct node_index children[MEDIUM_NODE_CHILDREN];
130 };
131 
132 
133 /* Note: char_to_letter prints a message when it comes across an invalid
134    letter, so it should not be used in lookup(), only in reading the
135    dictionary (which is assumed to contain only valid words). lookup()
136    has its own implementation of this function inside it.
137 */
char_to_letter(unsigned char c)138 static inline int char_to_letter(unsigned char c)
139 {
140 	if(c>=(unsigned char)'�' && c<(unsigned char)'�'+27){
141 		return c - (unsigned char)'�' + 2;
142 	} else if (c=='"'){
143 		return 0;
144 	} else if (c=='\''){
145 		return 1;
146 	} else {
147 		fprintf(stderr,"Hspell: unknown letter %c...\n",c);
148 		/* a silly thing to do, but what the heck */
149 		return 0;
150 	}
151 }
152 
letter_to_char(int l)153 static inline unsigned char letter_to_char(int l)
154 {
155 	if(l>=2 && l<29){
156 		return l+(unsigned char)'�'-2;
157 	} else if(l==0){
158 		return '"';
159 	} else if(l==1){
160 		return '\'';
161 	} else {
162 		/* this will never happen in the current code: */
163 		fprintf(stderr,"Hspell: internal error: unknown letter %d... "
164 				"exiting.\n",l);
165 		exit(1);
166 	}
167 }
168 
169 /* This routine was written for debugging purposes only, and not for
170  * absolute efficiency.
171  */
172 static void
do_print_tree(struct node * nodes,struct node_small * nodes_small,struct node_medium * nodes_medium,struct node_index head,char * word,int len,int maxlen)173 do_print_tree(struct node *nodes, struct node_small *nodes_small,
174 	   struct node_medium *nodes_medium,
175            struct node_index head, char *word, int len, int maxlen){
176 	int i;
177 	if(len>=maxlen){
178 		fprintf(stderr,"Hspell: do_print_tree(): warning: buffer overflow.\n");
179 		return;
180 	}
181 	if((head.val_or_index & HIGHBITS) == HIGHBITS_FULL){
182 		struct node *n = &nodes[head.val_or_index & VALUEMASK];
183 		if(n->value){
184 			word[len]='\0';
185 			printf("%s %d\n", word, n->value);
186 		}
187 		for(i=0;i<NUM_LETTERS;i++){
188 			word[len]=letter_to_char(i);
189 			do_print_tree(nodes,nodes_small,nodes_medium,
190 					n->children[i],word,len+1,maxlen);
191 		}
192 	} else if((head.val_or_index & HIGHBITS) == HIGHBITS_SMALL){
193 		struct node_small *n = &nodes_small[head.val_or_index & VALUEMASK];
194 		if(n->value){
195 			word[len]='\0';
196 			printf("%s %d\n", word, n->value);
197 		}
198 		for(i=0;i<SMALL_NODE_CHILDREN;i++){
199 			if(n->chars[i]){
200 				word[len]=n->chars[i];
201 				do_print_tree(nodes,nodes_small,nodes_medium,
202 					n->children[i],word,len+1,maxlen);
203 			}
204 		}
205 	} else if((head.val_or_index & HIGHBITS) == HIGHBITS_MEDIUM){
206 		struct node_medium *n = &nodes_medium[head.val_or_index & VALUEMASK];
207 		if(n->value){
208 			word[len]='\0';
209 			printf("%s %d\n", word, n->value);
210 		}
211 		for(i=0;i<MEDIUM_NODE_CHILDREN;i++){
212 			if(n->chars[i]){
213 				word[len]=n->chars[i];
214 				do_print_tree(nodes,nodes_small,nodes_medium,
215 					n->children[i],word,len+1,maxlen);
216 			}
217 		}
218 	} else if(head.val_or_index){
219 		word[len]='\0';
220 		printf("%s %d\n", word, head.val_or_index);
221 	}
222 }
223 
224 struct dict_radix {
225 	/* The nodes used by the radix tree representation of the dictionary */
226 	int nnodes_small, size_nodes_small;
227 	struct node_small *nodes_small;
228 
229 	int nnodes_medium, size_nodes_medium;
230 	struct node_medium *nodes_medium;
231 
232 	int nnodes, size_nodes;
233 	struct node *nodes;
234 
235 	struct node_index head;
236 
237 	/* Freelist of recycled small nodes. As more words are added to the
238 	   dictionary in the process of read_dict(), small nodes become
239 	   medium and medium nodes become full. Because these small/medium
240 	   nodes that are no longer needed are in the middle of the node
241 	   list, we keep them aside in a freelist. They are recycled quickly,
242 	   as new small/medium nodes are continued to be created.
243 	 */
244 	int free_nodes_small[16], nfree_nodes_small;
245 	int free_nodes_medium[16], nfree_nodes_medium;
246 
247 	int nwords;
248 };
249 
250 /* new_dict_radix is the constructor for an opaque (to the includer of
251    dict_radix.h) object.
252 */
253 struct dict_radix *
new_dict_radix(void)254 new_dict_radix(void)
255 {
256 	struct dict_radix *dict;
257 	dict= (struct dict_radix *) malloc(sizeof(struct dict_radix));
258 	/* By default, zero all fields in dict_radix */
259 	if(dict)
260 		memset(dict, 0, sizeof(*dict));
261 	return dict;
262 }
263 
264 /* Note that delete_dict_radix frees everything inside a dict_radix, and
265    the dict_radix structure itself. The pointer given to it is no longer
266    a valid pointer after this call.
267 */
268 void
delete_dict_radix(struct dict_radix * dict)269 delete_dict_radix(struct dict_radix *dict)
270 {
271 	if(!dict)
272 		return; /* allow deleting null object, like in C++... */
273 	if(dict->nodes_small)
274 		free(dict->nodes_small);
275 	if(dict->nodes_medium)
276 		free(dict->nodes_medium);
277 	if(dict->nodes)
278 		free(dict->nodes);
279 	free(dict);
280 }
281 
282 int
allocate_nodes(struct dict_radix * dict,int nsmall,int nmedium,int nfull)283 allocate_nodes(struct dict_radix *dict, int nsmall, int nmedium, int nfull)
284 {
285 	/* if already allocated, it's an error */
286 	if(dict->nodes)
287 		return -1;
288 
289 	dict->nodes_small = malloc(sizeof(struct node_small)*nsmall);
290 	dict->size_nodes_small = nsmall;
291 
292 	dict->nodes_medium = malloc(sizeof(struct node_medium)*nmedium);
293 	dict->size_nodes_medium = nmedium;
294 
295 	dict->nodes = malloc(sizeof(struct node)*nfull);
296 	dict->size_nodes = nfull;
297 
298 	if(dict->nodes_small==NULL || dict->nodes_medium==NULL ||
299 	   dict->nodes==NULL)
300 		return -2;
301 
302 	return 0;
303 }
304 
305 
306 /* Efficiently read a compressed dictionary from the given directory.
307    Use memory pre-allocation hints from another file in this directory.
308 
309    returns 1 on success, 0 on failure.
310 
311    TODO: there are too many printouts here. We need to return error
312    numbers instead of all those printouts.
313 */
314 
315 #define PREFIX_FILE
316 
317 #ifdef PREFIX_FILE
318 static int do_read_dict(FILE *fp, FILE *prefixes, struct dict_radix *dict);
319 #else
320 static int do_read_dict(FILE *fp, struct dict_radix *dict);
321 #endif
322 
323 int
read_dict(struct dict_radix * dict,const char * dir)324 read_dict(struct dict_radix *dict, const char *dir)
325 {
326 	if(dir){
327 		FILE *fp;
328 		char s[1024];
329 		int small,medium,full,ret;
330 #ifdef PREFIX_FILE
331 		FILE *prefixes;
332 #endif
333 
334 		snprintf(s,sizeof(s),"%s.sizes",dir);
335 		if(!(fp=fopen(s,"r"))){
336 			fprintf(stderr,"Hspell: can't open %s.\n",s);
337 			return 0;
338 		}
339 		if(fscanf(fp,"%d %d %d",&small,&medium,&full)!=3){
340 			fprintf(stderr,"Hspell: can't read from %s.\n",s);
341 			return 0;
342 		}
343 		fclose(fp);
344 
345 #ifdef HAVE_ZLIB
346 		if(!(fp=gzopen(dir,"r"))){
347 			fprintf(stderr,"Hspell: can't open %s.\n",dir);
348 			return 0;
349 		}
350 #else
351 		snprintf(s,sizeof(s),"gzip -dc '%s'",dir);
352 		if(!(fp=popen(s,"r"))){
353 			fprintf(stderr,"Hspell: can't run %s.\n",s);
354 			return 0;
355 		}
356 #endif /* HAVE_ZLIB */
357 
358 #ifdef PREFIX_FILE
359 #ifdef HAVE_ZLIB
360 		snprintf(s,sizeof(s),"%s.prefixes",dir);
361 		if(!(prefixes=gzopen(s,"rb"))){
362 			fprintf(stderr,"Hspell: can't open %s.\n",s);
363 			return 0;
364 		}
365 #else
366 		snprintf(s,sizeof(s),"gzip -dc '%s.prefixes'",dir);
367 		if(!(prefixes=popen(s,"rb"))){
368 			fprintf(stderr,"Hspell: can't run %s.\n",s);
369 			return 0;
370 		}
371 #endif /* HAVE_ZLIB */
372 #endif
373 
374 		allocate_nodes(dict,small,medium,full);
375 #ifdef PREFIX_FILE
376 		ret=do_read_dict(fp, prefixes, dict);
377 		pclose(prefixes);
378 #else
379 		ret=do_read_dict(fp, dict);
380 #endif
381 		pclose(fp);
382 		return ret;
383 	} else {
384 #ifdef HAVE_ZLIB
385 		/* note that gzopen also works on non-gzipped files */
386 		FILE *in=gzdopen(fileno(stdin),"r");
387 #ifdef PREFIX_FILE
388 		FILE *zero=gzopen("/dev/zero","r");
389 #endif
390 #else
391 		FILE *in=stdin;
392 #ifdef PREFIX_FILE
393 		FILE *zero=fopen("/dev/zero","r");
394 #endif
395 #endif /* HAVE_ZLIB */
396 
397 #ifdef PREFIX_FILE
398 		return do_read_dict(in, zero, dict);
399 #else
400 		return do_read_dict(in, dict);
401 #endif
402 	}
403 }
404 
405 #ifdef PREFIX_FILE
do_read_dict(FILE * fp,FILE * prefixes,struct dict_radix * dict)406 static int do_read_dict(FILE *fp, FILE *prefixes, struct dict_radix *dict)
407 #else
408 static int do_read_dict(FILE *fp, struct dict_radix *dict)
409 #endif
410 {
411 	struct node_index *stack[256];
412 	int sdepth=0;
413 	int c,n,cc;
414 	/* Local copies of dict-> variables, for efficiency. */
415 	int nwords=0;
416 	struct node *nodes = dict->nodes;
417 	struct node_small *nodes_small = dict->nodes_small;
418 	struct node_medium *nodes_medium = dict->nodes_medium;
419 	int nnodes_small=0, nnodes_medium=0, nnodes=0;
420 
421 	if(dict->nnodes||dict->nnodes_small||dict->nnodes_medium||
422 	   dict->nwords){
423 		fprintf(stderr, "Hspell: do_read_dict(): called for a non-"
424 			"empty dictionary\n");
425 		return 0;
426 	}
427 	if(!nodes||!nodes_small||!nodes_medium){
428 		fprintf(stderr, "Hspell: do_read_dict(): allocate_nodes() must"
429 			" be called first\n");
430 		return 0;
431 	}
432 
433 	memset(&nodes[nnodes], 0, sizeof(nodes[nnodes]));
434 	dict->head.val_or_index=(nnodes++) | HIGHBITS_FULL;
435 	stack[0]=&dict->head;
436 	sdepth=0;
437 	while((c=getc(fp))!=EOF){
438 		if(c>='0' && c<='9'){
439 			/* new word - finalize old word first (set value) */
440 			nwords++; /* statistics */
441 			/* assert(!stack[sdepth]->val_or_index) */
442 #ifdef PREFIX_FILE
443 			stack[sdepth]->val_or_index=getc(prefixes);
444 #else
445 			stack[sdepth]->val_or_index=nwords; /** TODO: different values */
446 #endif
447 			/* and read how much to go back */
448 			n=0;
449 			do {
450 				/* base 10... */
451 				n*=10;
452 				n+=(c-'0');
453 			} while ((c=getc(fp))!=EOF && c>='0' && c<='9');
454 			sdepth-=n;
455 			if(sdepth<0 || sdepth >= (sizeof(stack)/sizeof(stack[0]))-1){
456 				fprintf(stderr,"Hspell: bad backlength %d... giving up\n", sdepth);
457 				return 0;
458 			}
459 			/* we got a new letter c - continue the loop */
460 		}
461 		/* word letter - add it */
462 		if(sdepth>=sizeof(stack)/sizeof(stack[0])-1){
463 			fprintf(stderr,"Hspell: word too long... giving up\n");
464 			return 0;
465 		}
466 		cc=char_to_letter(c);
467 		/* make sure previous node is a small or full node, not just a
468 		 * value, and if it is small, that it's not full */
469 		if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_VALUE){
470 			int chosen;
471 			if(dict->nfree_nodes_small){
472 				chosen=dict->free_nodes_small
473 					[--(dict->nfree_nodes_small)];
474 			} else {
475 				chosen=nnodes_small;
476 				if(nnodes_small>=dict->size_nodes_small){
477 					fprintf(stderr,"Hspell: Realloc needed (small) - failing.\n");
478 					return 0;
479 				}
480 				nnodes_small++;
481 			}
482 			memset(&nodes_small[chosen], 0, sizeof(nodes_small[chosen]));
483 			nodes_small[chosen].value = stack[sdepth]->val_or_index;
484 			stack[sdepth]->val_or_index = chosen | HIGHBITS_SMALL;
485 
486 			nodes_small[chosen].chars[0]=c;
487 			stack[sdepth+1] = &nodes_small[chosen].children[0];
488 		} else if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_SMALL){
489 			int j;
490 			struct node_small *n=
491 			   &nodes_small[stack[sdepth]->val_or_index&VALUEMASK];
492 			/* is the small node not full yet? */
493 			for(j=0;j<SMALL_NODE_CHILDREN;j++)
494 				if(!n->chars[j]){
495 					n->chars[j]=c;
496 					stack[sdepth+1] = &n->children[j];
497 					break;
498 				}
499 			if(j==SMALL_NODE_CHILDREN){
500 				/* small node full! convert it to medium node */
501 				int chosen;
502 				if(dict->nfree_nodes_medium){
503 					chosen=dict->free_nodes_medium
504 						[--(dict->nfree_nodes_medium)];
505 				} else {
506 					chosen=nnodes_medium;
507 					if(nnodes_medium>=dict->size_nodes_medium){
508 						fprintf(stderr,"Hspell: Realloc needed (medium) - failing.\n");
509 						return 0;
510 					}
511 					nnodes_medium++;
512 				}
513 				memset(&nodes_medium[chosen], 0, sizeof(nodes_medium[chosen]));
514 				if(dict->nfree_nodes_small>=
515 				   sizeof(dict->free_nodes_small)/
516 				   sizeof(dict->free_nodes_small[0])){
517 					fprintf(stderr,"Hspell: overflow in free_nodes_small.\n");
518 					return 0;
519 				}
520 				dict->free_nodes_small
521 					[(dict->nfree_nodes_small)++]=
522 					stack[sdepth]->val_or_index & VALUEMASK;
523 				stack[sdepth]->val_or_index = chosen | HIGHBITS_MEDIUM;
524 				/* copy the children from n to nodes[nnodes]: */
525 				/* TODO: use memcpy instead! */
526 				nodes_medium[chosen].value = n->value;
527 				for(j=0;j<SMALL_NODE_CHILDREN;j++){
528 					nodes_medium[chosen].chars[j]=
529 						n->chars[j];
530 					nodes_medium[chosen].children[j]=
531 						n->children[j];
532 				}
533 				/* and finally choose the next child */
534 				nodes_medium[chosen].chars[SMALL_NODE_CHILDREN]=
535 					c;
536 				stack[sdepth+1] = &nodes_medium[chosen].
537 					children[SMALL_NODE_CHILDREN];
538 			}
539 		} else if((stack[sdepth]->val_or_index & HIGHBITS)==HIGHBITS_MEDIUM){
540 			int j;
541 			struct node_medium *n=
542 			   &nodes_medium[stack[sdepth]->val_or_index&VALUEMASK];
543 			/* is the medium node not full yet? */
544 			for(j=0;j<MEDIUM_NODE_CHILDREN;j++)
545 				if(!n->chars[j]){
546 					n->chars[j]=c;
547 					stack[sdepth+1] = &n->children[j];
548 					break;
549 				}
550 			if(j==MEDIUM_NODE_CHILDREN){
551 				/* medium node full! convert it to full node */
552 				if(nnodes>=dict->size_nodes){
553 					fprintf(stderr,"Hspell: Realloc needed (full) - failing.\n");
554 					return 0;
555 				}
556 				memset(&nodes[nnodes], 0, sizeof(nodes[nnodes]));
557 				nodes[nnodes].value = n->value;
558 				if(dict->nfree_nodes_medium>=
559 				   sizeof(dict->free_nodes_medium)/
560 				   sizeof(dict->free_nodes_medium[0])){
561 					fprintf(stderr,"Hspell: overflow in free_nodes_medium.\n");
562 					return 0;
563 				}
564 				dict->free_nodes_medium
565 					[(dict->nfree_nodes_medium)++]=
566 					stack[sdepth]->val_or_index & VALUEMASK;
567 				stack[sdepth]->val_or_index = nnodes | HIGHBITS_FULL;
568 				/* copy the children from n to nodes[nnodes]: */
569 				for(j=0;j<MEDIUM_NODE_CHILDREN;j++)
570 					nodes[nnodes].children[char_to_letter(
571 						n->chars[j])]=
572 						n->children[j];
573 				/* and finally choose the next child */
574 				stack[sdepth+1] = &nodes[nnodes].children[cc];
575 				nnodes++;
576 			}
577 		} else { /* HIGHBITS_FULL */
578 			stack[sdepth+1] = &nodes[
579 			  stack[sdepth]->val_or_index & VALUEMASK].children[cc];
580 		}
581 		sdepth++;
582 	}
583 	/* output last word */
584 	nwords++; /* statistics */
585 #ifdef PREFIX_FILE
586 	stack[sdepth]->val_or_index=getc(prefixes);
587 #else
588 	stack[sdepth]->val_or_index=nwords; /** TODO: different values */
589 #endif
590 
591 	/* return local copies to dict-> structure */
592 	dict->nwords=nwords;
593 	dict->nnodes_small=nnodes_small;
594 	dict->nnodes_medium=nnodes_medium;
595 	dict->nnodes=nnodes;
596 
597 	return 1;
598 }
599 
600 void
print_stats(struct dict_radix * dict)601 print_stats(struct dict_radix *dict)
602 {
603 	fprintf(stderr,	"%d words in %d full nodes, %d medium nodes, "
604 		"%d small nodes.\n", dict->nwords, dict->nnodes,
605 		dict->nnodes_medium, dict->nnodes_small);
606 	fprintf(stderr, "%d nfree_nodes_small %d nfree_nodes_medium.\n",
607 		dict->nfree_nodes_small,dict->nfree_nodes_medium);
608 	fprintf(stderr, "node memory filled: %d K\n",
609 	       (int)(dict->nnodes*sizeof(struct node)
610 		+ dict->nnodes_small*sizeof(struct node_small)
611 		+ dict->nnodes_medium*sizeof(struct node_medium)
612 		       )/1024);
613 }
614 
615 void
print_tree(struct dict_radix * dict)616 print_tree(struct dict_radix *dict)
617 {
618 	char word[256];
619 	do_print_tree(dict->nodes,dict->nodes_small,dict->nodes_medium,
620 		      dict->head,word,0,sizeof(word));
621 
622 }
623 
624 void
print_sizes(struct dict_radix * dict)625 print_sizes(struct dict_radix *dict)
626 {
627 	printf("%d %d %d\n", dict->nnodes_small, dict->nnodes_medium,
628 		dict->nnodes);
629 }
630 
631 int
lookup(const struct dict_radix * dict,const char * word)632 lookup(const struct dict_radix *dict, const char *word)
633 {
634 	struct node_index current = dict->head;
635 	for(;;){
636 		switch(current.val_or_index & HIGHBITS){
637 		case HIGHBITS_VALUE:
638 			if(*word){
639 				/* The word isn't over yet but we reached a
640 				   leaf node. So the word isn't in the dict */
641 				return 0;
642 			} else {
643 				return current.val_or_index & VALUEMASK;
644 			}
645 			break;
646 		case HIGHBITS_SMALL:
647 			if(*word){
648 				struct node_small *n =
649 				      &dict->nodes_small[current.val_or_index
650 							 & VALUEMASK];
651 #if SMALL_NODE_CHILDREN==2
652 				if(n->chars[0]==*word)
653 					current=n->children[0];
654 				else if(n->chars[1]==*word)
655 					current=n->children[1];
656 				else
657 					return 0; /* not found... */
658 #else
659 #error "small node lookup not implemented except for 2 children."
660 #endif
661 			} else {
662 				return dict->nodes_small[current.val_or_index
663 							 & VALUEMASK]
664 					.value;
665 			}
666 			break;
667 		case HIGHBITS_MEDIUM:
668 			if(*word){
669 				struct node_medium *n =
670 				      &dict->nodes_medium[current.val_or_index
671 							  & VALUEMASK];
672 #if MEDIUM_NODE_CHILDREN==8
673 				register char c=*word, *cs=n->chars;
674 				/* TODO: use binary search? stop searching
675 				   on the first 0? All these optimizations
676 				   are probably useless for 8 chars... */
677 				if(*(cs++)==c)      current=n->children[0];
678 				else if(*(cs++)==c) current=n->children[1];
679 				else if(*(cs++)==c) current=n->children[2];
680 				else if(*(cs++)==c) current=n->children[3];
681 				else if(*(cs++)==c) current=n->children[4];
682 				else if(*(cs++)==c) current=n->children[5];
683 				else if(*(cs++)==c) current=n->children[6];
684 				else if(*(cs++)==c) current=n->children[7];
685 				else
686 					return 0; /* not found... */
687 #else
688 #error "medium node lookup not implemented except for 8 children."
689 #endif
690 			} else {
691 				return dict->nodes_medium[current.val_or_index
692 							  & VALUEMASK]
693 					.value;
694 			}
695 			break;
696 		case HIGHBITS_FULL:
697 			if(*word){
698 				/* the following is a copy of char_to_letter */
699 				register int ind;
700 				register unsigned char c = *word;
701 				if(c>=(unsigned char)'�' &&
702 				   c<(unsigned char)'�'+27)
703 					ind = c - (unsigned char)'�' + 2;
704 				else if (c=='"')
705 					ind = 0;
706 				else if (c=='\'')
707 					ind = 1;
708 				else
709 					return 0; /* non-Hebrew letter */
710 				current=dict->nodes[current.val_or_index
711 						    & VALUEMASK]
712 					.children[ind];
713 			} else {
714 				return dict->nodes[current.val_or_index
715 						   & VALUEMASK].value;
716 			}
717 			break;
718 		}
719 		word++;
720 	}
721 }
722