1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright 2013, 2014 Linas Vepstas                                    */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 
14 #include <string.h>
15 
16 #include "api-structures.h"             // Sentence_s (add_empty_word)
17 #include "dict-common/dialect.h"
18 #include "dict-common/dict-affix.h"     // is_stem
19 #include "dict-common/dict-common.h"
20 #include "dict-common/dict-defines.h"   // SUBSCRIPT_MARK
21 #include "dict-common/file-utils.h"
22 #include "dict-common/idiom.h"
23 #include "error.h"
24 #include "externs.h"
25 #include "print/print.h"
26 #include "read-dict.h"
27 #include "string-set.h"
28 #include "tokenize/tok-structures.h"    // MT_WALL
29 #include "utilities.h"
30 #include "word-file.h"
31 
32 /*
33   The dictionary format:
34 
35   In what follows:
36     Every "%" symbol and everything after it is ignored on every line.
37     Every newline or tab is replaced by a space.
38 
39   The dictionary file is a sequence of ENTRIES.  Each ENTRY is one or
40   more WORDS (a sequence of upper or lower case letters) separated by
41   spaces, followed by a ":", followed by an EXPRESSION followed by a
42   ";".  An EXPRESSION is an expression where the operators are "&"
43   or "and" or "|" or "or", and there are three types of parentheses:
44   "()", "{}", and "[]".  The terminal symbols of this grammar are the
45   connectors, which are strings of letters or numbers or *s.
46   Expressions are written in infix form.
47 
48   The connector begins with an optional @, which is followed by an upper
49   case sequence of letters. Each subsequent *, lower case letter or
50   number is a subscript. At the end is a + or - sign.  The "@" allows
51   this connector to attach to one or more other connectors.
52 
53   Here is a sample dictionary entry:
54 
55       gone:         T- & {@EV+};
56 
57   (See our paper for more about how to interpret the meaning of the
58   dictionary expressions.)
59 
60   A previously defined word (such as "gone" above) may be used instead
61   of a connector to specify the expression it was defined to be.  Of
62   course, in this case, it must uniquely specify a word in the
63   dictionary, and have been previously defined.
64 
65   If a word is of the form "/foo", then the file current-dir/foo
66   is a so-called word file, and is read in as a list of words.
67   A word file is just a list of words separated by blanks or newlines.
68 
69   A word that contains the character "_" defines an idiomatic use of
70   the words separated by the "_".  For example "kind of" is an idiomatic
71   expression, so a word "kind_of" is defined in the dictionary.
72   Idiomatic expressions of any number of words can be defined in this way.
73   When the word "kind" is encountered, all the idiomatic uses of the word
74   are considered.
75 
76   An expression enclosed in "[..]" is give a cost of 1.  This means
77   that if any of the connectors inside the square braces are used,
78   a cost of 1 is incurred.  (This cost is the first element of the cost
79   vector printed when a sentence is parsed.)  Of course if something is
80   inside of 10 levels of "[..]" then using it incurs a cost of 10.
81   These costs are called "disjunct costs".  The linkages are printed out
82   in order of non-increasing disjunct cost.
83 
84   A number following a square bracket over-rides the cost of that bracket.
85   Thus, [...].5 has a cost of 0.5 while [...]2.0 has a cost of 2; that
86   is it is the same as [[...]].  Any floating point number (including
87   exponents!) is allowed.
88 
89   The expression "(A+ or ())" means that you can choose either "A+" or
90   the empty expression "()", that is, that the connector "A+" is
91   optional.  This is more compactly expressed as "{A+}".  In other words,
92   curly braces indicate an optional expression.
93 
94   The expression "(A+ or [])" is the same as that above, but there is a
95   cost of 1 incurred for choosing not to use "A+".  The expression
96   "(EXP1 & [EXP2])" is exactly the same as "[EXP1 & EXP2]".  The difference
97   between "({[A+]} & B+)" and "([{A+}] & B+)" is that the latter always
98   incurs a cost of 1, while the former only gets a cost of 1 if "A+" is
99   used.
100 
101   The dictionary writer is not allowed to use connectors that begin in
102   "ID".  This is reserved for the connectors automatically
103   generated for idioms.
104 
105   Dictionary words may be followed by a dot (period, "."), and a "subscript"
106   identifying the word type. The subscript may be one or more letters or
107   numbers, but must begin with a letter. Currently, the dictionary contains
108   (mostly?) subscripts consisting of a single letter, and these serve mostly
109   to identify the part-of-speech. In general, subscripts can also be used
110   to distinguish different word senses.
111 */
112 
113 static bool link_advance(Dictionary dict);
114 
dict_error2(Dictionary dict,const char * s,const char * s2)115 static void dict_error2(Dictionary dict, const char * s, const char *s2)
116 {
117 #define ERRBUFLEN 1024
118 	char tokens[ERRBUFLEN], t[ERRBUFLEN];
119 	int pos = 1;
120 	int i;
121 
122 	/* The link_advance used to print the error message can
123 	 * throw more errors while printing... */
124 	if (dict->recursive_error) return;
125 	dict->recursive_error = true;
126 
127 	char token[MAX_TOKEN_LENGTH];
128 	strcpy(token, dict->token);
129 	bool save_is_special    = dict->is_special;
130 	const char * save_input = dict->input;
131 	const char * save_pin   = dict->pin;
132 	int save_already_got_it = dict->already_got_it;
133 	int save_line_number    = dict->line_number;
134 
135 	tokens[0] = '\0';
136 	for (i=0; i<5 && dict->token[0] != '\0'; i++)
137 	{
138 		pos += snprintf(t, ERRBUFLEN, "\"%s\" ", dict->token);
139 		strncat(tokens, t, ERRBUFLEN-1-pos);
140 		if (!link_advance(dict)) break;
141 	}
142 	tokens[pos] = '\0';
143 
144 	strcpy(dict->token, token);
145 	dict->is_special     = save_is_special;
146 	dict->input          = save_input;
147 	dict->pin            = save_pin;
148 	dict->already_got_it = save_already_got_it;
149 	dict->line_number    = save_line_number;
150 
151 	if (s2)
152 	{
153 		prt_error("Error: While parsing dictionary \"%s\":\n"
154 		          "%s %s\n\t Line %d, next tokens: %s\n",
155 		          dict->name, s, s2, dict->line_number, tokens);
156 	}
157 	else
158 	{
159 		prt_error("Error: While parsing dictionary \"%s\":\n"
160 		          "%s\n\t Line %d, next tokens: %s\n",
161 		          dict->name, s, dict->line_number, tokens);
162 	}
163 	dict->recursive_error = false;
164 }
165 
dict_error(Dictionary dict,const char * s)166 static void dict_error(Dictionary dict, const char * s)
167 {
168 	dict_error2(dict, s, NULL);
169 }
170 
warning(Dictionary dict,const char * s)171 static void warning(Dictionary dict, const char * s)
172 {
173 	prt_error("Warning: %s\n"
174 	        "\tline %d, current token = \"%s\"\n",
175 	        s, dict->line_number, dict->token);
176 }
177 
178 /**
179  * This gets the next UTF8 character from the input, eliminating comments.
180  * If we're in quote mode, it does not consider the % character for
181  * comments.   Note that the returned character is a wide character!
182  */
183 #define MAXUTFLEN 7
184 typedef char utf8char[MAXUTFLEN];
get_character(Dictionary dict,int quote_mode,utf8char uc)185 static bool get_character(Dictionary dict, int quote_mode, utf8char uc)
186 {
187 	int i = 0;
188 
189 	while (1)
190 	{
191 		char c = *(dict->pin++);
192 
193 		/* Skip over all comments */
194 		if ((c == '%') && (!quote_mode))
195 		{
196 			if (0 == strncmp(dict->pin, SUPPRESS, sizeof(SUPPRESS)-1))
197 			{
198 				const char *nl = strchr(dict->pin + sizeof(SUPPRESS)-1, '\n');
199 				if (NULL != nl)
200 				{
201 					dict->suppress_warning =
202 						strndup(dict->pin + sizeof(SUPPRESS)-1,
203 					           nl - dict->pin - sizeof(SUPPRESS) + 1);
204 				}
205 			}
206 			while ((c != 0x0) && (c != '\n')) c = *(dict->pin++);
207 			if (c == 0x0) break;
208 			dict->line_number++;
209 			continue;
210 		}
211 
212 		/* Newline counts as whitespace */
213 		if (c == '\n')
214 			dict->line_number++;
215 
216 		/* If it's a 7-bit ascii, we are done */
217 		if ((0 == i) && ((c & 0x80) == 0x0))
218 		{
219 			uc[0] = c;
220 			uc[1] = 0x0;
221 			return true;
222 		}
223 
224 		uc[0] = c;
225 		i = 1;
226 		while (i < MAXUTFLEN-1)
227 		{
228 			c = *(dict->pin++);
229 			/* If we're onto the next char, we're done. */
230 			if (((c & 0x80) == 0x0) || ((c & 0xc0) == 0xc0))
231 			{
232 				dict->pin--;
233 				uc[i] = 0x0;
234 				return true;
235 			}
236 			uc[i] = c;
237 			i++;
238 		}
239 		dict_error(dict, "UTF8 char is too long");
240 		return false;
241 	}
242 	uc[0] = 0x0;
243 	return true;
244 }
245 
246 
247 /*
248  * This set of 10 characters are the ones defining the syntax of the
249  * dictionary.
250  */
251 #define SPECIAL "(){};[]&^|:"
252 
253 /* Commutative (symmetric) AND */
254 #define SYM_AND '^'
255 
256 /* Bi-directional connector: + or - */
257 #define ANY_DIR '$'
258 
259 /* Wild-card link type */
260 #define WILD_TYPE '*'
261 
262 /**
263  * Return true if the input character is one of the special
264  * characters used to define the syntax of the dictionary.
265  */
char_is_special(char c)266 static bool char_is_special(char c)
267 {
268 	return (NULL != strchr(SPECIAL, c));
269 }
270 
271 /**
272  * This reads the next token from the input into 'token'.
273  * Return 1 if a character was read, else return 0 (and print a warning).
274  */
275 NO_SAN_DICT
link_advance(Dictionary dict)276 static bool link_advance(Dictionary dict)
277 {
278 	utf8char c;
279 	int nr, i;
280 	int quote_mode;
281 
282 	dict->is_special = false;
283 
284 	if (dict->already_got_it != '\0')
285 	{
286 		dict->is_special = char_is_special(dict->already_got_it);
287 		if (dict->already_got_it == EOF) {
288 			dict->token[0] = '\0';
289 		} else {
290 			dict->token[0] = (char)dict->already_got_it; /* specials are one byte */
291 			dict->token[1] = '\0';
292 		}
293 		dict->already_got_it = '\0';
294 		return true;
295 	}
296 
297 	do
298 	{
299 		bool ok = get_character(dict, false, c);
300 		if (!ok) return false;
301 	}
302 	while (lg_isspace(c[0]));
303 
304 	quote_mode = false;
305 
306 	i = 0;
307 	for (;;)
308 	{
309 		if (i > MAX_TOKEN_LENGTH-3) {
310 			dict_error(dict, "Token too long");
311 			return false;
312 		}
313 		if (quote_mode) {
314 			if (c[0] == '"' &&
315 			    /* Check the next character too, to allow " in words */
316 			    (*dict->pin == ':' || *dict->pin == ';' ||
317 			    lg_isspace(*dict->pin))) {
318 
319 				dict->token[i] = '\0';
320 				return true;
321 			}
322 			if (lg_isspace(c[0])) {
323 				dict_error(dict, "White space inside of token");
324 				return false;
325 			}
326 			if (c[0] == '\0')
327 			{
328 				dict_error(dict, "EOF while reading quoted token");
329 				return false;
330 			}
331 
332 			nr = 0;
333 			while (c[nr]) {dict->token[i] = c[nr]; i++; nr++; }
334 		} else {
335 			if ('\0' == c[1] && char_is_special(c[0]))
336 			{
337 				if (i == 0)
338 				{
339 					dict->token[0] = c[0];  /* special toks are one char always */
340 					dict->token[1] = '\0';
341 					dict->is_special = true;
342 					return true;
343 				}
344 				dict->token[i] = '\0';
345 				dict->already_got_it = c[0];
346 				return true;
347 			}
348 			if (c[0] == 0x0) {
349 				if (i != 0) dict->already_got_it = '\0';
350 				dict->token[0] = '\0';
351 				return true;
352 			}
353 			if (lg_isspace(c[0])) {
354 				dict->token[i] = '\0';
355 				return true;
356 			}
357 			if (c[0] == '\"') {
358 				quote_mode = true;
359 			} else {
360 				nr = 0;
361 				while (c[nr]) {dict->token[i] = c[nr]; i++; nr++; }
362 			}
363 		}
364 		bool ok = get_character(dict, quote_mode, c);
365 		if (!ok) return false;
366 	}
367 	/* unreachable */
368 }
369 
370 /**
371  * Returns true if this token is a special token and it is equal to c
372  */
is_equal(Dictionary dict,char c)373 static int is_equal(Dictionary dict, char c)
374 {
375 	return (dict->is_special &&
376 	        c == dict->token[0] &&
377 	        dict->token[1] == '\0');
378 }
379 
380 /**
381  * Make sure the string s is a valid connector.
382  * Return true if the connector is valid, else return false,
383  * and print an appropriate warning message.
384  */
check_connector(Dictionary dict,const char * s)385 static bool check_connector(Dictionary dict, const char * s)
386 {
387 	int i;
388 	i = strlen(s);
389 	if (i < 1) {
390 		dict_error(dict, "Expecting a connector.");
391 		return false;
392 	}
393 	i = s[i-1];  /* the last character of the token */
394 	if ((i != '+') && (i != '-') && (i != ANY_DIR)) {
395 		dict_error(dict, "A connector must end in a \"+\", \"-\" or \"$\".");
396 		return false;
397 	}
398 	if (*s == '@') s++;
399 	if (('h' == *s) || ('d' == *s)) s++;
400 	if (!isupper((unsigned char)*s)) {
401 		dict_error(dict, "Connectors must start with uppercase after an optional h or d.");
402 		return false;
403 	}
404 	/* Note that IDx when x is a subscript is allowed (to allow e.g. ID4id+). */
405 	if ((*s == 'I') && (*(s+1) == 'D') && isupper((unsigned char)*(s+2))) {
406 		dict_error(dict, "Connectors beginning with \"ID\" are forbidden");
407 		return false;
408 	}
409 
410 	bool connector_base = true;
411 	s++; /* The first uppercase has been validated above. */
412 	while (*(s+1)) {
413 		if ((!isalnum((unsigned char)*s)) && (*s != WILD_TYPE)) {
414 			dict_error(dict, "All letters of a connector must be ASCII alpha-numeric.");
415 			return false;
416 		}
417 		if (isupper((unsigned char)*s))
418 		{
419 			if (!connector_base)
420 			{
421 				dict_error(dict, "Connector subscript contains uppercase.");
422 				return false;
423 			}
424 		}
425 		else
426 		{
427 			connector_base = false;
428 		}
429 		s++;
430 	}
431 	return true;
432 }
433 
434 /* ======================================================================== */
435 /**
436  * Dictionary entry comparison and ordering functions.
437  *
438  * The data structure storing the dictionary is simply a binary tree.
439  * The entries in the binary tree are sorted by alphabetical order.
440  * There is one catch, however: words may have suffixes (a dot, followed
441  * by the suffix), and these suffixes are to be handled appropriately
442  * during sorting and comparison.
443  *
444  * The use of suffixes means that the ordering of the words is not
445  * exactly the order given by strcmp.  The order must be such that, for
446  * example, "make" < "make.n" < "make-up" -- suffixed words come after
447  * the bare words, but before any other other words with non-alphabetic
448  * characters (such as the hyphen in "make-up", or possibly UTF8
449  * characters). Thus, plain "strcmp" can't be used to determine
450  * dictionary order.
451  *
452  * Thus, a set of specialized string comparison and ordering functions
453  * are provided. These "do the right thing" when matching string with
454  * and without suffixes.
455  */
456 /**
457  * dict_order_strict - order two dictionary words in proper sort order.
458  * Return zero if the strings match, else return in a unique order.
459  * The order is NOT (locale-dependent) UTF8 sort order; its ordered
460  * based on numeric values of single bytes.  This will uniquely order
461  * UTF8 strings, just not in a LANG-dependent (locale-dependent) order.
462  * But we don't need/want locale-dependent ordering!
463  */
464 /* verbose version, for demonstration only */
465 /*
466 int dict_order_strict(char *s, char *t)
467 {
468 	int ss, tt;
469 	while (*s != '\0' && *s == *t) {
470 		s++;
471 		t++;
472 	}
473 	if (*s == SUBSCRIPT_MARK) {
474 		ss = 1;
475 	} else {
476 		ss = (*s)<<1;
477 	}
478 	if (*t == SUBSCRIPT_MARK) {
479 		tt = 1;
480 	} else {
481 		tt = (*t)<<1;
482 	}
483 	return (ss - tt);
484 }
485 */
486 
487 /* terse version */
488 /* If one word contains a dot, the other one must also! */
489 NO_SAN_DICT
dict_order_strict(const char * s,const Dict_node * dn)490 static inline int dict_order_strict(const char *s, const Dict_node * dn)
491 {
492 	const char * t = dn->string;
493 	while (*s != '\0' && *s == *t) {s++; t++;}
494 	return ((*s == SUBSCRIPT_MARK)?(1):(*s))  -  ((*t == SUBSCRIPT_MARK)?(1):(*t));
495 }
496 
497 /**
498  * dict_order_bare() -- order user vs. dictionary string.
499  *
500  * Similar to above, except that a "bare" search string will match
501  * a dictionary entry with a dot.
502  *
503  * Assuming that s is a pointer to the search string, and that t is
504  * a pointer to a dictionary string, this returns 0 if they match,
505  * returns >0 if s>t, and <0 if s<t.
506  *
507  * The matching is done as follows.  Walk down the strings until you
508  * come to the end of one of them, or until you find unequal characters.
509  * If the dictionary string contains a SUBSCRIPT_MARK, then replace the
510  * mark by "\0", and take the difference.
511  */
512 NO_SAN_DICT
dict_order_bare(const char * s,const Dict_node * dn)513 static inline int dict_order_bare(const char *s, const Dict_node * dn)
514 {
515 	const char * t = dn->string;
516 	while (*s != '\0' && *s == *t) {s++; t++;}
517 	return (*s)  -  ((*t == SUBSCRIPT_MARK)?(0):(*t));
518 }
519 
520 /**
521  * dict_order_wild() -- order dictionary strings, with wildcard.
522  *
523  * This routine is used to support command-line parser users who
524  * want to search for all dictionary entries of some given word or
525  * partial word, containing a wild-card. This is done by using the
526  * !!blah* command at the command-line.  Users need this function to
527  * debug the dictionary.  This is the ONLY place in the link-parser
528  * where wild-card search is needed; ordinary parsing does not use it.
529  *
530  * !!blah*.sub is also supported.
531  *
532  * Assuming that s is a pointer to a search string, and that
533  * t is a pointer to a dictionary string, this returns 0 if they
534  * match, >0 if s>t, and <0 if s<t.
535  *
536  * The matching is done as follows.  Walk down the strings until
537  * you come to the end of one of them, or until you find unequal
538  * characters.  A "*" matches anything before the subscript mark.
539  * Otherwise, replace SUBSCRIPT_MARK by "\0", and take the difference.
540  * This behavior matches that of the function dict_order_bare().
541  */
542 #define D_DOW 6
dict_order_wild(const char * s,const Dict_node * dn)543 static inline int dict_order_wild(const char * s, const Dict_node * dn)
544 {
545 	const char * t = dn->string;
546 
547 	lgdebug(+D_DOW, "search-word='%s' dict-word='%s'\n", s, t);
548 	while((*s != '\0') && (*s != SUBSCRIPT_MARK) && (*s == *t)) {s++; t++;}
549 
550 	if (*s == WILD_TYPE) return 0;
551 
552 	lgdebug(D_DOW, "Result: '%s'-'%s'=%d\n",
553 	 s, t, ((*s == SUBSCRIPT_MARK)?(0):(*s)) - ((*t == SUBSCRIPT_MARK)?(0):(*t)));
554 	return ((*s == SUBSCRIPT_MARK)?(0):(*s)) - ((*t == SUBSCRIPT_MARK)?(0):(*t));
555 }
556 #undef D_DOW
557 
558 /**
559  * dict_match --  return true if strings match, else false.
560  * A "bare" string (one without a subscript) will match any corresponding
561  * string with a subscript; so, for example, "make" and "make.n" are
562  * a match.  If both strings have subscripts, then the subscripts must match.
563  *
564  * A subscript is the part that follows the SUBSCRIPT_MARK.
565  */
dict_match(const char * s,const char * t)566 static bool dict_match(const char * s, const char * t)
567 {
568 	while ((*s != '\0') && (*s == *t)) { s++; t++; }
569 
570 	if (*s == *t) return true; /* both are '\0' */
571 	if ((*s == 0) && (*t == SUBSCRIPT_MARK)) return true;
572 	if ((*s == SUBSCRIPT_MARK) && (*t == 0)) return true;
573 
574 	return false;
575 }
576 
577 /* ======================================================================== */
578 
dict_node_new(void)579 static inline Dict_node * dict_node_new(void)
580 {
581 	return (Dict_node*) malloc(sizeof(Dict_node));
582 }
583 
584 /**
585  * prune_lookup_list -- discard all list entries that don't match string
586  * Walk the lookup list (of right links), discarding all nodes that do
587  * not match the dictionary string s. The matching is dictionary matching:
588  * subscripted entries will match "bare" entries.
589  */
prune_lookup_list(Dict_node * restrict llist,const char * restrict s)590 static Dict_node * prune_lookup_list(Dict_node * restrict llist, const char * restrict s)
591 {
592 	Dict_node *dn, *dnx, *list_new;
593 
594 	list_new = NULL;
595 	for (dn = llist; dn != NULL; dn = dnx)
596 	{
597 		dnx = dn->right;
598 		/* now put dn onto the answer list, or free it */
599 		if (dict_match(dn->string, s))
600 		{
601 			dn->right = list_new;
602 			list_new = dn;
603 		}
604 		else
605 		{
606 			free(dn);
607 		}
608 	}
609 
610 	/* now reverse the list back */
611 	llist = NULL;
612 	for (dn = list_new; dn != NULL; dn = dnx)
613 	{
614 		dnx = dn->right;
615 		dn->right = llist;
616 		llist = dn;
617 	}
618 	return llist;
619 }
620 
621 /* ======================================================================== */
subscr_match(const char * s,const Dict_node * dn)622 static bool subscr_match(const char *s, const Dict_node * dn)
623 {
624 	const char * s_sub = strrchr(s, SUBSCRIPT_MARK);
625 	const char * t_sub;
626 
627 	if (NULL == s_sub) return true;
628 	t_sub = strrchr(dn->string, SUBSCRIPT_MARK);
629 	if (NULL == t_sub) return false;
630 	if (0 == strcmp(s_sub, t_sub)) return true;
631 
632 	return false;
633 }
634 
635 /**
636  * rdictionary_lookup() -- recursive dictionary lookup
637  * Walk binary tree, given by 'dn', looking for the string 's'.
638  * For every node in the tree where 's' matches,
639  * make a copy of that node, and append it to llist.
640  */
641 static Dict_node *
rdictionary_lookup(Dict_node * restrict llist,const Dict_node * restrict dn,const char * restrict s,bool match_idiom,int (* dict_order)(const char *,const Dict_node *))642 rdictionary_lookup(Dict_node * restrict llist,
643                    const Dict_node * restrict dn,
644                    const char * restrict s,
645                    bool match_idiom,
646                    int (*dict_order)(const char *, const Dict_node *))
647 {
648 	/* see comment in dictionary_lookup below */
649 	int m;
650 	Dict_node * dn_new;
651 	if (dn == NULL) return llist;
652 
653 	m = dict_order(s, dn);
654 
655 	if (m >= 0)
656 	{
657 		llist = rdictionary_lookup(llist, dn->right, s, match_idiom, dict_order);
658 	}
659 	if ((m == 0) && (match_idiom || !is_idiom_word(dn->string)) &&
660 		 (dict_order != dict_order_wild || subscr_match(s, dn)))
661 	{
662 		dn_new = dict_node_new();
663 		*dn_new = *dn;
664 		dn_new->right = llist;
665 		llist = dn_new;
666 	}
667 	if (m <= 0)
668 	{
669 		llist = rdictionary_lookup(llist, dn->left, s, match_idiom, dict_order);
670 	}
671 	return llist;
672 }
673 
674 /**
675  * file_lookup_list() - return list of words in the file-backed dictionary.
676  *
677  * Returns a pointer to a lookup list of the words in the dictionary.
678  * Matches include words that appear in idioms.  To exclude idioms, use
679  * abridged_lookup_list() to obtain matches.
680  *
681  * This list is made up of Dict_nodes, linked by their right pointers.
682  * The node, file and string fields are copied from the dictionary.
683  *
684  * The returned list must be freed with file_free_lookup().
685  */
file_lookup_list(const Dictionary dict,const char * s)686 Dict_node * file_lookup_list(const Dictionary dict, const char *s)
687 {
688 	Dict_node * llist =
689 		rdictionary_lookup(NULL, dict->root, s, true, dict_order_bare);
690 	llist = prune_lookup_list(llist, s);
691 	return llist;
692 }
693 
file_boolean_lookup(Dictionary dict,const char * s)694 bool file_boolean_lookup(Dictionary dict, const char *s)
695 {
696 	Dict_node *llist = file_lookup_list(dict, s);
697 	bool boool = (llist != NULL);
698 	file_free_lookup(llist);
699 	return boool;
700 }
701 
file_free_lookup(Dict_node * llist)702 void file_free_lookup(Dict_node *llist)
703 {
704 	Dict_node * n;
705 	while (llist != NULL)
706 	{
707 		n = llist->right;
708 		free(llist);
709 		llist = n;
710 	}
711 }
712 
free_insert_list(Dict_node * ilist)713 void free_insert_list(Dict_node *ilist)
714 {
715 	Dict_node * n;
716 	while (ilist != NULL)
717 	{
718 		n = ilist->left;
719 		free(ilist);
720 		ilist = n;
721 	}
722 }
723 
724 /**
725  * file_lookup_wild -- allows for wildcard searches (globs)
726  * Used to support the !! command in the parser command-line tool.
727  */
file_lookup_wild(Dictionary dict,const char * s)728 Dict_node * file_lookup_wild(Dictionary dict, const char *s)
729 {
730 	bool lookup_idioms = test_enabled("lookup-idioms");
731 	char * ds = strrchr(s, SUBSCRIPT_DOT); /* Only the rightmost dot is a
732 	                                          candidate for SUBSCRIPT_DOT */
733 	char * ws = strrchr(s, WILD_TYPE);     /* A SUBSCRIPT_DOT can only appear
734                                              after a wild-card */
735 	Dict_node * result;
736 	char * stmp = strdup(s);
737 
738 	/* It is not a SUBSCRIPT_DOT if it is at the end or before the wild-card.
739 	 * E.g: "Dr.", "i.*", "." */
740 	if ((NULL != ds) && ('\0' != ds[1]) && ((NULL == ws) || (ds > ws)))
741 		stmp[ds-s] = SUBSCRIPT_MARK;
742 
743 	result =
744 	 rdictionary_lookup(NULL, dict->root, stmp, lookup_idioms, dict_order_wild);
745 	free(stmp);
746 	return result;
747 }
748 
749 /**
750  * abridged_lookup_list() - return lookup list of words in the dictionary
751  *
752  * Returns a pointer to a lookup list of the words in the dictionary.
753  * Excludes any idioms that contain the word; use
754  * dictionary_lookup_list() to obtain the complete list.
755  *
756  * This list is made up of Dict_nodes, linked by their right pointers.
757  * The node, file and string fields are copied from the dictionary.
758  *
759  * The returned list must be freed with file_free_lookup().
760  */
abridged_lookup_list(const Dictionary dict,const char * s)761 static Dict_node * abridged_lookup_list(const Dictionary dict, const char *s)
762 {
763 	Dict_node *llist;
764 	llist = rdictionary_lookup(NULL, dict->root, s, false, dict_order_bare);
765 	llist = prune_lookup_list(llist, s);
766 	return llist;
767 }
768 
769 /**
770  * strict_lookup_list() - return exact match in the dictionary
771  *
772  * Returns a pointer to a lookup list of the words in the dictionary.
773  * Excludes any idioms that contain the word.
774  *
775  * This list is made up of Dict_nodes, linked by their right pointers.
776  * The node, file and string fields are copied from the dictionary.
777  *
778  * The list normally has 0 or 1 elements, unless the given word
779  * appears more than once in the dictionary.
780  *
781  * The returned list must be freed with file_free_lookup().
782  */
strict_lookup_list(const Dictionary dict,const char * s)783 static Dict_node * strict_lookup_list(const Dictionary dict, const char *s)
784 {
785 	Dict_node *llist;
786 	llist = rdictionary_lookup(NULL, dict->root, s, false, dict_order_strict);
787 	llist = prune_lookup_list(llist, s);
788 	return llist;
789 }
790 
791 /* ======================================================================== */
792 /**
793  * Allocate a new Exp node.
794  */
Exp_create(Pool_desc * mp)795 Exp *Exp_create(Pool_desc *mp)
796 {
797 	Exp *e = pool_alloc(mp);
798 	e->tag_type = Exptag_none;
799 	return e;
800 }
801 
802 /**
803  * Duplicate the given Exp node.
804  * This is needed in case it participates more than once in a
805  * single expression.
806  */
Exp_create_dup(Pool_desc * mp,Exp * old_e)807 Exp *Exp_create_dup(Pool_desc *mp, Exp *old_e)
808 {
809 	Exp *new_e = Exp_create(mp);
810 
811 	*new_e = *old_e;
812 
813 	return new_e;
814 }
815 
816 /**
817  * This creates a node with zero children.  Initializes
818  * the cost to zero.
819  */
make_zeroary_node(Pool_desc * mp)820 static Exp * make_zeroary_node(Pool_desc *mp)
821 {
822 	Exp * n = Exp_create(mp);
823 	n->type = AND_type;  /* these must be AND types */
824 	n->cost = 0.0;
825 	n->operand_first = NULL;
826 	n->operand_next = NULL;
827 	return n;
828 }
829 
830 /**
831  * This creates a node with one child (namely e).  Initializes
832  * the cost to zero.
833  */
make_unary_node(Pool_desc * mp,Exp * e)834 Exp *make_unary_node(Pool_desc *mp, Exp * e)
835 {
836 	Exp * n;
837 	n = Exp_create(mp);
838 	n->type = AND_type;  /* these must be AND types */
839 	n->operand_next = NULL;
840 	n->cost = 0.0;
841 	n->operand_first = e;
842 	return n;
843 }
844 
845 /**
846  * Create an AND_type expression. The expressions nl, nr will be
847  * AND-ed together.
848  */
make_and_node(Pool_desc * mp,Exp * nl,Exp * nr)849 static Exp * make_and_node(Pool_desc *mp, Exp* nl, Exp* nr)
850 {
851 	Exp* n;
852 
853 	n = Exp_create(mp);
854 	n->type = AND_type;
855 	n->operand_next = NULL;
856 	n->cost = 0.0;
857 
858 	n->operand_first = nl;
859 	nl->operand_next = nr;
860 	nr->operand_next = NULL;
861 
862 	return n;
863 }
864 
make_op_Exp(Pool_desc * mp,Exp_type t)865 static Exp *make_op_Exp(Pool_desc *mp, Exp_type t)
866 {
867 	Exp * n = Exp_create(mp);
868 	n->type = t;
869 	n->operand_next = NULL;
870 	n->cost = 0.0;
871 
872 	/* The caller is supposed to assign n->operand. */
873 	return n;
874 }
875 
876 /**
877  * Create an OR_type expression. The expressions nl, nr will be
878  * OR-ed together.
879  */
make_or_node(Pool_desc * mp,Exp * nl,Exp * nr)880 static Exp * make_or_node(Pool_desc *mp, Exp* nl, Exp* nr)
881 {
882 	Exp* n;
883 
884 	n = Exp_create(mp);
885 	n->type = OR_type;
886 	n->operand_next = NULL;
887 	n->cost = 0.0;
888 
889 	n->operand_first = nl;
890 	nl->operand_next = nr;
891 	nr->operand_next = NULL;
892 
893 	return n;
894 }
895 
896 /**
897  * This creates an OR node with two children, one the given node,
898  * and the other as zeroary node.  This has the effect of creating
899  * what used to be called an optional node.
900  */
make_optional_node(Pool_desc * mp,Exp * e)901 static Exp *make_optional_node(Pool_desc *mp, Exp *e)
902 {
903 	return make_or_node(mp, make_zeroary_node(mp), e);
904 }
905 
906 /**
907  * make_dir_connector() -- make a single node for a connector
908  * that is a + or a - connector.
909  *
910  * Assumes the current token is the connector.
911  */
make_dir_connector(Dictionary dict,int i)912 static Exp * make_dir_connector(Dictionary dict, int i)
913 {
914 	Exp* n = Exp_create(dict->Exp_pool);
915 	char *constring;
916 
917 	n->dir = dict->token[i];
918 	dict->token[i] = '\0';   /* get rid of the + or - */
919 	if (dict->token[0] == '@')
920 	{
921 		constring = dict->token+1;
922 		n->multi = true;
923 	}
924 	else
925 	{
926 		constring = dict->token;
927 		n->multi = false;
928 	}
929 
930 	n->condesc = condesc_add(&dict->contable,
931 	                         string_set_add(constring, dict->string_set));
932 	if (NULL == n->condesc) return NULL; /* Table ovf */
933 	n->type = CONNECTOR_type;
934 	n->operand_next = NULL; /* unused, but accessed by copy_Exp() and some more */
935 	n->cost = 0.0;
936 	return n;
937 }
938 
939 /* ======================================================================== */
940 /**
941  * Add an optional macro/word tag, for expression debugging.
942  * Enabled by !test="macro-tag". This tag is used only in expression printing.
943  */
exptag_macro_add(Dictionary dict,const char * tag)944 static unsigned int exptag_macro_add(Dictionary dict, const char *tag)
945 {
946 	expression_tag *mt = dict->macro_tag;
947 	if (mt == NULL) return 0;
948 
949 	if (mt->num == mt->size)
950 	{
951 		if (mt->num == 0)
952 			mt->size = 128;
953 		else
954 			mt->size *= 2;
955 
956 		mt->name = realloc(mt->name, mt->size * sizeof(*mt->name));
957 	}
958 	mt->name[mt->num] = tag;
959 
960 	return mt->num++;
961 }
962 
963 /**
964  * make_connector() -- make a node for a connector or dictionary word.
965  *
966  * Assumes the current token is a connector or dictionary word.
967  */
make_connector(Dictionary dict)968 static Exp * make_connector(Dictionary dict)
969 {
970 	Exp * n;
971 	Dict_node *dn, *dn_head;
972 	int i;
973 
974 	i = strlen(dict->token) - 1;  /* this must be +, - or $ if a connector */
975 	if ((dict->token[i] != '+') &&
976 	    (dict->token[i] != '-') &&
977 	    (dict->token[i] != ANY_DIR))
978 	{
979 		/* If we are here, token is a word */
980 		patch_subscript(dict->token);
981 		dn_head = abridged_lookup_list(dict, dict->token);
982 		dn = dn_head;
983 		while ((dn != NULL) && (strcmp(dn->string, dict->token) != 0))
984 		{
985 			dn = dn->right;
986 		}
987 		if (dn == NULL)
988 		{
989 			file_free_lookup(dn_head);
990 			dict_error(dict, "Perhaps missing + or - in a connector.\n"
991 			                 "Or perhaps you forgot the subscript on a word.\n"
992 			                 "Or perhaps a word is used before it is defined.");
993 			return NULL;
994 		}
995 
996 		/* Wrap it in a unary node as a placeholder for a macro tag and cost. */
997 		n = make_unary_node(dict->Exp_pool, dn->exp);
998 		n->tag_id = exptag_macro_add(dict, dn->string);
999 		if (n->tag_id != 0) n->tag_type = Exptag_macro;
1000 
1001 		file_free_lookup(dn_head);
1002 	}
1003 	else
1004 	{
1005 		/* If we are here, token is a connector */
1006 		if (!check_connector(dict, dict->token))
1007 		{
1008 			return NULL;
1009 		}
1010 		if ((dict->token[i] == '+') || (dict->token[i] == '-'))
1011 		{
1012 			/* A simple, unidirectional connector. Just make that. */
1013 			n = make_dir_connector(dict, i);
1014 			if (NULL == n) return NULL;
1015 		}
1016 		else if (dict->token[i] == ANY_DIR)
1017 		{
1018 			Exp *plu, *min;
1019 			/* If we are here, then it's a bi-directional connector.
1020 			 * Make both a + and a - version, and or them together.  */
1021 			dict->token[i] = '+';
1022 			plu = make_dir_connector(dict, i);
1023 			if (NULL == plu) return NULL;
1024 			dict->token[i] = '-';
1025 			min = make_dir_connector(dict, i);
1026 			if (NULL == min) return NULL;
1027 
1028 			n = make_or_node(dict->Exp_pool, plu, min);
1029 		}
1030 		else
1031 		{
1032 			dict_error(dict, "Unknown connector direction type.");
1033 			return NULL;
1034 		}
1035 	}
1036 
1037 	if (!link_advance(dict))
1038 	{
1039 		free(n);
1040 		return NULL;
1041 	}
1042 	return n;
1043 }
1044 
1045 /* ======================================================================== */
1046 /* Empty-word handling. */
1047 
1048 /** Insert ZZZ+ connectors.
1049  *  This function was mainly used to support using empty-words, a concept
1050  *  that has been eliminated. However, it is still used to support linking of
1051  *  quotes that don't get the QUc/QUd links.
1052  */
add_empty_word(Sentence sent,X_node * x)1053 void add_empty_word(Sentence sent, X_node *x)
1054 {
1055 	Exp *zn, *an;
1056 	const char *ZZZ = string_set_lookup(EMPTY_CONNECTOR, sent->dict->string_set);
1057 	/* This function is called only if ZZZ is in the dictionary. */
1058 
1059 	/* The left-wall already has ZZZ-. The right-wall will not arrive here. */
1060 	if (MT_WALL == x->word->morpheme_type) return;
1061 
1062 	/* Replace plain-word-exp by {ZZZ+} & (plain-word-exp) in each X_node.  */
1063 	for(; NULL != x; x = x->next)
1064 	{
1065 		/* Ignore stems for now, decreases a little the overhead for
1066 		 * stem-suffix languages. */
1067 		if (is_stem(x->string)) continue; /* Avoid an unneeded overhead. */
1068 		//lgdebug(+0, "Processing '%s'\n", x->string);
1069 
1070 		/* zn points at {ZZZ+} */
1071 		zn = Exp_create(sent->Exp_pool);
1072 		zn->dir = '+';
1073 		zn->condesc = condesc_add(&sent->dict->contable, ZZZ);
1074 		zn->multi = false;
1075 		zn->type = CONNECTOR_type;
1076 		zn->operand_next = NULL; /* unused, but to be on the safe side */
1077 		zn->cost = 0.0;
1078 		zn = make_optional_node(sent->Exp_pool, zn);
1079 
1080 		/* an will be {ZZZ+} & (plain-word-exp) */
1081 		an = Exp_create(sent->Exp_pool);
1082 		an->type = AND_type;
1083 		an->operand_next = NULL;
1084 		an->cost = 0.0;
1085 		an->operand_first = zn;
1086 		zn->operand_next = x->exp;
1087 
1088 		x->exp = an;
1089 	}
1090 }
1091 
1092 /* ======================================================================== */
1093 
1094 /**
1095  * Return true if the string is a (floating point) number.
1096  * Float points can be proceeded by a single plus or minus sign.
1097  */
is_number(const char * str)1098 static bool is_number(const char * str)
1099 {
1100 	if (str[0] == '\0') return false; /* End of file. */
1101 	if ('+' == str[0] || '-' == str[0]) str++;
1102 	size_t numlen = strspn(str, "0123456789.");
1103 
1104 	return str[numlen] == '\0';
1105 }
1106 
1107 /* ======================================================================== */
1108 
1109 /**
1110  * Build (and return the root of) the tree for the expression beginning
1111  * with the current token.  At the end, the token is the first one not
1112  * part of this expression.
1113  */
make_expression(Dictionary dict)1114 static Exp *make_expression(Dictionary dict)
1115 {
1116 	Exp *nl = NULL;
1117 	Exp *e_head = NULL;
1118 	Exp *e_tail = NULL; /* last part of the expression */
1119 	bool is_sym_and = false;
1120 
1121 	while (true)
1122 	{
1123 		if (is_equal(dict, '('))
1124 		{
1125 			if (!link_advance(dict)) {
1126 				return NULL;
1127 			}
1128 			nl = make_expression(dict);
1129 			if (nl == NULL) {
1130 				return NULL;
1131 			}
1132 			if (!is_equal(dict, ')')) {
1133 				dict_error(dict, "Expecting a \")\".");
1134 				return NULL;
1135 			}
1136 			if (!link_advance(dict)) {
1137 				return NULL;
1138 			}
1139 		}
1140 		else if (is_equal(dict, '{'))
1141 		{
1142 			if (!link_advance(dict)) {
1143 				return NULL;
1144 			}
1145 			nl = make_expression(dict);
1146 			if (nl == NULL) {
1147 				return NULL;
1148 			}
1149 			if (!is_equal(dict, '}')) {
1150 				dict_error(dict, "Expecting a \"}\".");
1151 				return NULL;
1152 			}
1153 			if (!link_advance(dict)) {
1154 				return NULL;
1155 			}
1156 			nl = make_optional_node(dict->Exp_pool, nl);
1157 		}
1158 		else if (is_equal(dict, '['))
1159 		{
1160 			if (!link_advance(dict)) {
1161 				return NULL;
1162 			}
1163 			nl = make_expression(dict);
1164 			if (nl == NULL) {
1165 				return NULL;
1166 			}
1167 			if (!is_equal(dict, ']')) {
1168 				dict_error(dict, "Expecting a \"]\".");
1169 				return NULL;
1170 			}
1171 			if (!link_advance(dict)) {
1172 				return NULL;
1173 			}
1174 
1175 			/* A square bracket can have a number or a name after it.
1176 			 * If a number is present, then that it is interpreted
1177 			 * as the cost of the bracket. If a name is present, it
1178 			 * is used as an expression tag. Else, the cost of a
1179 			 * square bracket is 1.0.
1180 			 */
1181 			if (is_number(dict->token))
1182 			{
1183 				float cost;
1184 
1185 				if (strtodC(dict->token, &cost))
1186 				{
1187 					nl->cost += cost;
1188 				}
1189 				else
1190 				{
1191 					warning(dict, "Invalid cost (using 1.0)\n");
1192 					nl->cost += 1.0;
1193 				}
1194 				if (!link_advance(dict)) {
1195 					return NULL;
1196 				}
1197 			}
1198 			else if ((strcmp(dict->token, "or") != 0) &&
1199 			         (strcmp(dict->token, "and") != 0) &&
1200 			         isalpha(dict->token[0]))
1201 			{
1202 				const char *bad = valid_dialect_name(dict->token);
1203 				if (bad != NULL)
1204 				{
1205 					char badchar[] = { *bad, '\0' };
1206 					dict_error2(dict, "Invalid character in dialect tag name:",
1207 					           badchar);
1208 					return NULL;
1209 				}
1210 				if (nl->tag_type != Exptag_none)
1211 				{
1212 					nl = make_unary_node(dict->Exp_pool, nl);
1213 				}
1214 				nl->tag_id = exptag_dialect_add(dict, dict->token);
1215 				nl->tag_type = Exptag_dialect;
1216 				if (!link_advance(dict)) {
1217 					return NULL;
1218 				}
1219 			}
1220 			else
1221 			{
1222 				nl->cost += 1.0;
1223 			}
1224 		}
1225 		else if (!dict->is_special)
1226 		{
1227 			nl = make_connector(dict);
1228 			if (nl == NULL) {
1229 				return NULL;
1230 			}
1231 		}
1232 		else if (is_equal(dict, ')') || is_equal(dict, ']'))
1233 		{
1234 			/* allows "()" or "[]" */
1235 			nl = make_zeroary_node(dict->Exp_pool);
1236 		}
1237 		else
1238 		{
1239 			dict_error(dict, "Connector, \"(\", \"[\", or \"{\" expected.");
1240 			return NULL;
1241 		}
1242 
1243 		if (is_sym_and)
1244 		{
1245 			/* Part 2/2 of SYM_AND processing. */
1246 
1247 			/* Expand A ^ B into the expr ((A & B) or (B & A)). */
1248 			Exp *na = make_and_node(dict->Exp_pool,
1249 			                   Exp_create_dup(dict->Exp_pool, e_tail),
1250 			                   Exp_create_dup(dict->Exp_pool, nl));
1251 			Exp *nb = make_and_node(dict->Exp_pool,
1252 			                   Exp_create_dup(dict->Exp_pool, nl),
1253 			                   Exp_create_dup(dict->Exp_pool, e_tail));
1254 			Exp *or = make_or_node(dict->Exp_pool, na, nb);
1255 
1256 			*e_tail = *or; /* SYM_AND result */
1257 			is_sym_and = false;
1258 		}
1259 		else if (e_tail != NULL)
1260 		{
1261 			/* This is Non-commuting AND or Commuting OR.
1262 			 * Append the just read expression (nl) to its operand chain. */
1263 			e_tail->operand_next = nl;
1264 			e_tail = nl;
1265 		}
1266 
1267 		/* Extract the operator. */
1268 
1269 		Exp_type op;
1270 
1271 		/* Non-commuting AND */
1272 		if (is_equal(dict, '&') || (strcmp(dict->token, "and") == 0))
1273 		{
1274 			op = AND_type;
1275 		}
1276 		/* Commuting OR */
1277 		else if (is_equal(dict, '|') || (strcmp(dict->token, "or") == 0))
1278 		{
1279 			op =  OR_type;
1280 		}
1281 		/* Commuting AND */
1282 		else if (is_equal(dict, SYM_AND) || (strcmp(dict->token, "sym") == 0))
1283 		{
1284 			/* Part 1/2 of SYM_AND processing */
1285 			op = AND_type; /* allow mixing with ordinary ands at the same level */
1286 			is_sym_and = true; /* processing to be completed after next argument */
1287 		}
1288 		else
1289 		{
1290 			if (e_head != NULL) return e_head;
1291 			return nl;
1292 		}
1293 
1294 		/* If this is the first operand, use nl for it.  Else just validate
1295 		 * that the current operator is consistent with that of the current
1296 		 * expression level. */
1297 		if (e_head == NULL)
1298 		{
1299 			e_head = make_op_Exp(dict->Exp_pool, op);
1300 			e_head->operand_first = nl;
1301 		}
1302 		else
1303 		{
1304 			if (e_head->type != op)
1305 			{
1306 				dict_error(dict, "\"and\" and \"or\" at the same level in an expression");
1307 				return NULL;
1308 			}
1309 		}
1310 
1311 		if (!link_advance(dict)) {
1312 			return NULL;
1313 		}
1314 
1315 		if (e_tail == NULL)
1316 			e_tail = e_head->operand_first;
1317 	}
1318 		/* unreachable */
1319 }
1320 
1321 /* ======================================================================== */
1322 /* Implementation of the DSW algo for rebalancing a binary tree.
1323  * The point is -- after building the dictionary tree, we rebalance it
1324  * once at the end. This is a **LOT LOT** quicker than maintaining an
1325  * AVL tree along the way (less than quarter-of-a-second vs. about
1326  * a minute or more!) FWIW, the DSW tree is even more balanced than
1327  * the AVL tree is (it's less deep, more full).
1328  *
1329  * The DSW algo, with C++ code, is described in
1330  *
1331  * Timothy J. Rolfe, "One-Time Binary Search Tree Balancing:
1332  * The Day/Stout/Warren (DSW) Algorithm", inroads, Vol. 34, No. 4
1333  * (December 2002), pp. 85-88
1334  * http://penguin.ewu.edu/~trolfe/DSWpaper/
1335  */
1336 
rotate_right(Dict_node * root)1337 static Dict_node *rotate_right(Dict_node *root)
1338 {
1339 	Dict_node *pivot = root->left;
1340 	root->left = pivot->right;
1341 	pivot->right = root;
1342 	return pivot;
1343 }
1344 
dsw_tree_to_vine(Dict_node * root)1345 static Dict_node * dsw_tree_to_vine (Dict_node *root)
1346 {
1347 	Dict_node *vine_tail, *vine_head, *rest;
1348 	Dict_node vh;
1349 
1350 	vine_head = &vh;
1351 	vine_head->left = NULL;
1352 	vine_head->right = root;
1353 	vine_tail = vine_head;
1354 	rest = root;
1355 
1356 	while (NULL != rest)
1357 	{
1358 		/* If no left, we are done, do the right */
1359 		if (NULL == rest->left)
1360 		{
1361 			vine_tail = rest;
1362 			rest = rest->right;
1363 		}
1364 		/* eliminate the left subtree */
1365 		else
1366 		{
1367 			rest = rotate_right(rest);
1368 			vine_tail->right = rest;
1369 		}
1370 	}
1371 
1372 	return vh.right;
1373 }
1374 
1375 NO_SAN_DICT
dsw_compression(Dict_node * root,unsigned int count)1376 static void dsw_compression (Dict_node *root, unsigned int count)
1377 {
1378 	unsigned int j;
1379 	for (j = 0; j < count; j++)
1380 	{
1381 		/* Compound left rotation */
1382 		Dict_node * pivot = root->right;
1383 		root->right = pivot->right;
1384 		root = pivot->right;
1385 		pivot->right = root->left;
1386 		root->left = pivot;
1387 	}
1388 }
1389 
1390 /* Return size of the full portion of the tree
1391  * Gets the next pow(2,k)-1
1392  */
full_tree_size(unsigned int size)1393 static inline unsigned int full_tree_size (unsigned int size)
1394 {
1395 	unsigned int pk = 1;
1396 	while (pk < size) pk = 2*pk + 1;
1397 	return pk/2;
1398 }
1399 
dsw_vine_to_tree(Dict_node * root,int size)1400 static Dict_node * dsw_vine_to_tree (Dict_node *root, int size)
1401 {
1402 	Dict_node vine_head;
1403 	unsigned int full_count = full_tree_size(size +1);
1404 
1405 	vine_head.left = NULL;
1406 	vine_head.right = root;
1407 
1408 	dsw_compression(&vine_head, size - full_count);
1409 	for (size = full_count; size > 1; size /= 2)
1410 	{
1411 		dsw_compression(&vine_head, size / 2);
1412 	}
1413 	return vine_head.right;
1414 }
1415 
1416 /* ======================================================================== */
1417 /**
1418  * Insert the new node into the dictionary below node n.
1419  * "newnode" left and right fields are NULL, and its string is already
1420  * there.  If the string is already found in the dictionary, give an error
1421  * message and effectively ignore it.
1422  *
1423  * The resulting tree is highly unbalanced. It needs to be rebalanced
1424  * before being used.  The DSW algo below is ideal for that.
1425  */
1426 NO_SAN_DICT
insert_dict(Dictionary dict,Dict_node * n,Dict_node * newnode)1427 Dict_node *insert_dict(Dictionary dict, Dict_node *n, Dict_node *newnode)
1428 {
1429 	if (NULL == n) return newnode;
1430 
1431 	static Exp null_exp =
1432 	{
1433 		.type = AND_type,
1434 		.operand_first = NULL,
1435 		.operand_next = NULL,
1436 	};
1437 	int comp = dict_order_strict(newnode->string, n);
1438 
1439 	if (0 == comp &&
1440 	    /* Suppress reporting duplicate idioms until they are fixed. */
1441 	    (!contains_underbar(newnode->string) || test_enabled("dup-idioms")))
1442 	{
1443 		char t[80+MAX_TOKEN_LENGTH];
1444 		snprintf(t, sizeof(t),
1445 		         "Ignoring word \"%s\", which has been multiply defined:",
1446 		         newnode->string);
1447 		dict_error(dict, t);
1448 		/* Too late to skip insertion - insert it with a null expression. */
1449 		newnode->exp = &null_exp;
1450 		comp = -1;
1451 	}
1452 
1453 	if (comp < 0)
1454 	{
1455 		if (NULL == n->left)
1456 		{
1457 			n->left = newnode;
1458 			return n;
1459 		}
1460 		n->left = insert_dict(dict, n->left, newnode);
1461 	}
1462 	else
1463 	{
1464 		if (NULL == n->right)
1465 		{
1466 			n->right = newnode;
1467 			return n;
1468 		}
1469 		n->right = insert_dict(dict, n->right, newnode);
1470 	}
1471 
1472 	return n;
1473 	/* return rebalance(n); Uncomment to get an AVL tree */
1474 }
1475 
1476 /**
1477  * Find if a warning symbol exists in the currently suppress list.
1478  * The warning symbols are constructed in a way that disallow overlap
1479  * matching.
1480  */
is_warning_suppressed(Dictionary dict,const char * warning_symbol)1481 static bool is_warning_suppressed(Dictionary dict, const char *warning_symbol)
1482 {
1483 	if (NULL == dict->suppress_warning) return false;
1484 	return (NULL != strstr(dict->suppress_warning, warning_symbol));
1485 }
1486 
1487 /**
1488  * Remember the length_limit definitions in a list according to their order.
1489  * The order is kept to allow later more specific definitions to override
1490  * already applied ones.
1491  */
add_condesc_length_limit(Dictionary dict,Dict_node * dn,int length_limit)1492 static void add_condesc_length_limit(Dictionary dict, Dict_node *dn,
1493                                      int length_limit)
1494 {
1495 	length_limit_def_t *lld = malloc(sizeof(*lld));
1496 	lld->next = NULL;
1497 	lld->length_limit = length_limit;
1498 	lld->defexp = dn->exp;
1499 	lld->defword = dn->string;
1500 	*dict->contable.length_limit_def_next = lld;
1501 	dict->contable.length_limit_def_next = &lld->next;
1502 }
1503 
insert_length_limit(Dictionary dict,Dict_node * dn)1504 static void insert_length_limit(Dictionary dict, Dict_node *dn)
1505 {
1506 	int length_limit;
1507 
1508 	if (0 == strcmp(UNLIMITED_CONNECTORS_WORD, dn->string))
1509 	{
1510 		length_limit = UNLIMITED_LEN;
1511 	}
1512 	else
1513 	if (0 == strncmp(LIMITED_CONNECTORS_WORD, dn->string,
1514 	                 sizeof(LIMITED_CONNECTORS_WORD)-1))
1515 	{
1516 		char *endp;
1517 		length_limit =
1518 			(int)strtol(dn->string + sizeof(LIMITED_CONNECTORS_WORD)-1, &endp, 10);
1519 		if ((length_limit < 0) || (length_limit > MAX_SENTENCE) ||
1520 		  (('\0' != *endp) && (SUBSCRIPT_MARK != *endp)))
1521 		{
1522 			prt_error("Warning: Word \"%s\" found near line %d of \"%s\".\n"
1523 					  "\tThis word should end with a number (1-%d).\n"
1524 					  "\tThis word will be ignored.\n",
1525 					  dn->string, dict->line_number, dict->name, MAX_SENTENCE);
1526 			return;
1527 		}
1528 	}
1529 	else return;
1530 
1531 	/* We cannot set the connectors length_limit yet because the
1532 	 * needed data structure is not defined yet. For now, just
1533 	 * remember the definitions in their order. */
1534 	add_condesc_length_limit(dict, dn, length_limit);
1535 }
1536 
1537 /**
1538  * insert_list() -
1539  * p points to a list of dict_nodes connected by their left pointers.
1540  * l is the length of this list (the last ptr may not be NULL).
1541  * It inserts the list into the dictionary.
1542  * It does the middle one first, then the left half, then the right.
1543  *
1544  * Note: I think this "insert middle, then left, then right" algo has
1545  * its origins as a lame attempt to hack around the fact that the
1546  * resulting binary tree is rather badly unbalanced. This has been
1547  * fixed by using the DSW rebalancing algo. Now, that would seem
1548  * to render this crazy bisected-insertion algo obsolete, but ..
1549  * oddly enough, it seems to make the DSW balancing go really fast!
1550  * Faster than a simple insertion. Go figure. I think this has
1551  * something to do with the fact that the dictionaries are in
1552  * alphabetical order! This subdivision helps randomize a bit.
1553  */
insert_list(Dictionary dict,Dict_node * p,int l)1554 void insert_list(Dictionary dict, Dict_node * p, int l)
1555 {
1556 	Dict_node * dn, *dn_head, *dn_second_half;
1557 	int k, i; /* length of first half */
1558 
1559 	if (l == 0) return;
1560 
1561 	k = (l-1)/2;
1562 	dn = p;
1563 	for (i = 0; i < k; i++)
1564 	{
1565 		dn = dn->left;
1566 	}
1567 
1568 	/* dn now points to the middle element */
1569 	dn_second_half = dn->left;
1570 	dn->left = dn->right = NULL;
1571 
1572 	if (is_idiom_word(dn->string))
1573 	{
1574 		prt_error("Warning: Word \"%s\" found near line %d of \"%s\".\n"
1575 		        "\tWords ending \".Ix\" (x a number) are reserved for idioms.\n"
1576 		        "\tThis word will be ignored.\n",
1577 		        dn->string, dict->line_number, dict->name);
1578 		free(dn);
1579 	}
1580 	else
1581 	{
1582 		if (contains_underbar(dn->string))
1583 		{
1584 			insert_idiom(dict, dn);
1585 		}
1586 
1587 		dict->root = insert_dict(dict, dict->root, dn);
1588 		insert_length_limit(dict, dn);
1589 		dict->num_entries++;
1590 
1591 		if ((verbosity_level(D_DICT+0) && !is_warning_suppressed(dict, DUP_BASE)) ||
1592 		    verbosity_level(D_SPEC+3))
1593 		{
1594 			/* Warn if there are words with a subscript that match a bare word. */
1595 			const char *sm = strchr(dn->string, SUBSCRIPT_MARK);
1596 			char *bareword;
1597 
1598 			if (NULL != sm)
1599 			{
1600 				bareword = strdupa(dn->string);
1601 				bareword[sm - dn->string] = '\0';
1602 			}
1603 			else
1604 			{
1605 				bareword = (char *)dn->string;
1606 			}
1607 
1608 			if ((dn_head = strict_lookup_list(dict, bareword)) != NULL)
1609 			{
1610 				bool match_found = false;
1611 				for (Dict_node *dnx = dn_head; dnx != NULL; dnx = dnx->right) {
1612 					if (0 != strcmp(dnx->string, dn->string))
1613 					{
1614 						if (!match_found)
1615 						{
1616 							prt_error("Warning: The word \"%s\" found near line "
1617 										 "%d of \"%s\"\n\t matches the following words:",
1618 										 dn->string, dict->line_number, dict->name);
1619 							match_found = true;
1620 						}
1621 						prt_error("\t%s", dnx->string);
1622 					}
1623 				}
1624 				if (match_found) prt_error("\n");
1625 				file_free_lookup(dn_head);
1626 			}
1627 		}
1628 	}
1629 
1630 	insert_list(dict, p, k);
1631 	insert_list(dict, dn_second_half, l-k-1);
1632 }
1633 
1634 /**
1635  * read_entry() -- read one dictionary entry
1636  * Starting with the current token, parse one dictionary entry.
1637  * A single dictionary entry must have one and only one colon in it,
1638  * and is terminated by a semi-colon.
1639  * Add these words to the dictionary.
1640  */
read_entry(Dictionary dict)1641 static bool read_entry(Dictionary dict)
1642 {
1643 	Exp *n;
1644 	int i;
1645 
1646 	Dict_node *dnx, *dn = NULL;
1647 
1648 	while (!is_equal(dict, ':'))
1649 	{
1650 		if (dict->is_special)
1651 		{
1652 			dict_error(dict, "I expected a word but didn\'t get it.");
1653 			goto syntax_error;
1654 		}
1655 
1656 		/* If it's a word-file name */
1657 		/* However, be careful to reject "/.v" which is the division symbol
1658 		 * used in equations (.v means verb-like) */
1659 		if ((dict->token[0] == '/') && (dict->token[1] != '.'))
1660 		{
1661 			dn = read_word_file(dict, dn, dict->token);
1662 			if (dn == NULL)
1663 			{
1664 				prt_error("Error: Cannot open word file \"%s\".\n", dict->token);
1665 				return false;
1666 			}
1667 		}
1668 		else if (0 == strcmp(dict->token, "#include"))
1669 		{
1670 			bool rc;
1671 			char* instr;
1672 			char* dict_name;
1673 			const char * save_name;
1674 			bool save_is_special;
1675 			const char * save_input;
1676 			const char * save_pin;
1677 			int save_already_got_it;
1678 			int save_line_number;
1679 			size_t skip_slash;
1680 
1681 			if (!link_advance(dict)) goto syntax_error;
1682 
1683 			skip_slash          = ('/' == dict->token[0]) ? 1 : 0;
1684 			dict_name           = strdupa(dict->token);
1685 			save_name           = dict->name;
1686 			save_is_special     = dict->is_special;
1687 			save_input          = dict->input;
1688 			save_pin            = dict->pin;
1689 			save_already_got_it = dict->already_got_it;
1690 			save_line_number    = dict->line_number;
1691 
1692 			/* OK, token contains the filename to read ... */
1693 			instr = get_file_contents(dict_name + skip_slash);
1694 			if (NULL == instr)
1695 			{
1696 				prt_error("Error: While parsing dictionary \"%s\":\n"
1697 				          "\t Line %d: Could not open subdictionary \"%s\"\n",
1698 				          dict->name, dict->line_number-1, dict_name);
1699 				goto syntax_error;
1700 			}
1701 			dict->input = instr;
1702 			dict->pin = dict->input;
1703 
1704 			/* The line number and dict name are used for error reporting */
1705 			dict->line_number = 1;
1706 			dict->name = dict_name;
1707 
1708 			/* Now read the thing in. */
1709 			rc = read_dictionary(dict);
1710 
1711 			dict->name           = save_name;
1712 			dict->is_special     = save_is_special;
1713 			dict->input          = save_input;
1714 			dict->pin            = save_pin;
1715 			dict->already_got_it = save_already_got_it;
1716 			dict->line_number    = save_line_number;
1717 
1718 			free(instr);
1719 			if (!rc) goto syntax_error;
1720 
1721 			/* when we return, point to the next entry */
1722 			if (!link_advance(dict)) goto syntax_error;
1723 
1724 			/* If a semicolon follows the include, that's OK... ignore it. */
1725 			if (';' == dict->token[0])
1726 			{
1727 				if (!link_advance(dict)) goto syntax_error;
1728 			}
1729 
1730 			return true;
1731 		}
1732 		else
1733 		{
1734 			Dict_node * dn_new = dict_node_new();
1735 			dn_new->left = dn;
1736 			dn_new->right = NULL;
1737 			dn_new->exp = NULL;
1738 			dn = dn_new;
1739 			dn->file = NULL;
1740 
1741 			/* Note: The following patches a dot in regexes appearing in
1742 			 * the affix file... It is corrected later. */
1743 			patch_subscript(dict->token);
1744 			dn->string = string_set_add(dict->token, dict->string_set);
1745 		}
1746 
1747 		/* Advance to next entry, unless error */
1748 		if (!link_advance(dict)) goto syntax_error;
1749 	}
1750 
1751 	/* pass the : */
1752 	if (!link_advance(dict))
1753 	{
1754 		goto syntax_error;
1755 	}
1756 
1757 	n = make_expression(dict);
1758 	if (n == NULL)
1759 	{
1760 		goto syntax_error;
1761 	}
1762 
1763 	if (!is_equal(dict, ';'))
1764 	{
1765 		dict_error(dict, "Expecting \";\" at the end of an entry.");
1766 		goto syntax_error;
1767 	}
1768 
1769 	if (dn == NULL)
1770 	{
1771 		dict_error(dict, "Expecting a token before \":\".");
1772 		goto syntax_error;
1773 	}
1774 
1775 	/* At this point, dn points to a list of Dict_nodes connected by
1776 	 * their left pointers. These are to be inserted into the dictionary. */
1777 	i = 0;
1778 	for (dnx = dn; dnx != NULL; dnx = dnx->left)
1779 	{
1780 		dnx->exp = n;
1781 		i++;
1782 	}
1783 	dict->insert_entry(dict, dn, i);
1784 
1785 	if (dict->suppress_warning)
1786 	{
1787 		free((void *)dict->suppress_warning);
1788 		dict->suppress_warning = NULL;
1789 	}
1790 
1791 	/* pass the ; */
1792 	if (!link_advance(dict))
1793 	{
1794 		/* Avoid freeing dn, since it is already inserted into the dict. */
1795 		return false;
1796 	}
1797 
1798 	return true;
1799 
1800 syntax_error:
1801 	free_insert_list(dn);
1802 	return false;
1803 }
1804 
1805 
rprint_dictionary_data(Dict_node * n)1806 static void rprint_dictionary_data(Dict_node * n)
1807 {
1808 	if (n == NULL) return;
1809 	rprint_dictionary_data(n->left);
1810 	printf("%s: %s\n", n->string, lg_exp_stringify(n->exp));
1811 	rprint_dictionary_data(n->right);
1812 }
1813 
1814 /**
1815  * Dump the entire contents of the dictionary
1816  * XXX This is not currently called by anything, but is a "good thing
1817  * to keep around".
1818  */
print_dictionary_data(Dictionary dict)1819 void print_dictionary_data(Dictionary dict)
1820 {
1821 	rprint_dictionary_data(dict->root);
1822 }
1823 
read_dictionary(Dictionary dict)1824 bool read_dictionary(Dictionary dict)
1825 {
1826 	if (!link_advance(dict))
1827 	{
1828 		return false;
1829 	}
1830 	/* The last character of a dictionary is NUL.
1831 	 * Note: At the end of reading a dictionary, dict->pin points to one
1832 	 * character after the input. Referring its [-1] element is safe even if
1833 	 * the dict file size is 0. */
1834 	while ('\0' != dict->pin[-1])
1835 	{
1836 		if (!read_entry(dict))
1837 		{
1838 			return false;
1839 		}
1840 	}
1841 	dict->root = dsw_tree_to_vine(dict->root);
1842 	dict->root = dsw_vine_to_tree(dict->root, dict->num_entries);
1843 	return true;
1844 }
1845 
1846 /* ======================================================================= */
1847