1 /*************************************************************************/
2 /* Copyright (c) 2004 */
3 /* Daniel Sleator, David Temperley, and John Lafferty */
4 /* Copyright 2013, 2014 Linas Vepstas */
5 /* All rights reserved */
6 /* */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software. */
9 /* This license allows free redistribution and use in source and binary */
10 /* forms, with or without modification, subject to certain conditions. */
11 /* */
12 /*************************************************************************/
13
14 #include <string.h>
15
16 #include "api-structures.h" // Sentence_s (add_empty_word)
17 #include "dict-common/dialect.h"
18 #include "dict-common/dict-affix.h" // is_stem
19 #include "dict-common/dict-common.h"
20 #include "dict-common/dict-defines.h" // SUBSCRIPT_MARK
21 #include "dict-common/file-utils.h"
22 #include "dict-common/idiom.h"
23 #include "error.h"
24 #include "externs.h"
25 #include "print/print.h"
26 #include "read-dict.h"
27 #include "string-set.h"
28 #include "tokenize/tok-structures.h" // MT_WALL
29 #include "utilities.h"
30 #include "word-file.h"
31
32 /*
33 The dictionary format:
34
35 In what follows:
36 Every "%" symbol and everything after it is ignored on every line.
37 Every newline or tab is replaced by a space.
38
39 The dictionary file is a sequence of ENTRIES. Each ENTRY is one or
40 more WORDS (a sequence of upper or lower case letters) separated by
41 spaces, followed by a ":", followed by an EXPRESSION followed by a
42 ";". An EXPRESSION is an expression where the operators are "&"
43 or "and" or "|" or "or", and there are three types of parentheses:
44 "()", "{}", and "[]". The terminal symbols of this grammar are the
45 connectors, which are strings of letters or numbers or *s.
46 Expressions are written in infix form.
47
48 The connector begins with an optional @, which is followed by an upper
49 case sequence of letters. Each subsequent *, lower case letter or
50 number is a subscript. At the end is a + or - sign. The "@" allows
51 this connector to attach to one or more other connectors.
52
53 Here is a sample dictionary entry:
54
55 gone: T- & {@EV+};
56
57 (See our paper for more about how to interpret the meaning of the
58 dictionary expressions.)
59
60 A previously defined word (such as "gone" above) may be used instead
61 of a connector to specify the expression it was defined to be. Of
62 course, in this case, it must uniquely specify a word in the
63 dictionary, and have been previously defined.
64
65 If a word is of the form "/foo", then the file current-dir/foo
66 is a so-called word file, and is read in as a list of words.
67 A word file is just a list of words separated by blanks or newlines.
68
69 A word that contains the character "_" defines an idiomatic use of
70 the words separated by the "_". For example "kind of" is an idiomatic
71 expression, so a word "kind_of" is defined in the dictionary.
72 Idiomatic expressions of any number of words can be defined in this way.
73 When the word "kind" is encountered, all the idiomatic uses of the word
74 are considered.
75
76 An expression enclosed in "[..]" is give a cost of 1. This means
77 that if any of the connectors inside the square braces are used,
78 a cost of 1 is incurred. (This cost is the first element of the cost
79 vector printed when a sentence is parsed.) Of course if something is
80 inside of 10 levels of "[..]" then using it incurs a cost of 10.
81 These costs are called "disjunct costs". The linkages are printed out
82 in order of non-increasing disjunct cost.
83
84 A number following a square bracket over-rides the cost of that bracket.
85 Thus, [...].5 has a cost of 0.5 while [...]2.0 has a cost of 2; that
86 is it is the same as [[...]]. Any floating point number (including
87 exponents!) is allowed.
88
89 The expression "(A+ or ())" means that you can choose either "A+" or
90 the empty expression "()", that is, that the connector "A+" is
91 optional. This is more compactly expressed as "{A+}". In other words,
92 curly braces indicate an optional expression.
93
94 The expression "(A+ or [])" is the same as that above, but there is a
95 cost of 1 incurred for choosing not to use "A+". The expression
96 "(EXP1 & [EXP2])" is exactly the same as "[EXP1 & EXP2]". The difference
97 between "({[A+]} & B+)" and "([{A+}] & B+)" is that the latter always
98 incurs a cost of 1, while the former only gets a cost of 1 if "A+" is
99 used.
100
101 The dictionary writer is not allowed to use connectors that begin in
102 "ID". This is reserved for the connectors automatically
103 generated for idioms.
104
105 Dictionary words may be followed by a dot (period, "."), and a "subscript"
106 identifying the word type. The subscript may be one or more letters or
107 numbers, but must begin with a letter. Currently, the dictionary contains
108 (mostly?) subscripts consisting of a single letter, and these serve mostly
109 to identify the part-of-speech. In general, subscripts can also be used
110 to distinguish different word senses.
111 */
112
113 static bool link_advance(Dictionary dict);
114
dict_error2(Dictionary dict,const char * s,const char * s2)115 static void dict_error2(Dictionary dict, const char * s, const char *s2)
116 {
117 #define ERRBUFLEN 1024
118 char tokens[ERRBUFLEN], t[ERRBUFLEN];
119 int pos = 1;
120 int i;
121
122 /* The link_advance used to print the error message can
123 * throw more errors while printing... */
124 if (dict->recursive_error) return;
125 dict->recursive_error = true;
126
127 char token[MAX_TOKEN_LENGTH];
128 strcpy(token, dict->token);
129 bool save_is_special = dict->is_special;
130 const char * save_input = dict->input;
131 const char * save_pin = dict->pin;
132 int save_already_got_it = dict->already_got_it;
133 int save_line_number = dict->line_number;
134
135 tokens[0] = '\0';
136 for (i=0; i<5 && dict->token[0] != '\0'; i++)
137 {
138 pos += snprintf(t, ERRBUFLEN, "\"%s\" ", dict->token);
139 strncat(tokens, t, ERRBUFLEN-1-pos);
140 if (!link_advance(dict)) break;
141 }
142 tokens[pos] = '\0';
143
144 strcpy(dict->token, token);
145 dict->is_special = save_is_special;
146 dict->input = save_input;
147 dict->pin = save_pin;
148 dict->already_got_it = save_already_got_it;
149 dict->line_number = save_line_number;
150
151 if (s2)
152 {
153 prt_error("Error: While parsing dictionary \"%s\":\n"
154 "%s %s\n\t Line %d, next tokens: %s\n",
155 dict->name, s, s2, dict->line_number, tokens);
156 }
157 else
158 {
159 prt_error("Error: While parsing dictionary \"%s\":\n"
160 "%s\n\t Line %d, next tokens: %s\n",
161 dict->name, s, dict->line_number, tokens);
162 }
163 dict->recursive_error = false;
164 }
165
dict_error(Dictionary dict,const char * s)166 static void dict_error(Dictionary dict, const char * s)
167 {
168 dict_error2(dict, s, NULL);
169 }
170
warning(Dictionary dict,const char * s)171 static void warning(Dictionary dict, const char * s)
172 {
173 prt_error("Warning: %s\n"
174 "\tline %d, current token = \"%s\"\n",
175 s, dict->line_number, dict->token);
176 }
177
178 /**
179 * This gets the next UTF8 character from the input, eliminating comments.
180 * If we're in quote mode, it does not consider the % character for
181 * comments. Note that the returned character is a wide character!
182 */
183 #define MAXUTFLEN 7
184 typedef char utf8char[MAXUTFLEN];
get_character(Dictionary dict,int quote_mode,utf8char uc)185 static bool get_character(Dictionary dict, int quote_mode, utf8char uc)
186 {
187 int i = 0;
188
189 while (1)
190 {
191 char c = *(dict->pin++);
192
193 /* Skip over all comments */
194 if ((c == '%') && (!quote_mode))
195 {
196 if (0 == strncmp(dict->pin, SUPPRESS, sizeof(SUPPRESS)-1))
197 {
198 const char *nl = strchr(dict->pin + sizeof(SUPPRESS)-1, '\n');
199 if (NULL != nl)
200 {
201 dict->suppress_warning =
202 strndup(dict->pin + sizeof(SUPPRESS)-1,
203 nl - dict->pin - sizeof(SUPPRESS) + 1);
204 }
205 }
206 while ((c != 0x0) && (c != '\n')) c = *(dict->pin++);
207 if (c == 0x0) break;
208 dict->line_number++;
209 continue;
210 }
211
212 /* Newline counts as whitespace */
213 if (c == '\n')
214 dict->line_number++;
215
216 /* If it's a 7-bit ascii, we are done */
217 if ((0 == i) && ((c & 0x80) == 0x0))
218 {
219 uc[0] = c;
220 uc[1] = 0x0;
221 return true;
222 }
223
224 uc[0] = c;
225 i = 1;
226 while (i < MAXUTFLEN-1)
227 {
228 c = *(dict->pin++);
229 /* If we're onto the next char, we're done. */
230 if (((c & 0x80) == 0x0) || ((c & 0xc0) == 0xc0))
231 {
232 dict->pin--;
233 uc[i] = 0x0;
234 return true;
235 }
236 uc[i] = c;
237 i++;
238 }
239 dict_error(dict, "UTF8 char is too long");
240 return false;
241 }
242 uc[0] = 0x0;
243 return true;
244 }
245
246
247 /*
248 * This set of 10 characters are the ones defining the syntax of the
249 * dictionary.
250 */
251 #define SPECIAL "(){};[]&^|:"
252
253 /* Commutative (symmetric) AND */
254 #define SYM_AND '^'
255
256 /* Bi-directional connector: + or - */
257 #define ANY_DIR '$'
258
259 /* Wild-card link type */
260 #define WILD_TYPE '*'
261
262 /**
263 * Return true if the input character is one of the special
264 * characters used to define the syntax of the dictionary.
265 */
char_is_special(char c)266 static bool char_is_special(char c)
267 {
268 return (NULL != strchr(SPECIAL, c));
269 }
270
271 /**
272 * This reads the next token from the input into 'token'.
273 * Return 1 if a character was read, else return 0 (and print a warning).
274 */
275 NO_SAN_DICT
link_advance(Dictionary dict)276 static bool link_advance(Dictionary dict)
277 {
278 utf8char c;
279 int nr, i;
280 int quote_mode;
281
282 dict->is_special = false;
283
284 if (dict->already_got_it != '\0')
285 {
286 dict->is_special = char_is_special(dict->already_got_it);
287 if (dict->already_got_it == EOF) {
288 dict->token[0] = '\0';
289 } else {
290 dict->token[0] = (char)dict->already_got_it; /* specials are one byte */
291 dict->token[1] = '\0';
292 }
293 dict->already_got_it = '\0';
294 return true;
295 }
296
297 do
298 {
299 bool ok = get_character(dict, false, c);
300 if (!ok) return false;
301 }
302 while (lg_isspace(c[0]));
303
304 quote_mode = false;
305
306 i = 0;
307 for (;;)
308 {
309 if (i > MAX_TOKEN_LENGTH-3) {
310 dict_error(dict, "Token too long");
311 return false;
312 }
313 if (quote_mode) {
314 if (c[0] == '"' &&
315 /* Check the next character too, to allow " in words */
316 (*dict->pin == ':' || *dict->pin == ';' ||
317 lg_isspace(*dict->pin))) {
318
319 dict->token[i] = '\0';
320 return true;
321 }
322 if (lg_isspace(c[0])) {
323 dict_error(dict, "White space inside of token");
324 return false;
325 }
326 if (c[0] == '\0')
327 {
328 dict_error(dict, "EOF while reading quoted token");
329 return false;
330 }
331
332 nr = 0;
333 while (c[nr]) {dict->token[i] = c[nr]; i++; nr++; }
334 } else {
335 if ('\0' == c[1] && char_is_special(c[0]))
336 {
337 if (i == 0)
338 {
339 dict->token[0] = c[0]; /* special toks are one char always */
340 dict->token[1] = '\0';
341 dict->is_special = true;
342 return true;
343 }
344 dict->token[i] = '\0';
345 dict->already_got_it = c[0];
346 return true;
347 }
348 if (c[0] == 0x0) {
349 if (i != 0) dict->already_got_it = '\0';
350 dict->token[0] = '\0';
351 return true;
352 }
353 if (lg_isspace(c[0])) {
354 dict->token[i] = '\0';
355 return true;
356 }
357 if (c[0] == '\"') {
358 quote_mode = true;
359 } else {
360 nr = 0;
361 while (c[nr]) {dict->token[i] = c[nr]; i++; nr++; }
362 }
363 }
364 bool ok = get_character(dict, quote_mode, c);
365 if (!ok) return false;
366 }
367 /* unreachable */
368 }
369
370 /**
371 * Returns true if this token is a special token and it is equal to c
372 */
is_equal(Dictionary dict,char c)373 static int is_equal(Dictionary dict, char c)
374 {
375 return (dict->is_special &&
376 c == dict->token[0] &&
377 dict->token[1] == '\0');
378 }
379
380 /**
381 * Make sure the string s is a valid connector.
382 * Return true if the connector is valid, else return false,
383 * and print an appropriate warning message.
384 */
check_connector(Dictionary dict,const char * s)385 static bool check_connector(Dictionary dict, const char * s)
386 {
387 int i;
388 i = strlen(s);
389 if (i < 1) {
390 dict_error(dict, "Expecting a connector.");
391 return false;
392 }
393 i = s[i-1]; /* the last character of the token */
394 if ((i != '+') && (i != '-') && (i != ANY_DIR)) {
395 dict_error(dict, "A connector must end in a \"+\", \"-\" or \"$\".");
396 return false;
397 }
398 if (*s == '@') s++;
399 if (('h' == *s) || ('d' == *s)) s++;
400 if (!isupper((unsigned char)*s)) {
401 dict_error(dict, "Connectors must start with uppercase after an optional h or d.");
402 return false;
403 }
404 /* Note that IDx when x is a subscript is allowed (to allow e.g. ID4id+). */
405 if ((*s == 'I') && (*(s+1) == 'D') && isupper((unsigned char)*(s+2))) {
406 dict_error(dict, "Connectors beginning with \"ID\" are forbidden");
407 return false;
408 }
409
410 bool connector_base = true;
411 s++; /* The first uppercase has been validated above. */
412 while (*(s+1)) {
413 if ((!isalnum((unsigned char)*s)) && (*s != WILD_TYPE)) {
414 dict_error(dict, "All letters of a connector must be ASCII alpha-numeric.");
415 return false;
416 }
417 if (isupper((unsigned char)*s))
418 {
419 if (!connector_base)
420 {
421 dict_error(dict, "Connector subscript contains uppercase.");
422 return false;
423 }
424 }
425 else
426 {
427 connector_base = false;
428 }
429 s++;
430 }
431 return true;
432 }
433
434 /* ======================================================================== */
435 /**
436 * Dictionary entry comparison and ordering functions.
437 *
438 * The data structure storing the dictionary is simply a binary tree.
439 * The entries in the binary tree are sorted by alphabetical order.
440 * There is one catch, however: words may have suffixes (a dot, followed
441 * by the suffix), and these suffixes are to be handled appropriately
442 * during sorting and comparison.
443 *
444 * The use of suffixes means that the ordering of the words is not
445 * exactly the order given by strcmp. The order must be such that, for
446 * example, "make" < "make.n" < "make-up" -- suffixed words come after
447 * the bare words, but before any other other words with non-alphabetic
448 * characters (such as the hyphen in "make-up", or possibly UTF8
449 * characters). Thus, plain "strcmp" can't be used to determine
450 * dictionary order.
451 *
452 * Thus, a set of specialized string comparison and ordering functions
453 * are provided. These "do the right thing" when matching string with
454 * and without suffixes.
455 */
456 /**
457 * dict_order_strict - order two dictionary words in proper sort order.
458 * Return zero if the strings match, else return in a unique order.
459 * The order is NOT (locale-dependent) UTF8 sort order; its ordered
460 * based on numeric values of single bytes. This will uniquely order
461 * UTF8 strings, just not in a LANG-dependent (locale-dependent) order.
462 * But we don't need/want locale-dependent ordering!
463 */
464 /* verbose version, for demonstration only */
465 /*
466 int dict_order_strict(char *s, char *t)
467 {
468 int ss, tt;
469 while (*s != '\0' && *s == *t) {
470 s++;
471 t++;
472 }
473 if (*s == SUBSCRIPT_MARK) {
474 ss = 1;
475 } else {
476 ss = (*s)<<1;
477 }
478 if (*t == SUBSCRIPT_MARK) {
479 tt = 1;
480 } else {
481 tt = (*t)<<1;
482 }
483 return (ss - tt);
484 }
485 */
486
487 /* terse version */
488 /* If one word contains a dot, the other one must also! */
489 NO_SAN_DICT
dict_order_strict(const char * s,const Dict_node * dn)490 static inline int dict_order_strict(const char *s, const Dict_node * dn)
491 {
492 const char * t = dn->string;
493 while (*s != '\0' && *s == *t) {s++; t++;}
494 return ((*s == SUBSCRIPT_MARK)?(1):(*s)) - ((*t == SUBSCRIPT_MARK)?(1):(*t));
495 }
496
497 /**
498 * dict_order_bare() -- order user vs. dictionary string.
499 *
500 * Similar to above, except that a "bare" search string will match
501 * a dictionary entry with a dot.
502 *
503 * Assuming that s is a pointer to the search string, and that t is
504 * a pointer to a dictionary string, this returns 0 if they match,
505 * returns >0 if s>t, and <0 if s<t.
506 *
507 * The matching is done as follows. Walk down the strings until you
508 * come to the end of one of them, or until you find unequal characters.
509 * If the dictionary string contains a SUBSCRIPT_MARK, then replace the
510 * mark by "\0", and take the difference.
511 */
512 NO_SAN_DICT
dict_order_bare(const char * s,const Dict_node * dn)513 static inline int dict_order_bare(const char *s, const Dict_node * dn)
514 {
515 const char * t = dn->string;
516 while (*s != '\0' && *s == *t) {s++; t++;}
517 return (*s) - ((*t == SUBSCRIPT_MARK)?(0):(*t));
518 }
519
520 /**
521 * dict_order_wild() -- order dictionary strings, with wildcard.
522 *
523 * This routine is used to support command-line parser users who
524 * want to search for all dictionary entries of some given word or
525 * partial word, containing a wild-card. This is done by using the
526 * !!blah* command at the command-line. Users need this function to
527 * debug the dictionary. This is the ONLY place in the link-parser
528 * where wild-card search is needed; ordinary parsing does not use it.
529 *
530 * !!blah*.sub is also supported.
531 *
532 * Assuming that s is a pointer to a search string, and that
533 * t is a pointer to a dictionary string, this returns 0 if they
534 * match, >0 if s>t, and <0 if s<t.
535 *
536 * The matching is done as follows. Walk down the strings until
537 * you come to the end of one of them, or until you find unequal
538 * characters. A "*" matches anything before the subscript mark.
539 * Otherwise, replace SUBSCRIPT_MARK by "\0", and take the difference.
540 * This behavior matches that of the function dict_order_bare().
541 */
542 #define D_DOW 6
dict_order_wild(const char * s,const Dict_node * dn)543 static inline int dict_order_wild(const char * s, const Dict_node * dn)
544 {
545 const char * t = dn->string;
546
547 lgdebug(+D_DOW, "search-word='%s' dict-word='%s'\n", s, t);
548 while((*s != '\0') && (*s != SUBSCRIPT_MARK) && (*s == *t)) {s++; t++;}
549
550 if (*s == WILD_TYPE) return 0;
551
552 lgdebug(D_DOW, "Result: '%s'-'%s'=%d\n",
553 s, t, ((*s == SUBSCRIPT_MARK)?(0):(*s)) - ((*t == SUBSCRIPT_MARK)?(0):(*t)));
554 return ((*s == SUBSCRIPT_MARK)?(0):(*s)) - ((*t == SUBSCRIPT_MARK)?(0):(*t));
555 }
556 #undef D_DOW
557
558 /**
559 * dict_match -- return true if strings match, else false.
560 * A "bare" string (one without a subscript) will match any corresponding
561 * string with a subscript; so, for example, "make" and "make.n" are
562 * a match. If both strings have subscripts, then the subscripts must match.
563 *
564 * A subscript is the part that follows the SUBSCRIPT_MARK.
565 */
dict_match(const char * s,const char * t)566 static bool dict_match(const char * s, const char * t)
567 {
568 while ((*s != '\0') && (*s == *t)) { s++; t++; }
569
570 if (*s == *t) return true; /* both are '\0' */
571 if ((*s == 0) && (*t == SUBSCRIPT_MARK)) return true;
572 if ((*s == SUBSCRIPT_MARK) && (*t == 0)) return true;
573
574 return false;
575 }
576
577 /* ======================================================================== */
578
dict_node_new(void)579 static inline Dict_node * dict_node_new(void)
580 {
581 return (Dict_node*) malloc(sizeof(Dict_node));
582 }
583
584 /**
585 * prune_lookup_list -- discard all list entries that don't match string
586 * Walk the lookup list (of right links), discarding all nodes that do
587 * not match the dictionary string s. The matching is dictionary matching:
588 * subscripted entries will match "bare" entries.
589 */
prune_lookup_list(Dict_node * restrict llist,const char * restrict s)590 static Dict_node * prune_lookup_list(Dict_node * restrict llist, const char * restrict s)
591 {
592 Dict_node *dn, *dnx, *list_new;
593
594 list_new = NULL;
595 for (dn = llist; dn != NULL; dn = dnx)
596 {
597 dnx = dn->right;
598 /* now put dn onto the answer list, or free it */
599 if (dict_match(dn->string, s))
600 {
601 dn->right = list_new;
602 list_new = dn;
603 }
604 else
605 {
606 free(dn);
607 }
608 }
609
610 /* now reverse the list back */
611 llist = NULL;
612 for (dn = list_new; dn != NULL; dn = dnx)
613 {
614 dnx = dn->right;
615 dn->right = llist;
616 llist = dn;
617 }
618 return llist;
619 }
620
621 /* ======================================================================== */
subscr_match(const char * s,const Dict_node * dn)622 static bool subscr_match(const char *s, const Dict_node * dn)
623 {
624 const char * s_sub = strrchr(s, SUBSCRIPT_MARK);
625 const char * t_sub;
626
627 if (NULL == s_sub) return true;
628 t_sub = strrchr(dn->string, SUBSCRIPT_MARK);
629 if (NULL == t_sub) return false;
630 if (0 == strcmp(s_sub, t_sub)) return true;
631
632 return false;
633 }
634
635 /**
636 * rdictionary_lookup() -- recursive dictionary lookup
637 * Walk binary tree, given by 'dn', looking for the string 's'.
638 * For every node in the tree where 's' matches,
639 * make a copy of that node, and append it to llist.
640 */
641 static Dict_node *
rdictionary_lookup(Dict_node * restrict llist,const Dict_node * restrict dn,const char * restrict s,bool match_idiom,int (* dict_order)(const char *,const Dict_node *))642 rdictionary_lookup(Dict_node * restrict llist,
643 const Dict_node * restrict dn,
644 const char * restrict s,
645 bool match_idiom,
646 int (*dict_order)(const char *, const Dict_node *))
647 {
648 /* see comment in dictionary_lookup below */
649 int m;
650 Dict_node * dn_new;
651 if (dn == NULL) return llist;
652
653 m = dict_order(s, dn);
654
655 if (m >= 0)
656 {
657 llist = rdictionary_lookup(llist, dn->right, s, match_idiom, dict_order);
658 }
659 if ((m == 0) && (match_idiom || !is_idiom_word(dn->string)) &&
660 (dict_order != dict_order_wild || subscr_match(s, dn)))
661 {
662 dn_new = dict_node_new();
663 *dn_new = *dn;
664 dn_new->right = llist;
665 llist = dn_new;
666 }
667 if (m <= 0)
668 {
669 llist = rdictionary_lookup(llist, dn->left, s, match_idiom, dict_order);
670 }
671 return llist;
672 }
673
674 /**
675 * file_lookup_list() - return list of words in the file-backed dictionary.
676 *
677 * Returns a pointer to a lookup list of the words in the dictionary.
678 * Matches include words that appear in idioms. To exclude idioms, use
679 * abridged_lookup_list() to obtain matches.
680 *
681 * This list is made up of Dict_nodes, linked by their right pointers.
682 * The node, file and string fields are copied from the dictionary.
683 *
684 * The returned list must be freed with file_free_lookup().
685 */
file_lookup_list(const Dictionary dict,const char * s)686 Dict_node * file_lookup_list(const Dictionary dict, const char *s)
687 {
688 Dict_node * llist =
689 rdictionary_lookup(NULL, dict->root, s, true, dict_order_bare);
690 llist = prune_lookup_list(llist, s);
691 return llist;
692 }
693
file_boolean_lookup(Dictionary dict,const char * s)694 bool file_boolean_lookup(Dictionary dict, const char *s)
695 {
696 Dict_node *llist = file_lookup_list(dict, s);
697 bool boool = (llist != NULL);
698 file_free_lookup(llist);
699 return boool;
700 }
701
file_free_lookup(Dict_node * llist)702 void file_free_lookup(Dict_node *llist)
703 {
704 Dict_node * n;
705 while (llist != NULL)
706 {
707 n = llist->right;
708 free(llist);
709 llist = n;
710 }
711 }
712
free_insert_list(Dict_node * ilist)713 void free_insert_list(Dict_node *ilist)
714 {
715 Dict_node * n;
716 while (ilist != NULL)
717 {
718 n = ilist->left;
719 free(ilist);
720 ilist = n;
721 }
722 }
723
724 /**
725 * file_lookup_wild -- allows for wildcard searches (globs)
726 * Used to support the !! command in the parser command-line tool.
727 */
file_lookup_wild(Dictionary dict,const char * s)728 Dict_node * file_lookup_wild(Dictionary dict, const char *s)
729 {
730 bool lookup_idioms = test_enabled("lookup-idioms");
731 char * ds = strrchr(s, SUBSCRIPT_DOT); /* Only the rightmost dot is a
732 candidate for SUBSCRIPT_DOT */
733 char * ws = strrchr(s, WILD_TYPE); /* A SUBSCRIPT_DOT can only appear
734 after a wild-card */
735 Dict_node * result;
736 char * stmp = strdup(s);
737
738 /* It is not a SUBSCRIPT_DOT if it is at the end or before the wild-card.
739 * E.g: "Dr.", "i.*", "." */
740 if ((NULL != ds) && ('\0' != ds[1]) && ((NULL == ws) || (ds > ws)))
741 stmp[ds-s] = SUBSCRIPT_MARK;
742
743 result =
744 rdictionary_lookup(NULL, dict->root, stmp, lookup_idioms, dict_order_wild);
745 free(stmp);
746 return result;
747 }
748
749 /**
750 * abridged_lookup_list() - return lookup list of words in the dictionary
751 *
752 * Returns a pointer to a lookup list of the words in the dictionary.
753 * Excludes any idioms that contain the word; use
754 * dictionary_lookup_list() to obtain the complete list.
755 *
756 * This list is made up of Dict_nodes, linked by their right pointers.
757 * The node, file and string fields are copied from the dictionary.
758 *
759 * The returned list must be freed with file_free_lookup().
760 */
abridged_lookup_list(const Dictionary dict,const char * s)761 static Dict_node * abridged_lookup_list(const Dictionary dict, const char *s)
762 {
763 Dict_node *llist;
764 llist = rdictionary_lookup(NULL, dict->root, s, false, dict_order_bare);
765 llist = prune_lookup_list(llist, s);
766 return llist;
767 }
768
769 /**
770 * strict_lookup_list() - return exact match in the dictionary
771 *
772 * Returns a pointer to a lookup list of the words in the dictionary.
773 * Excludes any idioms that contain the word.
774 *
775 * This list is made up of Dict_nodes, linked by their right pointers.
776 * The node, file and string fields are copied from the dictionary.
777 *
778 * The list normally has 0 or 1 elements, unless the given word
779 * appears more than once in the dictionary.
780 *
781 * The returned list must be freed with file_free_lookup().
782 */
strict_lookup_list(const Dictionary dict,const char * s)783 static Dict_node * strict_lookup_list(const Dictionary dict, const char *s)
784 {
785 Dict_node *llist;
786 llist = rdictionary_lookup(NULL, dict->root, s, false, dict_order_strict);
787 llist = prune_lookup_list(llist, s);
788 return llist;
789 }
790
791 /* ======================================================================== */
792 /**
793 * Allocate a new Exp node.
794 */
Exp_create(Pool_desc * mp)795 Exp *Exp_create(Pool_desc *mp)
796 {
797 Exp *e = pool_alloc(mp);
798 e->tag_type = Exptag_none;
799 return e;
800 }
801
802 /**
803 * Duplicate the given Exp node.
804 * This is needed in case it participates more than once in a
805 * single expression.
806 */
Exp_create_dup(Pool_desc * mp,Exp * old_e)807 Exp *Exp_create_dup(Pool_desc *mp, Exp *old_e)
808 {
809 Exp *new_e = Exp_create(mp);
810
811 *new_e = *old_e;
812
813 return new_e;
814 }
815
816 /**
817 * This creates a node with zero children. Initializes
818 * the cost to zero.
819 */
make_zeroary_node(Pool_desc * mp)820 static Exp * make_zeroary_node(Pool_desc *mp)
821 {
822 Exp * n = Exp_create(mp);
823 n->type = AND_type; /* these must be AND types */
824 n->cost = 0.0;
825 n->operand_first = NULL;
826 n->operand_next = NULL;
827 return n;
828 }
829
830 /**
831 * This creates a node with one child (namely e). Initializes
832 * the cost to zero.
833 */
make_unary_node(Pool_desc * mp,Exp * e)834 Exp *make_unary_node(Pool_desc *mp, Exp * e)
835 {
836 Exp * n;
837 n = Exp_create(mp);
838 n->type = AND_type; /* these must be AND types */
839 n->operand_next = NULL;
840 n->cost = 0.0;
841 n->operand_first = e;
842 return n;
843 }
844
845 /**
846 * Create an AND_type expression. The expressions nl, nr will be
847 * AND-ed together.
848 */
make_and_node(Pool_desc * mp,Exp * nl,Exp * nr)849 static Exp * make_and_node(Pool_desc *mp, Exp* nl, Exp* nr)
850 {
851 Exp* n;
852
853 n = Exp_create(mp);
854 n->type = AND_type;
855 n->operand_next = NULL;
856 n->cost = 0.0;
857
858 n->operand_first = nl;
859 nl->operand_next = nr;
860 nr->operand_next = NULL;
861
862 return n;
863 }
864
make_op_Exp(Pool_desc * mp,Exp_type t)865 static Exp *make_op_Exp(Pool_desc *mp, Exp_type t)
866 {
867 Exp * n = Exp_create(mp);
868 n->type = t;
869 n->operand_next = NULL;
870 n->cost = 0.0;
871
872 /* The caller is supposed to assign n->operand. */
873 return n;
874 }
875
876 /**
877 * Create an OR_type expression. The expressions nl, nr will be
878 * OR-ed together.
879 */
make_or_node(Pool_desc * mp,Exp * nl,Exp * nr)880 static Exp * make_or_node(Pool_desc *mp, Exp* nl, Exp* nr)
881 {
882 Exp* n;
883
884 n = Exp_create(mp);
885 n->type = OR_type;
886 n->operand_next = NULL;
887 n->cost = 0.0;
888
889 n->operand_first = nl;
890 nl->operand_next = nr;
891 nr->operand_next = NULL;
892
893 return n;
894 }
895
896 /**
897 * This creates an OR node with two children, one the given node,
898 * and the other as zeroary node. This has the effect of creating
899 * what used to be called an optional node.
900 */
make_optional_node(Pool_desc * mp,Exp * e)901 static Exp *make_optional_node(Pool_desc *mp, Exp *e)
902 {
903 return make_or_node(mp, make_zeroary_node(mp), e);
904 }
905
906 /**
907 * make_dir_connector() -- make a single node for a connector
908 * that is a + or a - connector.
909 *
910 * Assumes the current token is the connector.
911 */
make_dir_connector(Dictionary dict,int i)912 static Exp * make_dir_connector(Dictionary dict, int i)
913 {
914 Exp* n = Exp_create(dict->Exp_pool);
915 char *constring;
916
917 n->dir = dict->token[i];
918 dict->token[i] = '\0'; /* get rid of the + or - */
919 if (dict->token[0] == '@')
920 {
921 constring = dict->token+1;
922 n->multi = true;
923 }
924 else
925 {
926 constring = dict->token;
927 n->multi = false;
928 }
929
930 n->condesc = condesc_add(&dict->contable,
931 string_set_add(constring, dict->string_set));
932 if (NULL == n->condesc) return NULL; /* Table ovf */
933 n->type = CONNECTOR_type;
934 n->operand_next = NULL; /* unused, but accessed by copy_Exp() and some more */
935 n->cost = 0.0;
936 return n;
937 }
938
939 /* ======================================================================== */
940 /**
941 * Add an optional macro/word tag, for expression debugging.
942 * Enabled by !test="macro-tag". This tag is used only in expression printing.
943 */
exptag_macro_add(Dictionary dict,const char * tag)944 static unsigned int exptag_macro_add(Dictionary dict, const char *tag)
945 {
946 expression_tag *mt = dict->macro_tag;
947 if (mt == NULL) return 0;
948
949 if (mt->num == mt->size)
950 {
951 if (mt->num == 0)
952 mt->size = 128;
953 else
954 mt->size *= 2;
955
956 mt->name = realloc(mt->name, mt->size * sizeof(*mt->name));
957 }
958 mt->name[mt->num] = tag;
959
960 return mt->num++;
961 }
962
963 /**
964 * make_connector() -- make a node for a connector or dictionary word.
965 *
966 * Assumes the current token is a connector or dictionary word.
967 */
make_connector(Dictionary dict)968 static Exp * make_connector(Dictionary dict)
969 {
970 Exp * n;
971 Dict_node *dn, *dn_head;
972 int i;
973
974 i = strlen(dict->token) - 1; /* this must be +, - or $ if a connector */
975 if ((dict->token[i] != '+') &&
976 (dict->token[i] != '-') &&
977 (dict->token[i] != ANY_DIR))
978 {
979 /* If we are here, token is a word */
980 patch_subscript(dict->token);
981 dn_head = abridged_lookup_list(dict, dict->token);
982 dn = dn_head;
983 while ((dn != NULL) && (strcmp(dn->string, dict->token) != 0))
984 {
985 dn = dn->right;
986 }
987 if (dn == NULL)
988 {
989 file_free_lookup(dn_head);
990 dict_error(dict, "Perhaps missing + or - in a connector.\n"
991 "Or perhaps you forgot the subscript on a word.\n"
992 "Or perhaps a word is used before it is defined.");
993 return NULL;
994 }
995
996 /* Wrap it in a unary node as a placeholder for a macro tag and cost. */
997 n = make_unary_node(dict->Exp_pool, dn->exp);
998 n->tag_id = exptag_macro_add(dict, dn->string);
999 if (n->tag_id != 0) n->tag_type = Exptag_macro;
1000
1001 file_free_lookup(dn_head);
1002 }
1003 else
1004 {
1005 /* If we are here, token is a connector */
1006 if (!check_connector(dict, dict->token))
1007 {
1008 return NULL;
1009 }
1010 if ((dict->token[i] == '+') || (dict->token[i] == '-'))
1011 {
1012 /* A simple, unidirectional connector. Just make that. */
1013 n = make_dir_connector(dict, i);
1014 if (NULL == n) return NULL;
1015 }
1016 else if (dict->token[i] == ANY_DIR)
1017 {
1018 Exp *plu, *min;
1019 /* If we are here, then it's a bi-directional connector.
1020 * Make both a + and a - version, and or them together. */
1021 dict->token[i] = '+';
1022 plu = make_dir_connector(dict, i);
1023 if (NULL == plu) return NULL;
1024 dict->token[i] = '-';
1025 min = make_dir_connector(dict, i);
1026 if (NULL == min) return NULL;
1027
1028 n = make_or_node(dict->Exp_pool, plu, min);
1029 }
1030 else
1031 {
1032 dict_error(dict, "Unknown connector direction type.");
1033 return NULL;
1034 }
1035 }
1036
1037 if (!link_advance(dict))
1038 {
1039 free(n);
1040 return NULL;
1041 }
1042 return n;
1043 }
1044
1045 /* ======================================================================== */
1046 /* Empty-word handling. */
1047
1048 /** Insert ZZZ+ connectors.
1049 * This function was mainly used to support using empty-words, a concept
1050 * that has been eliminated. However, it is still used to support linking of
1051 * quotes that don't get the QUc/QUd links.
1052 */
add_empty_word(Sentence sent,X_node * x)1053 void add_empty_word(Sentence sent, X_node *x)
1054 {
1055 Exp *zn, *an;
1056 const char *ZZZ = string_set_lookup(EMPTY_CONNECTOR, sent->dict->string_set);
1057 /* This function is called only if ZZZ is in the dictionary. */
1058
1059 /* The left-wall already has ZZZ-. The right-wall will not arrive here. */
1060 if (MT_WALL == x->word->morpheme_type) return;
1061
1062 /* Replace plain-word-exp by {ZZZ+} & (plain-word-exp) in each X_node. */
1063 for(; NULL != x; x = x->next)
1064 {
1065 /* Ignore stems for now, decreases a little the overhead for
1066 * stem-suffix languages. */
1067 if (is_stem(x->string)) continue; /* Avoid an unneeded overhead. */
1068 //lgdebug(+0, "Processing '%s'\n", x->string);
1069
1070 /* zn points at {ZZZ+} */
1071 zn = Exp_create(sent->Exp_pool);
1072 zn->dir = '+';
1073 zn->condesc = condesc_add(&sent->dict->contable, ZZZ);
1074 zn->multi = false;
1075 zn->type = CONNECTOR_type;
1076 zn->operand_next = NULL; /* unused, but to be on the safe side */
1077 zn->cost = 0.0;
1078 zn = make_optional_node(sent->Exp_pool, zn);
1079
1080 /* an will be {ZZZ+} & (plain-word-exp) */
1081 an = Exp_create(sent->Exp_pool);
1082 an->type = AND_type;
1083 an->operand_next = NULL;
1084 an->cost = 0.0;
1085 an->operand_first = zn;
1086 zn->operand_next = x->exp;
1087
1088 x->exp = an;
1089 }
1090 }
1091
1092 /* ======================================================================== */
1093
1094 /**
1095 * Return true if the string is a (floating point) number.
1096 * Float points can be proceeded by a single plus or minus sign.
1097 */
is_number(const char * str)1098 static bool is_number(const char * str)
1099 {
1100 if (str[0] == '\0') return false; /* End of file. */
1101 if ('+' == str[0] || '-' == str[0]) str++;
1102 size_t numlen = strspn(str, "0123456789.");
1103
1104 return str[numlen] == '\0';
1105 }
1106
1107 /* ======================================================================== */
1108
1109 /**
1110 * Build (and return the root of) the tree for the expression beginning
1111 * with the current token. At the end, the token is the first one not
1112 * part of this expression.
1113 */
make_expression(Dictionary dict)1114 static Exp *make_expression(Dictionary dict)
1115 {
1116 Exp *nl = NULL;
1117 Exp *e_head = NULL;
1118 Exp *e_tail = NULL; /* last part of the expression */
1119 bool is_sym_and = false;
1120
1121 while (true)
1122 {
1123 if (is_equal(dict, '('))
1124 {
1125 if (!link_advance(dict)) {
1126 return NULL;
1127 }
1128 nl = make_expression(dict);
1129 if (nl == NULL) {
1130 return NULL;
1131 }
1132 if (!is_equal(dict, ')')) {
1133 dict_error(dict, "Expecting a \")\".");
1134 return NULL;
1135 }
1136 if (!link_advance(dict)) {
1137 return NULL;
1138 }
1139 }
1140 else if (is_equal(dict, '{'))
1141 {
1142 if (!link_advance(dict)) {
1143 return NULL;
1144 }
1145 nl = make_expression(dict);
1146 if (nl == NULL) {
1147 return NULL;
1148 }
1149 if (!is_equal(dict, '}')) {
1150 dict_error(dict, "Expecting a \"}\".");
1151 return NULL;
1152 }
1153 if (!link_advance(dict)) {
1154 return NULL;
1155 }
1156 nl = make_optional_node(dict->Exp_pool, nl);
1157 }
1158 else if (is_equal(dict, '['))
1159 {
1160 if (!link_advance(dict)) {
1161 return NULL;
1162 }
1163 nl = make_expression(dict);
1164 if (nl == NULL) {
1165 return NULL;
1166 }
1167 if (!is_equal(dict, ']')) {
1168 dict_error(dict, "Expecting a \"]\".");
1169 return NULL;
1170 }
1171 if (!link_advance(dict)) {
1172 return NULL;
1173 }
1174
1175 /* A square bracket can have a number or a name after it.
1176 * If a number is present, then that it is interpreted
1177 * as the cost of the bracket. If a name is present, it
1178 * is used as an expression tag. Else, the cost of a
1179 * square bracket is 1.0.
1180 */
1181 if (is_number(dict->token))
1182 {
1183 float cost;
1184
1185 if (strtodC(dict->token, &cost))
1186 {
1187 nl->cost += cost;
1188 }
1189 else
1190 {
1191 warning(dict, "Invalid cost (using 1.0)\n");
1192 nl->cost += 1.0;
1193 }
1194 if (!link_advance(dict)) {
1195 return NULL;
1196 }
1197 }
1198 else if ((strcmp(dict->token, "or") != 0) &&
1199 (strcmp(dict->token, "and") != 0) &&
1200 isalpha(dict->token[0]))
1201 {
1202 const char *bad = valid_dialect_name(dict->token);
1203 if (bad != NULL)
1204 {
1205 char badchar[] = { *bad, '\0' };
1206 dict_error2(dict, "Invalid character in dialect tag name:",
1207 badchar);
1208 return NULL;
1209 }
1210 if (nl->tag_type != Exptag_none)
1211 {
1212 nl = make_unary_node(dict->Exp_pool, nl);
1213 }
1214 nl->tag_id = exptag_dialect_add(dict, dict->token);
1215 nl->tag_type = Exptag_dialect;
1216 if (!link_advance(dict)) {
1217 return NULL;
1218 }
1219 }
1220 else
1221 {
1222 nl->cost += 1.0;
1223 }
1224 }
1225 else if (!dict->is_special)
1226 {
1227 nl = make_connector(dict);
1228 if (nl == NULL) {
1229 return NULL;
1230 }
1231 }
1232 else if (is_equal(dict, ')') || is_equal(dict, ']'))
1233 {
1234 /* allows "()" or "[]" */
1235 nl = make_zeroary_node(dict->Exp_pool);
1236 }
1237 else
1238 {
1239 dict_error(dict, "Connector, \"(\", \"[\", or \"{\" expected.");
1240 return NULL;
1241 }
1242
1243 if (is_sym_and)
1244 {
1245 /* Part 2/2 of SYM_AND processing. */
1246
1247 /* Expand A ^ B into the expr ((A & B) or (B & A)). */
1248 Exp *na = make_and_node(dict->Exp_pool,
1249 Exp_create_dup(dict->Exp_pool, e_tail),
1250 Exp_create_dup(dict->Exp_pool, nl));
1251 Exp *nb = make_and_node(dict->Exp_pool,
1252 Exp_create_dup(dict->Exp_pool, nl),
1253 Exp_create_dup(dict->Exp_pool, e_tail));
1254 Exp *or = make_or_node(dict->Exp_pool, na, nb);
1255
1256 *e_tail = *or; /* SYM_AND result */
1257 is_sym_and = false;
1258 }
1259 else if (e_tail != NULL)
1260 {
1261 /* This is Non-commuting AND or Commuting OR.
1262 * Append the just read expression (nl) to its operand chain. */
1263 e_tail->operand_next = nl;
1264 e_tail = nl;
1265 }
1266
1267 /* Extract the operator. */
1268
1269 Exp_type op;
1270
1271 /* Non-commuting AND */
1272 if (is_equal(dict, '&') || (strcmp(dict->token, "and") == 0))
1273 {
1274 op = AND_type;
1275 }
1276 /* Commuting OR */
1277 else if (is_equal(dict, '|') || (strcmp(dict->token, "or") == 0))
1278 {
1279 op = OR_type;
1280 }
1281 /* Commuting AND */
1282 else if (is_equal(dict, SYM_AND) || (strcmp(dict->token, "sym") == 0))
1283 {
1284 /* Part 1/2 of SYM_AND processing */
1285 op = AND_type; /* allow mixing with ordinary ands at the same level */
1286 is_sym_and = true; /* processing to be completed after next argument */
1287 }
1288 else
1289 {
1290 if (e_head != NULL) return e_head;
1291 return nl;
1292 }
1293
1294 /* If this is the first operand, use nl for it. Else just validate
1295 * that the current operator is consistent with that of the current
1296 * expression level. */
1297 if (e_head == NULL)
1298 {
1299 e_head = make_op_Exp(dict->Exp_pool, op);
1300 e_head->operand_first = nl;
1301 }
1302 else
1303 {
1304 if (e_head->type != op)
1305 {
1306 dict_error(dict, "\"and\" and \"or\" at the same level in an expression");
1307 return NULL;
1308 }
1309 }
1310
1311 if (!link_advance(dict)) {
1312 return NULL;
1313 }
1314
1315 if (e_tail == NULL)
1316 e_tail = e_head->operand_first;
1317 }
1318 /* unreachable */
1319 }
1320
1321 /* ======================================================================== */
1322 /* Implementation of the DSW algo for rebalancing a binary tree.
1323 * The point is -- after building the dictionary tree, we rebalance it
1324 * once at the end. This is a **LOT LOT** quicker than maintaining an
1325 * AVL tree along the way (less than quarter-of-a-second vs. about
1326 * a minute or more!) FWIW, the DSW tree is even more balanced than
1327 * the AVL tree is (it's less deep, more full).
1328 *
1329 * The DSW algo, with C++ code, is described in
1330 *
1331 * Timothy J. Rolfe, "One-Time Binary Search Tree Balancing:
1332 * The Day/Stout/Warren (DSW) Algorithm", inroads, Vol. 34, No. 4
1333 * (December 2002), pp. 85-88
1334 * http://penguin.ewu.edu/~trolfe/DSWpaper/
1335 */
1336
rotate_right(Dict_node * root)1337 static Dict_node *rotate_right(Dict_node *root)
1338 {
1339 Dict_node *pivot = root->left;
1340 root->left = pivot->right;
1341 pivot->right = root;
1342 return pivot;
1343 }
1344
dsw_tree_to_vine(Dict_node * root)1345 static Dict_node * dsw_tree_to_vine (Dict_node *root)
1346 {
1347 Dict_node *vine_tail, *vine_head, *rest;
1348 Dict_node vh;
1349
1350 vine_head = &vh;
1351 vine_head->left = NULL;
1352 vine_head->right = root;
1353 vine_tail = vine_head;
1354 rest = root;
1355
1356 while (NULL != rest)
1357 {
1358 /* If no left, we are done, do the right */
1359 if (NULL == rest->left)
1360 {
1361 vine_tail = rest;
1362 rest = rest->right;
1363 }
1364 /* eliminate the left subtree */
1365 else
1366 {
1367 rest = rotate_right(rest);
1368 vine_tail->right = rest;
1369 }
1370 }
1371
1372 return vh.right;
1373 }
1374
1375 NO_SAN_DICT
dsw_compression(Dict_node * root,unsigned int count)1376 static void dsw_compression (Dict_node *root, unsigned int count)
1377 {
1378 unsigned int j;
1379 for (j = 0; j < count; j++)
1380 {
1381 /* Compound left rotation */
1382 Dict_node * pivot = root->right;
1383 root->right = pivot->right;
1384 root = pivot->right;
1385 pivot->right = root->left;
1386 root->left = pivot;
1387 }
1388 }
1389
1390 /* Return size of the full portion of the tree
1391 * Gets the next pow(2,k)-1
1392 */
full_tree_size(unsigned int size)1393 static inline unsigned int full_tree_size (unsigned int size)
1394 {
1395 unsigned int pk = 1;
1396 while (pk < size) pk = 2*pk + 1;
1397 return pk/2;
1398 }
1399
dsw_vine_to_tree(Dict_node * root,int size)1400 static Dict_node * dsw_vine_to_tree (Dict_node *root, int size)
1401 {
1402 Dict_node vine_head;
1403 unsigned int full_count = full_tree_size(size +1);
1404
1405 vine_head.left = NULL;
1406 vine_head.right = root;
1407
1408 dsw_compression(&vine_head, size - full_count);
1409 for (size = full_count; size > 1; size /= 2)
1410 {
1411 dsw_compression(&vine_head, size / 2);
1412 }
1413 return vine_head.right;
1414 }
1415
1416 /* ======================================================================== */
1417 /**
1418 * Insert the new node into the dictionary below node n.
1419 * "newnode" left and right fields are NULL, and its string is already
1420 * there. If the string is already found in the dictionary, give an error
1421 * message and effectively ignore it.
1422 *
1423 * The resulting tree is highly unbalanced. It needs to be rebalanced
1424 * before being used. The DSW algo below is ideal for that.
1425 */
1426 NO_SAN_DICT
insert_dict(Dictionary dict,Dict_node * n,Dict_node * newnode)1427 Dict_node *insert_dict(Dictionary dict, Dict_node *n, Dict_node *newnode)
1428 {
1429 if (NULL == n) return newnode;
1430
1431 static Exp null_exp =
1432 {
1433 .type = AND_type,
1434 .operand_first = NULL,
1435 .operand_next = NULL,
1436 };
1437 int comp = dict_order_strict(newnode->string, n);
1438
1439 if (0 == comp &&
1440 /* Suppress reporting duplicate idioms until they are fixed. */
1441 (!contains_underbar(newnode->string) || test_enabled("dup-idioms")))
1442 {
1443 char t[80+MAX_TOKEN_LENGTH];
1444 snprintf(t, sizeof(t),
1445 "Ignoring word \"%s\", which has been multiply defined:",
1446 newnode->string);
1447 dict_error(dict, t);
1448 /* Too late to skip insertion - insert it with a null expression. */
1449 newnode->exp = &null_exp;
1450 comp = -1;
1451 }
1452
1453 if (comp < 0)
1454 {
1455 if (NULL == n->left)
1456 {
1457 n->left = newnode;
1458 return n;
1459 }
1460 n->left = insert_dict(dict, n->left, newnode);
1461 }
1462 else
1463 {
1464 if (NULL == n->right)
1465 {
1466 n->right = newnode;
1467 return n;
1468 }
1469 n->right = insert_dict(dict, n->right, newnode);
1470 }
1471
1472 return n;
1473 /* return rebalance(n); Uncomment to get an AVL tree */
1474 }
1475
1476 /**
1477 * Find if a warning symbol exists in the currently suppress list.
1478 * The warning symbols are constructed in a way that disallow overlap
1479 * matching.
1480 */
is_warning_suppressed(Dictionary dict,const char * warning_symbol)1481 static bool is_warning_suppressed(Dictionary dict, const char *warning_symbol)
1482 {
1483 if (NULL == dict->suppress_warning) return false;
1484 return (NULL != strstr(dict->suppress_warning, warning_symbol));
1485 }
1486
1487 /**
1488 * Remember the length_limit definitions in a list according to their order.
1489 * The order is kept to allow later more specific definitions to override
1490 * already applied ones.
1491 */
add_condesc_length_limit(Dictionary dict,Dict_node * dn,int length_limit)1492 static void add_condesc_length_limit(Dictionary dict, Dict_node *dn,
1493 int length_limit)
1494 {
1495 length_limit_def_t *lld = malloc(sizeof(*lld));
1496 lld->next = NULL;
1497 lld->length_limit = length_limit;
1498 lld->defexp = dn->exp;
1499 lld->defword = dn->string;
1500 *dict->contable.length_limit_def_next = lld;
1501 dict->contable.length_limit_def_next = &lld->next;
1502 }
1503
insert_length_limit(Dictionary dict,Dict_node * dn)1504 static void insert_length_limit(Dictionary dict, Dict_node *dn)
1505 {
1506 int length_limit;
1507
1508 if (0 == strcmp(UNLIMITED_CONNECTORS_WORD, dn->string))
1509 {
1510 length_limit = UNLIMITED_LEN;
1511 }
1512 else
1513 if (0 == strncmp(LIMITED_CONNECTORS_WORD, dn->string,
1514 sizeof(LIMITED_CONNECTORS_WORD)-1))
1515 {
1516 char *endp;
1517 length_limit =
1518 (int)strtol(dn->string + sizeof(LIMITED_CONNECTORS_WORD)-1, &endp, 10);
1519 if ((length_limit < 0) || (length_limit > MAX_SENTENCE) ||
1520 (('\0' != *endp) && (SUBSCRIPT_MARK != *endp)))
1521 {
1522 prt_error("Warning: Word \"%s\" found near line %d of \"%s\".\n"
1523 "\tThis word should end with a number (1-%d).\n"
1524 "\tThis word will be ignored.\n",
1525 dn->string, dict->line_number, dict->name, MAX_SENTENCE);
1526 return;
1527 }
1528 }
1529 else return;
1530
1531 /* We cannot set the connectors length_limit yet because the
1532 * needed data structure is not defined yet. For now, just
1533 * remember the definitions in their order. */
1534 add_condesc_length_limit(dict, dn, length_limit);
1535 }
1536
1537 /**
1538 * insert_list() -
1539 * p points to a list of dict_nodes connected by their left pointers.
1540 * l is the length of this list (the last ptr may not be NULL).
1541 * It inserts the list into the dictionary.
1542 * It does the middle one first, then the left half, then the right.
1543 *
1544 * Note: I think this "insert middle, then left, then right" algo has
1545 * its origins as a lame attempt to hack around the fact that the
1546 * resulting binary tree is rather badly unbalanced. This has been
1547 * fixed by using the DSW rebalancing algo. Now, that would seem
1548 * to render this crazy bisected-insertion algo obsolete, but ..
1549 * oddly enough, it seems to make the DSW balancing go really fast!
1550 * Faster than a simple insertion. Go figure. I think this has
1551 * something to do with the fact that the dictionaries are in
1552 * alphabetical order! This subdivision helps randomize a bit.
1553 */
insert_list(Dictionary dict,Dict_node * p,int l)1554 void insert_list(Dictionary dict, Dict_node * p, int l)
1555 {
1556 Dict_node * dn, *dn_head, *dn_second_half;
1557 int k, i; /* length of first half */
1558
1559 if (l == 0) return;
1560
1561 k = (l-1)/2;
1562 dn = p;
1563 for (i = 0; i < k; i++)
1564 {
1565 dn = dn->left;
1566 }
1567
1568 /* dn now points to the middle element */
1569 dn_second_half = dn->left;
1570 dn->left = dn->right = NULL;
1571
1572 if (is_idiom_word(dn->string))
1573 {
1574 prt_error("Warning: Word \"%s\" found near line %d of \"%s\".\n"
1575 "\tWords ending \".Ix\" (x a number) are reserved for idioms.\n"
1576 "\tThis word will be ignored.\n",
1577 dn->string, dict->line_number, dict->name);
1578 free(dn);
1579 }
1580 else
1581 {
1582 if (contains_underbar(dn->string))
1583 {
1584 insert_idiom(dict, dn);
1585 }
1586
1587 dict->root = insert_dict(dict, dict->root, dn);
1588 insert_length_limit(dict, dn);
1589 dict->num_entries++;
1590
1591 if ((verbosity_level(D_DICT+0) && !is_warning_suppressed(dict, DUP_BASE)) ||
1592 verbosity_level(D_SPEC+3))
1593 {
1594 /* Warn if there are words with a subscript that match a bare word. */
1595 const char *sm = strchr(dn->string, SUBSCRIPT_MARK);
1596 char *bareword;
1597
1598 if (NULL != sm)
1599 {
1600 bareword = strdupa(dn->string);
1601 bareword[sm - dn->string] = '\0';
1602 }
1603 else
1604 {
1605 bareword = (char *)dn->string;
1606 }
1607
1608 if ((dn_head = strict_lookup_list(dict, bareword)) != NULL)
1609 {
1610 bool match_found = false;
1611 for (Dict_node *dnx = dn_head; dnx != NULL; dnx = dnx->right) {
1612 if (0 != strcmp(dnx->string, dn->string))
1613 {
1614 if (!match_found)
1615 {
1616 prt_error("Warning: The word \"%s\" found near line "
1617 "%d of \"%s\"\n\t matches the following words:",
1618 dn->string, dict->line_number, dict->name);
1619 match_found = true;
1620 }
1621 prt_error("\t%s", dnx->string);
1622 }
1623 }
1624 if (match_found) prt_error("\n");
1625 file_free_lookup(dn_head);
1626 }
1627 }
1628 }
1629
1630 insert_list(dict, p, k);
1631 insert_list(dict, dn_second_half, l-k-1);
1632 }
1633
1634 /**
1635 * read_entry() -- read one dictionary entry
1636 * Starting with the current token, parse one dictionary entry.
1637 * A single dictionary entry must have one and only one colon in it,
1638 * and is terminated by a semi-colon.
1639 * Add these words to the dictionary.
1640 */
read_entry(Dictionary dict)1641 static bool read_entry(Dictionary dict)
1642 {
1643 Exp *n;
1644 int i;
1645
1646 Dict_node *dnx, *dn = NULL;
1647
1648 while (!is_equal(dict, ':'))
1649 {
1650 if (dict->is_special)
1651 {
1652 dict_error(dict, "I expected a word but didn\'t get it.");
1653 goto syntax_error;
1654 }
1655
1656 /* If it's a word-file name */
1657 /* However, be careful to reject "/.v" which is the division symbol
1658 * used in equations (.v means verb-like) */
1659 if ((dict->token[0] == '/') && (dict->token[1] != '.'))
1660 {
1661 dn = read_word_file(dict, dn, dict->token);
1662 if (dn == NULL)
1663 {
1664 prt_error("Error: Cannot open word file \"%s\".\n", dict->token);
1665 return false;
1666 }
1667 }
1668 else if (0 == strcmp(dict->token, "#include"))
1669 {
1670 bool rc;
1671 char* instr;
1672 char* dict_name;
1673 const char * save_name;
1674 bool save_is_special;
1675 const char * save_input;
1676 const char * save_pin;
1677 int save_already_got_it;
1678 int save_line_number;
1679 size_t skip_slash;
1680
1681 if (!link_advance(dict)) goto syntax_error;
1682
1683 skip_slash = ('/' == dict->token[0]) ? 1 : 0;
1684 dict_name = strdupa(dict->token);
1685 save_name = dict->name;
1686 save_is_special = dict->is_special;
1687 save_input = dict->input;
1688 save_pin = dict->pin;
1689 save_already_got_it = dict->already_got_it;
1690 save_line_number = dict->line_number;
1691
1692 /* OK, token contains the filename to read ... */
1693 instr = get_file_contents(dict_name + skip_slash);
1694 if (NULL == instr)
1695 {
1696 prt_error("Error: While parsing dictionary \"%s\":\n"
1697 "\t Line %d: Could not open subdictionary \"%s\"\n",
1698 dict->name, dict->line_number-1, dict_name);
1699 goto syntax_error;
1700 }
1701 dict->input = instr;
1702 dict->pin = dict->input;
1703
1704 /* The line number and dict name are used for error reporting */
1705 dict->line_number = 1;
1706 dict->name = dict_name;
1707
1708 /* Now read the thing in. */
1709 rc = read_dictionary(dict);
1710
1711 dict->name = save_name;
1712 dict->is_special = save_is_special;
1713 dict->input = save_input;
1714 dict->pin = save_pin;
1715 dict->already_got_it = save_already_got_it;
1716 dict->line_number = save_line_number;
1717
1718 free(instr);
1719 if (!rc) goto syntax_error;
1720
1721 /* when we return, point to the next entry */
1722 if (!link_advance(dict)) goto syntax_error;
1723
1724 /* If a semicolon follows the include, that's OK... ignore it. */
1725 if (';' == dict->token[0])
1726 {
1727 if (!link_advance(dict)) goto syntax_error;
1728 }
1729
1730 return true;
1731 }
1732 else
1733 {
1734 Dict_node * dn_new = dict_node_new();
1735 dn_new->left = dn;
1736 dn_new->right = NULL;
1737 dn_new->exp = NULL;
1738 dn = dn_new;
1739 dn->file = NULL;
1740
1741 /* Note: The following patches a dot in regexes appearing in
1742 * the affix file... It is corrected later. */
1743 patch_subscript(dict->token);
1744 dn->string = string_set_add(dict->token, dict->string_set);
1745 }
1746
1747 /* Advance to next entry, unless error */
1748 if (!link_advance(dict)) goto syntax_error;
1749 }
1750
1751 /* pass the : */
1752 if (!link_advance(dict))
1753 {
1754 goto syntax_error;
1755 }
1756
1757 n = make_expression(dict);
1758 if (n == NULL)
1759 {
1760 goto syntax_error;
1761 }
1762
1763 if (!is_equal(dict, ';'))
1764 {
1765 dict_error(dict, "Expecting \";\" at the end of an entry.");
1766 goto syntax_error;
1767 }
1768
1769 if (dn == NULL)
1770 {
1771 dict_error(dict, "Expecting a token before \":\".");
1772 goto syntax_error;
1773 }
1774
1775 /* At this point, dn points to a list of Dict_nodes connected by
1776 * their left pointers. These are to be inserted into the dictionary. */
1777 i = 0;
1778 for (dnx = dn; dnx != NULL; dnx = dnx->left)
1779 {
1780 dnx->exp = n;
1781 i++;
1782 }
1783 dict->insert_entry(dict, dn, i);
1784
1785 if (dict->suppress_warning)
1786 {
1787 free((void *)dict->suppress_warning);
1788 dict->suppress_warning = NULL;
1789 }
1790
1791 /* pass the ; */
1792 if (!link_advance(dict))
1793 {
1794 /* Avoid freeing dn, since it is already inserted into the dict. */
1795 return false;
1796 }
1797
1798 return true;
1799
1800 syntax_error:
1801 free_insert_list(dn);
1802 return false;
1803 }
1804
1805
rprint_dictionary_data(Dict_node * n)1806 static void rprint_dictionary_data(Dict_node * n)
1807 {
1808 if (n == NULL) return;
1809 rprint_dictionary_data(n->left);
1810 printf("%s: %s\n", n->string, lg_exp_stringify(n->exp));
1811 rprint_dictionary_data(n->right);
1812 }
1813
1814 /**
1815 * Dump the entire contents of the dictionary
1816 * XXX This is not currently called by anything, but is a "good thing
1817 * to keep around".
1818 */
print_dictionary_data(Dictionary dict)1819 void print_dictionary_data(Dictionary dict)
1820 {
1821 rprint_dictionary_data(dict->root);
1822 }
1823
read_dictionary(Dictionary dict)1824 bool read_dictionary(Dictionary dict)
1825 {
1826 if (!link_advance(dict))
1827 {
1828 return false;
1829 }
1830 /* The last character of a dictionary is NUL.
1831 * Note: At the end of reading a dictionary, dict->pin points to one
1832 * character after the input. Referring its [-1] element is safe even if
1833 * the dict file size is 0. */
1834 while ('\0' != dict->pin[-1])
1835 {
1836 if (!read_entry(dict))
1837 {
1838 return false;
1839 }
1840 }
1841 dict->root = dsw_tree_to_vine(dict->root);
1842 dict->root = dsw_vine_to_tree(dict->root, dict->num_entries);
1843 return true;
1844 }
1845
1846 /* ======================================================================= */
1847