1 /*************************************************************************/
2 /* Copyright 2013, 2014 Linas Vepstas */
3 /* Copyright 2014 Amir Plivatsky */
4 /* All rights reserved */
5 /* */
6 /* Use of the link grammar parsing system is subject to the terms of the */
7 /* license set forth in the LICENSE file included with this software. */
8 /* This license allows free redistribution and use in source and binary */
9 /* forms, with or without modification, subject to certain conditions. */
10 /* */
11 /*************************************************************************/
12
13 #include "api-structures.h" // Sentence_s
14 #include "api-types.h"
15 #include "dict-common/regex-morph.h" // match_regex
16 #include "connectors.h" // MAX_SENTENCE
17 #include "disjunct-utils.h" // Disjunct_struct
18 #include "lg_assert.h"
19 #include "linkage.h"
20 #include "sane.h"
21 #include "tokenize/tok-structures.h" // Wordgraph_pathpos_s
22 #include "tokenize/word-structures.h" // Word_struct
23 #include "tokenize/wordgraph.h"
24 #include "utilities.h"
25
26 /**
27 * Construct word paths (one or more) through the Wordgraph.
28 *
29 * Add 'current_word" to the potential path.
30 * Add "p" to the path queue, which defines the start of the next potential
31 * paths to be checked.
32 *
33 * Each path is up to the current word (not including). It doesn't actually
34 * construct a full path if there are null words - they break it. The final path
35 * is constructed when the Wordgraph termination word is encountered.
36 *
37 * Note: The final path doesn't match the linkage word indexing if the linkage
38 * contains optional words which are null, until the are eliminated from the
39 * linkage (in remove_empty_words()). Further processing of the path is done
40 * there in case morphology splits are to be hidden or there are morphemes with
41 * null linkage.
42 */
43 #define D_WPA 7
wordgraph_path_append(Wordgraph_pathpos ** nwp,const Gword ** path,Gword * current_word,Gword * p)44 static void wordgraph_path_append(Wordgraph_pathpos **nwp, const Gword **path,
45 Gword *current_word, /* add to the path */
46 Gword *p) /* add to the path queue */
47 {
48 size_t n = wordgraph_pathpos_len(*nwp);
49
50 assert(NULL != p, "Tried to add a NULL word to the word queue");
51 if (current_word == p)
52 {
53 lgdebug(D_WPA, "Adding the same word '%s' again\n", p->subword);
54 //print_lwg_path((Gword **)path, "After adding the same word");
55 }
56
57 /* Check if the path queue already contains the word to be added to it. */
58 const Wordgraph_pathpos *wpt = NULL;
59
60 if (NULL != *nwp)
61 {
62 for (wpt = *nwp; NULL != wpt->word; wpt++)
63 {
64 if (p == wpt->word)
65 {
66 lgdebug(D_WPA, "Word %s (after %zu) exists (after %d)\n",
67 p->subword,
68 wpt->path[gwordlist_len(wpt->path)-1]->sent_wordidx,
69 path ? (int)path[gwordlist_len(path)-1]->sent_wordidx : -1);
70 /* If we are here, there are 2 or more paths leading to this word
71 * (p) that end with the same number of consecutive null words that
72 * consist an entire alternative. These null words represent
73 * different ways to split the subword upward in the hierarchy.
74 * For a nicer result we choose the shorter path. */
75 if ((NULL != path) &&
76 wpt->path[gwordlist_len(wpt->path)-1]->sent_wordidx <=
77 path[gwordlist_len(path)-1]->sent_wordidx)
78 {
79 lgdebug(D_WPA, "Shorter path already queued\n");
80 return; /* The shorter path is already in the queue. */
81 }
82 lgdebug(D_WPA, "Longer path is in the queue\n");
83 //print_lwg_path((Gword **)wpt->path, "Freeing");
84 free(wpt->path); /* To be replaced by a shorter path. */
85 break;
86 }
87 }
88 }
89
90 if ((NULL == wpt) || (p != wpt->word))
91 {
92 /* Not already in the path queue - add it. */
93 *nwp = wordgraph_pathpos_resize(*nwp, n+1);
94 }
95 else
96 {
97 lgdebug(D_WPA, "Path position to be replaced (len %zu): %d\n", n,
98 (int)(wpt - *nwp));
99 n = wpt - *nwp; /* Replace this path. */
100 }
101 (*nwp)[n].word = p;
102
103 if (NULL == path)
104 {
105 (*nwp)[n].path = NULL;
106 }
107 else
108 {
109 /* Duplicate the path from the current one. */
110 size_t path_arr_size = (gwordlist_len(path)+1)*sizeof(*path);
111
112 (*nwp)[n].path = malloc(path_arr_size);
113 memcpy((*nwp)[n].path, path, path_arr_size);
114 }
115
116 if (NULL == current_word) return;
117
118 /* If we queue the same word again, its path remains the same.
119 * Else append the current word to it. */
120 if (p != current_word)
121 {
122 /* FIXME (cast) but anyway gwordlist_append() doesn't modify Gword. */
123 gwordlist_append((Gword ***)&(*nwp)[n].path, current_word);
124 }
125 }
126
127 /**
128 * Free the Wordgraph paths and the Wordgraph_pathpos array.
129 * In case of a match, the final path is still needed so this function is
130 * then invoked with free_final_path=false.
131 */
wordgraph_path_free(Wordgraph_pathpos * wp,bool free_final_path)132 static void wordgraph_path_free(Wordgraph_pathpos *wp, bool free_final_path)
133 {
134 Wordgraph_pathpos *twp;
135
136 if (NULL == wp) return;
137 for (twp = wp; NULL != twp->word; twp++)
138 {
139 if (free_final_path || (MT_INFRASTRUCTURE != twp->word->morpheme_type))
140 free(twp->path);
141 }
142 free(wp);
143 }
144
145 #define NO_WORD (MAX_SENTENCE+1)
146
147 /**
148 * Return the number of islands in a linkage.
149 * First, each word appears in its own linked list.
150 * Then all the links in the linkage are traversed, and the lists pointed
151 * by each of them are combined.
152 * Finally, the words are traversed and the lists are followed and
153 * numbered. The WG path is used to skip optional words which are null.
154 */
num_islands(const Linkage lkg,const Gword ** wg_path)155 static size_t num_islands(const Linkage lkg, const Gword **wg_path)
156 {
157 struct word
158 {
159 int prev;
160 int next;
161 int inum;
162 };
163 struct word *word = alloca(lkg->sent->length * sizeof(struct word));
164
165 /* Initially, each word is in its own island. */
166 for (WordIdx w = 0; w < lkg->sent->length; w++)
167 {
168 word[w].prev = word[w].next = w;
169 }
170
171 /* Unify the potential islands pointed by each link
172 * (if they are already unified, they remain so.) */
173 for (LinkIdx li = 0; li < lkg->num_links; li++)
174 {
175 Link *l = &lkg->link_array[li];
176
177 WordIdx iw;
178 for (iw = word[l->lw].next; (iw != l->rw) && (iw != l->lw); iw = word[iw].next)
179 ;
180
181 if (iw != l->rw)
182 {
183 int nextl = word[l->lw].next;
184 int prevr = word[l->rw].prev;
185
186 word[l->lw].next = l->rw;
187 word[l->rw].prev = l->lw;
188
189 word[prevr].next = nextl;
190 word[nextl].prev = prevr;
191 }
192
193 if (verbosity_level(+8))
194 {
195 for (WordIdx w = 0; w < lkg->sent->length; w++)
196 {
197 err_msg(lg_Debug, "%d<-%zu->%d ", word[w].prev, w, word[w].next);
198 }
199 err_msg(lg_Debug, "\n");
200 }
201 }
202
203 /* Count islands. */
204 int inum = -1;
205 Disjunct **cdj = lkg->chosen_disjuncts;
206
207 for (WordIdx w = 0; w < lkg->sent->length; w++)
208 {
209 /* Skip null words which are optional words. */
210 if ((NULL == *wg_path) || ((*wg_path)->sent_wordidx != w))
211 {
212 assert(word[w].prev == word[w].next);
213 assert((NULL == cdj[w]) && lkg->sent->word[w].optional);
214
215 word[w].prev = NO_WORD;
216 word[w].inum = -1; /* not belonging to any island */
217 continue;
218 }
219
220 wg_path++;
221 if (NO_WORD == word[w].prev) continue;
222
223 inum++;
224 for (WordIdx iw = w; NO_WORD != word[iw].prev; iw = word[iw].next)
225 {
226 word[iw].prev = NO_WORD;
227 word[iw].inum = inum;
228 }
229 }
230
231 if (verbosity_level(8))
232 {
233 err_msg(lg_Debug, "Island count %d: ", inum);
234 for (WordIdx w = 0; w < lkg->sent->length; w++)
235 {
236 err_msg(lg_Debug, "%d ", word[w].inum);
237 }
238 err_msg(lg_Debug, "\n");
239 }
240
241 return inum;
242 }
243
244 /* ============================================================== */
245 /* A kind of morphism post-processing */
246
247 /* These letters create a string that should be matched by a
248 * SANEMORPHISM regex, given in the affix file. The empty word
249 * doesn't have a letter. E.g. for the Russian dictionary: "w|ts".
250 * It is converted here to: "^((w|ts)b)+$".
251 * It matches "wbtsbwbtsbwb" but not "wbtsbwsbtsb".
252 * FIXME? In this version of the function, 'b' is not yet supported,
253 * so "w|ts" is converted to "^(w|ts)+$" for now.
254 */
255 #define AFFIXTYPE_PREFIX 'p' /* prefix */
256 #define AFFIXTYPE_STEM 't' /* stem */
257 #define AFFIXTYPE_SUFFIX 's' /* suffix */
258 #define AFFIXTYPE_MIDDLE 'm' /* middle morpheme */
259 #define AFFIXTYPE_WORD 'w' /* regular word */
260 #ifdef WORD_BOUNDARIES
261 #define AFFIXTYPE_END 'b' /* end of input word */
262 #endif
263
264 /**
265 * This routine solves the problem of mis-linked alternatives,
266 * i.e a morpheme in one alternative that is linked to a morpheme in
267 * another alternative. This can happen due to the way in which word
268 * alternatives are implemented.
269 *
270 * It does so by checking that all the chosen disjuncts in a linkage
271 * (including null words) match, in the same order, a path in the
272 * Wordgraph.
273 *
274 * An important side effect of this check is that if the linkage is
275 * good, its Wordgraph path is found.
276 *
277 * Optionally (if SANEMORPHISM regex is defined in the affix file), it
278 * also validates that the morpheme-type sequence is permitted for the
279 * language. This is a sanity check of the program and the dictionary.
280 *
281 * Return true if the linkage is good, else return false.
282 */
283 #define D_SLM 8
sane_linkage_morphism(Sentence sent,Linkage lkg,Parse_Options opts)284 bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts)
285 {
286 Wordgraph_pathpos *wp_new = NULL;
287 Wordgraph_pathpos *wp_old = NULL;
288 Wordgraph_pathpos *wpp;
289 Gword **next; /* next Wordgraph words of the current word */
290 size_t i;
291 unsigned int null_count_found = 0;
292
293 bool match_found = true; /* if all the words are null - it's still a match */
294 Gword **lwg_path;
295
296 Dictionary afdict = sent->dict->affix_table; /* for SANEMORPHISM */
297 char *const affix_types = alloca(sent->length*2 + 1); /* affix types */
298 affix_types[0] = '\0';
299
300 lkg->wg_path = NULL;
301
302 /* Populate the path word queue, initializing the path to NULL. */
303 for (next = sent->wordgraph->next; *next; next++)
304 {
305 wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next);
306 }
307 assert(NULL != wp_new, "Path word queue is empty");
308
309 for (i = 0; i < lkg->num_words; i++)
310 {
311 Disjunct *cdj; /* chosen disjunct */
312
313 lgdebug(D_SLM, "lkg=%p Word %zu: ", lkg, i);
314
315 if (NULL == wp_new)
316 {
317 lgdebug(D_SLM, "- No more words in the wordgraph\n");
318 match_found = false;
319 break;
320 }
321
322 if (wp_old != wp_new)
323 {
324 wordgraph_path_free(wp_old, true);
325 wp_old = wp_new;
326 }
327 wp_new = NULL;
328 //wordgraph_pathpos_print(wp_old);
329
330 cdj = lkg->chosen_disjuncts[i];
331 /* Handle null words */
332 if (NULL == cdj)
333 {
334 lgdebug(D_SLM, "- Null word");
335 /* A null word matches any word in the Wordgraph -
336 * so, unconditionally proceed in all paths in parallel. */
337 match_found = false;
338 bool optional_word_found = false;
339 for (wpp = wp_old; NULL != wpp->word; wpp++)
340 {
341 if ((MT_INFRASTRUCTURE == wpp->word->morpheme_type) ||
342 (wpp->word->sent_wordidx > i))
343 {
344 assert(sent->word[i].optional, "wordindex=%zu", i);
345 lgdebug(D_SLM, " (Optional, index=%zu)\n", i);
346 // Retain the same word in the new path queue.
347 wordgraph_path_append(&wp_new, wpp->path, wpp->word, wpp->word);
348 match_found = true;
349 optional_word_found = true;
350 continue; /* Disregard this chosen disjunct. */
351 }
352
353 /* The null words cannot be marked here because wpp->path consists
354 * of pointers to the Wordgraph words, and these words are common to
355 * all the linkages, with potentially different null words in each
356 * of them. However, the position of the null words can be inferred
357 * from the null words in the word array of the Linkage structure.
358 */
359 for (next = wpp->word->next; NULL != *next; next++)
360 {
361 if (MT_INFRASTRUCTURE != wpp->word->morpheme_type)
362 match_found = true;
363 wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
364 }
365 }
366
367 if (!optional_word_found)
368 {
369 null_count_found++;
370 /* Note that if all the sentence words are null-words, its
371 * null_count is only sent->length-1 so this is not a mismatch. */
372 if ((null_count_found > lkg->sent->null_count) &&
373 (lkg->sent->null_count != sent->length-1))
374 {
375 lgdebug(D_SLM, " (Extra, count > %u)\n", lkg->sent->null_count);
376 match_found = false;
377 break;
378 }
379 lgdebug(D_SLM, "\n");
380 }
381
382 continue;
383 }
384
385 if (!match_found)
386 {
387 const char *e = "Internal error: Too many words in the linkage";
388 lgdebug(D_SLM, "- %s\n", e);
389 prt_error("Error: %s.\n", e);
390 break;
391 }
392
393 if (verbosity_level(D_SLM)) prt_error("%s", cdj->word_string);
394
395 match_found = false;
396 /* Proceed in all the paths in which the word is found. */
397 for (wpp = wp_old; NULL != wpp->word; wpp++)
398 {
399 for (gword_set *gl = cdj->originating_gword; NULL != gl; gl = gl->next)
400 {
401 if (gl->o_gword == wpp->word)
402 {
403 match_found = true;
404 for (next = wpp->word->next; NULL != *next; next++)
405 {
406 wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
407 }
408 break;
409 }
410 }
411 }
412
413 if (!match_found)
414 {
415 /* FIXME? A message can be added here if there are too many words
416 * in the linkage (can happen only if there is an internal error). */
417 lgdebug(D_SLM, "- No Wordgraph match\n");
418 break;
419 }
420 lgdebug(D_SLM, "\n");
421 }
422
423 if (match_found)
424 {
425 match_found = false;
426 /* Validate that there are no missing words in the linkage.
427 * It is so, if the dummy termination word is found in the
428 * new pathpos queue.
429 */
430 if (NULL != wp_new)
431 {
432 for (wpp = wp_new; NULL != wpp->word; wpp++)
433 {
434 if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) {
435 match_found = true;
436 /* Exit the loop with with wpp of the termination word. */
437 break;
438 }
439 }
440 }
441 if (!match_found)
442 lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg);
443 }
444
445 /* Reject found null count that is not consistent with sent->null_count.
446 * Here islands_ok=1 is handled, and also a lower-than-expected null
447 * count when islands_ok=0. */
448 if (match_found)
449 {
450 unsigned int count_found =
451 opts->islands_ok ? num_islands(lkg, wpp->path) : null_count_found;
452
453 if ((count_found != lkg->sent->null_count) &&
454 (lkg->sent->null_count != sent->length-1) && (count_found != sent->length))
455 {
456 lgdebug(D_SLM, "Null count mismatch: Found %u != null_count %u\n",
457 count_found, lkg->sent->null_count);
458 match_found = false;
459 }
460 }
461
462 #define DEBUG_morpheme_type 0
463 /* Check the morpheme type combination.
464 * If null_count > 0, the morpheme type combination may be invalid
465 * due to null subwords, so skip this check. */
466 if (match_found && (0 == sent->null_count) &&
467 (NULL != afdict) && (NULL != afdict->regex_root))
468 {
469 const Gword **w;
470 char *affix_types_p = affix_types;
471
472 /* Construct the affix_types string. */
473 #if DEBUG_morpheme_type
474 print_lwg_path(wpp->path, "Linkage");
475 #endif
476 i = 0;
477 for (w = wpp->path; *w; w++)
478 {
479 i++;
480
481 PRAGMA_START(GCC diagnostic ignored "-Wswitch-enum")
482 switch ((*w)->morpheme_type)
483 {
484 default:
485 /* What to do with the rest? */
486 case MT_WORD:
487 *affix_types_p = AFFIXTYPE_WORD;
488 break;
489 case MT_PREFIX:
490 *affix_types_p = AFFIXTYPE_PREFIX;
491 break;
492 case MT_STEM:
493 *affix_types_p = AFFIXTYPE_STEM;
494 break;
495 case MT_MIDDLE:
496 *affix_types_p = AFFIXTYPE_MIDDLE;
497 break;
498 case MT_SUFFIX:
499 *affix_types_p = AFFIXTYPE_SUFFIX;
500 break;
501 }
502 PRAGMA_END
503
504 #if DEBUG_morpheme_type
505 lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n",
506 i, (*w)->subword, *affix_types_p);
507 #endif
508
509 affix_types_p++;
510 }
511 *affix_types_p = '\0';
512
513 #ifdef WORD_BOUNDARIES /* not yet implemented */
514 {
515 const Gword *uw;
516
517 /* If w is an "end subword", return its unsplit word, else NULL. */
518 uw = word_boundary(w); /* word_boundary() unimplemented */
519
520 if (NULL != uw)
521 {
522 *affix_types_p++ = AFFIXTYPE_END;
523 lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword);
524 }
525 }
526 #endif
527
528 /* Check if affix_types is valid according to SANEMORPHISM. */
529 if (('\0' != affix_types[0]) &&
530 (NULL == match_regex(afdict->regex_root, affix_types)))
531 {
532 /* Morpheme type combination is invalid. */
533 match_found = false;
534 /* Notify to stdout, so it will be shown along with the result.
535 * XXX We should have a better way to notify. */
536 if (0 < opts->verbosity)
537 prt_error("Warning: Invalid morpheme type combination '%s'.\n"
538 "Run with !bad and !verbosity>"STRINGIFY(D_USER_MAX)
539 " to debug\n", affix_types);
540 }
541 }
542
543 if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */
544 wordgraph_path_free(wp_old, true);
545 wordgraph_path_free(wp_new, !match_found);
546
547 if (match_found)
548 {
549 if ('\0' != affix_types[0])
550 {
551 lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types);
552 }
553 lgdebug(+D_SLM-1, "%p SUCCEEDED\n", lkg);
554 lkg->wg_path = lwg_path;
555 return true;
556 }
557
558 /* Oh no ... invalid morpheme combination! */
559 lgdebug(+D_SLM-1, "%p FAILED\n", lkg);
560 return false;
561 }
562 #undef D_SLM
563