1 /*************************************************************************/
2 /* Copyright 2013, 2014 Linas Vepstas                                    */
3 /* Copyright 2014 Amir Plivatsky                                         */
4 /* All rights reserved                                                   */
5 /*                                                                       */
6 /* Use of the link grammar parsing system is subject to the terms of the */
7 /* license set forth in the LICENSE file included with this software.    */
8 /* This license allows free redistribution and use in source and binary  */
9 /* forms, with or without modification, subject to certain conditions.   */
10 /*                                                                       */
11 /*************************************************************************/
12 
13 #include "api-structures.h"             // Sentence_s
14 #include "api-types.h"
15 #include "dict-common/regex-morph.h"    // match_regex
16 #include "connectors.h"                 // MAX_SENTENCE
17 #include "disjunct-utils.h"             // Disjunct_struct
18 #include "lg_assert.h"
19 #include "linkage.h"
20 #include "sane.h"
21 #include "tokenize/tok-structures.h"    // Wordgraph_pathpos_s
22 #include "tokenize/word-structures.h"   // Word_struct
23 #include "tokenize/wordgraph.h"
24 #include "utilities.h"
25 
26 /**
27  * Construct word paths (one or more) through the Wordgraph.
28  *
29  * Add 'current_word" to the potential path.
30  * Add "p" to the path queue, which defines the start of the next potential
31  * paths to be checked.
32  *
33  * Each path is up to the current word (not including). It doesn't actually
34  * construct a full path if there are null words - they break it. The final path
35  * is constructed when the Wordgraph termination word is encountered.
36  *
37  * Note: The final path doesn't match the linkage word indexing if the linkage
38  * contains optional words which are null, until the are eliminated from the
39  * linkage (in remove_empty_words()). Further processing of the path is done
40  * there in case morphology splits are to be hidden or there are morphemes with
41  * null linkage.
42  */
43 #define D_WPA 7
wordgraph_path_append(Wordgraph_pathpos ** nwp,const Gword ** path,Gword * current_word,Gword * p)44 static void wordgraph_path_append(Wordgraph_pathpos **nwp, const Gword **path,
45                                   Gword *current_word, /* add to the path */
46                                   Gword *p)      /* add to the path queue */
47 {
48 	size_t n = wordgraph_pathpos_len(*nwp);
49 
50 	assert(NULL != p, "Tried to add a NULL word to the word queue");
51 	if (current_word == p)
52 	{
53 		lgdebug(D_WPA, "Adding the same word '%s' again\n", p->subword);
54 		//print_lwg_path((Gword **)path, "After adding the same word");
55 	}
56 
57 	/* Check if the path queue already contains the word to be added to it. */
58 	const Wordgraph_pathpos *wpt = NULL;
59 
60 	if (NULL != *nwp)
61 	{
62 		for (wpt = *nwp; NULL != wpt->word; wpt++)
63 		{
64 			if (p == wpt->word)
65 			{
66 				lgdebug(D_WPA, "Word %s (after %zu) exists (after %d)\n",
67 				        p->subword,
68 				        wpt->path[gwordlist_len(wpt->path)-1]->sent_wordidx,
69 				        path ? (int)path[gwordlist_len(path)-1]->sent_wordidx : -1);
70 				/* If we are here, there are 2 or more paths leading to this word
71 				 * (p) that end with the same number of consecutive null words that
72 				 * consist an entire alternative. These null words represent
73 				 * different ways to split the subword upward in the hierarchy.
74 				 * For a nicer result we choose the shorter path. */
75 				if ((NULL != path) &&
76 				    wpt->path[gwordlist_len(wpt->path)-1]->sent_wordidx <=
77 				    path[gwordlist_len(path)-1]->sent_wordidx)
78 				{
79 					lgdebug(D_WPA, "Shorter path already queued\n");
80 					return; /* The shorter path is already in the queue. */
81 				}
82 				lgdebug(D_WPA, "Longer path is in the queue\n");
83 				//print_lwg_path((Gword **)wpt->path, "Freeing");
84 				free(wpt->path); /* To be replaced by a shorter path. */
85 				break;
86 			}
87 		}
88 	}
89 
90 	if ((NULL == wpt) || (p != wpt->word))
91 	{
92 		/* Not already in the path queue - add it. */
93 		*nwp = wordgraph_pathpos_resize(*nwp, n+1);
94 	}
95 	else
96 	{
97 		lgdebug(D_WPA, "Path position to be replaced (len %zu): %d\n", n,
98 		                (int)(wpt - *nwp));
99 		n = wpt - *nwp; /* Replace this path. */
100 	}
101 	(*nwp)[n].word = p;
102 
103 	if (NULL == path)
104 	{
105 			(*nwp)[n].path = NULL;
106 	}
107 	else
108 	{
109 		/* Duplicate the path from the current one. */
110 		size_t path_arr_size = (gwordlist_len(path)+1)*sizeof(*path);
111 
112 		(*nwp)[n].path = malloc(path_arr_size);
113 		memcpy((*nwp)[n].path, path, path_arr_size);
114 	}
115 
116 	if (NULL == current_word) return;
117 
118 	/* If we queue the same word again, its path remains the same.
119 	 * Else append the current word to it. */
120 	if (p != current_word)
121 	{
122 		/* FIXME (cast) but anyway gwordlist_append() doesn't modify Gword. */
123 		gwordlist_append((Gword ***)&(*nwp)[n].path, current_word);
124 	}
125 }
126 
127 /**
128  * Free the Wordgraph paths and the Wordgraph_pathpos array.
129  * In case of a match, the final path is still needed so this function is
130  * then invoked with free_final_path=false.
131  */
wordgraph_path_free(Wordgraph_pathpos * wp,bool free_final_path)132 static void wordgraph_path_free(Wordgraph_pathpos *wp, bool free_final_path)
133 {
134 	Wordgraph_pathpos *twp;
135 
136 	if (NULL == wp) return;
137 	for (twp = wp; NULL != twp->word; twp++)
138 	{
139 		if (free_final_path || (MT_INFRASTRUCTURE != twp->word->morpheme_type))
140 			free(twp->path);
141 	}
142 	free(wp);
143 }
144 
145 #define NO_WORD (MAX_SENTENCE+1)
146 
147 /**
148  * Return the number of islands in a linkage.
149  * First, each word appears in its own linked list.
150  * Then all the links in the linkage are traversed, and the lists pointed
151  * by each of them are combined.
152  * Finally, the words are traversed and the lists are followed and
153  * numbered. The WG path is used to skip optional words which are null.
154  */
num_islands(const Linkage lkg,const Gword ** wg_path)155 static size_t num_islands(const Linkage lkg, const Gword **wg_path)
156 {
157 	struct word
158 	{
159 		int prev;
160 		int next;
161 		int inum;
162 	};
163 	struct word *word = alloca(lkg->sent->length * sizeof(struct word));
164 
165 	/* Initially, each word is in its own island. */
166 	for (WordIdx w = 0; w < lkg->sent->length; w++)
167 	{
168 		word[w].prev = word[w].next = w;
169 	}
170 
171 	/* Unify the potential islands pointed by each link
172 	 * (if they are already unified, they remain so.) */
173 	for (LinkIdx li = 0; li < lkg->num_links; li++)
174 	{
175 		Link *l = &lkg->link_array[li];
176 
177 		WordIdx iw;
178 		for (iw = word[l->lw].next; (iw != l->rw) && (iw != l->lw); iw = word[iw].next)
179 			;
180 
181 		if (iw != l->rw)
182 		{
183 			int nextl = word[l->lw].next;
184 			int prevr = word[l->rw].prev;
185 
186 			word[l->lw].next = l->rw;
187 			word[l->rw].prev = l->lw;
188 
189 			word[prevr].next = nextl;
190 			word[nextl].prev = prevr;
191 		}
192 
193 		if (verbosity_level(+8))
194 		{
195 			for (WordIdx w = 0; w < lkg->sent->length; w++)
196 			{
197 				err_msg(lg_Debug, "%d<-%zu->%d ", word[w].prev, w, word[w].next);
198 			}
199 			err_msg(lg_Debug, "\n");
200 		}
201 	}
202 
203 	/* Count islands. */
204 	int inum = -1;
205 	Disjunct **cdj = lkg->chosen_disjuncts;
206 
207 	for (WordIdx w = 0; w < lkg->sent->length; w++)
208 	{
209 		/* Skip null words which are optional words. */
210 		if ((NULL == *wg_path) || ((*wg_path)->sent_wordidx != w))
211 		{
212 			assert(word[w].prev == word[w].next);
213 			assert((NULL == cdj[w]) && lkg->sent->word[w].optional);
214 
215 			word[w].prev = NO_WORD;
216 			word[w].inum = -1; /* not belonging to any island */
217 			continue;
218 		}
219 
220 		wg_path++;
221 		if (NO_WORD == word[w].prev) continue;
222 
223 		inum++;
224 		for (WordIdx iw = w; NO_WORD != word[iw].prev; iw = word[iw].next)
225 		{
226 			word[iw].prev = NO_WORD;
227 			word[iw].inum = inum;
228 		}
229 	}
230 
231 	if (verbosity_level(8))
232 	{
233 		err_msg(lg_Debug, "Island count %d: ", inum);
234 		for (WordIdx w = 0; w < lkg->sent->length; w++)
235 		{
236 			err_msg(lg_Debug, "%d ", word[w].inum);
237 		}
238 		err_msg(lg_Debug, "\n");
239 	}
240 
241 	return inum;
242 }
243 
244 /* ============================================================== */
245 /* A kind of morphism post-processing */
246 
247 /* These letters create a string that should be matched by a
248  * SANEMORPHISM regex, given in the affix file. The empty word
249  * doesn't have a letter. E.g. for the Russian dictionary: "w|ts".
250  * It is converted here to: "^((w|ts)b)+$".
251  * It matches "wbtsbwbtsbwb" but not "wbtsbwsbtsb".
252  * FIXME? In this version of the function, 'b' is not yet supported,
253  * so "w|ts" is converted to "^(w|ts)+$" for now.
254  */
255 #define AFFIXTYPE_PREFIX   'p'   /* prefix */
256 #define AFFIXTYPE_STEM     't'   /* stem */
257 #define AFFIXTYPE_SUFFIX   's'   /* suffix */
258 #define AFFIXTYPE_MIDDLE   'm'   /* middle morpheme */
259 #define AFFIXTYPE_WORD     'w'   /* regular word */
260 #ifdef WORD_BOUNDARIES
261 #define AFFIXTYPE_END      'b'   /* end of input word */
262 #endif
263 
264 /**
265  * This routine solves the problem of mis-linked alternatives,
266  * i.e a morpheme in one alternative that is linked to a morpheme in
267  * another alternative. This can happen due to the way in which word
268  * alternatives are implemented.
269  *
270  * It does so by checking that all the chosen disjuncts in a linkage
271  * (including null words) match, in the same order, a path in the
272  * Wordgraph.
273  *
274  * An important side effect of this check is that if the linkage is
275  * good, its Wordgraph path is found.
276  *
277  * Optionally (if SANEMORPHISM regex is defined in the affix file), it
278  * also validates that the morpheme-type sequence is permitted for the
279  * language. This is a sanity check of the program and the dictionary.
280  *
281  * Return true if the linkage is good, else return false.
282  */
283 #define D_SLM 8
sane_linkage_morphism(Sentence sent,Linkage lkg,Parse_Options opts)284 bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts)
285 {
286 	Wordgraph_pathpos *wp_new = NULL;
287 	Wordgraph_pathpos *wp_old = NULL;
288 	Wordgraph_pathpos *wpp;
289 	Gword **next; /* next Wordgraph words of the current word */
290 	size_t i;
291 	unsigned int null_count_found = 0;
292 
293 	bool match_found = true; /* if all the words are null - it's still a match */
294 	Gword **lwg_path;
295 
296 	Dictionary afdict = sent->dict->affix_table;       /* for SANEMORPHISM */
297 	char *const affix_types = alloca(sent->length*2 + 1);   /* affix types */
298 	affix_types[0] = '\0';
299 
300 	lkg->wg_path = NULL;
301 
302 	/* Populate the path word queue, initializing the path to NULL. */
303 	for (next = sent->wordgraph->next; *next; next++)
304 	{
305 		wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next);
306 	}
307 	assert(NULL != wp_new, "Path word queue is empty");
308 
309 	for (i = 0; i < lkg->num_words; i++)
310 	{
311 		Disjunct *cdj;            /* chosen disjunct */
312 
313 		lgdebug(D_SLM, "lkg=%p Word %zu: ", lkg, i);
314 
315 		if (NULL == wp_new)
316 		{
317 			lgdebug(D_SLM, "- No more words in the wordgraph\n");
318 			match_found = false;
319 			break;
320 		}
321 
322 		if (wp_old != wp_new)
323 		{
324 			wordgraph_path_free(wp_old, true);
325 			wp_old = wp_new;
326 		}
327 		wp_new = NULL;
328 		//wordgraph_pathpos_print(wp_old);
329 
330 		cdj = lkg->chosen_disjuncts[i];
331 		/* Handle null words */
332 		if (NULL == cdj)
333 		{
334 			lgdebug(D_SLM, "- Null word");
335 			/* A null word matches any word in the Wordgraph -
336 			 * so, unconditionally proceed in all paths in parallel. */
337 			match_found = false;
338 			bool optional_word_found = false;
339 			for (wpp = wp_old; NULL != wpp->word; wpp++)
340 			{
341 				if ((MT_INFRASTRUCTURE == wpp->word->morpheme_type) ||
342 				    (wpp->word->sent_wordidx > i))
343 				{
344 					assert(sent->word[i].optional, "wordindex=%zu", i);
345 					lgdebug(D_SLM, " (Optional, index=%zu)\n", i);
346 					// Retain the same word in the new path queue.
347 					wordgraph_path_append(&wp_new, wpp->path, wpp->word, wpp->word);
348 					match_found = true;
349 					optional_word_found = true;
350 					continue; /* Disregard this chosen disjunct. */
351 				}
352 
353 				/* The null words cannot be marked here because wpp->path consists
354 				 * of pointers to the Wordgraph words, and these words are common to
355 				 * all the linkages, with potentially different null words in each
356 				 * of them. However, the position of the null words can be inferred
357 				 * from the null words in the word array of the Linkage structure.
358 				 */
359 				for (next = wpp->word->next; NULL != *next; next++)
360 				{
361 					if (MT_INFRASTRUCTURE != wpp->word->morpheme_type)
362 						match_found = true;
363 					wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
364 				}
365 			}
366 
367 			if (!optional_word_found)
368 			{
369 				null_count_found++;
370 				/* Note that if all the sentence words are null-words, its
371 				 * null_count is only sent->length-1 so this is not a mismatch. */
372 				if ((null_count_found > lkg->sent->null_count) &&
373 				    (lkg->sent->null_count != sent->length-1))
374 				{
375 					lgdebug(D_SLM, " (Extra, count > %u)\n", lkg->sent->null_count);
376 					match_found = false;
377 					break;
378 				}
379 				lgdebug(D_SLM, "\n");
380 			}
381 
382 			continue;
383 		}
384 
385 		if (!match_found)
386 		{
387 			const char *e = "Internal error: Too many words in the linkage";
388 			lgdebug(D_SLM, "- %s\n", e);
389 			prt_error("Error: %s.\n", e);
390 			break;
391 		}
392 
393 		if (verbosity_level(D_SLM)) prt_error("%s", cdj->word_string);
394 
395 		match_found = false;
396 		/* Proceed in all the paths in which the word is found. */
397 		for (wpp = wp_old; NULL != wpp->word; wpp++)
398 		{
399 			for (gword_set *gl = cdj->originating_gword; NULL != gl; gl =  gl->next)
400 			{
401 				if (gl->o_gword == wpp->word)
402 				{
403 					match_found = true;
404 					for (next = wpp->word->next; NULL != *next; next++)
405 					{
406 						wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
407 					}
408 					break;
409 				}
410 			}
411 		}
412 
413 		if (!match_found)
414 		{
415 			/* FIXME? A message can be added here if there are too many words
416 			 * in the linkage (can happen only if there is an internal error). */
417 			lgdebug(D_SLM, "- No Wordgraph match\n");
418 			break;
419 		}
420 		lgdebug(D_SLM, "\n");
421 	}
422 
423 	if (match_found)
424 	{
425 		match_found = false;
426 		/* Validate that there are no missing words in the linkage.
427 		 * It is so, if the dummy termination word is found in the
428 		 * new pathpos queue.
429 		 */
430 		if (NULL != wp_new)
431 		{
432 			for (wpp = wp_new; NULL != wpp->word; wpp++)
433 			{
434 				if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) {
435 					match_found = true;
436 					/* Exit the loop with with wpp of the termination word. */
437 					break;
438 				}
439 			}
440 		}
441 		if (!match_found)
442 		    lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg);
443 	}
444 
445 	/* Reject found null count that is not consistent with sent->null_count.
446 	 * Here islands_ok=1 is handled, and also a lower-than-expected null
447 	 * count when islands_ok=0. */
448 	if (match_found)
449 	{
450 		unsigned int count_found =
451 			opts->islands_ok ? num_islands(lkg, wpp->path) : null_count_found;
452 
453 		if ((count_found != lkg->sent->null_count) &&
454 		    (lkg->sent->null_count != sent->length-1) && (count_found != sent->length))
455 		{
456 			lgdebug(D_SLM, "Null count mismatch: Found %u != null_count %u\n",
457 					  count_found, lkg->sent->null_count);
458 			match_found = false;
459 		}
460 	}
461 
462 #define DEBUG_morpheme_type 0
463 	/* Check the morpheme type combination.
464 	 * If null_count > 0, the morpheme type combination may be invalid
465 	 * due to null subwords, so skip this check. */
466 	if (match_found && (0 == sent->null_count) &&
467 		(NULL != afdict) && (NULL != afdict->regex_root))
468 	{
469 		const Gword **w;
470 		char *affix_types_p = affix_types;
471 
472 		/* Construct the affix_types string. */
473 #if DEBUG_morpheme_type
474 		print_lwg_path(wpp->path, "Linkage");
475 #endif
476 		i = 0;
477 		for (w = wpp->path; *w; w++)
478 		{
479 			i++;
480 
481 			PRAGMA_START(GCC diagnostic ignored "-Wswitch-enum")
482 			switch ((*w)->morpheme_type)
483 			{
484 				default:
485 					/* What to do with the rest? */
486 				case MT_WORD:
487 					*affix_types_p = AFFIXTYPE_WORD;
488 					break;
489 				case MT_PREFIX:
490 					*affix_types_p = AFFIXTYPE_PREFIX;
491 					break;
492 				case MT_STEM:
493 					*affix_types_p = AFFIXTYPE_STEM;
494 					break;
495 				case MT_MIDDLE:
496 					*affix_types_p = AFFIXTYPE_MIDDLE;
497 					break;
498 				case MT_SUFFIX:
499 					*affix_types_p = AFFIXTYPE_SUFFIX;
500 					break;
501 			}
502 			PRAGMA_END
503 
504 #if DEBUG_morpheme_type
505 			lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n",
506 			     i, (*w)->subword,  *affix_types_p);
507 #endif
508 
509 			affix_types_p++;
510 		}
511 		*affix_types_p = '\0';
512 
513 #ifdef WORD_BOUNDARIES /* not yet implemented */
514 		{
515 			const Gword *uw;
516 
517 			/* If w is an "end subword", return its unsplit word, else NULL. */
518 			uw = word_boundary(w); /* word_boundary() unimplemented */
519 
520 			if (NULL != uw)
521 			{
522 				*affix_types_p++ = AFFIXTYPE_END;
523 				lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword);
524 			}
525 		}
526 #endif
527 
528 		/* Check if affix_types is valid according to SANEMORPHISM. */
529 		if (('\0' != affix_types[0]) &&
530 		    (NULL == match_regex(afdict->regex_root, affix_types)))
531 		{
532 			/* Morpheme type combination is invalid. */
533 			match_found = false;
534 			/* Notify to stdout, so it will be shown along with the result.
535 			 * XXX We should have a better way to notify. */
536 			if (0 < opts->verbosity)
537 				prt_error("Warning: Invalid morpheme type combination '%s'.\n"
538 				          "Run with !bad and !verbosity>"STRINGIFY(D_USER_MAX)
539 				          " to debug\n", affix_types);
540 		}
541 	}
542 
543 	if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */
544 	wordgraph_path_free(wp_old, true);
545 	wordgraph_path_free(wp_new, !match_found);
546 
547 	if (match_found)
548 	{
549 		if ('\0' != affix_types[0])
550 		{
551 			lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types);
552 		}
553 		lgdebug(+D_SLM-1, "%p SUCCEEDED\n", lkg);
554 		lkg->wg_path = lwg_path;
555 		return true;
556 	}
557 
558 	/* Oh no ... invalid morpheme combination! */
559 	lgdebug(+D_SLM-1, "%p FAILED\n", lkg);
560 	return false;
561 }
562 #undef D_SLM
563