1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright (c) 2014 Linas Vepstas                                      */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 
14 /* see bottom of file for comments on post processing */
15 
16 #include <memory.h>
17 #include <stdint.h>
18 
19 #include "post-process.h"
20 
21 #include "api-structures.h"
22 #include "error.h"
23 #include "linkage/linkage.h"
24 #include "linkage/score.h"
25 #include "pp_knowledge.h"
26 #include "pp_linkset.h"
27 #include "pp-structures.h"
28 #include "resources.h"
29 #include "string-set.h"
30 
31 #define PP_MAX_DOMAINS 128
32 
33 /**
34  * post_process_match -- compare two link-types.
35  *
36  * string comparison in postprocessing. The first parameter is a
37  * post-processing symbol. The second one is a connector name from a
38  * link. The upper case parts must match. We imagine that the first
39  * arg is padded with an infinite sequence of "#" and that the 2nd one
40  * is padded with "*". "#" matches anything, but "*" is just like an
41  * ordinary char for matching purposes.
42  *
43  * For efficiency, the algo is not straightforward:
44  * 1. In the first "while", there is no check for (*t != '\0').
45  * Instead, this condition is detected by (*s != *t) when the uppercase
46  * part of "s" is longer than that of "t".
47  * 2. "t" is not checked for uppercase in the loop.
48  * Instead, if its uppercase part is longer then that of "s", the check
49  * after the loop detects that.
50  */
51 
post_process_match(const char * s,const char * t)52 bool post_process_match(const char *s, const char *t)
53 {
54 	if (NULL == t) return false;
55 	if (islower(*t)) t++; /* Skip head-dependent indicator */
56 	while (isupper(*s))
57 	{
58 		if (*s != *t) return false;
59 		s++;
60 		t++;
61 	}
62 	if (isupper(*t)) return false;
63 	while (*t != '\0')
64 	{
65 		if (*s == '\0') return true;
66 		if (*s != *t && *s != '#') return false;
67 		s++;
68 		t++;
69 	}
70 	while (*s != '\0')
71 	{
72 		if (*s != '*' && *s != '#') return false;
73 		s++;
74 	}
75 	return true;
76 }
77 
78 /***************** utility routines (not exported) ***********************/
79 
80 /**
81  * Returns false if the string s does not match anything in
82  * the array. The array elements are post-processing symbols.
83  */
string_in_list(const char * s,const char * a[])84 static bool string_in_list(const char * s, const char * a[])
85 {
86 	int i;
87 	for (i=0; a[i] != NULL; i++)
88 		if (post_process_match(a[i], s)) return true;
89 	return false;
90 }
91 
92 /**
93  * Return the name of the domain associated with the provided starting
94  * link. Return -1 if link isn't associated with a domain.
95  */
find_domain_name(Postprocessor * pp,const char * link)96 static size_t find_domain_name(Postprocessor *pp, const char *link)
97 {
98 	size_t i, domain;
99 	StartingLinkAndDomain *sllt = pp->knowledge->starting_link_lookup_table;
100 	for (i=0;;i++)
101 	{
102 		domain = sllt[i].domain;
103 		if (domain == SIZE_MAX) return SIZE_MAX;  /* hit the end-of-list sentinel */
104 		if (post_process_match(sllt[i].starting_link, link)) return domain;
105 	}
106 }
107 
108 /** Returns true if domain d1 is contained in domain d2 */
contained_in(const Domain * d1,const Domain * d2,const Linkage sublinkage)109 static bool contained_in(const Domain * d1, const Domain * d2,
110                         const Linkage sublinkage)
111 {
112 	bool *mark = alloca(sublinkage->num_links*sizeof(bool));
113 	List_o_links * lol;
114 	memset(mark, 0, sublinkage->num_links*(sizeof(bool)));
115 	for (lol=d2->lol; lol != NULL; lol = lol->next)
116 		mark[lol->link] = true;
117 	for (lol=d1->lol; lol != NULL; lol = lol->next)
118 		if (!mark[lol->link]) return false;
119 	return true;
120 }
121 
122 /** Returns the predicate "the given link is in the given domain" */
link_in_domain(size_t link,const Domain * d)123 static bool link_in_domain(size_t link, const Domain * d)
124 {
125 	List_o_links * lol;
126 	for (lol = d->lol; lol != NULL; lol = lol->next)
127 		if (lol->link == link) return true;
128 	return false;
129 }
130 
131 /* #define CHECK_DOMAIN_NESTING */
132 
133 #if defined(CHECK_DOMAIN_NESTING)
134 /* Although this is no longer used, I'm leaving the code here for future reference --DS 3/98 */
135 
136 /* Returns true if the domains actually form a properly nested structure */
check_domain_nesting(Postprocessor * pp,int num_links)137 static bool check_domain_nesting(Postprocessor *pp, int num_links)
138 {
139 	size_t id1, id2;
140 	Domain * d1, * d2;
141 	int counts[4];
142 	char mark[MAX_NUM_LINKS];
143 	List_o_links * lol;
144 	int i;
145 	PP_data *pp_data = &pp->pp_data;
146 
147 	for (id1 = 0; id1 < pp_data->N_domains; id1++)
148 	{
149 		d1 = &pp_data->domain_array[id1];
150 		for (id2 = id1+1; id2 < pp_data->N_domains; id2++)
151 		{
152 			d2 = &pp_data->domain_array[id2];
153 
154 			memset(mark, 0, num_links);
155 			for (lol=d2->lol; lol != NULL; lol = lol->next)
156 				mark[lol->link] = 1;
157 
158 			for (lol=d1->lol; lol != NULL; lol = lol->next)
159 				mark[lol->link] += 2;
160 
161 			counts[0] = counts[1] = counts[2] = counts[3] = 0;
162 			for (i=0; i<num_links; i++)
163 			{
164 				assert(mark[i] < 4, "Miscount of link marks!");
165 				counts[(size_t)mark[i]]++; /* cast avoids compiler warning */
166 			}
167 			if ((counts[1] > 0) && (counts[2] > 0) && (counts[3] > 0))
168 				return false;
169 		}
170 	}
171 	return true;
172 }
173 #endif
174 
175 /**
176  * Free the list of links pointed to by lol
177  * (does not free any strings)
178  */
free_List_o_links(List_o_links * lol)179 static void free_List_o_links(List_o_links *lol)
180 {
181 	List_o_links * xlol;
182 	while (lol != NULL)
183 	{
184 		xlol = lol->next;
185 		free(lol);
186 		lol = xlol;
187 	}
188 }
189 
free_D_tree_leaves(DTreeLeaf * dtl)190 static void free_D_tree_leaves(DTreeLeaf *dtl)
191 {
192 	DTreeLeaf * xdtl;
193 	while (dtl != NULL)
194 	{
195 		xdtl = dtl->next;
196 		free(dtl);
197 		dtl = xdtl;
198 	}
199 }
200 
pp_free_domain_array(PP_data * ppd)201 static void pp_free_domain_array(PP_data *ppd)
202 {
203 	size_t d;
204 	for (d = 0; d < ppd->domlen; d++)
205 	{
206 		free_List_o_links(ppd->domain_array[d].lol);
207 		ppd->domain_array[d].lol = NULL;
208 		free_D_tree_leaves(ppd->domain_array[d].child);
209 		ppd->domain_array[d].child = NULL;
210 	}
211 }
212 
213 /**
214  * Gets called after every invocation of post_process()
215  */
post_process_free_data(PP_data * ppd)216 void post_process_free_data(PP_data * ppd)
217 {
218 	size_t w;
219 	for (w = 0; w < ppd->wowlen; w++)
220 	{
221 		free_List_o_links(ppd->word_links[w]);
222 		ppd->word_links[w] = NULL;
223 	}
224 
225 	pp_free_domain_array(ppd);
226 	free_List_o_links(ppd->links_to_ignore);
227 	ppd->links_to_ignore = NULL;
228 	ppd->num_words = 0;
229 	ppd->N_domains = 0;
230 }
231 
232 #ifdef THIS_FUNCTION_IS_NOT_CURRENTLY_USED
connectivity_dfs(Postprocessor * pp,Linkage sublinkage,int w,pp_linkset * ls)233 static void connectivity_dfs(Postprocessor *pp, Linkage sublinkage,
234                              int w, pp_linkset *ls)
235 {
236 	List_o_links *lol;
237 	assert(w < pp_data->num_words, "Bad word index");
238 	pp_data->visited[w] = true;
239 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
240 	{
241 		if (!pp_data->visited[lol->word] &&
242 				!pp_linkset_match(ls, sublinkage->link[lol->link]->name))
243 			connectivity_dfs(pp, sublinkage, lol->word, ls);
244 	}
245 }
246 #endif /* THIS_FUNCTION_IS_NOT_CURRENTLY_USED */
247 
linkage_get_violation_name(const Linkage linkage)248 const char * linkage_get_violation_name(const Linkage linkage)
249 {
250 	return linkage->lifo.pp_violation_msg;
251 }
252 
253 /************************ rule application *******************************/
254 
clear_visited(PP_data * pp_data)255 static void clear_visited(PP_data *pp_data)
256 {
257 	memset(pp_data->visited, 0, pp_data->num_words * sizeof(bool));
258 }
259 
apply_rules(PP_data * pp_data,bool (applyfn)(PP_data *,Linkage,pp_rule *),Linkage sublinkage,pp_rule * rule_array,const char ** msg)260 static bool apply_rules(PP_data *pp_data,
261                         bool (applyfn) (PP_data *, Linkage, pp_rule *),
262                         Linkage sublinkage,
263                         pp_rule *rule_array,
264                         const char **msg)
265 {
266 	int i;
267 	for (i = 0; (*msg = rule_array[i].msg) != NULL; i++)
268 	{
269 		if (!applyfn(pp_data, sublinkage, &(rule_array[i])))
270 		{
271 			rule_array[i].use_count ++;
272 			return false;
273 		}
274 	}
275 	return true;
276 }
277 
278 static bool
apply_relevant_rules(Postprocessor * pp,bool (applyfn)(PP_data *,Linkage,pp_rule *),Linkage sublinkage,pp_rule * rule_array,int * relevant_rules,const char ** msg)279 apply_relevant_rules(Postprocessor *pp,
280                      bool (applyfn)(PP_data *, Linkage, pp_rule *),
281                      Linkage sublinkage,
282                      pp_rule *rule_array,
283                      int *relevant_rules,
284                      const char **msg)
285 {
286 	int i, idx;
287 	PP_data *pp_data = &pp->pp_data;
288 
289 	/* If we didn't accumulate link names for this sentence, we need
290 	 *  to apply all rules. */
291 	if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) {
292 		return apply_rules(pp_data, applyfn, sublinkage, rule_array, msg);
293 	}
294 
295 	/* We did, and we don't. */
296 	for (i = 0; (idx = relevant_rules[i]) != -1; i++)
297 	{
298 		*msg = rule_array[idx].msg;
299 		if (!applyfn(pp_data, sublinkage, &(rule_array[idx]))) return false;
300 	}
301 	return true;
302 }
303 
304 /**
305  * returns true if and only if all groups containing the specified link
306  * contain at least one from the required list. (as determined by exact
307  * string matching)
308  */
309 static bool
apply_contains_one(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)310 apply_contains_one(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
311 {
312 	DTreeLeaf * dtl;
313 	size_t d, count;
314 
315 	for (d=0; d<pp_data->N_domains; d++)
316 	{
317 		for (dtl = pp_data->domain_array[d].child;
318 		     dtl != NULL &&
319 		        !post_process_match(rule->selector,
320 		           sublinkage->link_array[dtl->link].link_name);
321 		     dtl = dtl->next) {}
322 		if (dtl != NULL)
323 		{
324 			/* selector link of rule appears in this domain */
325 			count=0;
326 			for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next)
327 			{
328 				if (string_in_list(sublinkage->link_array[dtl->link].link_name,
329 				                   rule->link_array))
330 				{
331 					count=1;
332 					break;
333 				}
334 			}
335 			if (count == 0) return false;
336 		}
337 	}
338 	return true;
339 }
340 
341 
342 /**
343  * Returns true if and only if:
344  * all groups containing the selector link do not contain anything
345  * from the link_array contained in the rule. Uses exact string matching.
346  */
347 static bool
apply_contains_none(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)348 apply_contains_none(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
349 {
350 	size_t d;
351 
352 	for (d=0; d<pp_data->N_domains; d++)
353 	{
354 		DTreeLeaf * dtl;
355 		for (dtl = pp_data->domain_array[d].child;
356 		     dtl != NULL &&
357 		         !post_process_match(rule->selector,
358 		                  sublinkage->link_array[dtl->link].link_name);
359 		     dtl = dtl->next) {}
360 		if (dtl != NULL)
361 		{
362 			/* selector link of rule appears in this domain */
363 			for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next)
364 			{
365 				if (string_in_list(sublinkage->link_array[dtl->link].link_name,
366 				                   rule->link_array))
367 					return false;
368 			}
369 		}
370 	}
371 	return true;
372 }
373 
374 /**
375  * Returns true if and only if
376  * (1) the sentence doesn't contain the selector link for the rule, or
377  * (2) it does, and it also contains one or more from the rule's link set
378  */
379 static bool
apply_contains_one_globally(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)380 apply_contains_one_globally(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
381 {
382 	size_t i, j, count;
383 	for (i = 0; i < sublinkage->num_links; i++)
384 	{
385 		assert(sublinkage->link_array[i].lw != SIZE_MAX);
386 		if (post_process_match(rule->selector, sublinkage->link_array[i].link_name)) break;
387 	}
388 	if (i == sublinkage->num_links) return true;
389 
390 	/* selector link of rule appears in sentence */
391 	count = 0;
392 	for (j = 0; j < sublinkage->num_links && count == 0; j++)
393 	{
394 		assert(sublinkage->link_array[j].lw != SIZE_MAX);
395 		if (string_in_list(sublinkage->link_array[j].link_name, rule->link_array))
396 		{
397 			count = 1;
398 			break;
399 		}
400 	}
401 	if (count == 0) return false; else return true;
402 }
403 
404 /**
405  * For each link in the linkage that is in the must_form_a_cycle list,
406  * we want to make sure that that link is in a cycle.  We do this
407  * simply by deleting the link, then seeing if the end points of that
408  * link are still connected.
409  */
reachable_without_dfs(PP_data * pp_data,Linkage sublinkage,size_t a,size_t b,size_t w)410 static void reachable_without_dfs(PP_data *pp_data,
411                     Linkage sublinkage, size_t a, size_t b, size_t w)
412 {
413 	/* This is a depth first search of words reachable from w, excluding
414 	 * any direct edge between word a and word b. */
415 	List_o_links *lol;
416 	assert(w < pp_data->num_words, "Bad word index");
417 	pp_data->visited[w] = true;
418 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
419 	{
420 		assert(lol->word < pp_data->num_words, "Bad word index");
421 		if (!pp_data->visited[lol->word] &&
422 		    !(w == a && lol->word == b) &&
423 		    !(w == b && lol->word == a))
424 		{
425 				reachable_without_dfs(pp_data, sublinkage, a, b, lol->word);
426 		}
427 	}
428 }
429 
430 /**
431  * Returns true if the linkage is connected when ignoring the links
432  * whose names are in the given list of link names.
433  * Actually, what it does is this: it returns false if the connectivity
434  * of the subgraph reachable from word 0 changes as a result of deleting
435  * these links.
436  */
437 static bool
apply_must_form_a_cycle(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)438 apply_must_form_a_cycle(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
439 {
440 	List_o_links *lol;
441 	size_t w;
442 
443 	for (w = 0; w < pp_data->num_words; w++)
444 	{
445 		for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
446 		{
447 			if (w > lol->word) continue; /* only consider each edge once */
448 			if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue;
449 
450 			clear_visited(pp_data);
451 			reachable_without_dfs(pp_data, sublinkage, w, lol->word, w);
452 			if (!pp_data->visited[lol->word]) return false;
453 		}
454 	}
455 
456 	for (lol = pp_data->links_to_ignore; lol != NULL; lol = lol->next)
457 	{
458 		w = sublinkage->link_array[lol->link].lw;
459 		/* (w, lol->word) are the left and right ends of the edge we're considering */
460 		if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue;
461 
462 		clear_visited(pp_data);
463 		reachable_without_dfs(pp_data, sublinkage, w, lol->word, w);
464 
465 		assert(lol->word < pp_data->num_words, "Bad word index");
466 		if (!pp_data->visited[lol->word]) return false;
467 	}
468 
469 	return true;
470 }
471 
472 /**
473  * Checks to see that all domains with this name have the property that
474  * all of the words that touch a link in the domain are not to the left
475  * of the root word of the domain.
476  */
477 static bool
apply_bounded(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)478 apply_bounded(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
479 {
480 	size_t d, lw;
481 	List_o_links * lol;
482 	char d_type = rule->domain;
483 
484 	for (d = 0; d < pp_data->N_domains; d++)
485 	{
486 		if (pp_data->domain_array[d].type != d_type) continue;
487 		lw = sublinkage->link_array[pp_data->domain_array[d].start_link].lw;
488 		for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next)
489 		{
490 			if (sublinkage->link_array[lol->link].lw < lw) return false;
491 		}
492 	}
493 	return true;
494 }
495 
496 /**
497  * fill in the pp->pp_data.word_links array with a list of words
498  * neighboring each word (actually a list of links).  This is an
499  * undirected graph.
500  */
build_graph(Postprocessor * pp,Linkage sublinkage)501 static void build_graph(Postprocessor *pp, Linkage sublinkage)
502 {
503 	size_t link;
504 	List_o_links * lol;
505 	PP_data *pp_data = &pp->pp_data;
506 
507 	/* Get more size, if needed */
508 	if (pp_data->wowlen <= pp_data->num_words)
509 	{
510 		size_t newsz;
511 		pp_data->wowlen += pp_data->num_words;
512 		newsz = pp_data->wowlen * sizeof(List_o_links *);
513 		pp_data->word_links = (List_o_links **) realloc(
514 			pp_data->word_links, newsz);
515 	}
516 	memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *));
517 
518 	for (link = 0; link < sublinkage->num_links; link++)
519 	{
520 		assert (sublinkage->link_array[link].lw != SIZE_MAX);
521 		if (NULL == sublinkage->link_array[link].link_name) continue;
522 		if (pp_linkset_match(pp->knowledge->ignore_these_links,
523 		                     sublinkage->link_array[link].link_name))
524 		{
525 			lol = (List_o_links *) malloc(sizeof(List_o_links));
526 			lol->next = pp_data->links_to_ignore;
527 			pp_data->links_to_ignore = lol;
528 			lol->link = link;
529 			lol->word = sublinkage->link_array[link].rw;
530 			continue;
531 		}
532 
533 		lol = (List_o_links *) malloc(sizeof(List_o_links));
534 		lol->next = pp_data->word_links[sublinkage->link_array[link].lw];
535 		pp_data->word_links[sublinkage->link_array[link].lw] = lol;
536 		lol->link = link;
537 		lol->word = sublinkage->link_array[link].rw;
538 
539 		lol = (List_o_links *) malloc(sizeof(List_o_links));
540 		lol->next = pp_data->word_links[sublinkage->link_array[link].rw];
541 		pp_data->word_links[sublinkage->link_array[link].rw] = lol;
542 		lol->link = link;
543 		lol->word = sublinkage->link_array[link].lw;
544 	}
545 }
546 
setup_domain_array(Postprocessor * pp,const char * string,int start_link)547 static void setup_domain_array(Postprocessor *pp,
548                                const char *string, int start_link)
549 {
550 	PP_data *pp_data = &pp->pp_data;
551 	size_t n = pp_data->N_domains;
552 
553 	/* Grab more memory if needed */
554 	if (pp_data->domlen <= n)
555 	{
556 		size_t oldsz, incsz;
557 #define DOMINC 16
558 		oldsz = pp_data->domlen * sizeof(Domain);
559 		incsz = DOMINC * sizeof(Domain);
560 		pp_data->domain_array = (Domain *) realloc(pp_data->domain_array,
561 			oldsz + incsz);
562 		memset(&pp_data->domain_array[pp_data->domlen], 0, incsz);
563 		pp_data->domlen += DOMINC;
564 	}
565 
566 	pp_data->domain_array[n].string = string;
567 	pp_data->domain_array[n].lol    = NULL;
568 	pp_data->domain_array[n].size   = 0;
569 	pp_data->domain_array[n].start_link = start_link;
570 
571 	pp_data->N_domains++;
572 	assert(pp_data->N_domains<PP_MAX_DOMAINS, "raise value of PP_MAX_DOMAINS");
573 }
574 
add_link_to_domain(PP_data * pp_data,int link)575 static void add_link_to_domain(PP_data *pp_data, int link)
576 {
577 	size_t n = pp_data->N_domains - 1;  /* the very last one */
578 	List_o_links *lol = (List_o_links *) malloc(sizeof(List_o_links));
579 
580 	lol->next = pp_data->domain_array[n].lol;
581 	pp_data->domain_array[n].lol = lol;
582 	pp_data->domain_array[n].size++;
583 	lol->link = link;
584 }
585 
depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t root,size_t start_link)586 static void depth_first_search(Postprocessor *pp, Linkage sublinkage,
587                                size_t w, size_t root, size_t start_link)
588 {
589 	List_o_links *lol;
590 	PP_data *pp_data = &pp->pp_data;
591 
592 	assert(w < pp_data->num_words, "Bad word index");
593 	pp_data->visited[w] = true;
594 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
595 	{
596 		if (lol->word < w && lol->link != start_link)
597 		{
598 			add_link_to_domain(pp_data, lol->link);
599 		}
600 	}
601 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
602 	{
603 		if (!pp_data->visited[lol->word] && (lol->word != root) &&
604 		       !(lol->word < root && lol->word < w &&
605 		       pp_linkset_match(pp->knowledge->restricted_links,
606 		                sublinkage->link_array[lol->link].link_name)))
607 		{
608 			depth_first_search(pp, sublinkage, lol->word, root, start_link);
609 		}
610 	}
611 }
612 
bad_depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t root,size_t start_link)613 static void bad_depth_first_search(Postprocessor *pp, Linkage sublinkage,
614                                    size_t w, size_t root, size_t start_link)
615 {
616 	List_o_links * lol;
617 	PP_data *pp_data = &pp->pp_data;
618 
619 	assert(w < pp_data->num_words, "Bad word index");
620 	pp_data->visited[w] = true;
621 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
622 	{
623 		if ((lol->word < w) && (lol->link != start_link) && (w != root))
624 		{
625 			add_link_to_domain(pp_data, lol->link);
626 		}
627 	}
628 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
629 	{
630 		assert(lol->word < pp_data->num_words, "Bad word index");
631 		if ((!pp_data->visited[lol->word]) && !(w == root && lol->word < w) &&
632 		     !(lol->word < root && lol->word < w &&
633 		          pp_linkset_match(pp->knowledge->restricted_links,
634 		                sublinkage->link_array[lol->link].link_name)))
635 		{
636 			bad_depth_first_search(pp, sublinkage, lol->word, root, start_link);
637 		}
638 	}
639 }
640 
d_depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t root,size_t right,size_t start_link)641 static void d_depth_first_search(Postprocessor *pp, Linkage sublinkage,
642                     size_t w, size_t root, size_t right, size_t start_link)
643 {
644 	List_o_links * lol;
645 	PP_data *pp_data = &pp->pp_data;
646 
647 	assert(w < pp_data->num_words, "Bad word index");
648 	pp_data->visited[w] = true;
649 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
650 	{
651 		if ((lol->word < w) && (lol->link != start_link) && (w != root))
652 		{
653 			add_link_to_domain(pp_data, lol->link);
654 		}
655 	}
656 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
657 	{
658 		assert(lol->word < pp_data->num_words, "Bad word index");
659 		if (!pp_data->visited[lol->word] && !(w == root && lol->word >= right) &&
660 		    !(w == root && lol->word < root) &&
661 		       !(lol->word < root && lol->word < w &&
662 		          pp_linkset_match(pp->knowledge->restricted_links,
663 		                   sublinkage->link_array[lol->link].link_name)))
664 		{
665 			d_depth_first_search(pp,sublinkage,lol->word,root,right,start_link);
666 		}
667 	}
668 }
669 
left_depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t right,size_t start_link)670 static void left_depth_first_search(Postprocessor *pp, Linkage sublinkage,
671                                     size_t w, size_t right, size_t start_link)
672 {
673 	List_o_links *lol;
674 	PP_data *pp_data = &pp->pp_data;
675 
676 	assert(w < pp_data->num_words, "Bad word index");
677 	pp_data->visited[w] = true;
678 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
679 	{
680 		if (lol->word < w && lol->link != start_link)
681 		{
682 			add_link_to_domain(pp_data, lol->link);
683 		}
684 	}
685 	for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
686 	{
687 		assert(lol->word < pp_data->num_words, "Bad word index");
688 		if (!pp_data->visited[lol->word] && (lol->word != right))
689 		{
690 			depth_first_search(pp, sublinkage, lol->word, right, start_link);
691 		}
692 	}
693 }
694 
domain_compare(const Domain * d1,const Domain * d2)695 static int domain_compare(const Domain * d1, const Domain * d2)
696 {
697 	return (d1->size - d2->size); /* for sorting the domains by size */
698 }
699 
build_domains(Postprocessor * pp,Linkage sublinkage)700 static void build_domains(Postprocessor *pp, Linkage sublinkage)
701 {
702 	size_t link, i, d;
703 	const char *s;
704 	PP_data *pp_data = &pp->pp_data;
705 
706 	pp_data->N_domains = 0;
707 
708 	for (link = 0; link<sublinkage->num_links; link++)
709 	{
710 		assert (sublinkage->link_array[link].lw != SIZE_MAX);
711 		if (NULL == sublinkage->link_array[link].link_name) continue;
712 		s = sublinkage->link_array[link].link_name;
713 
714 		if (pp_linkset_match(pp->knowledge->ignore_these_links, s)) continue;
715 		if (pp_linkset_match(pp->knowledge->domain_starter_links, s))
716 		{
717 			setup_domain_array(pp, s, link);
718 			if (pp_linkset_match(pp->knowledge->domain_contains_links, s))
719 				add_link_to_domain(pp_data, link);
720 
721 			clear_visited(pp_data);
722 			depth_first_search(pp, sublinkage, sublinkage->link_array[link].rw,
723 			                   sublinkage->link_array[link].lw, link);
724 		}
725 		else
726 		if (pp_linkset_match(pp->knowledge->urfl_domain_starter_links, s))
727 		{
728 			setup_domain_array(pp, s, link);
729 			/* always add the starter link to its urfl domain */
730 			add_link_to_domain(pp_data, link);
731 
732 			clear_visited(pp_data);
733 			bad_depth_first_search(pp, sublinkage,sublinkage->link_array[link].rw,
734 			                       sublinkage->link_array[link].lw, link);
735 		}
736 		else
737 		if (pp_linkset_match(pp->knowledge->urfl_only_domain_starter_links, s))
738 		{
739 			setup_domain_array(pp, s, link);
740 			/* do not add the starter link to its urfl_only domain */
741 			clear_visited(pp_data);
742 			d_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw,
743 			                     sublinkage->link_array[link].lw,
744 			                     sublinkage->link_array[link].rw, link);
745 		}
746 		else
747 		if (pp_linkset_match(pp->knowledge->left_domain_starter_links, s))
748 		{
749 			setup_domain_array(pp, s, link);
750 			/* do not add the starter link to a left domain */
751 			clear_visited(pp_data);
752 			left_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw,
753 			                        sublinkage->link_array[link].rw, link);
754 		}
755 	}
756 
757 	/* sort the domains by size */
758 	qsort((void *) pp_data->domain_array,
759 		pp_data->N_domains,
760 		sizeof(Domain),
761 		(int (*)(const void *, const void *)) domain_compare);
762 
763 	/* sanity check: all links in all domains have a legal domain name */
764 	for (d = 0; d < pp_data->N_domains; d++)
765 	{
766 		i = find_domain_name(pp, pp_data->domain_array[d].string);
767 		if (i == SIZE_MAX)
768 			prt_error("Error: post_process(): Need an entry for %s in LINK_TYPE_TABLE\n",
769 			          pp_data->domain_array[d].string);
770 		pp_data->domain_array[d].type = i;
771 	}
772 }
773 
build_domain_forest(PP_data * pp_data,Linkage sublinkage)774 static void build_domain_forest(PP_data *pp_data, Linkage sublinkage)
775 {
776 	size_t d, d1, link;
777 	DTreeLeaf * dtl;
778 
779 	if (0 == pp_data->N_domains) return;
780 
781 	pp_data->domain_array[pp_data->N_domains-1].parent = NULL;
782 	for (d=0; d < pp_data->N_domains-1; d++)
783 	{
784 		for (d1 = d+1; d1 < pp_data->N_domains; d1++)
785 		{
786 			if (contained_in(&pp_data->domain_array[d], &pp_data->domain_array[d1], sublinkage))
787 			{
788 				pp_data->domain_array[d].parent = &pp_data->domain_array[d1];
789 				break;
790 			}
791 		}
792 		if (d1 == pp_data->N_domains)
793 		{
794 			/* we know this domain is a root of a new tree */
795 			pp_data->domain_array[d].parent = NULL;
796 		}
797 	}
798 
799 	/* The parent links of domain nodes have been established.
800 	 * Now do the leaves. */
801 	for (d = 0; d < pp_data->N_domains; d++)
802 	{
803 		pp_data->domain_array[d].child = NULL;
804 	}
805 
806 	for (link=0; link < sublinkage->num_links; link++)
807 	{
808 		assert (sublinkage->link_array[link].lw != SIZE_MAX);
809 		for (d=0; d<pp_data->N_domains; d++)
810 		{
811 			if (link_in_domain(link, &pp_data->domain_array[d]))
812 			{
813 				dtl = (DTreeLeaf *) malloc(sizeof(DTreeLeaf));
814 				dtl->link = link;
815 				dtl->parent = &pp_data->domain_array[d];
816 				dtl->next = pp_data->domain_array[d].child;
817 				pp_data->domain_array[d].child = dtl;
818 				break;
819 			}
820 		}
821 	}
822 }
823 
824 static int
internal_process(Postprocessor * pp,Linkage sublinkage,const char ** msg)825 internal_process(Postprocessor *pp, Linkage sublinkage, const char **msg)
826 {
827 	size_t i;
828 	PP_data *pp_data = &pp->pp_data;
829 
830 	/* quick test: try applying just the relevant global rules */
831 	if (!apply_relevant_rules(pp, apply_contains_one_globally,
832 	                          sublinkage,
833 	                          pp->knowledge->contains_one_rules,
834 	                          pp->relevant_contains_one_rules, msg))
835 	{
836 		for (i = 0; i < pp_data->wowlen; i++)
837 			pp_data->word_links[i] = NULL;
838 		pp_data->N_domains = 0;
839 		return -1;
840 	}
841 
842 	/* build graph; confirm that it's legally connected */
843 	build_graph(pp, sublinkage);
844 	build_domains(pp, sublinkage);
845 	build_domain_forest(&pp->pp_data, sublinkage);
846 
847 #if defined(CHECK_DOMAIN_NESTING)
848 	/* These messages were deemed to not be useful, so
849 	 * this code is commented out. See comment above. */
850 	if (!check_domain_nesting(pp, sublinkage->num_links))
851 		prt_error("Warning: The domains are not nested.\n");
852 #endif
853 
854 	/* The order below should be optimal for most cases */
855 	if (!apply_relevant_rules(pp, apply_contains_one, sublinkage,
856 	                          pp->knowledge->contains_one_rules,
857 	                          pp->relevant_contains_one_rules, msg)) return 1;
858 	if (!apply_relevant_rules(pp, apply_contains_none, sublinkage,
859 	                          pp->knowledge->contains_none_rules,
860 	                          pp->relevant_contains_none_rules, msg)) return 1;
861 	if (!apply_rules(pp_data, apply_must_form_a_cycle, sublinkage,
862 	                 pp->knowledge->form_a_cycle_rules,msg)) return 1;
863 	if (!apply_rules(pp_data, apply_bounded, sublinkage,
864 	                 pp->knowledge->bounded_rules, msg)) return 1;
865 	return 0; /* This linkage satisfied all the rules */
866 }
867 
868 
869 /**
870  * Call this (a) after having called post_process_scan_linkage() on all
871  * generated linkages, but (b) before calling post_process() on any
872  * particular linkage. Here we mark all rules which we know (from having
873  * accumulated a set of link names appearing in *any* linkage) that won't
874  * ever be needed.
875  */
prune_irrelevant_rules(Postprocessor * pp)876 static void prune_irrelevant_rules(Postprocessor *pp)
877 {
878 	pp_rule *rule;
879 	int coIDX, cnIDX, rcoIDX = 0, rcnIDX = 0;
880 
881 	/* If we didn't scan any linkages, there's no pruning to be done. */
882 	if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) return;
883 
884 	for (coIDX = 0; ; coIDX++)
885 	{
886 		rule = &(pp->knowledge->contains_one_rules[coIDX]);
887 		if (rule->msg == NULL) break;
888 		if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector))
889 		{
890 			/* Mark rule as being relevant to this sentence */
891 			pp->relevant_contains_one_rules[rcoIDX++] = coIDX;
892 			pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector);
893 		}
894 	}
895 	pp->relevant_contains_one_rules[rcoIDX] = -1;  /* end sentinel */
896 
897 	for (cnIDX = 0; ; cnIDX++)
898 	{
899 		rule = &(pp->knowledge->contains_none_rules[cnIDX]);
900 		if (rule->msg == NULL) break;
901 		if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector))
902 		{
903 			pp->relevant_contains_none_rules[rcnIDX++] = cnIDX;
904 			pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector);
905 		}
906 	}
907 	pp->relevant_contains_none_rules[rcnIDX] = -1;
908 
909 	if (verbosity_level(5))
910 	{
911 		err_msg(lg_Debug, "PP: Saw %zu unique link names in all linkages.\n\\",
912 		       pp_linkset_population(pp->set_of_links_of_sentence));
913 		err_msg(lg_Debug, "PP: Using %i 'contains one' rules "
914 		                  "and %i 'contains none' rules\n",
915 		       rcoIDX, rcnIDX);
916 	}
917 }
918 
919 
920 /***************** definitions of exported functions ***********************/
921 
922 #define PP_INITLEN 60 /* just starting size, it is expanded if needed */
923 
pp_new_domain_array(PP_data * pp_data)924 static void pp_new_domain_array(PP_data *pp_data)
925 {
926 	pp_data->domlen = PP_INITLEN;
927 	pp_data->domain_array = (Domain*) malloc(pp_data->domlen * sizeof(Domain));
928 	memset(pp_data->domain_array, 0, pp_data->domlen * sizeof(Domain));
929 }
930 
931 /**
932  * read rules from path and initialize the appropriate fields in
933  * a postprocessor structure, a pointer to which is returned.
934  */
post_process_new(pp_knowledge * kno)935 Postprocessor * post_process_new(pp_knowledge * kno)
936 {
937 	Postprocessor *pp;
938 	PP_data *pp_data;
939 
940 	if (NULL == kno) return NULL;
941 
942 	pp = (Postprocessor *) malloc (sizeof(Postprocessor));
943 	pp->knowledge = kno;
944 	pp->string_set = string_set_create();
945 	pp->set_of_links_of_sentence = pp_linkset_open(1024);
946 	pp->set_of_links_in_an_active_rule = pp_linkset_open(1024);
947 	pp->relevant_contains_one_rules =
948 	      (int *) malloc ((pp->knowledge->n_contains_one_rules + 1)
949 	                      *(sizeof pp->relevant_contains_one_rules[0]));
950 	pp->relevant_contains_none_rules =
951 	      (int *) malloc ((pp->knowledge->n_contains_none_rules + 1)
952 	                      *(sizeof pp->relevant_contains_none_rules[0]));
953 	pp->relevant_contains_one_rules[0] = -1;
954 	pp->relevant_contains_none_rules[0] = -1;
955 	pp->violation = NULL;
956 	pp->n_local_rules_firing = 0;
957 	pp->n_global_rules_firing = 0;
958 
959 	pp->q_pruned_rules = false;
960 
961 	pp_data = &pp->pp_data;
962 	pp_data->vlength = PP_INITLEN;
963 	pp_data->visited = (bool*) malloc(pp_data->vlength * sizeof(bool));
964 	memset(pp_data->visited, 0, pp_data->vlength * sizeof(bool));
965 
966 	pp_data->links_to_ignore = NULL;
967 	pp_new_domain_array(pp_data);
968 
969 	pp_data->wowlen = PP_INITLEN;
970 	pp_data->word_links = (List_o_links **) malloc(pp_data->wowlen * sizeof(List_o_links*));
971 	memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *));
972 
973 	return pp;
974 }
975 
post_process_free(Postprocessor * pp)976 void post_process_free(Postprocessor *pp)
977 {
978 	PP_data *pp_data;
979 
980 	/* frees up memory associated with pp, previously allocated by open */
981 	if (pp == NULL) return;
982 	string_set_delete(pp->string_set);
983 	pp_linkset_close(pp->set_of_links_of_sentence);
984 	pp_linkset_close(pp->set_of_links_in_an_active_rule);
985 	free(pp->relevant_contains_one_rules);
986 	free(pp->relevant_contains_none_rules);
987 	pp->knowledge = NULL;
988 	pp->violation = NULL;
989 
990 	pp_data = &pp->pp_data;
991 	post_process_free_data(pp_data);
992 	free(pp_data->visited);
993 	free(pp_data->domain_array);
994 	free(pp_data->word_links);
995 
996 	free(pp);
997 }
998 
999 /**
1000  * During a first pass (prior to actual post-processing of the linkages
1001  * of a sentence), call this once for every generated linkage. Here we
1002  * simply maintain a set of "seen" link names for rule pruning, later on.
1003  */
post_process_scan_linkage(Postprocessor * pp,Linkage linkage)1004 static void post_process_scan_linkage(Postprocessor *pp, Linkage linkage)
1005 {
1006 	size_t i;
1007 	if (pp == NULL) return;
1008 	for (i = 0; i < linkage->num_links; i++)
1009 	{
1010 		assert(linkage->link_array[i].lw != SIZE_MAX);
1011 
1012 		pp_linkset_add(pp->set_of_links_of_sentence,
1013 			linkage->link_array[i].link_name);
1014 	}
1015 }
1016 
report_rule_use(pp_rule * set)1017 static size_t report_rule_use(pp_rule *set)
1018 {
1019 	size_t cnt = 0;
1020 	size_t i;
1021 	for (i=0; set[i].msg != NULL; i++)
1022 	{
1023 		err_msg(lg_Debug, "Used: %d rule: %s\n", set[i].use_count, set[i].msg);
1024 		cnt++;
1025 	}
1026 	return cnt;
1027 }
1028 
report_unused_rule(pp_rule * set)1029 static size_t report_unused_rule(pp_rule *set)
1030 {
1031 	size_t i;
1032 	size_t cnt = 0;
1033 	for (i=0; set[i].msg != NULL; i++)
1034 	{
1035 		if (0 == set[i].use_count)
1036 		{
1037 			err_msg(lg_Debug, "Unused rule: %s\n", set[i].msg);
1038 			cnt++;
1039 		}
1040 	}
1041 	return cnt;
1042 }
1043 
report_pp_stats(Postprocessor * pp)1044 static void report_pp_stats(Postprocessor *pp)
1045 {
1046 	size_t rule_cnt = 0;
1047 	size_t unused_cnt = 0;
1048 	pp_knowledge * kno;
1049 	if (!verbosity_level(9)) return;
1050 
1051 	err_msg(lg_Debug, "PP stats: local_rules_firing=%d\n", pp->n_local_rules_firing);
1052 	kno = pp->knowledge;
1053 
1054 	err_msg(lg_Debug, "\nPP stats: form_a_cycle_rules\n");
1055 	rule_cnt += report_rule_use(kno->form_a_cycle_rules);
1056 
1057 	err_msg(lg_Debug, "\nPP stats: contains_one_rules\n");
1058 	rule_cnt += report_rule_use(kno->contains_one_rules);
1059 
1060 	err_msg(lg_Debug, "\nPP stats: contains_none_rules\n");
1061 	rule_cnt += report_rule_use(kno->contains_none_rules);
1062 
1063 	err_msg(lg_Debug, "\nPP stats: bounded_rules\n");
1064 	rule_cnt += report_rule_use(kno->bounded_rules);
1065 
1066 	err_msg(lg_Debug, "\nPP stats: Rules that were not used:\n");
1067 	unused_cnt += report_unused_rule(kno->form_a_cycle_rules);
1068 	unused_cnt += report_unused_rule(kno->contains_one_rules);
1069 	unused_cnt += report_unused_rule(kno->contains_none_rules);
1070 	unused_cnt += report_unused_rule(kno->bounded_rules);
1071 
1072 	err_msg(lg_Debug, "\nPP stats: %zu of %zu rules unused\n", unused_cnt, rule_cnt);
1073 }
1074 
1075 /**
1076  * NB: linkage->link[i]->l=-1 means that this connector is to be ignored.
1077  */
do_post_process(Postprocessor * pp,Linkage sublinkage,bool is_long)1078 void do_post_process(Postprocessor *pp, Linkage sublinkage, bool is_long)
1079 {
1080 	const char *msg;
1081 	PP_data *pp_data;
1082 
1083 	if (pp == NULL) return;
1084 	pp_data = &pp->pp_data;
1085 
1086 	// XXX wtf .. why is this not leaking memory ?
1087 	pp_data->links_to_ignore = NULL;
1088 
1089 	pp_data->num_words = sublinkage->num_words;
1090 
1091 	/* Grab more memory if needed */
1092 	if (pp_data->vlength <= pp_data->num_words)
1093 	{
1094 		size_t newsz;
1095 		pp_data->vlength += pp_data->num_words;
1096 		newsz = pp_data->vlength * sizeof(bool);
1097 		pp_data->visited = (bool *) realloc(pp_data->visited, newsz);
1098 	}
1099 	clear_visited(pp_data);
1100 
1101 	/* For long sentences, we can save some time by pruning the rules
1102 	 * which can't possibly be used during postprocessing the linkages
1103 	 * of this sentence. For short sentences, this is pointless. */
1104 	if (is_long && pp->q_pruned_rules == false)
1105 	{
1106 		prune_irrelevant_rules(pp);
1107 	}
1108 	pp->q_pruned_rules = true;
1109 
1110 	switch (internal_process(pp, sublinkage, &msg))
1111 	{
1112 		case -1:
1113 			/* some global test failed even before we had to build the domains */
1114 			pp->n_global_rules_firing++;
1115 			pp->violation = msg;
1116 			report_pp_stats(pp);
1117 			return;
1118 		case 1:
1119 			/* one of the "normal" post processing tests failed */
1120 			pp->n_local_rules_firing++;
1121 			pp->violation = msg;
1122 			break;
1123 		case 0:
1124 			/* This linkage is legal according to the post processing rules */
1125 			pp->violation = NULL;
1126 			break;
1127 	}
1128 
1129 	report_pp_stats(pp);
1130 }
1131 
1132 /**
1133  * This does basic post-processing for all linkages.
1134  */
post_process_lkgs(Sentence sent,Parse_Options opts)1135 void post_process_lkgs(Sentence sent, Parse_Options opts)
1136 {
1137 	size_t in;
1138 	size_t N_linkages_post_processed = 0;
1139 	size_t N_valid_linkages = sent->num_valid_linkages;
1140 	size_t N_linkages_alloced = sent->num_linkages_alloced;
1141 	bool twopass = sent->length >= opts->twopass_length;
1142 	Postprocessor *pp = sent->postprocessor;
1143 
1144 	/* Special-case the "amy/ady" morphology handling. */
1145 	/* More generally, it there's no post-processor, do nothing. */
1146 	/* Well, almost nothing. We still want to assign a score. */
1147 	// if (sent->dict->affix_table->anysplit)
1148 	if (NULL == pp)
1149 	{
1150 		sent->num_linkages_post_processed = sent->num_valid_linkages;
1151 		for (in=0; in < N_linkages_alloced; in++)
1152 		{
1153 			Linkage lkg = &sent->lnkages[in];
1154 			linkage_score(lkg, opts);
1155 		}
1156 		return;
1157 	}
1158 
1159 #define TCD 512 /* timer checking divisor */
1160 
1161 	/* (optional) First pass: just visit the linkages */
1162 	/* The purpose of the first pass is to make the post-processing
1163 	 * more efficient.  Because (hopefully) by the time the real work
1164 	 * is done in the 2nd pass, the relevant rule set has been pruned
1165 	 * in the first pass.
1166 	 */
1167 	if (twopass)
1168 	{
1169 		for (in=0; in < N_linkages_alloced; in++)
1170 		{
1171 			Linkage lkg = &sent->lnkages[in];
1172 			Linkage_info *lifo = &lkg->lifo;
1173 
1174 			if (lifo->N_violations) continue;
1175 
1176 			post_process_scan_linkage(pp, lkg);
1177 
1178 			if (((TCD-1) == in%TCD) && resources_exhausted(opts->resources)) break;
1179 		}
1180 	}
1181 
1182 	/* Second pass: actually perform post-processing */
1183 	for (in=0; in < N_linkages_alloced; in++)
1184 	{
1185 		Linkage lkg = &sent->lnkages[in];
1186 		Linkage_info *lifo = &lkg->lifo;
1187 
1188 		if (lifo->N_violations) continue;
1189 
1190 		do_post_process(pp, lkg, twopass);
1191 		post_process_free_data(&pp->pp_data);
1192 
1193 		if (NULL != pp->violation)
1194 		{
1195 			N_valid_linkages--;
1196 			lifo->N_violations++;
1197 
1198 			/* Set the message, only if not set */
1199 			if (NULL == lifo->pp_violation_msg)
1200 				lifo->pp_violation_msg = pp->violation;
1201 		}
1202 		N_linkages_post_processed++;
1203 
1204 		linkage_score(lkg, opts);
1205 		if (((TCD-1) == in%TCD) && resources_exhausted(opts->resources)) break;
1206 	}
1207 
1208 	/* If the timer expired, then we never finished post-processing.
1209 	 * Mark the remaining sentences as bad, as otherwise strange
1210 	 * results get reported. */
1211 	for (; in < N_linkages_alloced; in++)
1212 	{
1213 		Linkage lkg = &sent->lnkages[in];
1214 		Linkage_info *lifo = &lkg->lifo;
1215 
1216 		if (lifo->N_violations) continue;
1217 
1218 		N_valid_linkages--;
1219 		lifo->N_violations++;
1220 
1221 		/* Set the message, only if not set */
1222 		if (NULL == lifo->pp_violation_msg)
1223 			lifo->pp_violation_msg = "Timeout during postprocessing";
1224 	}
1225 
1226 	print_time(opts, "Postprocessed all linkages");
1227 
1228 	if (verbosity_level(6))
1229 	{
1230 		err_msg(lg_Info, "%zu of %zu linkages with no P.P. violations\n",
1231 		        N_valid_linkages, N_linkages_post_processed);
1232 	}
1233 
1234 	sent->num_linkages_post_processed = N_linkages_post_processed;
1235 	sent->num_valid_linkages = N_valid_linkages;
1236 }
1237 
1238 /* ================ compute the domain names ============= */
1239 /*
1240  * The code below is used in one place only: when printing the domain
1241  * names.  If the domain names are not being printed, then this is a
1242  * complete waste of CPU time.
1243  */
1244 
free_domain_names(PP_domains * ppi)1245 static void free_domain_names(PP_domains *ppi)
1246 {
1247 	if (ppi->num_domains > 0) free(ppi->domain_name);
1248 	ppi->domain_name = NULL;
1249 	ppi->num_domains = 0;
1250 }
1251 
linkage_free_pp_domains(Linkage lkg)1252 void linkage_free_pp_domains(Linkage lkg)
1253 {
1254 	size_t j;
1255 	if (!lkg || !lkg->pp_domains) return;
1256 
1257 	for (j = 0; j < lkg->num_links; ++j)
1258 		free_domain_names(&lkg->pp_domains[j]);
1259 	free(lkg->pp_domains);
1260 	lkg->pp_domains = NULL;
1261 }
1262 
1263 typedef struct D_type_list_s D_type_list;
1264 struct D_type_list_s
1265 {
1266 	D_type_list * next;
1267 	int type;
1268 };
1269 
free_d_type(D_type_list * dtl)1270 static void free_d_type(D_type_list * dtl)
1271 {
1272 	D_type_list * dtlx;
1273 	for (; dtl != NULL; dtl = dtlx)
1274 	{
1275 		dtlx = dtl->next;
1276 		free((void*) dtl);
1277 	}
1278 }
1279 
build_type_array(PP_data * pp_data,size_t numlinks)1280 static D_type_list ** build_type_array(PP_data *pp_data,
1281                                        size_t numlinks)
1282 {
1283 	size_t nbytes = numlinks * sizeof(D_type_list*);
1284 	D_type_list** dta = malloc(nbytes);
1285 	memset(dta, 0, nbytes);
1286 
1287 	for (size_t d = 0; d < pp_data->N_domains; d++)
1288 	{
1289 		List_o_links * lol;
1290 		for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next)
1291 		{
1292 			assert(lol->link < numlinks, "Something wrong about link numbering!");
1293 
1294 			D_type_list * dtl;
1295 			dtl = (D_type_list *) malloc(sizeof(D_type_list));
1296 			dtl->type = pp_data->domain_array[d].type;
1297 			dtl->next = dta[lol->link];
1298 			dta[lol->link] = dtl;
1299 		}
1300 	}
1301 	return dta;
1302 }
1303 
1304 /**
1305  * Store the domain names in the linkage. These are not needed
1306  * unless the user asks the domain names to be printed!
1307  */
linkage_set_domain_names(Postprocessor * postprocessor,Linkage linkage)1308 static void linkage_set_domain_names(Postprocessor *postprocessor,
1309                                      Linkage linkage)
1310 {
1311 	if (NULL == linkage) return;
1312 	if (NULL == postprocessor) return;
1313 	if (0 == postprocessor->pp_data.N_domains) return;
1314 
1315 	/* Copy the post-processing results over into the linkage */
1316 	if (postprocessor->violation != NULL) return;
1317 
1318 	D_type_list **dta = build_type_array(&postprocessor->pp_data,
1319 	                                     linkage->num_links);
1320 
1321 	assert(NULL == linkage->pp_domains, "Not expecting pp_domains here!");
1322 
1323 	linkage->pp_domains = malloc(sizeof(PP_domains) * linkage->num_links);
1324 	memset(linkage->pp_domains, 0, sizeof(PP_domains) * linkage->num_links);
1325 
1326 	for (size_t j = 0; j < linkage->num_links; ++j)
1327 	{
1328 		D_type_list * d;
1329 		int k = 0;
1330 		for (d = dta[j]; d != NULL; d = d->next) k++;
1331 		linkage->pp_domains[j].num_domains = k;
1332 		if (k > 0)
1333 		{
1334 			linkage->pp_domains[j].domain_name =
1335 				(const char **) malloc(k * sizeof(const char *));
1336 		}
1337 		k = 0;
1338 		for (d = dta[j]; d != NULL; d = d->next)
1339 		{
1340 			char buff[] = {d->type, '\0'};
1341 
1342 			linkage->pp_domains[j].domain_name[k] =
1343 			      string_set_add (buff, postprocessor->string_set);
1344 
1345 			k++;
1346 		}
1347 	}
1348 
1349 	/* Done with the d_type_array */
1350 	for (size_t i=0; i<linkage->num_links; i++)
1351 		free_d_type(dta[i]);
1352 	free(dta);
1353 }
1354 
1355 /**
1356  * Compute linkage domain names.
1357  *
1358  * This assumes that post-processing has been done once, already;
1359  * however, it re-performs post-processing a second time, because
1360  * the data need to obtain the domain names has been lost.
1361  */
compute_domain_names(Linkage lkg)1362 void compute_domain_names(Linkage lkg)
1363 {
1364 	Postprocessor *pp = lkg->sent->postprocessor;
1365 	if (NULL == pp) return;
1366 
1367 	Linkage_info *lifo = &lkg->lifo;
1368 	if (lifo->N_violations) return;
1369 
1370 	// If pp_domains is set, its been computed already
1371 	if (NULL != lkg->pp_domains) return;
1372 
1373 	do_post_process(pp, lkg, true);
1374 	linkage_set_domain_names(pp, lkg);
1375 	post_process_free_data(&pp->pp_data);
1376 }
1377 
verify_link_index(const Linkage linkage,LinkIdx index)1378 static inline bool verify_link_index(const Linkage linkage, LinkIdx index)
1379 {
1380 	if (!linkage) return false;
1381 	if (index >= linkage->num_links) return false;
1382 	return true;
1383 }
1384 
1385 /** XXX this will not return valid data unless compute_domain_names
1386  * has been called first. FIXME? or does this matter?
1387  */
linkage_get_link_num_domains(const Linkage linkage,LinkIdx index)1388 int linkage_get_link_num_domains(const Linkage linkage, LinkIdx index)
1389 {
1390 	if (NULL == linkage->pp_domains) return -1;
1391 	if (!verify_link_index(linkage, index)) return -1;
1392 	return linkage->pp_domains[index].num_domains;
1393 }
1394 
1395 /** XXX this will not return valid data unless compute_domain_names
1396  * has been called first. FIXME? or does this matter?
1397  */
linkage_get_link_domain_names(const Linkage linkage,LinkIdx index)1398 const char ** linkage_get_link_domain_names(const Linkage linkage, LinkIdx index)
1399 {
1400 	if (NULL == linkage->pp_domains) return NULL;
1401 	if (!verify_link_index(linkage, index)) return NULL;
1402 	return linkage->pp_domains[index].domain_name;
1403 }
1404 
1405 
1406 /* OLD COMMENTS (OUT OF DATE):
1407   This file does the post-processing.
1408   The main routine is "post_process()".  It uses the link names only,
1409   and not the connectors.
1410 
1411   A domain is a set of links.  Each domain has a defining link.
1412   Only certain types of links serve to define a domain.  These
1413   parameters are set by the lists of link names in a separate,
1414   human-readable file referred to herein as the 'knowledge file.'
1415 
1416   The domains are nested: given two domains, either they're disjoint,
1417   or one contains the other, i.e. they're tree structured.  The set of links
1418   in a domain (but in no smaller domain) are called the "group" of the
1419   domain.  Data structures are built to store all this stuff.
1420   The tree structured property is not mathematically guaranteed by
1421   the domain construction algorithm.  Davy simply claims that because
1422   of how he built the dictionary, the domains will always be so
1423   structured.  The program checks this and gives an error message
1424   if it's violated.
1425 
1426   Define the "root word" of a link (or domain) to be the word at the
1427   left end of the link.  The other end of the defining link is called
1428   the "right word".
1429 
1430   The domain corresponding to a link is defined to be the set of links
1431   reachable by starting from the right word, following links and never
1432   using the root word or any word to its left.
1433 
1434   There are some minor exceptions to this.  The "restricted_link" lists
1435   those connectors that, even if they point back before the root word,
1436   are included in the domain.  Some of the starting links are included
1437   in their domain, these are listed in the "domain_contains_links" list.
1438 
1439   Such was the way it was.  Now Davy tells me there should be another type
1440   of domain that's quite different.  Let's call these "urfl" domains.
1441   Certain type of connectors start urfl domains.  They're listed below.
1442   In a urfl domain, the search includes the root word.  It does a separate
1443   search to find urfl domains.
1444 
1445   Restricted links should work just as they do with ordinary domains. If they
1446   come out of the right word, or anything to the right of it (that's
1447   in the domain), they should be included but should not be traced
1448   further. If they come out of the root word, they should not be
1449   included.
1450   */
1451 
1452 /*
1453   I also, unfortunately, want to propose a new type of domain. These
1454   would include everything that can be reached from the root word of the
1455   link, to the right, that is closer than the right word of the link.
1456   (They would not include the link itself.)
1457 
1458   In the following sentence, then, the "Urfl_Only Domain" of the G link
1459   would include only the "O" link:
1460 
1461      +-----G----+
1462      +---O--+   +-AI+
1463      |      |   |   |
1464   hitting dogs is fun.a
1465 
1466   In the following sentence it would include the "O", the "TT", the "I",
1467   the second "O", and the "A".
1468 
1469      +----------------G---------------+
1470      +-----TT-----+  +-----O-----+    |
1471      +---O---+    +-I+    +---A--+    +-AI+
1472      |       |    |  |    |      |    |   |
1473   telling people to do stupid things is fun.a
1474 
1475   This would allow us to judge the following:
1476 
1477   kicking dogs bores me
1478   *kicking dogs kicks dogs
1479   explaining the program is easy
1480   *explaining the program is running
1481 
1482   (These are distinctions that I thought we would never be able to make,
1483   so I told myself they were semantic rather than syntactic. But with
1484   domains, they should be easy.)
1485   */
1486 
1487   /* Modifications, 6/96 ALB:
1488      1) Rules and link sets are relegated to a separate, user-written
1489         file(s), herein referred to as the 'knowledge file'
1490      2) This information is read by a lexer, in pp_lexer.l (lex code)
1491         whose exported routines are all prefixed by 'pp_lexer'
1492      3) when postprocessing a sentence, the links of each domain are
1493         placed in a set for quick lookup, ('contains one' and 'contains none')
1494      4) Functions which were never called have been eliminated:
1495         link_inhabits(), match_in_list(), group_type_contains(),
1496         group_type_contains_one(),  group_type_contains_all()
1497      5) Some 'one-by-one' initializations have been replaced by faster
1498         block memory operations (memset etc.)
1499      6) The above comments are correct but incomplete! (1/97)
1500      7) observation: the 'contains one' is, empirically, by far the most
1501         violated rule, so it should come first in applying the rules.
1502 
1503     Modifications, 9/97 ALB:
1504      Deglobalization. Made code consistent with api.
1505    */
1506