1 /*************************************************************************/
2 /* Copyright (c) 2004 */
3 /* Daniel Sleator, David Temperley, and John Lafferty */
4 /* Copyright (c) 2014 Linas Vepstas */
5 /* All rights reserved */
6 /* */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software. */
9 /* This license allows free redistribution and use in source and binary */
10 /* forms, with or without modification, subject to certain conditions. */
11 /* */
12 /*************************************************************************/
13
14 /* see bottom of file for comments on post processing */
15
16 #include <memory.h>
17 #include <stdint.h>
18
19 #include "post-process.h"
20
21 #include "api-structures.h"
22 #include "error.h"
23 #include "linkage/linkage.h"
24 #include "linkage/score.h"
25 #include "pp_knowledge.h"
26 #include "pp_linkset.h"
27 #include "pp-structures.h"
28 #include "resources.h"
29 #include "string-set.h"
30
31 #define PP_MAX_DOMAINS 128
32
33 /**
34 * post_process_match -- compare two link-types.
35 *
36 * string comparison in postprocessing. The first parameter is a
37 * post-processing symbol. The second one is a connector name from a
38 * link. The upper case parts must match. We imagine that the first
39 * arg is padded with an infinite sequence of "#" and that the 2nd one
40 * is padded with "*". "#" matches anything, but "*" is just like an
41 * ordinary char for matching purposes.
42 *
43 * For efficiency, the algo is not straightforward:
44 * 1. In the first "while", there is no check for (*t != '\0').
45 * Instead, this condition is detected by (*s != *t) when the uppercase
46 * part of "s" is longer than that of "t".
47 * 2. "t" is not checked for uppercase in the loop.
48 * Instead, if its uppercase part is longer then that of "s", the check
49 * after the loop detects that.
50 */
51
post_process_match(const char * s,const char * t)52 bool post_process_match(const char *s, const char *t)
53 {
54 if (NULL == t) return false;
55 if (islower(*t)) t++; /* Skip head-dependent indicator */
56 while (isupper(*s))
57 {
58 if (*s != *t) return false;
59 s++;
60 t++;
61 }
62 if (isupper(*t)) return false;
63 while (*t != '\0')
64 {
65 if (*s == '\0') return true;
66 if (*s != *t && *s != '#') return false;
67 s++;
68 t++;
69 }
70 while (*s != '\0')
71 {
72 if (*s != '*' && *s != '#') return false;
73 s++;
74 }
75 return true;
76 }
77
78 /***************** utility routines (not exported) ***********************/
79
80 /**
81 * Returns false if the string s does not match anything in
82 * the array. The array elements are post-processing symbols.
83 */
string_in_list(const char * s,const char * a[])84 static bool string_in_list(const char * s, const char * a[])
85 {
86 int i;
87 for (i=0; a[i] != NULL; i++)
88 if (post_process_match(a[i], s)) return true;
89 return false;
90 }
91
92 /**
93 * Return the name of the domain associated with the provided starting
94 * link. Return -1 if link isn't associated with a domain.
95 */
find_domain_name(Postprocessor * pp,const char * link)96 static size_t find_domain_name(Postprocessor *pp, const char *link)
97 {
98 size_t i, domain;
99 StartingLinkAndDomain *sllt = pp->knowledge->starting_link_lookup_table;
100 for (i=0;;i++)
101 {
102 domain = sllt[i].domain;
103 if (domain == SIZE_MAX) return SIZE_MAX; /* hit the end-of-list sentinel */
104 if (post_process_match(sllt[i].starting_link, link)) return domain;
105 }
106 }
107
108 /** Returns true if domain d1 is contained in domain d2 */
contained_in(const Domain * d1,const Domain * d2,const Linkage sublinkage)109 static bool contained_in(const Domain * d1, const Domain * d2,
110 const Linkage sublinkage)
111 {
112 bool *mark = alloca(sublinkage->num_links*sizeof(bool));
113 List_o_links * lol;
114 memset(mark, 0, sublinkage->num_links*(sizeof(bool)));
115 for (lol=d2->lol; lol != NULL; lol = lol->next)
116 mark[lol->link] = true;
117 for (lol=d1->lol; lol != NULL; lol = lol->next)
118 if (!mark[lol->link]) return false;
119 return true;
120 }
121
122 /** Returns the predicate "the given link is in the given domain" */
link_in_domain(size_t link,const Domain * d)123 static bool link_in_domain(size_t link, const Domain * d)
124 {
125 List_o_links * lol;
126 for (lol = d->lol; lol != NULL; lol = lol->next)
127 if (lol->link == link) return true;
128 return false;
129 }
130
131 /* #define CHECK_DOMAIN_NESTING */
132
133 #if defined(CHECK_DOMAIN_NESTING)
134 /* Although this is no longer used, I'm leaving the code here for future reference --DS 3/98 */
135
136 /* Returns true if the domains actually form a properly nested structure */
check_domain_nesting(Postprocessor * pp,int num_links)137 static bool check_domain_nesting(Postprocessor *pp, int num_links)
138 {
139 size_t id1, id2;
140 Domain * d1, * d2;
141 int counts[4];
142 char mark[MAX_NUM_LINKS];
143 List_o_links * lol;
144 int i;
145 PP_data *pp_data = &pp->pp_data;
146
147 for (id1 = 0; id1 < pp_data->N_domains; id1++)
148 {
149 d1 = &pp_data->domain_array[id1];
150 for (id2 = id1+1; id2 < pp_data->N_domains; id2++)
151 {
152 d2 = &pp_data->domain_array[id2];
153
154 memset(mark, 0, num_links);
155 for (lol=d2->lol; lol != NULL; lol = lol->next)
156 mark[lol->link] = 1;
157
158 for (lol=d1->lol; lol != NULL; lol = lol->next)
159 mark[lol->link] += 2;
160
161 counts[0] = counts[1] = counts[2] = counts[3] = 0;
162 for (i=0; i<num_links; i++)
163 {
164 assert(mark[i] < 4, "Miscount of link marks!");
165 counts[(size_t)mark[i]]++; /* cast avoids compiler warning */
166 }
167 if ((counts[1] > 0) && (counts[2] > 0) && (counts[3] > 0))
168 return false;
169 }
170 }
171 return true;
172 }
173 #endif
174
175 /**
176 * Free the list of links pointed to by lol
177 * (does not free any strings)
178 */
free_List_o_links(List_o_links * lol)179 static void free_List_o_links(List_o_links *lol)
180 {
181 List_o_links * xlol;
182 while (lol != NULL)
183 {
184 xlol = lol->next;
185 free(lol);
186 lol = xlol;
187 }
188 }
189
free_D_tree_leaves(DTreeLeaf * dtl)190 static void free_D_tree_leaves(DTreeLeaf *dtl)
191 {
192 DTreeLeaf * xdtl;
193 while (dtl != NULL)
194 {
195 xdtl = dtl->next;
196 free(dtl);
197 dtl = xdtl;
198 }
199 }
200
pp_free_domain_array(PP_data * ppd)201 static void pp_free_domain_array(PP_data *ppd)
202 {
203 size_t d;
204 for (d = 0; d < ppd->domlen; d++)
205 {
206 free_List_o_links(ppd->domain_array[d].lol);
207 ppd->domain_array[d].lol = NULL;
208 free_D_tree_leaves(ppd->domain_array[d].child);
209 ppd->domain_array[d].child = NULL;
210 }
211 }
212
213 /**
214 * Gets called after every invocation of post_process()
215 */
post_process_free_data(PP_data * ppd)216 void post_process_free_data(PP_data * ppd)
217 {
218 size_t w;
219 for (w = 0; w < ppd->wowlen; w++)
220 {
221 free_List_o_links(ppd->word_links[w]);
222 ppd->word_links[w] = NULL;
223 }
224
225 pp_free_domain_array(ppd);
226 free_List_o_links(ppd->links_to_ignore);
227 ppd->links_to_ignore = NULL;
228 ppd->num_words = 0;
229 ppd->N_domains = 0;
230 }
231
232 #ifdef THIS_FUNCTION_IS_NOT_CURRENTLY_USED
connectivity_dfs(Postprocessor * pp,Linkage sublinkage,int w,pp_linkset * ls)233 static void connectivity_dfs(Postprocessor *pp, Linkage sublinkage,
234 int w, pp_linkset *ls)
235 {
236 List_o_links *lol;
237 assert(w < pp_data->num_words, "Bad word index");
238 pp_data->visited[w] = true;
239 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
240 {
241 if (!pp_data->visited[lol->word] &&
242 !pp_linkset_match(ls, sublinkage->link[lol->link]->name))
243 connectivity_dfs(pp, sublinkage, lol->word, ls);
244 }
245 }
246 #endif /* THIS_FUNCTION_IS_NOT_CURRENTLY_USED */
247
linkage_get_violation_name(const Linkage linkage)248 const char * linkage_get_violation_name(const Linkage linkage)
249 {
250 return linkage->lifo.pp_violation_msg;
251 }
252
253 /************************ rule application *******************************/
254
clear_visited(PP_data * pp_data)255 static void clear_visited(PP_data *pp_data)
256 {
257 memset(pp_data->visited, 0, pp_data->num_words * sizeof(bool));
258 }
259
apply_rules(PP_data * pp_data,bool (applyfn)(PP_data *,Linkage,pp_rule *),Linkage sublinkage,pp_rule * rule_array,const char ** msg)260 static bool apply_rules(PP_data *pp_data,
261 bool (applyfn) (PP_data *, Linkage, pp_rule *),
262 Linkage sublinkage,
263 pp_rule *rule_array,
264 const char **msg)
265 {
266 int i;
267 for (i = 0; (*msg = rule_array[i].msg) != NULL; i++)
268 {
269 if (!applyfn(pp_data, sublinkage, &(rule_array[i])))
270 {
271 rule_array[i].use_count ++;
272 return false;
273 }
274 }
275 return true;
276 }
277
278 static bool
apply_relevant_rules(Postprocessor * pp,bool (applyfn)(PP_data *,Linkage,pp_rule *),Linkage sublinkage,pp_rule * rule_array,int * relevant_rules,const char ** msg)279 apply_relevant_rules(Postprocessor *pp,
280 bool (applyfn)(PP_data *, Linkage, pp_rule *),
281 Linkage sublinkage,
282 pp_rule *rule_array,
283 int *relevant_rules,
284 const char **msg)
285 {
286 int i, idx;
287 PP_data *pp_data = &pp->pp_data;
288
289 /* If we didn't accumulate link names for this sentence, we need
290 * to apply all rules. */
291 if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) {
292 return apply_rules(pp_data, applyfn, sublinkage, rule_array, msg);
293 }
294
295 /* We did, and we don't. */
296 for (i = 0; (idx = relevant_rules[i]) != -1; i++)
297 {
298 *msg = rule_array[idx].msg;
299 if (!applyfn(pp_data, sublinkage, &(rule_array[idx]))) return false;
300 }
301 return true;
302 }
303
304 /**
305 * returns true if and only if all groups containing the specified link
306 * contain at least one from the required list. (as determined by exact
307 * string matching)
308 */
309 static bool
apply_contains_one(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)310 apply_contains_one(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
311 {
312 DTreeLeaf * dtl;
313 size_t d, count;
314
315 for (d=0; d<pp_data->N_domains; d++)
316 {
317 for (dtl = pp_data->domain_array[d].child;
318 dtl != NULL &&
319 !post_process_match(rule->selector,
320 sublinkage->link_array[dtl->link].link_name);
321 dtl = dtl->next) {}
322 if (dtl != NULL)
323 {
324 /* selector link of rule appears in this domain */
325 count=0;
326 for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next)
327 {
328 if (string_in_list(sublinkage->link_array[dtl->link].link_name,
329 rule->link_array))
330 {
331 count=1;
332 break;
333 }
334 }
335 if (count == 0) return false;
336 }
337 }
338 return true;
339 }
340
341
342 /**
343 * Returns true if and only if:
344 * all groups containing the selector link do not contain anything
345 * from the link_array contained in the rule. Uses exact string matching.
346 */
347 static bool
apply_contains_none(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)348 apply_contains_none(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
349 {
350 size_t d;
351
352 for (d=0; d<pp_data->N_domains; d++)
353 {
354 DTreeLeaf * dtl;
355 for (dtl = pp_data->domain_array[d].child;
356 dtl != NULL &&
357 !post_process_match(rule->selector,
358 sublinkage->link_array[dtl->link].link_name);
359 dtl = dtl->next) {}
360 if (dtl != NULL)
361 {
362 /* selector link of rule appears in this domain */
363 for (dtl = pp_data->domain_array[d].child; dtl != NULL; dtl = dtl->next)
364 {
365 if (string_in_list(sublinkage->link_array[dtl->link].link_name,
366 rule->link_array))
367 return false;
368 }
369 }
370 }
371 return true;
372 }
373
374 /**
375 * Returns true if and only if
376 * (1) the sentence doesn't contain the selector link for the rule, or
377 * (2) it does, and it also contains one or more from the rule's link set
378 */
379 static bool
apply_contains_one_globally(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)380 apply_contains_one_globally(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
381 {
382 size_t i, j, count;
383 for (i = 0; i < sublinkage->num_links; i++)
384 {
385 assert(sublinkage->link_array[i].lw != SIZE_MAX);
386 if (post_process_match(rule->selector, sublinkage->link_array[i].link_name)) break;
387 }
388 if (i == sublinkage->num_links) return true;
389
390 /* selector link of rule appears in sentence */
391 count = 0;
392 for (j = 0; j < sublinkage->num_links && count == 0; j++)
393 {
394 assert(sublinkage->link_array[j].lw != SIZE_MAX);
395 if (string_in_list(sublinkage->link_array[j].link_name, rule->link_array))
396 {
397 count = 1;
398 break;
399 }
400 }
401 if (count == 0) return false; else return true;
402 }
403
404 /**
405 * For each link in the linkage that is in the must_form_a_cycle list,
406 * we want to make sure that that link is in a cycle. We do this
407 * simply by deleting the link, then seeing if the end points of that
408 * link are still connected.
409 */
reachable_without_dfs(PP_data * pp_data,Linkage sublinkage,size_t a,size_t b,size_t w)410 static void reachable_without_dfs(PP_data *pp_data,
411 Linkage sublinkage, size_t a, size_t b, size_t w)
412 {
413 /* This is a depth first search of words reachable from w, excluding
414 * any direct edge between word a and word b. */
415 List_o_links *lol;
416 assert(w < pp_data->num_words, "Bad word index");
417 pp_data->visited[w] = true;
418 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
419 {
420 assert(lol->word < pp_data->num_words, "Bad word index");
421 if (!pp_data->visited[lol->word] &&
422 !(w == a && lol->word == b) &&
423 !(w == b && lol->word == a))
424 {
425 reachable_without_dfs(pp_data, sublinkage, a, b, lol->word);
426 }
427 }
428 }
429
430 /**
431 * Returns true if the linkage is connected when ignoring the links
432 * whose names are in the given list of link names.
433 * Actually, what it does is this: it returns false if the connectivity
434 * of the subgraph reachable from word 0 changes as a result of deleting
435 * these links.
436 */
437 static bool
apply_must_form_a_cycle(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)438 apply_must_form_a_cycle(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
439 {
440 List_o_links *lol;
441 size_t w;
442
443 for (w = 0; w < pp_data->num_words; w++)
444 {
445 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
446 {
447 if (w > lol->word) continue; /* only consider each edge once */
448 if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue;
449
450 clear_visited(pp_data);
451 reachable_without_dfs(pp_data, sublinkage, w, lol->word, w);
452 if (!pp_data->visited[lol->word]) return false;
453 }
454 }
455
456 for (lol = pp_data->links_to_ignore; lol != NULL; lol = lol->next)
457 {
458 w = sublinkage->link_array[lol->link].lw;
459 /* (w, lol->word) are the left and right ends of the edge we're considering */
460 if (!pp_linkset_match(rule->link_set, sublinkage->link_array[lol->link].link_name)) continue;
461
462 clear_visited(pp_data);
463 reachable_without_dfs(pp_data, sublinkage, w, lol->word, w);
464
465 assert(lol->word < pp_data->num_words, "Bad word index");
466 if (!pp_data->visited[lol->word]) return false;
467 }
468
469 return true;
470 }
471
472 /**
473 * Checks to see that all domains with this name have the property that
474 * all of the words that touch a link in the domain are not to the left
475 * of the root word of the domain.
476 */
477 static bool
apply_bounded(PP_data * pp_data,Linkage sublinkage,pp_rule * rule)478 apply_bounded(PP_data *pp_data, Linkage sublinkage, pp_rule *rule)
479 {
480 size_t d, lw;
481 List_o_links * lol;
482 char d_type = rule->domain;
483
484 for (d = 0; d < pp_data->N_domains; d++)
485 {
486 if (pp_data->domain_array[d].type != d_type) continue;
487 lw = sublinkage->link_array[pp_data->domain_array[d].start_link].lw;
488 for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next)
489 {
490 if (sublinkage->link_array[lol->link].lw < lw) return false;
491 }
492 }
493 return true;
494 }
495
496 /**
497 * fill in the pp->pp_data.word_links array with a list of words
498 * neighboring each word (actually a list of links). This is an
499 * undirected graph.
500 */
build_graph(Postprocessor * pp,Linkage sublinkage)501 static void build_graph(Postprocessor *pp, Linkage sublinkage)
502 {
503 size_t link;
504 List_o_links * lol;
505 PP_data *pp_data = &pp->pp_data;
506
507 /* Get more size, if needed */
508 if (pp_data->wowlen <= pp_data->num_words)
509 {
510 size_t newsz;
511 pp_data->wowlen += pp_data->num_words;
512 newsz = pp_data->wowlen * sizeof(List_o_links *);
513 pp_data->word_links = (List_o_links **) realloc(
514 pp_data->word_links, newsz);
515 }
516 memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *));
517
518 for (link = 0; link < sublinkage->num_links; link++)
519 {
520 assert (sublinkage->link_array[link].lw != SIZE_MAX);
521 if (NULL == sublinkage->link_array[link].link_name) continue;
522 if (pp_linkset_match(pp->knowledge->ignore_these_links,
523 sublinkage->link_array[link].link_name))
524 {
525 lol = (List_o_links *) malloc(sizeof(List_o_links));
526 lol->next = pp_data->links_to_ignore;
527 pp_data->links_to_ignore = lol;
528 lol->link = link;
529 lol->word = sublinkage->link_array[link].rw;
530 continue;
531 }
532
533 lol = (List_o_links *) malloc(sizeof(List_o_links));
534 lol->next = pp_data->word_links[sublinkage->link_array[link].lw];
535 pp_data->word_links[sublinkage->link_array[link].lw] = lol;
536 lol->link = link;
537 lol->word = sublinkage->link_array[link].rw;
538
539 lol = (List_o_links *) malloc(sizeof(List_o_links));
540 lol->next = pp_data->word_links[sublinkage->link_array[link].rw];
541 pp_data->word_links[sublinkage->link_array[link].rw] = lol;
542 lol->link = link;
543 lol->word = sublinkage->link_array[link].lw;
544 }
545 }
546
setup_domain_array(Postprocessor * pp,const char * string,int start_link)547 static void setup_domain_array(Postprocessor *pp,
548 const char *string, int start_link)
549 {
550 PP_data *pp_data = &pp->pp_data;
551 size_t n = pp_data->N_domains;
552
553 /* Grab more memory if needed */
554 if (pp_data->domlen <= n)
555 {
556 size_t oldsz, incsz;
557 #define DOMINC 16
558 oldsz = pp_data->domlen * sizeof(Domain);
559 incsz = DOMINC * sizeof(Domain);
560 pp_data->domain_array = (Domain *) realloc(pp_data->domain_array,
561 oldsz + incsz);
562 memset(&pp_data->domain_array[pp_data->domlen], 0, incsz);
563 pp_data->domlen += DOMINC;
564 }
565
566 pp_data->domain_array[n].string = string;
567 pp_data->domain_array[n].lol = NULL;
568 pp_data->domain_array[n].size = 0;
569 pp_data->domain_array[n].start_link = start_link;
570
571 pp_data->N_domains++;
572 assert(pp_data->N_domains<PP_MAX_DOMAINS, "raise value of PP_MAX_DOMAINS");
573 }
574
add_link_to_domain(PP_data * pp_data,int link)575 static void add_link_to_domain(PP_data *pp_data, int link)
576 {
577 size_t n = pp_data->N_domains - 1; /* the very last one */
578 List_o_links *lol = (List_o_links *) malloc(sizeof(List_o_links));
579
580 lol->next = pp_data->domain_array[n].lol;
581 pp_data->domain_array[n].lol = lol;
582 pp_data->domain_array[n].size++;
583 lol->link = link;
584 }
585
depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t root,size_t start_link)586 static void depth_first_search(Postprocessor *pp, Linkage sublinkage,
587 size_t w, size_t root, size_t start_link)
588 {
589 List_o_links *lol;
590 PP_data *pp_data = &pp->pp_data;
591
592 assert(w < pp_data->num_words, "Bad word index");
593 pp_data->visited[w] = true;
594 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
595 {
596 if (lol->word < w && lol->link != start_link)
597 {
598 add_link_to_domain(pp_data, lol->link);
599 }
600 }
601 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
602 {
603 if (!pp_data->visited[lol->word] && (lol->word != root) &&
604 !(lol->word < root && lol->word < w &&
605 pp_linkset_match(pp->knowledge->restricted_links,
606 sublinkage->link_array[lol->link].link_name)))
607 {
608 depth_first_search(pp, sublinkage, lol->word, root, start_link);
609 }
610 }
611 }
612
bad_depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t root,size_t start_link)613 static void bad_depth_first_search(Postprocessor *pp, Linkage sublinkage,
614 size_t w, size_t root, size_t start_link)
615 {
616 List_o_links * lol;
617 PP_data *pp_data = &pp->pp_data;
618
619 assert(w < pp_data->num_words, "Bad word index");
620 pp_data->visited[w] = true;
621 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
622 {
623 if ((lol->word < w) && (lol->link != start_link) && (w != root))
624 {
625 add_link_to_domain(pp_data, lol->link);
626 }
627 }
628 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
629 {
630 assert(lol->word < pp_data->num_words, "Bad word index");
631 if ((!pp_data->visited[lol->word]) && !(w == root && lol->word < w) &&
632 !(lol->word < root && lol->word < w &&
633 pp_linkset_match(pp->knowledge->restricted_links,
634 sublinkage->link_array[lol->link].link_name)))
635 {
636 bad_depth_first_search(pp, sublinkage, lol->word, root, start_link);
637 }
638 }
639 }
640
d_depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t root,size_t right,size_t start_link)641 static void d_depth_first_search(Postprocessor *pp, Linkage sublinkage,
642 size_t w, size_t root, size_t right, size_t start_link)
643 {
644 List_o_links * lol;
645 PP_data *pp_data = &pp->pp_data;
646
647 assert(w < pp_data->num_words, "Bad word index");
648 pp_data->visited[w] = true;
649 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
650 {
651 if ((lol->word < w) && (lol->link != start_link) && (w != root))
652 {
653 add_link_to_domain(pp_data, lol->link);
654 }
655 }
656 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
657 {
658 assert(lol->word < pp_data->num_words, "Bad word index");
659 if (!pp_data->visited[lol->word] && !(w == root && lol->word >= right) &&
660 !(w == root && lol->word < root) &&
661 !(lol->word < root && lol->word < w &&
662 pp_linkset_match(pp->knowledge->restricted_links,
663 sublinkage->link_array[lol->link].link_name)))
664 {
665 d_depth_first_search(pp,sublinkage,lol->word,root,right,start_link);
666 }
667 }
668 }
669
left_depth_first_search(Postprocessor * pp,Linkage sublinkage,size_t w,size_t right,size_t start_link)670 static void left_depth_first_search(Postprocessor *pp, Linkage sublinkage,
671 size_t w, size_t right, size_t start_link)
672 {
673 List_o_links *lol;
674 PP_data *pp_data = &pp->pp_data;
675
676 assert(w < pp_data->num_words, "Bad word index");
677 pp_data->visited[w] = true;
678 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
679 {
680 if (lol->word < w && lol->link != start_link)
681 {
682 add_link_to_domain(pp_data, lol->link);
683 }
684 }
685 for (lol = pp_data->word_links[w]; lol != NULL; lol = lol->next)
686 {
687 assert(lol->word < pp_data->num_words, "Bad word index");
688 if (!pp_data->visited[lol->word] && (lol->word != right))
689 {
690 depth_first_search(pp, sublinkage, lol->word, right, start_link);
691 }
692 }
693 }
694
domain_compare(const Domain * d1,const Domain * d2)695 static int domain_compare(const Domain * d1, const Domain * d2)
696 {
697 return (d1->size - d2->size); /* for sorting the domains by size */
698 }
699
build_domains(Postprocessor * pp,Linkage sublinkage)700 static void build_domains(Postprocessor *pp, Linkage sublinkage)
701 {
702 size_t link, i, d;
703 const char *s;
704 PP_data *pp_data = &pp->pp_data;
705
706 pp_data->N_domains = 0;
707
708 for (link = 0; link<sublinkage->num_links; link++)
709 {
710 assert (sublinkage->link_array[link].lw != SIZE_MAX);
711 if (NULL == sublinkage->link_array[link].link_name) continue;
712 s = sublinkage->link_array[link].link_name;
713
714 if (pp_linkset_match(pp->knowledge->ignore_these_links, s)) continue;
715 if (pp_linkset_match(pp->knowledge->domain_starter_links, s))
716 {
717 setup_domain_array(pp, s, link);
718 if (pp_linkset_match(pp->knowledge->domain_contains_links, s))
719 add_link_to_domain(pp_data, link);
720
721 clear_visited(pp_data);
722 depth_first_search(pp, sublinkage, sublinkage->link_array[link].rw,
723 sublinkage->link_array[link].lw, link);
724 }
725 else
726 if (pp_linkset_match(pp->knowledge->urfl_domain_starter_links, s))
727 {
728 setup_domain_array(pp, s, link);
729 /* always add the starter link to its urfl domain */
730 add_link_to_domain(pp_data, link);
731
732 clear_visited(pp_data);
733 bad_depth_first_search(pp, sublinkage,sublinkage->link_array[link].rw,
734 sublinkage->link_array[link].lw, link);
735 }
736 else
737 if (pp_linkset_match(pp->knowledge->urfl_only_domain_starter_links, s))
738 {
739 setup_domain_array(pp, s, link);
740 /* do not add the starter link to its urfl_only domain */
741 clear_visited(pp_data);
742 d_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw,
743 sublinkage->link_array[link].lw,
744 sublinkage->link_array[link].rw, link);
745 }
746 else
747 if (pp_linkset_match(pp->knowledge->left_domain_starter_links, s))
748 {
749 setup_domain_array(pp, s, link);
750 /* do not add the starter link to a left domain */
751 clear_visited(pp_data);
752 left_depth_first_search(pp, sublinkage, sublinkage->link_array[link].lw,
753 sublinkage->link_array[link].rw, link);
754 }
755 }
756
757 /* sort the domains by size */
758 qsort((void *) pp_data->domain_array,
759 pp_data->N_domains,
760 sizeof(Domain),
761 (int (*)(const void *, const void *)) domain_compare);
762
763 /* sanity check: all links in all domains have a legal domain name */
764 for (d = 0; d < pp_data->N_domains; d++)
765 {
766 i = find_domain_name(pp, pp_data->domain_array[d].string);
767 if (i == SIZE_MAX)
768 prt_error("Error: post_process(): Need an entry for %s in LINK_TYPE_TABLE\n",
769 pp_data->domain_array[d].string);
770 pp_data->domain_array[d].type = i;
771 }
772 }
773
build_domain_forest(PP_data * pp_data,Linkage sublinkage)774 static void build_domain_forest(PP_data *pp_data, Linkage sublinkage)
775 {
776 size_t d, d1, link;
777 DTreeLeaf * dtl;
778
779 if (0 == pp_data->N_domains) return;
780
781 pp_data->domain_array[pp_data->N_domains-1].parent = NULL;
782 for (d=0; d < pp_data->N_domains-1; d++)
783 {
784 for (d1 = d+1; d1 < pp_data->N_domains; d1++)
785 {
786 if (contained_in(&pp_data->domain_array[d], &pp_data->domain_array[d1], sublinkage))
787 {
788 pp_data->domain_array[d].parent = &pp_data->domain_array[d1];
789 break;
790 }
791 }
792 if (d1 == pp_data->N_domains)
793 {
794 /* we know this domain is a root of a new tree */
795 pp_data->domain_array[d].parent = NULL;
796 }
797 }
798
799 /* The parent links of domain nodes have been established.
800 * Now do the leaves. */
801 for (d = 0; d < pp_data->N_domains; d++)
802 {
803 pp_data->domain_array[d].child = NULL;
804 }
805
806 for (link=0; link < sublinkage->num_links; link++)
807 {
808 assert (sublinkage->link_array[link].lw != SIZE_MAX);
809 for (d=0; d<pp_data->N_domains; d++)
810 {
811 if (link_in_domain(link, &pp_data->domain_array[d]))
812 {
813 dtl = (DTreeLeaf *) malloc(sizeof(DTreeLeaf));
814 dtl->link = link;
815 dtl->parent = &pp_data->domain_array[d];
816 dtl->next = pp_data->domain_array[d].child;
817 pp_data->domain_array[d].child = dtl;
818 break;
819 }
820 }
821 }
822 }
823
824 static int
internal_process(Postprocessor * pp,Linkage sublinkage,const char ** msg)825 internal_process(Postprocessor *pp, Linkage sublinkage, const char **msg)
826 {
827 size_t i;
828 PP_data *pp_data = &pp->pp_data;
829
830 /* quick test: try applying just the relevant global rules */
831 if (!apply_relevant_rules(pp, apply_contains_one_globally,
832 sublinkage,
833 pp->knowledge->contains_one_rules,
834 pp->relevant_contains_one_rules, msg))
835 {
836 for (i = 0; i < pp_data->wowlen; i++)
837 pp_data->word_links[i] = NULL;
838 pp_data->N_domains = 0;
839 return -1;
840 }
841
842 /* build graph; confirm that it's legally connected */
843 build_graph(pp, sublinkage);
844 build_domains(pp, sublinkage);
845 build_domain_forest(&pp->pp_data, sublinkage);
846
847 #if defined(CHECK_DOMAIN_NESTING)
848 /* These messages were deemed to not be useful, so
849 * this code is commented out. See comment above. */
850 if (!check_domain_nesting(pp, sublinkage->num_links))
851 prt_error("Warning: The domains are not nested.\n");
852 #endif
853
854 /* The order below should be optimal for most cases */
855 if (!apply_relevant_rules(pp, apply_contains_one, sublinkage,
856 pp->knowledge->contains_one_rules,
857 pp->relevant_contains_one_rules, msg)) return 1;
858 if (!apply_relevant_rules(pp, apply_contains_none, sublinkage,
859 pp->knowledge->contains_none_rules,
860 pp->relevant_contains_none_rules, msg)) return 1;
861 if (!apply_rules(pp_data, apply_must_form_a_cycle, sublinkage,
862 pp->knowledge->form_a_cycle_rules,msg)) return 1;
863 if (!apply_rules(pp_data, apply_bounded, sublinkage,
864 pp->knowledge->bounded_rules, msg)) return 1;
865 return 0; /* This linkage satisfied all the rules */
866 }
867
868
869 /**
870 * Call this (a) after having called post_process_scan_linkage() on all
871 * generated linkages, but (b) before calling post_process() on any
872 * particular linkage. Here we mark all rules which we know (from having
873 * accumulated a set of link names appearing in *any* linkage) that won't
874 * ever be needed.
875 */
prune_irrelevant_rules(Postprocessor * pp)876 static void prune_irrelevant_rules(Postprocessor *pp)
877 {
878 pp_rule *rule;
879 int coIDX, cnIDX, rcoIDX = 0, rcnIDX = 0;
880
881 /* If we didn't scan any linkages, there's no pruning to be done. */
882 if (pp_linkset_population(pp->set_of_links_of_sentence) == 0) return;
883
884 for (coIDX = 0; ; coIDX++)
885 {
886 rule = &(pp->knowledge->contains_one_rules[coIDX]);
887 if (rule->msg == NULL) break;
888 if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector))
889 {
890 /* Mark rule as being relevant to this sentence */
891 pp->relevant_contains_one_rules[rcoIDX++] = coIDX;
892 pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector);
893 }
894 }
895 pp->relevant_contains_one_rules[rcoIDX] = -1; /* end sentinel */
896
897 for (cnIDX = 0; ; cnIDX++)
898 {
899 rule = &(pp->knowledge->contains_none_rules[cnIDX]);
900 if (rule->msg == NULL) break;
901 if (pp_linkset_match_bw(pp->set_of_links_of_sentence, rule->selector))
902 {
903 pp->relevant_contains_none_rules[rcnIDX++] = cnIDX;
904 pp_linkset_add(pp->set_of_links_in_an_active_rule, rule->selector);
905 }
906 }
907 pp->relevant_contains_none_rules[rcnIDX] = -1;
908
909 if (verbosity_level(5))
910 {
911 err_msg(lg_Debug, "PP: Saw %zu unique link names in all linkages.\n\\",
912 pp_linkset_population(pp->set_of_links_of_sentence));
913 err_msg(lg_Debug, "PP: Using %i 'contains one' rules "
914 "and %i 'contains none' rules\n",
915 rcoIDX, rcnIDX);
916 }
917 }
918
919
920 /***************** definitions of exported functions ***********************/
921
922 #define PP_INITLEN 60 /* just starting size, it is expanded if needed */
923
pp_new_domain_array(PP_data * pp_data)924 static void pp_new_domain_array(PP_data *pp_data)
925 {
926 pp_data->domlen = PP_INITLEN;
927 pp_data->domain_array = (Domain*) malloc(pp_data->domlen * sizeof(Domain));
928 memset(pp_data->domain_array, 0, pp_data->domlen * sizeof(Domain));
929 }
930
931 /**
932 * read rules from path and initialize the appropriate fields in
933 * a postprocessor structure, a pointer to which is returned.
934 */
post_process_new(pp_knowledge * kno)935 Postprocessor * post_process_new(pp_knowledge * kno)
936 {
937 Postprocessor *pp;
938 PP_data *pp_data;
939
940 if (NULL == kno) return NULL;
941
942 pp = (Postprocessor *) malloc (sizeof(Postprocessor));
943 pp->knowledge = kno;
944 pp->string_set = string_set_create();
945 pp->set_of_links_of_sentence = pp_linkset_open(1024);
946 pp->set_of_links_in_an_active_rule = pp_linkset_open(1024);
947 pp->relevant_contains_one_rules =
948 (int *) malloc ((pp->knowledge->n_contains_one_rules + 1)
949 *(sizeof pp->relevant_contains_one_rules[0]));
950 pp->relevant_contains_none_rules =
951 (int *) malloc ((pp->knowledge->n_contains_none_rules + 1)
952 *(sizeof pp->relevant_contains_none_rules[0]));
953 pp->relevant_contains_one_rules[0] = -1;
954 pp->relevant_contains_none_rules[0] = -1;
955 pp->violation = NULL;
956 pp->n_local_rules_firing = 0;
957 pp->n_global_rules_firing = 0;
958
959 pp->q_pruned_rules = false;
960
961 pp_data = &pp->pp_data;
962 pp_data->vlength = PP_INITLEN;
963 pp_data->visited = (bool*) malloc(pp_data->vlength * sizeof(bool));
964 memset(pp_data->visited, 0, pp_data->vlength * sizeof(bool));
965
966 pp_data->links_to_ignore = NULL;
967 pp_new_domain_array(pp_data);
968
969 pp_data->wowlen = PP_INITLEN;
970 pp_data->word_links = (List_o_links **) malloc(pp_data->wowlen * sizeof(List_o_links*));
971 memset(pp_data->word_links, 0, pp_data->wowlen * sizeof(List_o_links *));
972
973 return pp;
974 }
975
post_process_free(Postprocessor * pp)976 void post_process_free(Postprocessor *pp)
977 {
978 PP_data *pp_data;
979
980 /* frees up memory associated with pp, previously allocated by open */
981 if (pp == NULL) return;
982 string_set_delete(pp->string_set);
983 pp_linkset_close(pp->set_of_links_of_sentence);
984 pp_linkset_close(pp->set_of_links_in_an_active_rule);
985 free(pp->relevant_contains_one_rules);
986 free(pp->relevant_contains_none_rules);
987 pp->knowledge = NULL;
988 pp->violation = NULL;
989
990 pp_data = &pp->pp_data;
991 post_process_free_data(pp_data);
992 free(pp_data->visited);
993 free(pp_data->domain_array);
994 free(pp_data->word_links);
995
996 free(pp);
997 }
998
999 /**
1000 * During a first pass (prior to actual post-processing of the linkages
1001 * of a sentence), call this once for every generated linkage. Here we
1002 * simply maintain a set of "seen" link names for rule pruning, later on.
1003 */
post_process_scan_linkage(Postprocessor * pp,Linkage linkage)1004 static void post_process_scan_linkage(Postprocessor *pp, Linkage linkage)
1005 {
1006 size_t i;
1007 if (pp == NULL) return;
1008 for (i = 0; i < linkage->num_links; i++)
1009 {
1010 assert(linkage->link_array[i].lw != SIZE_MAX);
1011
1012 pp_linkset_add(pp->set_of_links_of_sentence,
1013 linkage->link_array[i].link_name);
1014 }
1015 }
1016
report_rule_use(pp_rule * set)1017 static size_t report_rule_use(pp_rule *set)
1018 {
1019 size_t cnt = 0;
1020 size_t i;
1021 for (i=0; set[i].msg != NULL; i++)
1022 {
1023 err_msg(lg_Debug, "Used: %d rule: %s\n", set[i].use_count, set[i].msg);
1024 cnt++;
1025 }
1026 return cnt;
1027 }
1028
report_unused_rule(pp_rule * set)1029 static size_t report_unused_rule(pp_rule *set)
1030 {
1031 size_t i;
1032 size_t cnt = 0;
1033 for (i=0; set[i].msg != NULL; i++)
1034 {
1035 if (0 == set[i].use_count)
1036 {
1037 err_msg(lg_Debug, "Unused rule: %s\n", set[i].msg);
1038 cnt++;
1039 }
1040 }
1041 return cnt;
1042 }
1043
report_pp_stats(Postprocessor * pp)1044 static void report_pp_stats(Postprocessor *pp)
1045 {
1046 size_t rule_cnt = 0;
1047 size_t unused_cnt = 0;
1048 pp_knowledge * kno;
1049 if (!verbosity_level(9)) return;
1050
1051 err_msg(lg_Debug, "PP stats: local_rules_firing=%d\n", pp->n_local_rules_firing);
1052 kno = pp->knowledge;
1053
1054 err_msg(lg_Debug, "\nPP stats: form_a_cycle_rules\n");
1055 rule_cnt += report_rule_use(kno->form_a_cycle_rules);
1056
1057 err_msg(lg_Debug, "\nPP stats: contains_one_rules\n");
1058 rule_cnt += report_rule_use(kno->contains_one_rules);
1059
1060 err_msg(lg_Debug, "\nPP stats: contains_none_rules\n");
1061 rule_cnt += report_rule_use(kno->contains_none_rules);
1062
1063 err_msg(lg_Debug, "\nPP stats: bounded_rules\n");
1064 rule_cnt += report_rule_use(kno->bounded_rules);
1065
1066 err_msg(lg_Debug, "\nPP stats: Rules that were not used:\n");
1067 unused_cnt += report_unused_rule(kno->form_a_cycle_rules);
1068 unused_cnt += report_unused_rule(kno->contains_one_rules);
1069 unused_cnt += report_unused_rule(kno->contains_none_rules);
1070 unused_cnt += report_unused_rule(kno->bounded_rules);
1071
1072 err_msg(lg_Debug, "\nPP stats: %zu of %zu rules unused\n", unused_cnt, rule_cnt);
1073 }
1074
1075 /**
1076 * NB: linkage->link[i]->l=-1 means that this connector is to be ignored.
1077 */
do_post_process(Postprocessor * pp,Linkage sublinkage,bool is_long)1078 void do_post_process(Postprocessor *pp, Linkage sublinkage, bool is_long)
1079 {
1080 const char *msg;
1081 PP_data *pp_data;
1082
1083 if (pp == NULL) return;
1084 pp_data = &pp->pp_data;
1085
1086 // XXX wtf .. why is this not leaking memory ?
1087 pp_data->links_to_ignore = NULL;
1088
1089 pp_data->num_words = sublinkage->num_words;
1090
1091 /* Grab more memory if needed */
1092 if (pp_data->vlength <= pp_data->num_words)
1093 {
1094 size_t newsz;
1095 pp_data->vlength += pp_data->num_words;
1096 newsz = pp_data->vlength * sizeof(bool);
1097 pp_data->visited = (bool *) realloc(pp_data->visited, newsz);
1098 }
1099 clear_visited(pp_data);
1100
1101 /* For long sentences, we can save some time by pruning the rules
1102 * which can't possibly be used during postprocessing the linkages
1103 * of this sentence. For short sentences, this is pointless. */
1104 if (is_long && pp->q_pruned_rules == false)
1105 {
1106 prune_irrelevant_rules(pp);
1107 }
1108 pp->q_pruned_rules = true;
1109
1110 switch (internal_process(pp, sublinkage, &msg))
1111 {
1112 case -1:
1113 /* some global test failed even before we had to build the domains */
1114 pp->n_global_rules_firing++;
1115 pp->violation = msg;
1116 report_pp_stats(pp);
1117 return;
1118 case 1:
1119 /* one of the "normal" post processing tests failed */
1120 pp->n_local_rules_firing++;
1121 pp->violation = msg;
1122 break;
1123 case 0:
1124 /* This linkage is legal according to the post processing rules */
1125 pp->violation = NULL;
1126 break;
1127 }
1128
1129 report_pp_stats(pp);
1130 }
1131
1132 /**
1133 * This does basic post-processing for all linkages.
1134 */
post_process_lkgs(Sentence sent,Parse_Options opts)1135 void post_process_lkgs(Sentence sent, Parse_Options opts)
1136 {
1137 size_t in;
1138 size_t N_linkages_post_processed = 0;
1139 size_t N_valid_linkages = sent->num_valid_linkages;
1140 size_t N_linkages_alloced = sent->num_linkages_alloced;
1141 bool twopass = sent->length >= opts->twopass_length;
1142 Postprocessor *pp = sent->postprocessor;
1143
1144 /* Special-case the "amy/ady" morphology handling. */
1145 /* More generally, it there's no post-processor, do nothing. */
1146 /* Well, almost nothing. We still want to assign a score. */
1147 // if (sent->dict->affix_table->anysplit)
1148 if (NULL == pp)
1149 {
1150 sent->num_linkages_post_processed = sent->num_valid_linkages;
1151 for (in=0; in < N_linkages_alloced; in++)
1152 {
1153 Linkage lkg = &sent->lnkages[in];
1154 linkage_score(lkg, opts);
1155 }
1156 return;
1157 }
1158
1159 #define TCD 512 /* timer checking divisor */
1160
1161 /* (optional) First pass: just visit the linkages */
1162 /* The purpose of the first pass is to make the post-processing
1163 * more efficient. Because (hopefully) by the time the real work
1164 * is done in the 2nd pass, the relevant rule set has been pruned
1165 * in the first pass.
1166 */
1167 if (twopass)
1168 {
1169 for (in=0; in < N_linkages_alloced; in++)
1170 {
1171 Linkage lkg = &sent->lnkages[in];
1172 Linkage_info *lifo = &lkg->lifo;
1173
1174 if (lifo->N_violations) continue;
1175
1176 post_process_scan_linkage(pp, lkg);
1177
1178 if (((TCD-1) == in%TCD) && resources_exhausted(opts->resources)) break;
1179 }
1180 }
1181
1182 /* Second pass: actually perform post-processing */
1183 for (in=0; in < N_linkages_alloced; in++)
1184 {
1185 Linkage lkg = &sent->lnkages[in];
1186 Linkage_info *lifo = &lkg->lifo;
1187
1188 if (lifo->N_violations) continue;
1189
1190 do_post_process(pp, lkg, twopass);
1191 post_process_free_data(&pp->pp_data);
1192
1193 if (NULL != pp->violation)
1194 {
1195 N_valid_linkages--;
1196 lifo->N_violations++;
1197
1198 /* Set the message, only if not set */
1199 if (NULL == lifo->pp_violation_msg)
1200 lifo->pp_violation_msg = pp->violation;
1201 }
1202 N_linkages_post_processed++;
1203
1204 linkage_score(lkg, opts);
1205 if (((TCD-1) == in%TCD) && resources_exhausted(opts->resources)) break;
1206 }
1207
1208 /* If the timer expired, then we never finished post-processing.
1209 * Mark the remaining sentences as bad, as otherwise strange
1210 * results get reported. */
1211 for (; in < N_linkages_alloced; in++)
1212 {
1213 Linkage lkg = &sent->lnkages[in];
1214 Linkage_info *lifo = &lkg->lifo;
1215
1216 if (lifo->N_violations) continue;
1217
1218 N_valid_linkages--;
1219 lifo->N_violations++;
1220
1221 /* Set the message, only if not set */
1222 if (NULL == lifo->pp_violation_msg)
1223 lifo->pp_violation_msg = "Timeout during postprocessing";
1224 }
1225
1226 print_time(opts, "Postprocessed all linkages");
1227
1228 if (verbosity_level(6))
1229 {
1230 err_msg(lg_Info, "%zu of %zu linkages with no P.P. violations\n",
1231 N_valid_linkages, N_linkages_post_processed);
1232 }
1233
1234 sent->num_linkages_post_processed = N_linkages_post_processed;
1235 sent->num_valid_linkages = N_valid_linkages;
1236 }
1237
1238 /* ================ compute the domain names ============= */
1239 /*
1240 * The code below is used in one place only: when printing the domain
1241 * names. If the domain names are not being printed, then this is a
1242 * complete waste of CPU time.
1243 */
1244
free_domain_names(PP_domains * ppi)1245 static void free_domain_names(PP_domains *ppi)
1246 {
1247 if (ppi->num_domains > 0) free(ppi->domain_name);
1248 ppi->domain_name = NULL;
1249 ppi->num_domains = 0;
1250 }
1251
linkage_free_pp_domains(Linkage lkg)1252 void linkage_free_pp_domains(Linkage lkg)
1253 {
1254 size_t j;
1255 if (!lkg || !lkg->pp_domains) return;
1256
1257 for (j = 0; j < lkg->num_links; ++j)
1258 free_domain_names(&lkg->pp_domains[j]);
1259 free(lkg->pp_domains);
1260 lkg->pp_domains = NULL;
1261 }
1262
1263 typedef struct D_type_list_s D_type_list;
1264 struct D_type_list_s
1265 {
1266 D_type_list * next;
1267 int type;
1268 };
1269
free_d_type(D_type_list * dtl)1270 static void free_d_type(D_type_list * dtl)
1271 {
1272 D_type_list * dtlx;
1273 for (; dtl != NULL; dtl = dtlx)
1274 {
1275 dtlx = dtl->next;
1276 free((void*) dtl);
1277 }
1278 }
1279
build_type_array(PP_data * pp_data,size_t numlinks)1280 static D_type_list ** build_type_array(PP_data *pp_data,
1281 size_t numlinks)
1282 {
1283 size_t nbytes = numlinks * sizeof(D_type_list*);
1284 D_type_list** dta = malloc(nbytes);
1285 memset(dta, 0, nbytes);
1286
1287 for (size_t d = 0; d < pp_data->N_domains; d++)
1288 {
1289 List_o_links * lol;
1290 for (lol = pp_data->domain_array[d].lol; lol != NULL; lol = lol->next)
1291 {
1292 assert(lol->link < numlinks, "Something wrong about link numbering!");
1293
1294 D_type_list * dtl;
1295 dtl = (D_type_list *) malloc(sizeof(D_type_list));
1296 dtl->type = pp_data->domain_array[d].type;
1297 dtl->next = dta[lol->link];
1298 dta[lol->link] = dtl;
1299 }
1300 }
1301 return dta;
1302 }
1303
1304 /**
1305 * Store the domain names in the linkage. These are not needed
1306 * unless the user asks the domain names to be printed!
1307 */
linkage_set_domain_names(Postprocessor * postprocessor,Linkage linkage)1308 static void linkage_set_domain_names(Postprocessor *postprocessor,
1309 Linkage linkage)
1310 {
1311 if (NULL == linkage) return;
1312 if (NULL == postprocessor) return;
1313 if (0 == postprocessor->pp_data.N_domains) return;
1314
1315 /* Copy the post-processing results over into the linkage */
1316 if (postprocessor->violation != NULL) return;
1317
1318 D_type_list **dta = build_type_array(&postprocessor->pp_data,
1319 linkage->num_links);
1320
1321 assert(NULL == linkage->pp_domains, "Not expecting pp_domains here!");
1322
1323 linkage->pp_domains = malloc(sizeof(PP_domains) * linkage->num_links);
1324 memset(linkage->pp_domains, 0, sizeof(PP_domains) * linkage->num_links);
1325
1326 for (size_t j = 0; j < linkage->num_links; ++j)
1327 {
1328 D_type_list * d;
1329 int k = 0;
1330 for (d = dta[j]; d != NULL; d = d->next) k++;
1331 linkage->pp_domains[j].num_domains = k;
1332 if (k > 0)
1333 {
1334 linkage->pp_domains[j].domain_name =
1335 (const char **) malloc(k * sizeof(const char *));
1336 }
1337 k = 0;
1338 for (d = dta[j]; d != NULL; d = d->next)
1339 {
1340 char buff[] = {d->type, '\0'};
1341
1342 linkage->pp_domains[j].domain_name[k] =
1343 string_set_add (buff, postprocessor->string_set);
1344
1345 k++;
1346 }
1347 }
1348
1349 /* Done with the d_type_array */
1350 for (size_t i=0; i<linkage->num_links; i++)
1351 free_d_type(dta[i]);
1352 free(dta);
1353 }
1354
1355 /**
1356 * Compute linkage domain names.
1357 *
1358 * This assumes that post-processing has been done once, already;
1359 * however, it re-performs post-processing a second time, because
1360 * the data need to obtain the domain names has been lost.
1361 */
compute_domain_names(Linkage lkg)1362 void compute_domain_names(Linkage lkg)
1363 {
1364 Postprocessor *pp = lkg->sent->postprocessor;
1365 if (NULL == pp) return;
1366
1367 Linkage_info *lifo = &lkg->lifo;
1368 if (lifo->N_violations) return;
1369
1370 // If pp_domains is set, its been computed already
1371 if (NULL != lkg->pp_domains) return;
1372
1373 do_post_process(pp, lkg, true);
1374 linkage_set_domain_names(pp, lkg);
1375 post_process_free_data(&pp->pp_data);
1376 }
1377
verify_link_index(const Linkage linkage,LinkIdx index)1378 static inline bool verify_link_index(const Linkage linkage, LinkIdx index)
1379 {
1380 if (!linkage) return false;
1381 if (index >= linkage->num_links) return false;
1382 return true;
1383 }
1384
1385 /** XXX this will not return valid data unless compute_domain_names
1386 * has been called first. FIXME? or does this matter?
1387 */
linkage_get_link_num_domains(const Linkage linkage,LinkIdx index)1388 int linkage_get_link_num_domains(const Linkage linkage, LinkIdx index)
1389 {
1390 if (NULL == linkage->pp_domains) return -1;
1391 if (!verify_link_index(linkage, index)) return -1;
1392 return linkage->pp_domains[index].num_domains;
1393 }
1394
1395 /** XXX this will not return valid data unless compute_domain_names
1396 * has been called first. FIXME? or does this matter?
1397 */
linkage_get_link_domain_names(const Linkage linkage,LinkIdx index)1398 const char ** linkage_get_link_domain_names(const Linkage linkage, LinkIdx index)
1399 {
1400 if (NULL == linkage->pp_domains) return NULL;
1401 if (!verify_link_index(linkage, index)) return NULL;
1402 return linkage->pp_domains[index].domain_name;
1403 }
1404
1405
1406 /* OLD COMMENTS (OUT OF DATE):
1407 This file does the post-processing.
1408 The main routine is "post_process()". It uses the link names only,
1409 and not the connectors.
1410
1411 A domain is a set of links. Each domain has a defining link.
1412 Only certain types of links serve to define a domain. These
1413 parameters are set by the lists of link names in a separate,
1414 human-readable file referred to herein as the 'knowledge file.'
1415
1416 The domains are nested: given two domains, either they're disjoint,
1417 or one contains the other, i.e. they're tree structured. The set of links
1418 in a domain (but in no smaller domain) are called the "group" of the
1419 domain. Data structures are built to store all this stuff.
1420 The tree structured property is not mathematically guaranteed by
1421 the domain construction algorithm. Davy simply claims that because
1422 of how he built the dictionary, the domains will always be so
1423 structured. The program checks this and gives an error message
1424 if it's violated.
1425
1426 Define the "root word" of a link (or domain) to be the word at the
1427 left end of the link. The other end of the defining link is called
1428 the "right word".
1429
1430 The domain corresponding to a link is defined to be the set of links
1431 reachable by starting from the right word, following links and never
1432 using the root word or any word to its left.
1433
1434 There are some minor exceptions to this. The "restricted_link" lists
1435 those connectors that, even if they point back before the root word,
1436 are included in the domain. Some of the starting links are included
1437 in their domain, these are listed in the "domain_contains_links" list.
1438
1439 Such was the way it was. Now Davy tells me there should be another type
1440 of domain that's quite different. Let's call these "urfl" domains.
1441 Certain type of connectors start urfl domains. They're listed below.
1442 In a urfl domain, the search includes the root word. It does a separate
1443 search to find urfl domains.
1444
1445 Restricted links should work just as they do with ordinary domains. If they
1446 come out of the right word, or anything to the right of it (that's
1447 in the domain), they should be included but should not be traced
1448 further. If they come out of the root word, they should not be
1449 included.
1450 */
1451
1452 /*
1453 I also, unfortunately, want to propose a new type of domain. These
1454 would include everything that can be reached from the root word of the
1455 link, to the right, that is closer than the right word of the link.
1456 (They would not include the link itself.)
1457
1458 In the following sentence, then, the "Urfl_Only Domain" of the G link
1459 would include only the "O" link:
1460
1461 +-----G----+
1462 +---O--+ +-AI+
1463 | | | |
1464 hitting dogs is fun.a
1465
1466 In the following sentence it would include the "O", the "TT", the "I",
1467 the second "O", and the "A".
1468
1469 +----------------G---------------+
1470 +-----TT-----+ +-----O-----+ |
1471 +---O---+ +-I+ +---A--+ +-AI+
1472 | | | | | | | |
1473 telling people to do stupid things is fun.a
1474
1475 This would allow us to judge the following:
1476
1477 kicking dogs bores me
1478 *kicking dogs kicks dogs
1479 explaining the program is easy
1480 *explaining the program is running
1481
1482 (These are distinctions that I thought we would never be able to make,
1483 so I told myself they were semantic rather than syntactic. But with
1484 domains, they should be easy.)
1485 */
1486
1487 /* Modifications, 6/96 ALB:
1488 1) Rules and link sets are relegated to a separate, user-written
1489 file(s), herein referred to as the 'knowledge file'
1490 2) This information is read by a lexer, in pp_lexer.l (lex code)
1491 whose exported routines are all prefixed by 'pp_lexer'
1492 3) when postprocessing a sentence, the links of each domain are
1493 placed in a set for quick lookup, ('contains one' and 'contains none')
1494 4) Functions which were never called have been eliminated:
1495 link_inhabits(), match_in_list(), group_type_contains(),
1496 group_type_contains_one(), group_type_contains_all()
1497 5) Some 'one-by-one' initializations have been replaced by faster
1498 block memory operations (memset etc.)
1499 6) The above comments are correct but incomplete! (1/97)
1500 7) observation: the 'contains one' is, empirically, by far the most
1501 violated rule, so it should come first in applying the rules.
1502
1503 Modifications, 9/97 ALB:
1504 Deglobalization. Made code consistent with api.
1505 */
1506