1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright (c) 2014 Linas Vepstas                                      */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 
14 #include <stdint.h>
15 #include <string.h>
16 
17 #include "api-structures.h"
18 #include "dict-common/dict-defines.h"    // For RIGHT_WALL_WORD
19 #include "dict-common/dict-common.h"     // For Dictionary_s
20 #include "dict-common/dict-defines.h"    // For MAX_WORD
21 #include "error.h"
22 #include "linkage/linkage.h"
23 #include "post-process/post-process.h"
24 #include "post-process/pp-structures.h"
25 #include "string-set.h"
26 #include "utilities.h"
27 
28 #define D_CONST 8 /* debug level for this file */
29 
30 #define OPEN_BRACKET '['
31 #define CLOSE_BRACKET ']'
32 
33 typedef enum {OPEN_TOK, CLOSE_TOK, WORD_TOK} CType;
34 typedef enum {NONE, STYPE, PTYPE, QTYPE, QDTYPE} WType;
35 
36 typedef struct
37 {
38 	const char * type;
39 	const char * start_link;
40 	size_t left;      /* leftmost word */
41 	size_t right;     /* rightmost word */
42 	int canon;
43 	bool valid;
44 	char domain_type;
45 } constituent_t;
46 
47 /*
48  * Context used to store assorted intermediate data
49  * when the constituent string is being generated.
50  */
51 typedef struct
52 {
53 	String_set * phrase_ss;
54 	WType * wordtype;
55 	constituent_t * constituent;
56 	int conlen;
57 } con_context_t;
58 
59 
60 typedef struct CNode_s CNode;
61 
62 /* Invariant: Leaf if child==NULL */
63 struct CNode_s
64 {
65 	char  * label;
66 	CNode * child;
67 	CNode * next;
68 	int   start, end;
69 };
70 
71 /* ================================================================ */
72 
73 /**
74  * @param s A Label
75  * @param t A Label
76  * @return True if the uppercase parts of s and t are equal, else false.
77  */
uppercompare(const char * s,const char * t)78 static bool uppercompare(const char * s, const char * t)
79 {
80 	while (isupper(*s) || isupper(*t))
81 	{
82 		if (*s++ != *t++) return false;
83 	}
84 	return true;
85 }
86 
87 /**
88  * If a constituent c has a comma at either end, we exclude the
89  * comma.
90  */
adjust_for_left_comma(con_context_t * ctxt,Linkage linkage,int c)91 static void adjust_for_left_comma(con_context_t * ctxt, Linkage linkage, int c)
92 {
93 	int w;
94 	w = ctxt->constituent[c].left;
95 	if (strcmp(linkage->word[w], ",") == 0)
96 		w++;
97 	ctxt->constituent[c].left = w;
98 }
99 
adjust_for_right_comma(con_context_t * ctxt,Linkage linkage,int c)100 static void adjust_for_right_comma(con_context_t *ctxt, Linkage linkage, int c)
101 {
102 	int w;
103 	w = ctxt->constituent[c].right;
104 	if ((strcmp(linkage->word[w], ",") == 0) ||
105 	    (strcmp(linkage->word[w], RIGHT_WALL_WORD) == 0))
106 	{
107 		w--;
108 	}
109 	ctxt->constituent[c].right = w;
110 }
111 
print_constituent(con_context_t * ctxt,Linkage linkage,int c)112 static void print_constituent(con_context_t *ctxt, Linkage linkage, int c)
113 {
114 	size_t w;
115 
116 	err_msg(lg_Debug, "  c %2d %4s [%c] (%2zu-%2zu): ",
117 		   c, ctxt->constituent[c].type, ctxt->constituent[c].domain_type,
118 		   ctxt->constituent[c].left, ctxt->constituent[c].right);
119 	for (w = ctxt->constituent[c].left; w <= ctxt->constituent[c].right; w++) {
120 		err_msg(lg_Debug, "%s ", linkage->word[w]); /**PV**/
121 	}
122 	err_msg(lg_Debug, "\n");
123 }
124 
125 /******************************************************
126  * These functions do the bulk of the actual
127  * constituent-generating; they're called once.
128  *********************************************************/
129 
130 typedef enum
131 {
132 	CASE_S=1,
133 	CASE_UNUSED=2,  /* XXX not used anywhere... */
134 	CASE_REL_CLAUSE=3,
135 	CASE_APPOS=4,
136 	CASE_OPENER=5,
137 	CASE_PPOPEN=6,
138 	CASE_SVINV=7,
139 	CASE_PART_MOD=8,
140 	CASE_PART_OPEN=9,
141 
142 } case_type;
143 
144 /**
145  * This function looks for constituents of type ctype1. Say it finds
146  * one, call it c1. It searches for the next larger constituent of
147  * type ctype2, call it c2. It then generates a new constituent of
148  * ctype3, containing all the words in c2 but not c1.
149  */
gen_comp(con_context_t * ctxt,Linkage linkage,int numcon_total,int numcon_subl,const char * ctype1,const char * ctype2,const char * ctype3,case_type x)150 static int gen_comp(con_context_t *ctxt, Linkage linkage,
151                     int numcon_total, int numcon_subl,
152                     const char * ctype1, const char * ctype2,
153                     const char * ctype3, case_type x)
154 {
155 	size_t w, w2, w3;
156 	int c, c1, c2;
157 	bool done;
158 	c = numcon_total + numcon_subl;
159 
160 	for (c1=numcon_total; c1<numcon_total + numcon_subl; c1++)
161 	{
162 		/* If ctype1 is NP, it has to be an appositive to continue */
163 		if ((x==CASE_APPOS) && (post_process_match("MX#*", ctxt->constituent[c1].start_link)==0))
164 			continue;
165 
166 #ifdef REVIVE_DEAD_CODE
167 		/* If ctype1 is X, and domain_type is t, it's an infinitive - skip it */
168 		if ((x==CASE_UNUSED) && (ctxt->constituent[c1].domain_type=='t'))
169 			continue;
170 #endif /* REVIVE_DEAD_CODE */
171 
172 		/* If it's domain-type z, it's a subject-relative clause;
173 		   the VP doesn't need an NP */
174 		if (ctxt->constituent[c1].domain_type=='z')
175 			continue;
176 
177 		/* If ctype1 is X or VP, and it's not started by an S, don't generate an NP
178 		 (Neither of the two previous checks are necessary now, right?) */
179 #ifdef REVIVE_DEAD_CODE
180 		/* use this ... if ((x==CASE_S || x==CASE_UNUSED) && */
181 #endif /* REVIVE_DEAD_CODE */
182 		if ((x==CASE_S) &&
183 			(((post_process_match("S", ctxt->constituent[c1].start_link) == 0) &&
184 			  (post_process_match("SX", ctxt->constituent[c1].start_link) == 0) &&
185 			  (post_process_match("SF", ctxt->constituent[c1].start_link) == 0)) ||
186 			 (post_process_match("S##w", ctxt->constituent[c1].start_link) != 0)))
187 			continue;
188 
189 		/* If it's an SBAR (relative clause case), it has to be a relative clause */
190 		if ((x==CASE_REL_CLAUSE) &&
191 			((post_process_match("Rn", ctxt->constituent[c1].start_link) == 0) &&
192 			 (post_process_match("R*", ctxt->constituent[c1].start_link) == 0) &&
193 			 (post_process_match("MX#r", ctxt->constituent[c1].start_link) == 0) &&
194 			 (post_process_match("Mr", ctxt->constituent[c1].start_link) == 0) &&
195 			 (post_process_match("MX#d", ctxt->constituent[c1].start_link) == 0)))
196 			continue;
197 
198 		/* If ctype1 is SBAR (clause opener case), it has to be an f domain */
199 		if ((x==CASE_OPENER) && (ctxt->constituent[c1].domain_type!='f'))
200 			continue;
201 
202 		/* If ctype1 is SBAR (pp opener case), it has to be a g domain */
203 		if ((x==CASE_PPOPEN) && (ctxt->constituent[c1].domain_type!='g'))
204 			continue;
205 
206 		/* If ctype1 is NP (paraphrase case), it has to be started by an SI */
207 		if ((x==CASE_SVINV) && (post_process_match("SI", ctxt->constituent[c1].start_link)==0))
208 			continue;
209 
210 		/* If ctype1 is VP (participle modifier case), it has to be
211 		   started by an Mv or Mg */
212 		if ((x==CASE_PART_MOD) && (post_process_match("M", ctxt->constituent[c1].start_link)==0))
213 			continue;
214 
215 		/* If ctype1 is VP (participle opener case), it has
216 		   to be started by a COp */
217 		if ((x==CASE_PART_OPEN) && (post_process_match("COp", ctxt->constituent[c1].start_link)==0))
218 			continue;
219 
220 		/* Now start at the bounds of c1, and work outwards until you
221 		   find a larger constituent of type ctype2 */
222 		if (!(strcmp(ctxt->constituent[c1].type, ctype1)==0))
223 			continue;
224 
225 		if (verbosity_level(D_CONST))
226 			err_msg(lg_Debug, "Generating complement constituent for c %d of type %s\n\\",
227 				   c1, ctype1);
228 		done = false;
229 		for (w2 = ctxt->constituent[c1].left; (done == false) && (w2 != (size_t)-1); w2--)
230 		{
231 			for (w3 = ctxt->constituent[c1].right; w3<linkage->num_words; w3++)
232 			{
233 				for (c2 = numcon_total; (done == false) &&
234 						 (c2 < numcon_total + numcon_subl); c2++) {
235 					if (!((ctxt->constituent[c2].left == w2) &&
236 						  (ctxt->constituent[c2].right == w3)) || (c2==c1))
237 						continue;
238 					if (!(strcmp(ctxt->constituent[c2].type, ctype2)==0))
239 						continue;
240 
241 					/* if the new constituent (c) is to the left
242 					   of c1, its right edge should be adjacent to the
243 					   left edge of c1 - or as close as possible. */
244 					if ((x==CASE_OPENER) || (x==CASE_PPOPEN) || (x==CASE_PART_OPEN))
245 					{
246 								/* This is the case where c is to the
247 								   RIGHT of c1 */
248 						w = ctxt->constituent[c1].right + 1;
249 						if (w > ctxt->constituent[c2].right)
250 						{
251 							done = true;
252 							continue;
253 						}
254 						ctxt->constituent[c].left = w;
255 						ctxt->constituent[c].right = ctxt->constituent[c2].right;
256 					}
257 					else
258 					{
259 						w = ctxt->constituent[c1].left - 1;
260 						if (w < ctxt->constituent[c2].left) {
261 							done = true;
262 							continue;
263 						}
264 						ctxt->constituent[c].right = w;
265 						ctxt->constituent[c].left = ctxt->constituent[c2].left;
266 					}
267 
268 					adjust_for_left_comma(ctxt, linkage, c1);
269 					adjust_for_right_comma(ctxt, linkage, c1);
270 
271 					ctxt->constituent[c].type =
272 						string_set_add(ctype3, ctxt->phrase_ss);
273 					ctxt->constituent[c].domain_type = 'x';
274 					ctxt->constituent[c].start_link =
275 						string_set_add("XX", ctxt->phrase_ss);
276 					if (verbosity_level(D_CONST))
277 					{
278 						err_msg(lg_Debug, "Larger c found: c %d (%s); ", c2, ctype2);
279 						err_msg(lg_Debug, "Adding constituent:\n\\");
280 						print_constituent(ctxt, linkage, c);
281 					}
282 					c++;
283 					assert (c < ctxt->conlen, "Too many constituents");
284 					done = true;
285 				}
286 			}
287 		}
288 		if (verbosity_level(D_CONST))
289 		{
290 			if (done == false)
291 				err_msg(lg_Debug, "No constituent added, because no larger %s"
292 					               " was found\n", ctype2);
293 			else
294 				lg_error_flush();
295 		}
296 	}
297 	numcon_subl = c - numcon_total;
298 	return numcon_subl;
299 }
300 
301 /**
302  * Look for a constituent started by an MVs or MVg.
303  * Find any VP's or ADJP's that contain it (without going
304  * beyond a larger S or NP). Adjust them so that
305  * they end right before the m domain starts.
306  */
adjust_subordinate_clauses(con_context_t * ctxt,Linkage linkage,int numcon_total,int numcon_subl)307 static void adjust_subordinate_clauses(con_context_t *ctxt, Linkage linkage,
308                                        int numcon_total,
309                                        int numcon_subl)
310 {
311 	int c, c2;
312 	size_t w, w2;
313 	bool done;
314 
315 	for (c=numcon_total; c<numcon_total + numcon_subl; c++)
316 	{
317 		if ((post_process_match("MVs", ctxt->constituent[c].start_link) == 1) ||
318 			 (post_process_match("MVg", ctxt->constituent[c].start_link) == 1))
319 		{
320 			done = false;
321 			for (w2 = ctxt->constituent[c].left-1; (false == done) && w2 != (size_t) -1; w2--)
322 			{
323 				for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++)
324 				{
325 					if (!((ctxt->constituent[c2].left == w2) &&
326 						  (ctxt->constituent[c2].right >= ctxt->constituent[c].right)))
327 						continue;
328 					if ((strcmp(ctxt->constituent[c2].type, "S") == 0) ||
329 						(strcmp(ctxt->constituent[c2].type, "NP") == 0)) {
330 						done = true;
331 						break;
332 					}
333 					if ((ctxt->constituent[c2].domain_type == 'v') ||
334 						(ctxt->constituent[c2].domain_type == 'a'))
335 					{
336 						w = ctxt->constituent[c].left - 1;
337 						ctxt->constituent[c2].right = w;
338 
339 						if (verbosity_level(D_CONST))
340 						{
341 							err_msg(lg_Debug, "Adjusting constituent %d:\n\\", c2);
342 							print_constituent(ctxt, linkage, c2);
343 						}
344 					}
345 				}
346 			}
347 			if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
348 				ctxt->constituent[c].left++;
349 		}
350 	}
351 }
352 
353 /******************************************************
354  * These functions are called once, after constituents
355  * have been generated, to merge them together and fix up
356  * some other things.
357  *
358  ********************************************************/
359 
merge_constituents(con_context_t * ctxt,Linkage linkage,int numcon_total)360 static int merge_constituents(con_context_t *ctxt, Linkage linkage, int numcon_total)
361 {
362 	int c1, c2=0;
363 
364 	/* First go through and give each constituent a canonical number
365 	   (the index number of the lowest-numbered constituent
366 	   identical to it) */
367 	for (c1 = 0; c1 < numcon_total; c1++)
368 	{
369 		ctxt->constituent[c1].valid = true;
370 		ctxt->constituent[c1].canon = c1;
371 		for (c2 = c1 + 1; c2 < numcon_total; c2++)
372 		{
373 			if ((ctxt->constituent[c1].left == ctxt->constituent[c2].left) &&
374 				(ctxt->constituent[c1].right == ctxt->constituent[c2].right) &&
375 				(strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0))
376 			{
377 				ctxt->constituent[c2].canon = c1;
378 			}
379 		}
380 	}
381 
382 	/* Now go through and find duplicates; if a pair is found,
383 	 * mark one as invalid.
384 	 */
385 	for (c1 = 0; c1 < numcon_total; c1++)
386 	{
387 		for (c2 = c1 + 1; c2 < numcon_total; c2++)
388 		{
389 			if (ctxt->constituent[c2].canon == ctxt->constituent[c1].canon)
390 				ctxt->constituent[c2].valid = false;
391 		}
392 	}
393 
394 	return numcon_total;
395 }
396 
397 /**
398  * Go through all the words. If a word is on the right end of
399  * an S (or SF or SX), wordtype[w]=STYPE.  If it's also on the left end of a
400  * Pg*b, I, PP, or Pv, wordtype[w]=PTYPE. If it's a question-word
401  * used in an indirect question, wordtype[w]=QTYPE. If it's a
402  * question-word determiner,  wordtype[w]=QDTYPE. Else wordtype[w]=NONE.
403  * (This function is called once.)
404  */
generate_misc_word_info(con_context_t * ctxt,Linkage linkage)405 static void generate_misc_word_info(con_context_t * ctxt, Linkage linkage)
406 {
407 	size_t w1, w2, l1, l2;
408 	const char * label1, * label2;
409 
410 	for (w1 = 0; w1 < linkage->num_words; w1++)
411 		ctxt->wordtype[w1] = NONE;
412 
413 	for (l1 = 0; l1 < linkage_get_num_links(linkage); l1++) {
414 		w1=linkage_get_link_rword(linkage, l1);
415 		label1 = linkage_get_link_label(linkage, l1);
416 		if ((uppercompare(label1, "S")) ||
417 			(uppercompare(label1, "SX")) ||
418 			(uppercompare(label1, "SF"))) {
419 			ctxt->wordtype[w1] = STYPE;
420 			for (l2 = 0; l2 < linkage_get_num_links(linkage); l2++) {
421 				w2=linkage_get_link_lword(linkage, l2);
422 				label2 = linkage_get_link_label(linkage, l2);
423 				if ((w1 == w2) &&
424 					((post_process_match("Pg#b", label2)==1) ||
425 					 (uppercompare(label2, "I")) ||
426 					 (uppercompare(label2, "PP")) ||
427 					 (post_process_match("Pv", label2)==1))) {
428 					/* Pvf, Pgf? */
429 					ctxt->wordtype[w1] = PTYPE;
430 				}
431 			}
432 		}
433 		if (post_process_match("QI#d", label1)==1) {
434 			ctxt->wordtype[w1] = QTYPE;
435 			for (l2 = 0; l2 < linkage_get_num_links(linkage); l2++) {
436 				w2 = linkage_get_link_lword(linkage, l2);
437 				label2 = linkage_get_link_label(linkage, l2);
438 				if ((w1 == w2) && (post_process_match("D##w", label2)==1)) {
439 					ctxt->wordtype[w1] = QDTYPE;
440 				}
441 			}
442 		}
443 		if (post_process_match("Mr", label1)==1) ctxt->wordtype[w1] = QDTYPE;
444 		if (post_process_match("MX#d", label1)==1) ctxt->wordtype[w1] = QDTYPE;
445 	}
446 }
447 
new_style_conjunctions(con_context_t * ctxt,Linkage linkage,int numcon_total)448 static int new_style_conjunctions(con_context_t *ctxt, Linkage linkage, int numcon_total)
449 {
450 #ifdef DEBUG
451 	int c;
452 	for (c = 0; c < numcon_total; c++)
453 	{
454 		constituent_t *ct = &ctxt->constituent[c];
455 		lgdebug(6, "ola %d valid=%d %s start=%s lr=%zu %zu\n", c,
456 			ct->valid, ct->type, ct->start_link, ct->left, ct->right);
457 	}
458 #endif
459 	return numcon_total;
460 }
461 
last_minute_fixes(con_context_t * ctxt,Linkage linkage,int numcon_total)462 static int last_minute_fixes(con_context_t *ctxt, Linkage linkage, int numcon_total)
463 {
464 	int c;
465 	bool global_leftend_found, global_rightend_found;
466 	size_t lastword;
467 
468 	for (c = 0; c < numcon_total; c++)
469 	{
470 		/* In a paraphrase construction ("John ran, he said"),
471 		   the paraphrasing clause doesn't get
472 		   an S. (This is true in Treebank II, not Treebank I) */
473 
474 		if (uppercompare(ctxt->constituent[c].start_link, "CP"))
475 		{
476 			ctxt->constituent[c].valid = false;
477 		}
478 
479 		/* If it's a possessive with an "'s", the NP on the left
480 		   should be extended to include the "'s". */
481 		if ((uppercompare(ctxt->constituent[c].start_link, "YS")) ||
482 			(uppercompare(ctxt->constituent[c].start_link, "YP")))
483 		{
484 			ctxt->constituent[c].right++;
485 		}
486 
487 		/* If a constituent has starting link MVpn, it's a time
488 		   expression like "last week"; label it as a noun phrase
489 		   (incorrectly) */
490 
491 		if (strcmp(ctxt->constituent[c].start_link, "MVpn") == 0)
492 		{
493 			ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
494 		}
495 		if (strcmp(ctxt->constituent[c].start_link, "COn") == 0)
496 		{
497 			ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
498 		}
499 		if (strcmp(ctxt->constituent[c].start_link, "Mpn") == 0)
500 		{
501 			ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
502 		}
503 
504 		/* If the constituent is an S started by "but" or "and" at
505 		   the beginning of the 67411142sentence, it should be ignored. */
506 
507 		if ((strcmp(ctxt->constituent[c].start_link, "Wdc") == 0) &&
508 			(ctxt->constituent[c].left == 2))
509 		{
510 			ctxt->constituent[c].valid = false;
511 		}
512 
513 		/* For prenominal adjectives, an ADJP constituent is assigned
514 		   if it's a hyphenated (Ah) or comparative (Am) adjective;
515 		   otherwise no ADJP is assigned, unless the phrase is more
516 		   than one word long (e.g. "very big"). The same with certain
517 		   types of adverbs. */
518 		/* That was for Treebank I. For Treebank II, the rule only
519 		   seems to apply to prenominal adjectives (of all kinds).
520 		   However, it also applies to number expressions ("QP"). */
521 
522 		if ((post_process_match("A", ctxt->constituent[c].start_link) == 1) ||
523 			(ctxt->constituent[c].domain_type == 'd') ||
524 			(ctxt->constituent[c].domain_type == 'h')) {
525 			if (ctxt->constituent[c].right-ctxt->constituent[c].left == 0)
526 			{
527 				ctxt->constituent[c].valid = false;
528 			}
529 		}
530 
531 		if ((ctxt->constituent[c].domain_type == 'h') &&
532 			(strcmp(linkage->word[ctxt->constituent[c].left - 1], "$") == 0))
533 		{
534 			ctxt->constituent[c].left--;
535 		}
536 	}
537 
538 	/* If there's a global S constituent that includes everything
539 	   except a final terminating punctuation (period or question mark),
540 	   extend it by one word. We know its the terminating punctuation,
541 	   because it links to the right wall with an RW link.  If its
542 	   not, then that final link is not there...
543 	 */
544 	for (c = 0; c < numcon_total; c++)
545 	{
546 		if ((ctxt->constituent[c].right == linkage->num_words - 3) &&
547 			(ctxt->constituent[c].left == 1) &&
548 			(strcmp(ctxt->constituent[c].type, "S") == 0))
549 		{
550 			size_t ln;
551 			for (ln = 0; ln < linkage->num_links; ln++)
552 			{
553 				if ((linkage->link_array[ln].lw == linkage->num_words - 2) &&
554 				    (linkage->link_array[ln].rw == linkage->num_words - 1))
555 				{
556 					ctxt->constituent[c].right++;
557 					break;
558 				}
559 			}
560 		}
561 	}
562 
563 	/* If there's no S boundary at the very left end of the sentence,
564 	   or the very right end, create a new S spanning the entire sentence */
565 
566 	lastword = linkage->num_words - 2;
567 	global_leftend_found = false;
568 	global_rightend_found = false;
569 	for (c = 0; c < numcon_total; c++)
570 	{
571 		if ((ctxt->constituent[c].left == 1) &&
572 		   (strcmp(ctxt->constituent[c].type, "S") == 0) &&
573 			ctxt->constituent[c].valid)
574 		{
575 			global_leftend_found = true;
576 		}
577 	}
578 
579 	for (c = 0; c < numcon_total; c++)
580 	{
581 		if ((ctxt->constituent[c].right >= lastword) &&
582 			(strcmp(ctxt->constituent[c].type, "S") == 0) &&
583 		   ctxt->constituent[c].valid)
584 		{
585 			global_rightend_found = true;
586 		}
587 	}
588 
589 	if ((global_leftend_found == false) || (global_rightend_found == false))
590 	{
591 		c = numcon_total;
592 		ctxt->constituent[c].left = 1;
593 		ctxt->constituent[c].right = linkage->num_words-1;
594 		ctxt->constituent[c].type = string_set_add("S", ctxt->phrase_ss);
595 		ctxt->constituent[c].valid = true;
596 		ctxt->constituent[c].domain_type = 'x';
597 		numcon_total++;
598 		if (verbosity_level(D_CONST))
599 		{
600 			err_msg(lg_Debug, "Adding global sentence constituent:\n\\");
601 			print_constituent(ctxt, linkage, c);
602 		}
603 	}
604 
605 	return numcon_total;
606 }
607 
add_constituent(con_context_t * ctxt,int c,const Linkage linkage,const Domain * domain,int l,int r,const char * name)608 static int add_constituent(con_context_t *ctxt, int c, const Linkage linkage,
609                            const Domain *domain,
610                            int l, int r, const char * name)
611 {
612 	int nwords = linkage->num_words-2;
613 	c++;
614 
615 	/* Avoid running off end, to walls. */
616 	if (l < 1) l=1;
617 	if (r > nwords) r = nwords;
618 	if (l > nwords) l = nwords;
619 	assert(l <= r, "negative constituent length!");
620 
621 	ctxt->constituent[c].type = string_set_add(name, ctxt->phrase_ss);
622 	ctxt->constituent[c].left = l;
623 	ctxt->constituent[c].right = r;
624 	ctxt->constituent[c].domain_type = domain->type;
625 	ctxt->constituent[c].start_link =
626 		linkage_get_link_label(linkage, domain->start_link);
627 	return c;
628 }
629 
cons_of_domain(const Linkage linkage,char domain_type)630 static const char * cons_of_domain(const Linkage linkage, char domain_type)
631 {
632 	switch (domain_type) {
633 	case 'a':
634 		return "ADJP";
635 	case 'b':
636 		return "SBAR";
637 	case 'c':
638 		return "VP";
639 	case 'd':
640 		return "QP";
641 	case 'e':
642 		return "ADVP";
643 	case 'f':
644 		return "SBAR";
645 	case 'g':
646 		return "PP";
647 	case 'h':
648 		return "QP";
649 	case 'i':
650 		return "ADVP";
651 	case 'k':
652 		return "PRT";
653 	case 'n':
654 		return "NP";
655 	case 'p':
656 		return "PP";
657 	case 'q':
658 		return "SINV";
659 	case 's':
660 		return "S";
661 	case 't':
662 		return "VP";
663 	case 'u':
664 		return "ADJP";
665 	case 'v':
666 		return "VP";
667 	case 'y':
668 		return "NP";
669 	case 'z':
670 		return "VP";
671 	default:
672 		{
673 			err_ctxt ec = { linkage->sent };
674 			err_msgc(&ec, lg_Error, "Illegal domain: %c\n", domain_type);
675 			return "";
676 		}
677 	}
678 }
679 
read_constituents_from_domains(con_context_t * ctxt,Linkage linkage,int numcon_total)680 static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage,
681                                           int numcon_total)
682 {
683 	size_t d, l, w2;
684 	int c, w, c2, numcon_subl = 0;
685 	PP_data *pp_data = &linkage->sent->constituent_pp->pp_data;
686 
687 	for (d = 0, c = numcon_total; d < pp_data->N_domains; d++, c++)
688 	{
689 		size_t leftmost, rightmost, leftlimit;
690 		int rootleft;
691 		List_o_links * dlink;
692 
693 		Domain domain = pp_data->domain_array[d];
694 
695 		// rootright = linkage_get_link_rword(linkage, domain.start_link);
696 		rootleft =  linkage_get_link_lword(linkage, domain.start_link);
697 
698 		if ((domain.type=='c') ||
699 			(domain.type=='d') ||
700 			(domain.type=='e') ||
701 			(domain.type=='f') ||
702 			(domain.type=='g') ||
703 			(domain.type=='u') ||
704 			(domain.type=='y'))
705 		{
706 			leftlimit = 0;
707 			leftmost = linkage_get_link_lword(linkage, domain.start_link);
708 			rightmost = linkage_get_link_lword(linkage, domain.start_link);
709 		}
710 		else
711 		{
712 			leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1;
713 			leftmost = linkage_get_link_rword(linkage, domain.start_link);
714 			rightmost = linkage_get_link_rword(linkage, domain.start_link);
715 		}
716 
717 		/* Start by assigning both left and right limits to the
718 		 * right word of the start link. This will always be contained
719 		 * in the constituent. This will also handle the case
720 		 * where the domain contains no links.
721 		 */
722 		for (dlink = domain.lol; dlink != NULL; dlink = dlink->next)
723 		{
724 			l = dlink->link;
725 
726 			if ((linkage_get_link_lword(linkage, l) < leftmost) &&
727 				(linkage_get_link_lword(linkage, l) >= leftlimit))
728 			{
729 				leftmost = linkage_get_link_lword(linkage, l);
730 			}
731 
732 			if (linkage_get_link_rword(linkage, l) > rightmost)
733 			{
734 				rightmost = linkage_get_link_rword(linkage, l);
735 			}
736 		}
737 
738 		c--;
739 		c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost,
740 						cons_of_domain(linkage, domain.type));
741 
742 		if (domain.type == 'z')
743 		{
744 			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
745 		}
746 		if (domain.type=='c')
747 		{
748 			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
749 		}
750 		if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) ||
751 			(post_process_match("Rn", ctxt->constituent[c].start_link)==1))
752 		{
753 			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "SBAR");
754 		}
755 		if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) ||
756 			(post_process_match("MX#r", ctxt->constituent[c].start_link)==1))
757 		{
758 			w = leftmost;
759 			if (strcmp(linkage->word[w], ",") == 0) w++;
760 			c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHNP");
761 		}
762 		if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1)
763 		{
764 			w = leftmost;
765 			if (strcmp(linkage->word[w], ",") == 0) w++;
766 			c = add_constituent(ctxt, c, linkage, &domain, w, w+1, "WHPP");
767 			c = add_constituent(ctxt, c, linkage, &domain, w+1, w+1, "WHNP");
768 		}
769 		if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) ||
770 			(post_process_match("B#d", ctxt->constituent[c].start_link)==1))
771 		{
772 			c = add_constituent(ctxt, c, linkage, &domain, rootleft, rootleft, "WHNP");
773 			c = add_constituent(ctxt, c, linkage, &domain,
774 							rootleft, ctxt->constituent[c-1].right, "SBAR");
775 		}
776 		if (post_process_match("CP", ctxt->constituent[c].start_link)==1)
777 		{
778 			if (strcmp(linkage->word[leftmost], ",") == 0)
779 				ctxt->constituent[c].left++;
780 			c = add_constituent(ctxt, c, linkage, &domain, 1, linkage->num_words-1, "S");
781 		}
782 		if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) ||
783 			(domain.type=='f'))
784 		{
785 			w = ctxt->constituent[c].left;
786 			if (strcmp(linkage->word[w], ",") == 0)
787 				w++;
788 			if (strcmp(linkage->word[w], "when") == 0)
789 			{
790 				c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHADVP");
791 			}
792 		}
793 		if (domain.type=='t')
794 		{
795 			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
796 		}
797 		if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) ||
798 			(post_process_match("Mr", ctxt->constituent[c].start_link) == 1) ||
799 			(post_process_match("MX#d", ctxt->constituent[c].start_link) == 1))
800 		{
801 			const char * name = "";
802 			w = leftmost;
803 			if (strcmp(linkage->word[w], ",") == 0) w++;
804 			if (ctxt->wordtype[w] == NONE)
805 				name = "WHADVP";
806 			else if (ctxt->wordtype[w] == QTYPE)
807 				name = "WHNP";
808 			else if (ctxt->wordtype[w] == QDTYPE)
809 				name = "WHNP";
810 			else
811 				assert(0, "Unexpected word type");
812 			c = add_constituent(ctxt, c, linkage, &domain, w, w, name);
813 
814 			if (ctxt->wordtype[w] == QDTYPE)
815 			{
816 				/* Now find the finite verb to the right, start an S */
817 				/* Limit w2 to sentence length. */
818 				// for( w2=w+1; w2 < ctxt->r_limit-1; w2++ )
819 				for (w2 = w+1; w2 < rightmost; w2++)
820 				  if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break;
821 
822 				/* Adjust the right boundary of previous constituent */
823 				ctxt->constituent[c].right = w2 - 1;
824 				c = add_constituent(ctxt, c, linkage, &domain, w2, rightmost, "S");
825 			}
826 		}
827 
828 		if (ctxt->constituent[c].domain_type == '\0')
829 		{
830 			err_ctxt ec = { linkage->sent };
831 			err_msgc(&ec, lg_Error, "No domain type assigned to constituent\n");
832 		}
833 		if (ctxt->constituent[c].start_link == NULL)
834 		{
835 			err_ctxt ec = { linkage->sent };
836 			err_msgc(&ec, lg_Error, "No type assigned to constituent\n");
837 		}
838 	}
839 
840 	numcon_subl = c - numcon_total;
841 	/* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl);  */
842 
843 	if (verbosity_level(D_CONST))
844 	{
845 		err_msg(lg_Debug, "Constituents added at first stage:\n\\");
846 		for (c = numcon_total; c < numcon_total + numcon_subl; c++)
847 		{
848 			/* FIXME: Here it cannot be printed as one debug message because
849 			 * a newline is printed at the end. */
850 			print_constituent(ctxt, linkage, c);
851 		}
852 	}
853 
854 	/* Opener case - generates S around main clause.
855 	   (This must be done first; the S generated will be needed for
856 	   later cases.) */
857 	numcon_subl =
858 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", CASE_OPENER);
859 
860 	/* pp opener case */
861 	numcon_subl =
862 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", CASE_PPOPEN);
863 
864 	/* participle opener case */
865 	numcon_subl =
866 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", CASE_PART_OPEN);
867 
868 	/* Subject-phrase case; every main VP generates an S */
869 	numcon_subl =
870 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", CASE_S);
871 
872 	/* Relative clause case; an SBAR generates a complement NP */
873 	numcon_subl =
874 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", CASE_REL_CLAUSE);
875 
876 	/* Participle modifier case */
877 	numcon_subl =
878 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", CASE_PART_MOD);
879 
880 	/* PP modifying NP */
881 	numcon_subl =
882 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", CASE_PART_MOD);
883 
884 	/* Appositive case */
885 	numcon_subl =
886 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", CASE_APPOS);
887 
888 	/* S-V inversion case; an NP generates a complement VP */
889 	numcon_subl =
890 		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", CASE_SVINV);
891 
892 	adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl);
893 	for (c = numcon_total; c < numcon_total + numcon_subl; c++)
894 	{
895 		if ((ctxt->constituent[c].domain_type=='p') &&
896 			(strcmp(linkage->word[ctxt->constituent[c].left], ",")==0))
897 		{
898 			ctxt->constituent[c].left++;
899 		}
900 	}
901 
902 	/* Make sure the constituents are nested. If two constituents
903 	 * are not nested: whichever constituent has the furthest left
904 	 * boundary, shift that boundary rightwards to the left boundary
905 	 * of the other one.
906 	 */
907 	while (true)
908 	{
909 		bool adjustment_made = false;
910 		for (c = numcon_total; c < numcon_total + numcon_subl; c++)
911 		{
912 			for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++)
913 			{
914 				if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
915 					(ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
916 					(ctxt->constituent[c].right >= ctxt->constituent[c2].left))
917 				{
918 					/* We've found two overlapping constituents.
919 					   If one is larger, except the smaller one
920 					   includes an extra comma, adjust the smaller one
921 					   to exclude the comma */
922 
923 					if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) ||
924 						(strcmp(linkage->word[ctxt->constituent[c2].right],
925 								RIGHT_WALL_WORD) == 0))
926 					{
927 						if (verbosity_level(D_CONST))
928 							err_msg(lg_Debug, "Adjusting %d to fix comma overlap\n", c2);
929 						adjust_for_right_comma(ctxt, linkage, c2);
930 						adjustment_made = true;
931 					}
932 					else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
933 					{
934 						if (verbosity_level(D_CONST))
935 							err_msg(lg_Debug, "Adjusting c %d to fix comma overlap\n", c);
936 						adjust_for_left_comma(ctxt, linkage, c);
937 						adjustment_made = true;
938 					}
939 					else
940 					{
941 						if (verbosity_level(D_CONST))
942 						{
943 							err_ctxt ec = { linkage->sent };
944 							err_msgc(&ec, lg_Warn,
945 							      "Warning: the constituents aren't nested! "
946 							      "Adjusting them. (%d, %d)\n", c, c2);
947 					  }
948 					  ctxt->constituent[c].left = ctxt->constituent[c2].left;
949 					}
950 				}
951 			}
952 		}
953 		if (adjustment_made == false) break;
954 	}
955 
956 	assert (numcon_total + numcon_subl < ctxt->conlen, "Too many constituents");
957 	return numcon_subl;
958 }
959 
960 static char *
exprint_constituent_structure(con_context_t * ctxt,Linkage linkage,int numcon_total)961 exprint_constituent_structure(con_context_t *ctxt,
962                               Linkage linkage, int numcon_total)
963 {
964 	size_t w;
965 	int c;
966 	bool *leftdone = alloca(numcon_total * sizeof(bool));
967 	bool *rightdone = alloca(numcon_total * sizeof(bool));
968 	int best, bestright, bestleft;
969 	char s[MAX_WORD];
970 	dyn_str * cs = dyn_str_new();
971 
972 	assert (numcon_total < ctxt->conlen, "Too many constituents (b)");
973 
974 	for (c = 0; c < numcon_total; c++)
975 	{
976 		leftdone[c] = false;
977 		rightdone[c] = false;
978 	}
979 
980 	/* Skip left wall; don't skip right wall, since it may
981 	 * have constituent boundaries. */
982 	for (w = 1; w < linkage->num_words; w++)
983 	{
984 		while (1)
985 		{
986 			best = -1;
987 			bestright = -1;
988 			for (c = 0; c < numcon_total; c++)
989 			{
990 				if ((ctxt->constituent[c].left == w) &&
991 					(leftdone[c] == false) && ctxt->constituent[c].valid &&
992 					((int) ctxt->constituent[c].right >= bestright))
993 				{
994 					best = c;
995 					bestright = ctxt->constituent[c].right;
996 				}
997 			}
998 			if (best == -1)
999 				break;
1000 
1001 			leftdone[best] = true;
1002 			dyn_strcat(cs, "[");
1003 			dyn_strcat(cs, ctxt->constituent[best].type);
1004 			dyn_strcat(cs, " ");
1005 		}
1006 
1007 		/* Don't print out right wall */
1008 		if (w < linkage->num_words - 1)
1009 		{
1010 			char *p;
1011 			strncpy(s, linkage->word[w], MAX_WORD);
1012 			s[MAX_WORD-1] = 0;
1013 
1014 			/* Constituent processing will crash if the sentence contains
1015 			 * square brackets, so we have to do something ... replace
1016 			 * them with curly braces ... this is a terrible hack, but
1017 			 * will have to do; for now.  A better solution would be to
1018 			 * allow the user to specify some reserved char as the
1019 			 * bracket symbol, e.g. SOH and EOT or something like that.
1020 			 */
1021 			p = strchr(s, OPEN_BRACKET);
1022 			while (p)
1023 			{
1024 				*p = '{';
1025 				p = strchr(p, OPEN_BRACKET);
1026 			}
1027 
1028 			p = strchr(s, CLOSE_BRACKET);
1029 			while (p)
1030 			{
1031 				*p = '}';
1032 				p = strchr(p, CLOSE_BRACKET);
1033 			}
1034 
1035 #if 0 /* firstupper check removed in 0c8107a */
1036 			/* Now, if the first character of the word was
1037 			   originally uppercase, we put it back that way */
1038 			if (linkage->chosen_disjuncts[w]->word[0]->status & WS_FIRSTUPPER)
1039 				upcase_utf8_str(s, s, MAX_WORD);
1040 #endif
1041 			dyn_strcat(cs, s);
1042 			dyn_strcat(cs, " ");
1043 		}
1044 
1045 		while (1)
1046 		{
1047 			best = -1;
1048 			bestleft = -1;
1049 			for(c = 0; c < numcon_total; c++)
1050 			{
1051 				if ((ctxt->constituent[c].right == w) &&
1052 					(rightdone[c] == false) && ctxt->constituent[c].valid &&
1053 					((int) ctxt->constituent[c].left > bestleft)) {
1054 					best = c;
1055 					bestleft = ctxt->constituent[c].left;
1056 				}
1057 			}
1058 			if (best == -1)
1059 				break;
1060 			rightdone[best] = true;
1061 			dyn_strcat(cs, ctxt->constituent[best].type);
1062 			dyn_strcat(cs, "] ");
1063 		}
1064 	}
1065 
1066 	dyn_strcat(cs, "\n");
1067 	return dyn_str_take(cs);
1068 }
1069 
do_print_flat_constituents(con_context_t * ctxt,Linkage linkage)1070 static char * do_print_flat_constituents(con_context_t *ctxt, Linkage linkage)
1071 {
1072 	int numcon_total= 0, numcon_subl;
1073 	char * q;
1074 	Sentence sent = linkage->sent;
1075 
1076 	ctxt->phrase_ss = string_set_create();
1077 	generate_misc_word_info(ctxt, linkage);
1078 
1079 	if (NULL ==  sent->constituent_pp)         /* First time for this sentence */
1080 		sent->constituent_pp = post_process_new(sent->dict->hpsg_knowledge);
1081 
1082 	do_post_process(sent->constituent_pp, linkage, linkage->is_sent_long);
1083 
1084 	/** No-op. If we wanted to debug domain names, we could do this...
1085 	 * linkage_free_pp_info(linkage);
1086 	 * linkage_set_domain_names(sent->constituent_pp, linkage);
1087 	 */
1088 	numcon_subl = read_constituents_from_domains(ctxt, linkage, numcon_total);
1089 	numcon_total += numcon_subl;
1090 	assert (numcon_total < ctxt->conlen, "Too many constituents (c)");
1091 	numcon_total = merge_constituents(ctxt, linkage, numcon_total);
1092 	assert (numcon_total < ctxt->conlen, "Too many constituents (d)");
1093 	numcon_total = new_style_conjunctions(ctxt, linkage, numcon_total);
1094 	assert (numcon_total < ctxt->conlen, "Too many constituents (e)");
1095 	numcon_total = last_minute_fixes(ctxt, linkage, numcon_total);
1096 	assert (numcon_total < ctxt->conlen, "Too many constituents (f)");
1097 	q = exprint_constituent_structure(ctxt, linkage, numcon_total);
1098 	string_set_delete(ctxt->phrase_ss);
1099 	ctxt->phrase_ss = NULL;
1100 
1101 	post_process_free_data(&sent->constituent_pp->pp_data);
1102 
1103 	return q;
1104 }
1105 
print_flat_constituents(Linkage linkage)1106 static char * print_flat_constituents(Linkage linkage)
1107 {
1108 	size_t wts = linkage->num_words * sizeof(WType);
1109 	size_t cns = (linkage->num_links + linkage->num_words) * sizeof(constituent_t);
1110 
1111 	con_context_t ctxt;
1112 	memset(&ctxt, 0, sizeof(con_context_t));
1113 	ctxt.wordtype = (WType *) alloca(wts);
1114 	memset(ctxt.wordtype, 0, wts);
1115 	ctxt.conlen = linkage->num_links + linkage->num_words;
1116 	ctxt.constituent = (constituent_t *) alloca(cns);
1117 	memset(ctxt.constituent, 0, cns);
1118 
1119 	return do_print_flat_constituents(&ctxt, linkage);
1120 }
1121 
token_type(char * token)1122 static CType token_type (char *token)
1123 {
1124 	if ((token[0] == OPEN_BRACKET) && (strlen(token) > 1))
1125 		return OPEN_TOK;
1126 	if ((strlen(token) > 1) && (token[strlen(token) - 1] == CLOSE_BRACKET))
1127 		return CLOSE_TOK;
1128 	return WORD_TOK;
1129 }
1130 
make_CNode(char * q)1131 static CNode * make_CNode(char *q)
1132 {
1133 	CNode * cn;
1134 	cn = (CNode *) malloc(sizeof(CNode));
1135 	cn->label = strdup(q);
1136 	cn->child = cn->next = (CNode *) NULL;
1137 	cn->next = (CNode *) NULL;
1138 	cn->start = cn->end = -1;
1139 	return cn;
1140 }
1141 
parse_string(CNode * n,char ** saveptr)1142 static CNode * parse_string(CNode * n, char **saveptr)
1143 {
1144 	char *q;
1145 	CNode *m, *last_child=NULL;
1146 
1147 	while ((q = strtok_r(NULL, " ", saveptr))) {
1148 		switch (token_type(q)) {
1149 		case CLOSE_TOK :
1150 			q[strlen(q)-1]='\0';
1151 			assert(strcmp(q, n->label)==0,
1152 				   "Constituent tree: Labels do not match.");
1153 			return n;
1154 			break;
1155 		case OPEN_TOK:
1156 			m = make_CNode(q+1);
1157 			m = parse_string(m, saveptr);
1158 			break;
1159 		case WORD_TOK:
1160 			m = make_CNode(q);
1161 			break;
1162 		default:
1163 			assert(0, "Constituent tree: Illegal token type");
1164 		}
1165 		if (n->child == NULL) {
1166 			last_child = n->child = m;
1167 		}
1168 		else {
1169 			last_child->next = m;
1170 			last_child = m;
1171 		}
1172 	}
1173 	assert(0, "Constituent tree: Constituent did not close");
1174 	return NULL;
1175 }
1176 
print_tree(dyn_str * cs,int indent,CNode * n,int o1,int o2)1177 static void print_tree(dyn_str * cs, int indent, CNode * n, int o1, int o2)
1178 {
1179 	int i, child_offset;
1180 	CNode * m;
1181 
1182 	if (n == NULL) return;
1183 
1184 	if (indent)
1185 		for (i = 0; i < o1; ++i) dyn_strcat(cs, " ");
1186 
1187 	dyn_strcat(cs, "(");
1188 	dyn_strcat(cs, n->label);
1189 	dyn_strcat(cs, " ");
1190 	child_offset = o2 + strlen(n->label) + 2;
1191 
1192 	for (m = n->child; m != NULL; m = m->next)
1193 	{
1194 		if (m->child == NULL)
1195 		{
1196 			char * p;
1197 			/* If the original string has left or right parens in it,
1198 			 * the printed string will be messed up by these ...
1199 			 * so replace them by curly braces. What else can one do?
1200 			 */
1201 			p = strchr(m->label, '(');
1202 			while(p)
1203 			{
1204 				*p = '{';
1205 				p = strchr(p, '(');
1206 			}
1207 
1208 			p = strchr(m->label, ')');
1209 			while(p)
1210 			{
1211 				*p = '}';
1212 				p = strchr(p, ')');
1213 			}
1214 
1215 			dyn_strcat(cs, m->label);
1216 			if ((m->next != NULL) && (m->next->child == NULL))
1217 				dyn_strcat(cs, " ");
1218 		}
1219 		else
1220 		{
1221 			if (m != n->child)
1222 			{
1223 				if (indent) dyn_strcat(cs, "\n");
1224 				else dyn_strcat(cs, " ");
1225 				print_tree(cs, indent, m, child_offset, child_offset);
1226 			}
1227 			else
1228 			{
1229 				print_tree(cs, indent, m, 0, child_offset);
1230 			}
1231 			if ((m->next != NULL) && (m->next->child == NULL))
1232 			{
1233 				if (indent)
1234 				{
1235 					dyn_strcat(cs, "\n");
1236 					for (i = 0; i < child_offset; ++i)
1237 						dyn_strcat(cs, " ");
1238 				}
1239 				else dyn_strcat(cs, " ");
1240 			}
1241 		}
1242 	}
1243 	dyn_strcat(cs, ")");
1244 }
1245 
assign_spans(CNode * n,int start)1246 static int assign_spans(CNode * n, int start)
1247 {
1248 	int num_words=0;
1249 	CNode * m=NULL;
1250 	if (n==NULL) return 0;
1251 	n->start = start;
1252 	if (n->child == NULL) {
1253 		n->end = start;
1254 		return 1;
1255 	}
1256 	else {
1257 		for (m=n->child; m!=NULL; m=m->next) {
1258 			num_words += assign_spans(m, start+num_words);
1259 		}
1260 		n->end = start+num_words-1;
1261 	}
1262 	return num_words;
1263 }
1264 
linkage_constituent_tree(Linkage linkage)1265 static CNode * linkage_constituent_tree(Linkage linkage)
1266 {
1267 	char *p, *q, *saveptr;
1268 	CNode * root;
1269 
1270 	p = print_flat_constituents(linkage);
1271 
1272 	q = strtok_r(p, " ", &saveptr);
1273 	assert(token_type(q) == OPEN_TOK, "Illegal beginning of string");
1274 	root = make_CNode(q+1);
1275 	root = parse_string(root, &saveptr);
1276 	assign_spans(root, 0);
1277 	free(p);
1278 	return root;
1279 }
1280 
1281 /* Make the compiler shut up about the deprecated functions */
1282 /*
1283 #pragma GCC diagnostic push
1284 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1285 */
1286 
linkage_free_constituent_tree(CNode * n)1287 static void linkage_free_constituent_tree(CNode * n)
1288 {
1289 	CNode *m, *x;
1290 	for (m=n->child; m!=NULL; m=x) {
1291 		x=m->next;
1292 		linkage_free_constituent_tree(m);
1293 	}
1294 	free(n->label);
1295 	free(n);
1296 }
1297 
1298 /**
1299  * Print out the constituent tree.
1300  * mode 1: treebank-style constituent tree
1301  * mode 2: flat, bracketed tree [A like [B this B] A]
1302  * mode 3: flat, treebank-style tree (A like (B this))
1303  */
linkage_print_constituent_tree(Linkage linkage,ConstituentDisplayStyle mode)1304 char * linkage_print_constituent_tree(Linkage linkage, ConstituentDisplayStyle mode)
1305 {
1306 	CNode * root;
1307 
1308 	if (!linkage) return NULL;
1309 	if (!linkage->sent->dict->hpsg_knowledge) return NULL;
1310 	if (mode == NO_DISPLAY)
1311 	{
1312 		return NULL;
1313 	}
1314 	else if (mode == MULTILINE || mode == SINGLE_LINE)
1315 	{
1316 		dyn_str * cs;
1317 		cs = dyn_str_new();
1318 		root = linkage_constituent_tree(linkage);
1319 		print_tree(cs, (mode==1), root, 0, 0);
1320 		linkage_free_constituent_tree(root);
1321 		dyn_strcat(cs, "\n");
1322 		return dyn_str_take(cs);
1323 	}
1324 	else if (mode == BRACKET_TREE)
1325 	{
1326 		return print_flat_constituents(linkage);
1327 	}
1328 	prt_error("Warning: Illegal mode %u for printing constituents\n"
1329 	          "Allowed values: %d to %d\n", mode, NO_DISPLAY, MAX_STYLES);
1330 	return NULL;
1331 }
1332 
linkage_free_constituent_tree_str(char * s)1333 void linkage_free_constituent_tree_str(char * s)
1334 {
1335 	free(s);
1336 }
1337