1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright (c) 2010, 2014 Linas Vepstas                                */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 
14 #include <limits.h>                     // INT_MAX
15 
16 #include "connectors.h"
17 #include "count.h"
18 #include "disjunct-utils.h"             // Disjunct
19 #include "extract-links.h"
20 #include "fast-match.h"
21 #include "utilities.h"                  // Windows rand_r()
22 #include "linkage/linkage.h"
23 #include "tokenize/word-structures.h"   // Word_Struct
24 
25 //#define RECOUNT
26 
27 //#define DEBUG_X_TABLE
28 #ifdef DEBUG_X_TABLE
29 #undef DEBUG_X_TABLE
30 #define DEBUG_X_TABLE(...) __VA_ARGS__
31 #else
32 #define DEBUG_X_TABLE(...)
33 #endif
34 
35 typedef struct Parse_choice_struct Parse_choice;
36 
37 /* The parse_choice is used to extract links for a given parse */
38 typedef struct Parse_set_struct Parse_set;
39 struct Parse_choice_struct
40 {
41 	Parse_choice * next;
42 	Parse_set * set[2];
43 	Link        link[2]; /* the lc fields of these is NULL if there is no link used */
44 	Disjunct    *md;     /* the chosen disjunct for the middle word */
45 };
46 
47 struct Parse_set_struct
48 {
49 	short          lw, rw; /* left and right word index */
50 	unsigned int   null_count; /* number of island words */
51 	int            l_id, r_id; /* pending, unconnected connectors */
52 
53 	s64 count;      /* The number of ways to parse. */
54 #ifdef RECOUNT
55 	s64 recount;  /* Exactly the same as above, but counted at a later stage. */
56 	s64 cut_count;  /* Count only low-cost parses, i.e. below the cost cutoff */
57 	//double cost_cutoff;
58 #undef RECOUNT
59 #define RECOUNT(X) X
60 #else
61 #define RECOUNT(X)  /* Make it disappear... */
62 #endif
63 	Parse_choice * first;
64 	Parse_choice * tail;
65 };
66 
67 typedef struct Pset_bucket_struct Pset_bucket;
68 struct Pset_bucket_struct
69 {
70 	Parse_set set;
71 	Pset_bucket *next;
72 };
73 
74 struct extractor_s
75 {
76 	unsigned int   x_table_size;
77 	unsigned int   log2_x_table_size;
78 	Pset_bucket ** x_table;  /* Hash table */
79 	Parse_set *    parse_set;
80 	Word           *words;
81 	bool           islands_ok;
82 	bool           sort_match_list;
83 
84 	/* thread-safe random number state */
85 	unsigned int rand_state;
86 };
87 
88 /**
89  * The first thing we do is we build a data structure to represent the
90  * result of the entire parse search.  There will be a set of nodes
91  * built for each call to the count() function that returned a non-zero
92  * value, AND which is part of a valid linkage.  Each of these nodes
93  * represents a valid continuation, and contains pointers to two other
94  * sets (one for the left continuation and one for the right
95  * continuation).
96  */
97 
free_set(Parse_set * s)98 static void free_set(Parse_set *s)
99 {
100 	Parse_choice *p, *xp;
101 	if (s == NULL) return;
102 	for (p=s->first; p != NULL; p = xp)
103 	{
104 		xp = p->next;
105 		xfree((void *)p, sizeof(*p));
106 	}
107 }
108 
109 static Parse_choice *
make_choice(Parse_set * lset,Connector * llc,Connector * lrc,Parse_set * rset,Connector * rlc,Connector * rrc,Disjunct * md)110 make_choice(Parse_set *lset, Connector * llc, Connector * lrc,
111             Parse_set *rset, Connector * rlc, Connector * rrc,
112             Disjunct *md)
113 {
114 	Parse_choice *pc;
115 	pc = (Parse_choice *) xalloc(sizeof(*pc));
116 	pc->next = NULL;
117 	pc->set[0] = lset;
118 	pc->link[0].link_name = NULL;
119 	pc->link[0].lw = lset->lw;
120 	pc->link[0].rw = lset->rw;
121 	pc->link[0].lc = llc;
122 	pc->link[0].rc = lrc;
123 	pc->set[1] = rset;
124 	pc->link[1].link_name = NULL;
125 	pc->link[1].lw = rset->lw;
126 	pc->link[1].rw = rset->rw;
127 	pc->link[1].lc = rlc;
128 	pc->link[1].rc = rrc;
129 	pc->md = md;
130 	return pc;
131 }
132 
133 /**
134  * Put this parse_choice into a given set.  The tail pointer is always
135  * left pointing to the end of the list.
136  */
put_choice_in_set(Parse_set * s,Parse_choice * pc)137 static void put_choice_in_set(Parse_set *s, Parse_choice *pc)
138 {
139 	if (s->first == NULL)
140 	{
141 		s->first = pc;
142 	}
143 	else
144 	{
145 		s->tail->next = pc;
146 	}
147 	s->tail = pc;
148 	pc->next = NULL;
149 }
150 
record_choice(Parse_set * lset,Connector * llc,Connector * lrc,Parse_set * rset,Connector * rlc,Connector * rrc,Disjunct * md,Parse_set * s)151 static void record_choice(
152     Parse_set *lset, Connector * llc, Connector * lrc,
153     Parse_set *rset, Connector * rlc, Connector * rrc,
154     Disjunct *md, Parse_set *s)
155 {
156 	put_choice_in_set(s, make_choice(lset, llc, lrc,
157 	                                 rset, rlc, rrc,
158 	                                 md));
159 }
160 
161 /**
162  * Allocate the parse info struct
163  *
164  * A piecewise exponential function determines the size of the hash
165  * table.  Probably should make use of the actual number of disjuncts,
166  * rather than just the number of words.
167  */
extractor_new(int nwords,unsigned int ranstat)168 extractor_t * extractor_new(int nwords, unsigned int ranstat)
169 {
170 	int log2_table_size;
171 	extractor_t * pex;
172 
173 	pex = (extractor_t *) xalloc(sizeof(extractor_t));
174 	memset(pex, 0, sizeof(extractor_t));
175 	pex->rand_state = ranstat;
176 
177 	/* Alloc the x_table */
178 	if (nwords > 96) {
179 		log2_table_size = 15 + nwords / 48;
180 	} else if (nwords >= 10) {
181 		log2_table_size = 14 + nwords / 24;
182 	} else if (nwords >= 4) {
183 		log2_table_size = 1 + nwords + nwords / 2;
184 	} else {
185 		log2_table_size = 5;
186 	}
187 	/* if (log2_table_size > 21) log2_table_size = 21; */
188 	pex->log2_x_table_size = log2_table_size;
189 	pex->x_table_size = (1 << log2_table_size);
190 
191 	DEBUG_X_TABLE(
192 		printf("Allocating x_table of size %u (nwords %d)\n",
193 		       pex->x_table_size, nwords);
194 	)
195 	pex->x_table = (Pset_bucket**) xalloc(pex->x_table_size * sizeof(Pset_bucket*));
196 	memset(pex->x_table, 0, pex->x_table_size * sizeof(Pset_bucket*));
197 
198 	return pex;
199 }
200 
201 /**
202  * This is the function that should be used to free the set structure. Since
203  * it's a dag, a recursive free function won't work.  Every time we create
204  * a set element, we put it in the hash table, so this is OK.
205  */
free_extractor(extractor_t * pex)206 void free_extractor(extractor_t * pex)
207 {
208 	unsigned int i;
209 	Pset_bucket *t, *x;
210 	if (!pex) return;
211 
212 	DEBUG_X_TABLE(int N = 0;)
213 	for (i=0; i<pex->x_table_size; i++)
214 	{
215 		DEBUG_X_TABLE(int c = 0;)
216 		for (t = pex->x_table[i]; t!= NULL; t=x)
217 		{
218 			DEBUG_X_TABLE(c++;)
219 			x = t->next;
220 			free_set(&t->set);
221 			xfree((void *) t, sizeof(Pset_bucket));
222 		}
223 		DEBUG_X_TABLE(
224 			if (c > 0)
225 				;//printf("I %d: chain %d\n", i, c);
226 			else
227 				N++;
228 		)
229 	}
230 	DEBUG_X_TABLE(
231 		printf("Used x_table %u/%u %.2f%%\n",
232 				 pex->x_table_size-N, pex->x_table_size,
233 				 100.0f*(pex->x_table_size-N)/pex->x_table_size);
234 	)
235 	pex->parse_set = NULL;
236 
237 	//printf("Freeing x_table of size %d\n", pex->x_table_size);
238 	xfree((void *) pex->x_table, pex->x_table_size * sizeof(Pset_bucket*));
239 	pex->x_table_size = 0;
240 	pex->x_table = NULL;
241 
242 	xfree((void *) pex, sizeof(extractor_t));
243 }
244 
245 /**
246  * Returns the pointer to this info, NULL if not there.
247  * Note that there is no need to use (lw, rw) as keys because tracon_id
248  * values are not shared between words.
249  */
x_table_pointer(int lw,int rw,Connector * le,Connector * re,unsigned int null_count,extractor_t * pex)250 static Pset_bucket * x_table_pointer(int lw, int rw,
251                               Connector *le, Connector *re,
252                               unsigned int null_count, extractor_t * pex)
253 {
254 	Pset_bucket *t;
255 	int l_id = (NULL != le) ? le->tracon_id : lw;
256 	int r_id = (NULL != re) ? re->tracon_id : rw;
257 	t = pex->x_table[pair_hash(pex->x_table_size, lw, rw, l_id, r_id, null_count)];
258 
259 	for (; t != NULL; t = t->next) {
260 		if ((t->set.l_id == l_id) && (t->set.r_id == r_id) &&
261 		    (t->set.null_count == null_count)) return t;
262 	}
263 	return NULL;
264 }
265 
266 /**
267  * Stores the value in the x_table.  Assumes it's not already there.
268  */
x_table_store(int lw,int rw,Connector * le,Connector * re,unsigned int null_count,extractor_t * pex)269 static Pset_bucket * x_table_store(int lw, int rw,
270                                   Connector *le, Connector *re,
271                                   unsigned int null_count, extractor_t * pex)
272 {
273 	Pset_bucket *t, *n;
274 	unsigned int h;
275 
276 	n = (Pset_bucket *) xalloc(sizeof(Pset_bucket));
277 	n->set.lw = lw;
278 	n->set.rw = rw;
279 	n->set.null_count = null_count;
280 	n->set.l_id = (NULL != le) ? le->tracon_id : lw;
281 	n->set.r_id = (NULL != re) ? re->tracon_id : rw;
282 	n->set.count = 0;
283 	n->set.first = NULL;
284 	n->set.tail = NULL;
285 
286 	h = pair_hash(pex->x_table_size, lw, rw, n->set.l_id, n->set.r_id, null_count);
287 	t = pex->x_table[h];
288 	n->next = t;
289 	pex->x_table[h] = n;
290 	return n;
291 }
292 
293 /** Create a bogus parse set that only holds lw, rw. */
dummy_set(int lw,int rw,unsigned int null_count,extractor_t * pex)294 static Parse_set* dummy_set(int lw, int rw,
295                             unsigned int null_count, extractor_t * pex)
296 {
297 	Pset_bucket *dummy;
298 	dummy = x_table_pointer(lw, rw, NULL, NULL, null_count, pex);
299 	if (dummy) return &dummy->set;
300 
301 	dummy = x_table_store(lw, rw, NULL, NULL, null_count, pex);
302 	dummy->set.count = 1;
303 	return &dummy->set;
304 }
305 
cost_compare(const void * a,const void * b)306 static int cost_compare(const void *a, const void *b)
307 {
308 	const Disjunct * const * da = a;
309 	const Disjunct * const * db = b;
310 	if ((*da)->cost < (*db)->cost) return -1;
311 	if ((*da)->cost > (*db)->cost) return 1;
312 	return 0;
313 }
314 
315 /**
316  * Sort the match-list into ascending disjunct cost. The goal here
317  * is to issue the lowest-cost disjuncts first, so that the parse
318  * set ends up quasi-sorted. This is not enough to get us a totally
319  * sorted parse set, but it does guarantee that at least the very
320  * first parse really will be the lowest cost.
321  */
sort_match_list(fast_matcher_t * mchxt,size_t mlb)322 static void sort_match_list(fast_matcher_t *mchxt, size_t mlb)
323 {
324 	size_t i = mlb;
325 
326 	while (get_match_list_element(mchxt, i) != NULL)
327 		i++;
328 
329 	if (i - mlb < 2) return;
330 
331 	qsort(get_match_list(mchxt, mlb), i - mlb, sizeof(Disjunct *), cost_compare);
332 }
333 
334 /**
335  * returns NULL if there are no ways to parse, or returns a pointer
336  * to a set structure representing all the ways to parse.
337  *
338  * This code is similar to do_count() in count.c -- for a good reason:
339  * the do_count() function did a full parse, but didn't actually
340  * allocate a memory structures to hold the parse.  This also does
341  * a full parse, but it also allocates and fills out the various
342  * parse structures.
343  */
344 static
mk_parse_set(fast_matcher_t * mchxt,count_context_t * ctxt,int lw,int rw,Connector * le,Connector * re,unsigned int null_count,extractor_t * pex)345 Parse_set * mk_parse_set(fast_matcher_t *mchxt,
346                  count_context_t * ctxt,
347                  int lw, int rw,
348                  Connector *le, Connector *re, unsigned int null_count,
349                  extractor_t * pex)
350 {
351 	int start_word, end_word, w;
352 	Pset_bucket *xt;
353 	Count_bin * count;
354 
355 	assert(null_count < 0x7fff, "mk_parse_set() called with null_count < 0.");
356 
357 	count = table_lookup(ctxt, lw, rw, le, re, null_count);
358 
359 	/* If there's no counter, then there's no way to parse. */
360 	if (NULL == count) return NULL;
361 	if (hist_total(count) == 0) return NULL;
362 
363 	xt = x_table_pointer(lw, rw, le, re, null_count, pex);
364 
365 	/* Perhaps we've already computed it; if so, return it. */
366 	if (xt != NULL) return &xt->set;
367 
368 	/* Start it out with the empty set of parse choices. */
369 	/* This entry must be updated before we return. */
370 	xt = x_table_store(lw, rw, le, re, null_count, pex);
371 
372 	/* The count we previously computed; it's non-zero. */
373 	xt->set.count = hist_total(count);
374 
375 	//#define NUM_PARSES 4
376 	// xt->set.cost_cutoff = hist_cost_cutoff(count, NUM_PARSES);
377 	// xt->set.cut_count = hist_cut_total(count, NUM_PARSES);
378 
379 	RECOUNT({xt->set.recount = 1;})
380 
381 	/* If the two words are next to each other, the count == 1 */
382 	if (lw + 1 == rw) return &xt->set;
383 
384 	/* The left and right connectors are null, but the two words are
385 	 * NOT next to each-other.  */
386 	if ((le == NULL) && (re == NULL))
387 	{
388 		Parse_set* pset;
389 		Parse_set* dummy;
390 		Disjunct* dis;
391 
392 		if (!pex->islands_ok && (lw != -1) && (pex->words[lw].d != NULL))
393 			return &xt->set;
394 		if (null_count == 0) return &xt->set;
395 
396 		RECOUNT({xt->set.recount = 0;})
397 
398 		w = lw + 1;
399 		for (int opt = 0; opt <= (int)pex->words[w].optional; opt++)
400 		{
401 			null_count += opt;
402 			for (dis = pex->words[w].d; dis != NULL; dis = dis->next)
403 			{
404 				if (dis->left == NULL)
405 				{
406 					pset = mk_parse_set(mchxt, ctxt,
407 											  w, rw, dis->right, NULL,
408 											  null_count-1, pex);
409 					if (pset == NULL) continue;
410 					dummy = dummy_set(lw, w, null_count-1, pex);
411 					record_choice(dummy, NULL, NULL,
412 									  pset, dis->right, NULL,
413 									  dis, &xt->set);
414 					RECOUNT({xt->set.recount += pset->recount;})
415 				}
416 			}
417 			pset = mk_parse_set(mchxt, ctxt,
418 									  w, rw, NULL, NULL,
419 									  null_count-1, pex);
420 			if (pset != NULL)
421 			{
422 				dummy = dummy_set(lw, w, null_count-1, pex);
423 				record_choice(dummy, NULL, NULL,
424 								  pset,  NULL, NULL,
425 								  NULL, &xt->set);
426 				RECOUNT({xt->set.recount += pset->recount;})
427 			}
428 		}
429 		return &xt->set;
430 	}
431 
432 	if (le == NULL)
433 	{
434 		start_word = MAX(lw+1, re->farthest_word);
435 	}
436 	else
437 	{
438 		start_word = le->nearest_word;
439 	}
440 
441 	if (re == NULL)
442 	{
443 		end_word = MIN(rw, le->farthest_word+1);
444 	}
445 	else
446 	{
447 		if ((le != NULL) && (re->nearest_word > le->farthest_word))
448 			end_word = le->farthest_word + 1;
449 		else
450 			end_word = re->nearest_word + 1;
451 	}
452 
453 	/* This condition can never be true here. It is included so GCC
454 	 * will be able to optimize the loop over "null_count".  Without
455 	 * this check, GCC thinks this loop may be an infinite loop and
456 	 * it may omit some optimizations. */
457 	if (UINT_MAX == null_count) return NULL;
458 
459 	RECOUNT({xt->set.recount = 0;})
460 	for (w = start_word; w < end_word; w++)
461 	{
462 		size_t mlb = form_match_list(mchxt, w, le, lw, re, rw);
463 		if (pex->sort_match_list) sort_match_list(mchxt, mlb);
464 
465 		for (size_t mle = mlb; get_match_list_element(mchxt, mle) != NULL; mle++)
466 		{
467 			Disjunct *d = get_match_list_element(mchxt, mle);
468 			bool Lmatch = d->match_left;
469 			bool Rmatch = d->match_right;
470 
471 			for (unsigned int lnull_count = 0; lnull_count <= null_count; lnull_count++)
472 			{
473 				int i, j;
474 				Parse_set *ls[4], *rs[4];
475 				bool ls_exists = false;
476 
477 				/* Here, lnull_count and rnull_count are the null_counts
478 				 * we're assigning to those parts respectively. */
479 				unsigned int rnull_count = null_count - lnull_count;
480 
481 				/* Now, we determine if (based on table only) we can see that
482 				   the current range is not parsable. */
483 
484 				for (i=0; i<4; i++) { ls[i] = rs[i] = NULL; }
485 				if (Lmatch)
486 				{
487 					ls[0] = mk_parse_set(mchxt, ctxt,
488 					             lw, w, le->next, d->left->next,
489 					             lnull_count, pex);
490 
491 					if (le->multi)
492 						ls[1] = mk_parse_set(mchxt, ctxt,
493 						              lw, w, le, d->left->next,
494 						              lnull_count, pex);
495 
496 					if (d->left->multi)
497 						ls[2] = mk_parse_set(mchxt, ctxt,
498 						              lw, w, le->next, d->left,
499 						              lnull_count, pex);
500 
501 					if (le->multi && d->left->multi)
502 						ls[3] = mk_parse_set(mchxt, ctxt,
503 						              lw, w, le, d->left,
504 						              lnull_count, pex);
505 
506 					ls_exists =
507 						ls[0] != NULL || ls[1] != NULL || ls[2] != NULL || ls[3] != NULL;
508 				}
509 
510 
511 				if (Rmatch && (ls_exists || le == NULL))
512 				{
513 					rs[0] = mk_parse_set(mchxt, ctxt,
514 					                 w, rw, d->right->next, re->next,
515 					                 rnull_count, pex);
516 
517 					if (d->right->multi)
518 						rs[1] = mk_parse_set(mchxt, ctxt,
519 					                 w, rw, d->right, re->next,
520 						              rnull_count, pex);
521 
522 					if (re->multi)
523 						rs[2] = mk_parse_set(mchxt, ctxt,
524 						              w, rw, d->right->next, re,
525 						              rnull_count, pex);
526 
527 					if (d->right->multi && re->multi)
528 						rs[3] = mk_parse_set(mchxt, ctxt,
529 						              w, rw, d->right, re,
530 						              rnull_count, pex);
531 				}
532 
533 				for (i=0; i<4; i++)
534 				{
535 					/* This ordering is probably not consistent with that
536 					 * needed to use list_links. (??) */
537 					if (ls[i] == NULL) continue;
538 					for (j=0; j<4; j++)
539 					{
540 						if (rs[j] == NULL) continue;
541 						record_choice(ls[i], le, d->left,
542 						              rs[j], d->right, re,
543 						              d, &xt->set);
544 						RECOUNT({xt->set.recount += ls[i]->recount * rs[j]->recount;})
545 					}
546 				}
547 
548 				if (ls_exists)
549 				{
550 					/* Evaluate using the left match, but not the right */
551 					Parse_set* rset = mk_parse_set(mchxt, ctxt,
552 					                        w, rw, d->right, re,
553 					                        rnull_count, pex);
554 					if (rset != NULL)
555 					{
556 						for (i=0; i<4; i++)
557 						{
558 							if (ls[i] == NULL) continue;
559 							/* this ordering is probably not consistent with
560 							 * that needed to use list_links */
561 							record_choice(ls[i], le, d->left,
562 							              rset,  NULL /* d->right */,
563 							              re,  /* the NULL indicates no link*/
564 							              d, &xt->set);
565 							RECOUNT({xt->set.recount += ls[i]->recount * rset->recount;})
566 						}
567 					}
568 				}
569 				else if ((le == NULL) && (rs[0] != NULL ||
570 				     rs[1] != NULL || rs[2] != NULL || rs[3] != NULL))
571 				{
572 					/* Evaluate using the right match, but not the left */
573 					Parse_set* lset = mk_parse_set(mchxt, ctxt,
574 					                        lw, w, le, d->left,
575 					                        lnull_count, pex);
576 
577 					if (lset != NULL)
578 					{
579 						for (j=0; j<4; j++)
580 						{
581 							if (rs[j] == NULL) continue;
582 							/* this ordering is probably not consistent with
583 							 * that needed to use list_links */
584 							record_choice(lset, NULL /* le */,
585 							              d->left,  /* NULL indicates no link */
586 							              rs[j], d->right, re,
587 							              d, &xt->set);
588 							RECOUNT({xt->set.recount += lset->recount * rs[j]->recount;})
589 						}
590 					}
591 				}
592 			}
593 		}
594 		pop_match_list(mchxt, mlb);
595 	}
596 	return &xt->set;
597 }
598 
599 /**
600  * Return TRUE if and only if an overflow in the number of parses
601  * occurred. Use a 64-bit int for counting.
602  */
set_node_overflowed(Parse_set * set)603 static bool set_node_overflowed(Parse_set *set)
604 {
605 	Parse_choice *pc;
606 	s64 n = 0;
607 	if (set == NULL || set->first == NULL) return false;
608 
609 	for (pc = set->first; pc != NULL; pc = pc->next)
610 	{
611 		n  += pc->set[0]->count * pc->set[1]->count;
612 		if (PARSE_NUM_OVERFLOW < n) return true;
613 	}
614 	return false;
615 }
616 
set_overflowed(extractor_t * pex)617 static bool set_overflowed(extractor_t * pex)
618 {
619 	unsigned int i;
620 
621 	assert(pex->x_table != NULL, "called set_overflowed with x_table==NULL");
622 	for (i=0; i<pex->x_table_size; i++)
623 	{
624 		Pset_bucket *t;
625 		for (t = pex->x_table[i]; t != NULL; t = t->next)
626 		{
627 			if (set_node_overflowed(&t->set)) return true;
628 		}
629 	}
630 	return false;
631 }
632 
633 /**
634  * This is the top level call that computes the whole parse_set.  It
635  * points whole_set at the result.  It creates the necessary hash
636  * table (x_table) which will be freed at the same time the
637  * whole_set is freed.
638  *
639  * This assumes that do_parse() has been run, and that the count_context
640  * is filled with the values thus computed.  This function is structured
641  * much like do_parse(), which wraps the main workhorse do_count().
642  *
643  * If the number of linkages gets huge, then the counts can overflow.
644  * We check if this has happened when verifying the parse set.
645  * This routine returns TRUE iff an overflow occurred.
646  */
647 
build_parse_set(extractor_t * pex,Sentence sent,fast_matcher_t * mchxt,count_context_t * ctxt,unsigned int null_count,Parse_Options opts)648 bool build_parse_set(extractor_t* pex, Sentence sent,
649                     fast_matcher_t *mchxt,
650                     count_context_t *ctxt,
651                     unsigned int null_count, Parse_Options opts)
652 {
653 	pex->words = sent->word;
654 	pex->islands_ok = opts->islands_ok;
655 	pex->sort_match_list = test_enabled("sort-match-list");
656 
657 	pex->parse_set =
658 		mk_parse_set(mchxt, ctxt,
659 		             -1, sent->length, NULL, NULL, null_count+1, pex);
660 
661 
662 	return set_overflowed(pex);
663 }
664 
665 // Cannot be static, also called by SAT-solver.
check_link_size(Linkage lkg)666 void check_link_size(Linkage lkg)
667 {
668 	if (lkg->lasz <= lkg->num_links)
669 	{
670 		lkg->lasz = 2 * lkg->lasz + 10;
671 		lkg->link_array = realloc(lkg->link_array, lkg->lasz * sizeof(Link));
672 	}
673 }
674 
675 /**
676  * Assemble the link array and the chosen_disjuncts of a linkage.
677  */
issue_link(Linkage lkg,bool lr,Disjunct * md,Link * link)678 static void issue_link(Linkage lkg, bool lr, Disjunct *md, Link *link)
679 {
680 	if (link->rc != NULL)
681 	{
682 		check_link_size(lkg);
683 		lkg->link_array[lkg->num_links] = *link;
684 		lkg->num_links++;
685 	}
686 
687 	lkg->chosen_disjuncts[lr ? link->lw : link->rw] = md;
688 }
689 
issue_links_for_choice(Linkage lkg,Parse_choice * pc)690 static void issue_links_for_choice(Linkage lkg, Parse_choice *pc)
691 {
692 	if (pc->link[0].lc != NULL) { /* there is a link to generate */
693 		issue_link(lkg, /*lr*/false, pc->md, &pc->link[0]);
694 	}
695 	if (pc->link[1].lc != NULL) {
696 		issue_link(lkg, /*lr*/true, pc->md, &pc->link[1]);
697 	}
698 }
699 
list_links(Linkage lkg,const Parse_set * set,int index)700 static void list_links(Linkage lkg, const Parse_set * set, int index)
701 {
702 	 Parse_choice *pc;
703 	 s64 n;
704 
705 	 if (set == NULL || set->first == NULL) return;
706 	 for (pc = set->first; pc != NULL; pc = pc->next) {
707 		  n = pc->set[0]->count * pc->set[1]->count;
708 		  if (index < n) break;
709 		  index -= n;
710 	 }
711 	 assert(pc != NULL, "walked off the end in list_links");
712 	 issue_links_for_choice(lkg, pc);
713 	 list_links(lkg, pc->set[0], index % pc->set[0]->count);
714 	 list_links(lkg, pc->set[1], index / pc->set[0]->count);
715 }
716 
list_random_links(Linkage lkg,unsigned int * rand_state,const Parse_set * set)717 static void list_random_links(Linkage lkg, unsigned int *rand_state,
718                               const Parse_set * set)
719 {
720 	Parse_choice *pc;
721 	int num_pc, new_index;
722 
723 	if (set == NULL || set->first == NULL) return;
724 
725 	/* Most of the times there is only one list element. */
726 	if (set->first->next == NULL)
727 	{
728 		pc = set->first;
729 	}
730 	else
731 	{
732 		num_pc = 0;
733 		for (pc = set->first; pc != NULL; pc = pc->next) {
734 			num_pc++;
735 		}
736 
737 		new_index = rand_r(rand_state) % num_pc;
738 
739 		num_pc = 0;
740 		for (pc = set->first; pc != NULL; pc = pc->next) {
741 			if (new_index == num_pc) break;
742 			num_pc++;
743 		}
744 	}
745 
746 	issue_links_for_choice(lkg, pc);
747 	list_random_links(lkg, rand_state, pc->set[0]);
748 	list_random_links(lkg, rand_state, pc->set[1]);
749 }
750 
751 /**
752  * Generate the list of all links of the index'th parsing of the
753  * sentence.  For this to work, you must have already called parse, and
754  * already built the whole_set.
755  */
extract_links(extractor_t * pex,Linkage lkg)756 void extract_links(extractor_t * pex, Linkage lkg)
757 {
758 	int index = lkg->lifo.index;
759 	if (index < 0)
760 	{
761 		bool repeatable = false;
762 		if (0 == pex->rand_state) repeatable = true;
763 		if (repeatable) pex->rand_state = index;
764 		list_random_links(lkg, &pex->rand_state, pex->parse_set);
765 		if (repeatable)
766 			pex->rand_state = 0;
767 		else
768 			lkg->sent->rand_state = pex->rand_state;
769 	}
770 	else {
771 		list_links(lkg, pex->parse_set, index);
772 	}
773 }
774