1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright 2018-2020, Amir Plivatsky                                  */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 #include <string.h>
14 
15 #include "api-structures.h"             // Sentence
16 #include "connectors.h"
17 #include "dict-common/dict-structures.h"
18 #include "dict-common/dict-utils.h"     // copy_Exp
19 #include "dict-common/regex-morph.h"    // match_regex
20 #include "disjunct-utils.h"
21 #include "memory-pool.h"
22 #include "prepare/build-disjuncts.h"
23 #include "print/print-util.h"
24 #include "tokenize/tok-structures.h"    // XXX TODO provide gword access methods!
25 #include "tokenize/word-structures.h"
26 #include "tracon-set.h"
27 #include "utilities.h"
28 
29 /* Disjunct utilities ... */
30 
31 #define D_DISJ 5                        /* Verbosity level for this file. */
32 
33 /**
34  * free_disjuncts() -- free the list of disjuncts pointed to by c
35  * (does not free any strings)
36  */
free_disjuncts(Disjunct * c)37 void free_disjuncts(Disjunct *c)
38 {
39 	Disjunct *c1;
40 	for (;c != NULL; c = c1) {
41 		c1 = c->next;
42 		free_connectors(c->left);
43 		free_connectors(c->right);
44 		xfree((char *)c, sizeof(Disjunct));
45 	}
46 }
47 
free_sentence_disjuncts(Sentence sent)48 void free_sentence_disjuncts(Sentence sent)
49 {
50 	if (NULL != sent->dc_memblock)
51 	{
52 		free(sent->dc_memblock);
53 		sent->dc_memblock = NULL;
54 	}
55 	else if (NULL != sent->Disjunct_pool)
56 	{
57 		pool_delete(sent->Disjunct_pool);
58 		pool_delete(sent->Connector_pool);
59 		sent->Disjunct_pool = NULL;
60 	}
61 }
62 
63 /**
64  * Destructively catenates the two disjunct lists d1 followed by d2.
65  * Doesn't change the contents of the disjuncts.
66  * Traverses the first list, but not the second.
67  */
catenate_disjuncts(Disjunct * d1,Disjunct * d2)68 Disjunct * catenate_disjuncts(Disjunct *d1, Disjunct *d2)
69 {
70 	Disjunct * dis = d1;
71 
72 	if (d1 == NULL) return d2;
73 	if (d2 == NULL) return d1;
74 	while (dis->next != NULL) dis = dis->next;
75 	dis->next = d2;
76 	return d1;
77 }
78 
79 /** Returns the number of disjuncts in the list pointed to by d */
count_disjuncts(Disjunct * d)80 unsigned int count_disjuncts(Disjunct * d)
81 {
82 	unsigned int count = 0;
83 	for (; d != NULL; d = d->next)
84 	{
85 		count++;
86 	}
87 	return count;
88 }
89 
90 /** Returns the number of connectors in the sentence. */
count_connectors(Sentence sent)91 static unsigned int count_connectors(Sentence sent)
92 {
93 	unsigned int ccnt = 0;
94 
95 	for (WordIdx w = 0; w < sent->length; w++)
96 	{
97 		for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next)
98 		{
99 			for (Connector *c = d->left; c != NULL; c = c->next) ccnt++;
100 			for (Connector *c = d->right; c !=NULL; c = c->next) ccnt++;
101 		}
102 	}
103 
104 	return ccnt;
105 }
106 /* ============================================================= */
107 
108 typedef struct disjunct_dup_table_s disjunct_dup_table;
109 struct disjunct_dup_table_s
110 {
111 	size_t dup_table_size;
112 	Disjunct *dup_table[];
113 };
114 
115 /**
116  * This is a hash function for disjuncts
117  *
118  * This is the old version that doesn't check for domination, just
119  * equality.
120  */
old_hash_disjunct(disjunct_dup_table * dt,Disjunct * d)121 static inline unsigned int old_hash_disjunct(disjunct_dup_table *dt, Disjunct * d)
122 {
123 	unsigned int i;
124 	i = 0;
125 	for (Connector *e = d->left; e != NULL; e = e->next) {
126 		i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7;
127 	}
128 	for (Connector *e = d->right; e != NULL; e = e->next) {
129 		i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7;
130 	}
131 #if 0 /* Redundant - the connector hashing has enough entropy. */
132 	i += string_hash(d->word_string);
133 #endif
134 	i += (i>>10);
135 
136 	d->dup_hash = i;
137 	return (i & (dt->dup_table_size-1));
138 }
139 
140 /**
141  * The connectors must be exactly equal.
142  */
connectors_equal_prune(Connector * c1,Connector * c2)143 static bool connectors_equal_prune(Connector *c1, Connector *c2)
144 {
145 	return c1->desc == c2->desc && (c1->multi == c2->multi);
146 }
147 
148 /** returns TRUE if the disjuncts are exactly the same */
disjuncts_equal(Disjunct * d1,Disjunct * d2)149 static bool disjuncts_equal(Disjunct * d1, Disjunct * d2)
150 {
151 	Connector *e1, *e2;
152 
153 	e1 = d1->left;
154 	e2 = d2->left;
155 	while ((e1 != NULL) && (e2 != NULL)) {
156 		if (!connectors_equal_prune(e1, e2)) return false;
157 		e1 = e1->next;
158 		e2 = e2->next;
159 	}
160 	if ((e1 != NULL) || (e2 != NULL)) return false;
161 
162 	e1 = d1->right;
163 	e2 = d2->right;
164 	while ((e1 != NULL) && (e2 != NULL)) {
165 		if (!connectors_equal_prune(e1, e2)) return false;
166 		e1 = e1->next;
167 		e2 = e2->next;
168 	}
169 	if ((e1 != NULL) || (e2 != NULL)) return false;
170 
171 	/* Save CPU time by comparing this last, since this will
172 	 * almost always be true. Rarely, the strings are not from
173 	 * the same string_set and hence the 2-step comparison. */
174 	if (d1->word_string == d2->word_string) return true;
175 	return (strcmp(d1->word_string, d2->word_string) == 0);
176 }
177 
178 #if 0
179 int de_fp = 0;
180 int de_total = 0;
181 static void disjuncts_equal_stat(void)
182 {
183 		fprintf(stderr, "disjuncts_equal FP %d/%d\n", de_fp, de_total);
184 }
185 
186 static bool disjuncts_equal(Disjunct * d1, Disjunct * d2)
187 {
188 	if (de_total == 0) atexit(disjuncts_equal_stat);
189 	de_total++;
190 
191 	bool rc = disjuncts_equal1(d1, d2);
192 	if (!rc) de_fp++;
193 
194 	return rc;
195 }
196 #endif
197 
disjunct_dup_table_new(size_t sz)198 static disjunct_dup_table * disjunct_dup_table_new(size_t sz)
199 {
200 	disjunct_dup_table *dt;
201 
202 	dt = malloc(sz * sizeof(Disjunct *) + sizeof(disjunct_dup_table));
203 	dt->dup_table_size = sz;
204 
205 	memset(dt->dup_table, 0, sz * sizeof(Disjunct *));
206 
207 	return dt;
208 }
209 
disjunct_dup_table_delete(disjunct_dup_table * dt)210 static void disjunct_dup_table_delete(disjunct_dup_table *dt)
211 {
212 	free(dt);
213 }
214 
215 #ifdef DEBUG
gword_set_len(const gword_set * gl)216 GNUC_UNUSED static int gword_set_len(const gword_set *gl)
217 {
218 	int len = 0;
219 	for (; NULL != gl; gl = gl->next) len++;
220 	return len;
221 }
222 #endif
223 
224 /**
225  * Return a new gword_set element, initialized from the given element.
226  * @param old_e Existing element.
227  */
gword_set_element_new(gword_set * old_e)228 static gword_set *gword_set_element_new(gword_set *old_e)
229 {
230 	gword_set *new_e = malloc(sizeof(gword_set));
231 	*new_e = (gword_set){0};
232 
233 	new_e->o_gword = old_e->o_gword;
234 	gword_set *chain_next = old_e->chain_next;
235 	old_e->chain_next = new_e;
236 	new_e->chain_next = chain_next;
237 
238 	return new_e;
239 }
240 
241 /**
242  * Add an element to existing gword_set. Uniqueness is assumed.
243  * @return A new set with the element.
244  */
gword_set_add(gword_set * gset,gword_set * ge)245 static gword_set *gword_set_add(gword_set *gset, gword_set *ge)
246 {
247 	gword_set *n = gword_set_element_new(ge);
248 	n->next = gset;
249 	gset = n;
250 
251 	return gset;
252 }
253 
254 /**
255  * Combine the given gword sets.
256  * The gword sets are not modified.
257  * This function is used for adding the gword pointers of an eliminated
258  * disjunct to the ones of the kept disjuncts, with no duplicates.
259  *
260  * @param kept gword_set of the kept disjunct.
261  * @param eliminated gword_set of the eliminated disjunct.
262  * @return Use copy-on-write semantics - the gword_set of the kept disjunct
263  * just gets returned if there is nothing to add to it. Else - a new gword
264  * set is returned.
265  */
gword_set_union(gword_set * kept,gword_set * eliminated)266 static gword_set *gword_set_union(gword_set *kept, gword_set *eliminated)
267 {
268 	/* Preserve the gword pointers of the eliminated disjunct if different. */
269 	gword_set *preserved_set = NULL;
270 	for (gword_set *e = eliminated; NULL != e; e = e->next)
271 	{
272 		gword_set *k;
273 
274 		/* Ensure uniqueness. */
275 		for (k = kept; NULL != k; k = k->next)
276 			if (e->o_gword == k->o_gword) break;
277 		if (NULL != k) continue;
278 
279 		preserved_set = gword_set_add(preserved_set, e);
280 	}
281 
282 	if (preserved_set)
283 	{
284 		/* Preserve the originating gword pointers of the remaining disjunct. */
285 		for (gword_set *k = kept; NULL != k; k = k->next)
286 			preserved_set = gword_set_add(preserved_set, k);
287 		kept = preserved_set;
288 	}
289 
290 	return kept;
291 }
292 
293 /**
294  * Takes the list of disjuncts pointed to by d, eliminates all
295  * duplicates, and returns a pointer to a new list.
296  */
eliminate_duplicate_disjuncts(Disjunct * dw)297 Disjunct *eliminate_duplicate_disjuncts(Disjunct *dw)
298 {
299 	unsigned int count = 0;
300 	disjunct_dup_table *dt;
301 	/* This initialization is unneeded because the first disjunct is never
302 	 * eliminated. However, omitting it generates "uninitialized" compiler
303 	 * warning. Setting it to NULL generates clang-analyzer error on
304 	 * possible NULL dereference. */
305 	Disjunct *prev = dw;
306 
307 	dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(dw)));
308 
309 	for (Disjunct *d = dw; d != NULL; d = d->next)
310 	{
311 		Disjunct *dx;
312 		unsigned int h = old_hash_disjunct(dt, d);
313 
314 		for (dx = dt->dup_table[h]; dx != NULL; dx = dx->dup_table_next)
315 		{
316 			if (d->dup_hash != dx->dup_hash) continue;
317 			if (disjuncts_equal(dx, d)) break;
318 		}
319 
320 		if (dx != NULL)
321 		{
322 			/* Discard the current disjunct. */
323 			if (d->cost < dx->cost) dx->cost = d->cost;
324 
325 			dx->originating_gword =
326 				gword_set_union(dx->originating_gword, d->originating_gword);
327 
328 			count++;
329 			prev->next = d->next;
330 		}
331 		else
332 		{
333 			d->dup_table_next = dt->dup_table[h];
334 			dt->dup_table[h] = d;
335 			prev = d;
336 		}
337 	}
338 
339 	lgdebug(+D_DISJ+(0==count)*1000, "Killed %u duplicates\n", count);
340 
341 	disjunct_dup_table_delete(dt);
342 	return dw;
343 }
344 
345 /* ============================================================= */
346 
347 /* Return the stringified disjunct.
348  * Be sure to free the string upon return.
349  */
350 
prt_con(Connector * c,dyn_str * p,char dir)351 static void prt_con(Connector *c, dyn_str * p, char dir)
352 {
353 	if (NULL == c) return;
354 	prt_con (c->next, p, dir);
355 
356 	if (c->multi)
357 	{
358 		append_string(p, "@%s%c ", connector_string(c), dir);
359 	}
360 	else
361 	{
362 		append_string(p, "%s%c ", connector_string(c), dir);
363 	}
364 }
365 
print_one_disjunct(Disjunct * dj)366 char * print_one_disjunct(Disjunct *dj)
367 {
368 	dyn_str *p = dyn_str_new();
369 
370 	prt_con(dj->left, p, '-');
371 	prt_con(dj->right, p, '+');
372 
373 	return dyn_str_take(p);
374 }
375 
376 /* ============================================================= */
377 
378 /**
379  * returns the number of connectors in the left lists of the disjuncts.
380  */
left_connector_count(Disjunct * d)381 int left_connector_count(Disjunct * d)
382 {
383 	int i=0;
384 	for (;d!=NULL; d=d->next) {
385 		for (Connector *c = d->left; c!=NULL; c = c->next) i++;
386 	}
387 	return i;
388 }
389 
right_connector_count(Disjunct * d)390 int right_connector_count(Disjunct * d)
391 {
392 	int i=0;
393 	for (;d!=NULL; d=d->next) {
394 	  for (Connector *c = d->right; c!=NULL; c = c->next) i++;
395 	}
396 	return i;
397 }
398 
399 /** Returns the number of disjuncts and connectors in the sentence. */
count_disjuncts_and_connectors(Sentence sent,unsigned int * dca,unsigned int * cca)400 void count_disjuncts_and_connectors(Sentence sent, unsigned int *dca,
401                                     unsigned int *cca)
402 {
403 	unsigned int ccnt = 0, dcnt = 0;
404 
405 	for (WordIdx w = 0; w < sent->length; w++)
406 	{
407 		for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next)
408 		{
409 			dcnt++;
410 			for (Connector *c = d->left; c != NULL; c = c->next) ccnt++;
411 			for (Connector *c = d->right; c !=NULL; c = c->next) ccnt++;
412 		}
413 	}
414 
415 	*cca = ccnt;
416 	*dca = dcnt;
417 }
418 
419 /* ============= Connector encoding, sharing and packing ============= */
420 
421 /*
422  * sentence_pack() copies the disjuncts and connectors to a contiguous
423  * memory. This facilitate a better memory caching for long sentences.
424  *
425  * In addition, it shares the memory of identical trailing connector
426  * sequences, aka "tracons". Tracons are considered identical if they
427  * belong to the same Gword (or same word for the pruning step) and
428  * contain identical connectors in the same order (with one exception:
429  * shallow connectors must have the same nearest_word as tracon leading
430  * deep connectors). Connectors are considered identical if they have
431  * the same string representation (including "multi" and the direction
432  * mark) with an additional requirement if the packing is done for the
433  * pruning step - shallow and deep connectors are then not considered
434  * identical. In both cases the exception regarding shallow connectors
435  * is because shallow connectors can match any connector, while deep
436  * connectors can match only shallow connectors. Note: For efficiency,
437  * the actual connector string representation is not used for connector
438  * comparison.
439  *
440  * For the parsing step, identical tracons are assigned a unique tracon
441  * ID, which is kept in their first connector tracon_id field. The rest of
442  * their connectors also have tracon IDs, which belong to tracons starting
443  * with that connectors. The tracon_id is not used for pruning.
444  *
445  * For the pruning step, more things are done:
446  * Additional data structure - a tracon list - is constructed, which
447  * includes a tracon table and per-word prune table sizes. These data
448  * structure consists of 2 identical parts - one for each tracon
449  * direction (left/right). The tracon table is indexed by (tracon_id -
450  * 1), and it indexes the connectors memory block (it doesn't use
451  * pointers in order to save memory on 64-bit CPUs because it may
452  * contain in the order of 100K entries for very long sentences).
453  * Also, a refcount field is set for each tracon to tell how many
454  * tracons are memory-shared at that connector address.
455  *
456  * Tracons are used differently in the pruning and parsing steps.
457  *
458  * Power Pruning:
459  * The first connector of each tracon is inserted into the power table,
460  * along with its reference count. When a connector cannot be matched,
461  * this means that all the disjuncts that contain its tracon also cannot
462  * be matched. It is then marked as bad (by nearest_word=BAD_WORD) and
463  * due to the tracon memory sharing all the connectors that share the same
464  * memory are marked simultaneously, and thus are detected when the next
465  * disjuncts are examined without a need to further process them.
466  * This drastically reduces the "power_cost" and saves much processing.
467  * Setting the nearest_word field is hence done only once per tracon on
468  * each pass. The pass_number field is used to detect already-processed
469  * good tracons - they are assigned the pass number so each tracon is
470  * processed only once per pass. The connector refcount field is used to
471  * discard connectors from the power table when all the disjuncts that
472  * contain them are discarded.
473  *
474  * PP pruning:
475  * Here too only the first connector in each tracon needs to be
476  * examined. Marking a connector with BAD_WORD simultaneously leaves
477  * a mark in the corresponding connector in the cms table and in all
478  * the disjuncts that share it.
479  *
480  * Parsing:
481  * Originally, the classic parser memoized the number of possible
482  * linkages per a given connector-pair using connector addresses. Since
483  * an exhaustive search is done, such an approach has two main problems
484  * for long sentences:
485  * 1. A very big count hash table (Table_connector in count.c) is used
486  * due to the huge number of connectors (100Ks) in long sentences, a
487  * thing that causes a severe CPU cache trash (to the degree that
488  * absolutely most of the memory accesses are L3 misses).
489  * 2. Repeated linkage detailed calculation for what seems identical
490  * connectors. A hint for the tracon idea was the use of 0 hash values
491  * for NULL connectors, which is the same for all the disjuncts of the
492  * same word (they can be considered a private case of a tracon - a
493  * "null tracon").
494  *
495  * The idea that is implemented here is based on the fact that the
496  * number of linkages between the same words using any of their
497  * connector-pair endpoints is governed only by these connectors and the
498  * connectors after them (since cross links are not permitted). Using
499  * tracon IDs as the hash keys allows to share the memoizing table
500  * counts between connectors that start the same tracons. As a
501  * result, long sentences have significantly less different connector
502  * hash values than their total number of connectors.
503  *
504  * In order to save the need to cache and check the endpoint word
505  * numbers the tracon IDs should not be shared between words. They also
506  * should not be shared between alternatives since connectors that belong
507  * to disjuncts of different alternatives may have different linkage
508  * counts because some alternatives-connectivity checks (to the middle
509  * disjunct) are done in the fast-matcher. These restrictions are
510  * implemented by using a different tracon ID per Gword (FIXME - this is
511  * more strict then needed - a different tracon IDs per alternative would
512  * suffice).
513  * The tracon memory sharing is currently not directly used in the
514  * parsing algo besides reducing the needed CPU cache by a large factor.
515  *
516  * Algo of generating tracon Ids, shared tracons and the tracon list:
517  * The string-set code has been adapted (see tracon-set.c) to hash
518  * tracons. The tracon-set hash table slots are Connector pointers which
519  * point to the memory block of the sentence connectors. When a tracon
520  * is not found in the hash table, a new tracon ID is assigned to it,
521  * and the tracon is copied to the said connector memory block. However,
522  * if it is found, its address is used instead of copying the
523  * connectors, thus sharing its memory with identical tracons. The
524  * tracon-set hash table is cleared after each word (for pruning tracons)
525  * or Gword (for parsing tracons), thus ensuring that the tracons IDs are
526  * not shared between words (or Gwords).
527  *
528  * Some tracon features:
529  * - Each connector starts some tracon.
530  * - Connectors of identical tracons share their memory.
531  *
532  * Jets:
533  * A jet is a (whole) ordered set of connectors all pointing in the same
534  * direction (left, or right). Every disjunct can be split into two jets;
535  * that is, a disjunct is a pair of jets, and so each word consists of a
536  * collection of pairs of jets. The first connector in a jet called
537  * a "shallow" connector. Connectors that are not shallow are deep.
538  * See the comments in prune.c for their connection properties.
539  * A jet is also a tracon.
540  *
541  * Note: This comment is referred-to in disjunct-utils.h, so changes
542  * here may need to be reflected in the comments there too.
543  */
544 
tlsz_check(Tracon_list * tl,unsigned int index,int dir)545 static void tlsz_check(Tracon_list *tl, unsigned int index, int dir)
546 {
547 
548 	if (index >= tl->table_size[dir])
549 	{
550 		size_t new_id_table_size = (0 == tl->table_size[dir]) ?
551 			index : tl->table_size[dir] * 2;
552 		size_t new_bytes = new_id_table_size * sizeof(uint32_t *);
553 
554 		tl->table[dir] = realloc(tl->table[dir], new_bytes);
555 		tl->table_size[dir] = new_id_table_size;
556 	}
557 }
558 
559 /**
560  * Pack the connectors in an array; memory-share and enumerate tracons.
561  */
pack_connectors(Tracon_sharing * ts,Connector * origc,int dir,int w)562 static Connector *pack_connectors(Tracon_sharing *ts, Connector *origc, int dir,
563                                   int w)
564 {
565 	if (NULL == origc) return NULL;
566 
567 	Connector head;
568 	Connector *prevc = &head;
569 	Connector *newc = &head;
570 	Connector *lcblock = ts->cblock;     /* For convenience. */
571 	Tracon_list *tl = ts->tracon_list;   /* If non-NULL - encode for pruning. */
572 
573 	for (Connector *o = origc; NULL != o;  o = o->next)
574 	{
575 		newc = NULL;
576 
577 		if (NULL != ts->csid[dir])
578 		{
579 			/* Encoding is used - share tracons. */
580 			Connector **tracon = tracon_set_add(o, ts->csid[dir]);
581 
582 			if (NULL == *tracon)
583 			{
584 				/* The first time we encounter this tracon. */
585 				*tracon = lcblock; /* Save its future location in the tracon_set. */
586 
587 				if (NULL != tl)
588 				{
589 					tlsz_check(tl, tl->entries[dir], dir);
590 					uint32_t cblock_index = (uint32_t)(lcblock - ts->cblock_base);
591 					tl->table[dir][tl->entries[dir]] = cblock_index;
592 					tl->entries[dir]++;
593 				}
594 			}
595 			else
596 			{
597 				newc = *tracon;
598 				if (!ts->is_pruning)
599 				{
600 					if ((o->nearest_word != newc->nearest_word) ||
601 					    (o->farthest_word != newc->farthest_word))
602 					{
603 						/* This is a rare case in which a shallow and deep
604 						 * connectors don't have the same nearest_word, because
605 						 * a shallow connector may mach a deep connector
606 						 * earlier. Because the nearest word is different, we
607 						 * cannot share it. (Such shallow and deep tracons could
608 						 * be shared separately, but because this is a rare
609 						 * event there is no benefit to do that.)
610 						 * Note:
611 						 * In case the parsing ever depends on other Connector
612 						 * fields, their will be a need to add a check for them
613 						 * here.
614 						 * Update: farthest_word added. */
615 						newc = NULL; /* Don't share it. */
616 					}
617 				}
618 			}
619 		}
620 
621 		if (newc == NULL)
622 		{
623 			/* No sharing yet. */
624 			newc = lcblock++;
625 			*newc = *o;
626 
627 			if (ts->is_pruning)
628 			{
629 				/* Initialize for the pruning step when no sharing is done yet. */
630 				newc->refcount = 1;  /* No sharing yet. */
631 				if (NULL != tl)
632 					tl->num_cnctrs_per_word[dir][w]++;
633 			}
634 			else
635 			{
636 				/* For the parsing step we need a unique ID. */
637 				newc->tracon_id = ts->next_id[dir]++;
638 			}
639 		}
640 		else
641 		{
642 			if (NULL != tl)
643 			{
644 				for (Connector *n = newc; NULL != n; n = n->next)
645 					n->refcount++;
646 			}
647 			prevc->next = newc;
648 
649 			/* Just shared a tracon, nothing more to do. */
650 			ts->cblock = lcblock;
651 			return head.next;
652 		}
653 
654 		prevc->next = newc;
655 		prevc = newc;
656 	}
657 	newc->next = NULL;
658 
659 	ts->cblock = lcblock;
660 	return head.next;
661 }
662 
663 /**
664  * Pack the given disjunct chain in a contiguous memory block.
665  * If the disjunct is NULL, return NULL.
666  */
pack_disjunct(Tracon_sharing * ts,Disjunct * d,int w)667 static Disjunct *pack_disjunct(Tracon_sharing *ts, Disjunct *d, int w)
668 {
669 	Disjunct *newd;
670 	uintptr_t token = (uintptr_t)w;
671 
672 	newd = (ts->dblock)++;
673 	newd->word_string = d->word_string;
674 	newd->cost = d->cost;
675 	newd->originating_gword = d->originating_gword;
676 
677 	if (NULL == ts->tracon_list)
678 		 token = (uintptr_t)d->originating_gword;
679 
680 	if ((token != ts->last_token) && (NULL != ts->csid[0]))
681 	{
682 		ts->last_token = token;
683 		//printf("Token %ld\n", token);
684 		tracon_set_reset(ts->csid[0]);
685 		tracon_set_reset(ts->csid[1]);
686 	}
687 	newd->left = pack_connectors(ts, d->left, 0, w);
688 	newd->right = pack_connectors(ts, d->right, 1,  w);
689 
690 	return newd;
691 }
692 
pack_disjuncts(Sentence sent,Tracon_sharing * ts,Disjunct * origd,int w)693 static Disjunct *pack_disjuncts(Sentence sent, Tracon_sharing *ts,
694                                 Disjunct *origd, int w)
695 {
696 	Disjunct head;
697 	Disjunct *prevd = &head;
698 
699 	for (Disjunct *d = origd; NULL != d; d = d->next)
700 	{
701 		prevd->next = pack_disjunct(ts, d, w);
702 		prevd = prevd->next;
703 	}
704 	prevd->next = NULL;
705 
706 	return head.next;
707 }
708 
709 #define TLSZ 8192         /* Initial size of the tracon list table */
710 
711 /* Reserved tracon ID space for NULL connectors (zero-length tracons).
712  * Currently, tracons are unique per word. So this is actually the max.
713  * number of words in a sentence rounded up to a power of 2.
714  * FIXME: Derive it from MAX_SENTENCE. */
715 #define WORD_OFFSET 256
716 
717 /** Create a context descriptor for disjuncts & connector memory "packing".
718  *   Allocate a memory block for all the disjuncts & connectors.
719  *   The current Connector struct size is 32 bytes, and the intention is
720  *   to keep it with a power-of-2 size. The idea is to put an integral
721  *   number of connectors in each cache line (assumed to be >= Connector
722  *   struct size, e.g. 64 bytes), so one connector will not need 2 cache
723  *   lines.
724  *
725  *   The current Disjunct struct size is 64 bytes, and the intention is
726  *   to keep it at this size for performance reasons.
727  *
728  *   The allocated memory block includes 2 sections, in that order:
729  *   1. A block for disjuncts.
730  *   2. A block of connectors.
731  *
732  *   If encoding is done for the pruning step, allocate tracon list
733  *   stuff too. In that case also call tracon_set_shallow() so tracons
734  *   starting with a shallow connector will be considered different than
735  *   similar ones starting with a deep connector.
736  *
737  * Note:
738  * In order to save overhead, sentences shorter than
739  * sent->min_len_encoding don't undergo encoding - only packing.
740  * This can also be used for library tests that totally bypass the use of
741  * connector encoding (to validate that the tracon_id/sharing/refcount
742  * implementation didn't introduce bugs in the pruning and parsing steps).
743  * E.g. when using link-parser:
744  * - To entirely disable connector encoding:
745  * link-parser -test=min-len-encoding:254
746  * - To use connector encoding even for short sentences:
747  * link-parser -test=min-len-encoding:0
748  * Any different result (e.g. number of discarded disjuncts in the pruning
749  * step or different parsing results) indicates a bug.
750  *
751  * @param is_pruning TRUE if invoked for pruning, FALSE if invoked for parsing.
752  * @return The said context descriptor.
753  */
pack_sentence_init(Sentence sent,bool is_pruning)754 static Tracon_sharing *pack_sentence_init(Sentence sent, bool is_pruning)
755 {
756 	unsigned int dcnt = 0, ccnt = 0;
757 	count_disjuncts_and_connectors(sent, &dcnt, &ccnt);
758 
759 	size_t dsize = dcnt * sizeof(Disjunct);
760 	if (sizeof(Disjunct) != 64)
761 		dsize = ALIGN(dsize, sizeof(Connector));
762 	size_t csize = ccnt * sizeof(Connector);
763 	size_t memblock_sz = dsize + csize;
764 	void *memblock = malloc(memblock_sz);
765 	Disjunct *dblock = memblock;
766 	Connector *cblock = (Connector *)((char *)memblock + dsize);
767 
768 	Tracon_sharing *ts = malloc(sizeof(Tracon_sharing));
769 	memset(ts, 0, sizeof(Tracon_sharing));
770 
771 	ts->memblock = memblock;
772 	ts->memblock_sz = memblock_sz;
773 	ts->cblock_base = cblock;
774 	ts->cblock = cblock;
775 	ts->dblock = dblock;
776 	ts->num_connectors = ccnt;
777 	ts->num_disjuncts = dcnt;
778 	ts->word_offset = is_pruning ? 1 : WORD_OFFSET;
779 	ts->is_pruning = is_pruning;
780 	ts->next_id[0] = ts->next_id[1] = ts->word_offset;
781 	ts->last_token = (uintptr_t)-1;
782 
783 	/* Encode connectors only for long-enough sentences. */
784 	if (sent->length >= sent->min_len_encoding)
785 	{
786 		ts->csid[0] = tracon_set_create();
787 		ts->csid[1] = tracon_set_create();
788 
789 		if (is_pruning)
790 		{
791 			ts->tracon_list = malloc(sizeof(Tracon_list));
792 			memset(ts->tracon_list, 0, sizeof(Tracon_list));
793 			unsigned int **ncpw = ts->tracon_list->num_cnctrs_per_word;
794 			for (int dir = 0; dir < 2; dir++)
795 			{
796 				ncpw[dir] = malloc(sent->length * sizeof(**ncpw));
797 				memset(ncpw[dir], 0, sent->length * sizeof(**ncpw));
798 
799 				tracon_set_shallow(true, ts->csid[dir]);
800 				tlsz_check(ts->tracon_list, TLSZ, dir); /* Allocate table. */
801 			}
802 		}
803 	}
804 
805 	if (!is_pruning && (ts->memblock != sent->dc_memblock))
806 	{
807 		/* The disjunct & connector content is stored in dc_memblock.
808 		 * It will be freed at sentence_delete(). */
809 		if (sent->dc_memblock) free(sent->dc_memblock);
810 		sent->dc_memblock = ts->memblock;
811 		sent->num_disjuncts = ts->num_disjuncts;
812 	}
813 
814 	return ts;
815 }
816 
free_tracon_sharing(Tracon_sharing * ts)817 void free_tracon_sharing(Tracon_sharing *ts)
818 {
819 	if (NULL == ts) return;
820 
821 	for (int dir = 0; dir < 2; dir++)
822 	{
823 		if (NULL != ts->tracon_list)
824 		{
825 			free(ts->tracon_list->num_cnctrs_per_word[dir]);
826 			free(ts->tracon_list->table[dir]);
827 		}
828 		if (NULL != ts->csid[dir])
829 		{
830 			tracon_set_delete(ts->csid[dir]);
831 			ts->csid[dir] = NULL;
832 		}
833 	}
834 
835 	if (NULL != ts->d) free(ts->d);
836 	free(ts->tracon_list);
837 	ts->tracon_list = NULL;
838 
839 	free(ts);
840 }
841 
842 /**
843  * Pack all disjunct and connectors into a one big memory block, share
844  * tracon memory and generate tracon IDs (for parsing) or tracon lists
845  * with reference count (for pruning). Aka "connector encoding".
846  *
847  * The disjunct and connectors packing in a contiguous memory facilitate a
848  * better memory caching for long sentences (a performance gain of a few
849  * percents in the initial implementation, in which this was the sole
850  * purpose of this packing.) In addition, tracon memory sharing
851  * drastically reduces the memory used for connectors.
852  *
853  * The tracon IDs (if invoked for the parsing step) or tracon lists (if
854  * invoked for pruning step) allow for a huge performance boost at these
855  * steps.
856  */
pack_sentence(Sentence sent,bool is_pruning)857 static Tracon_sharing *pack_sentence(Sentence sent, bool is_pruning)
858 {
859 	Tracon_sharing *ts = pack_sentence_init(sent, is_pruning);
860 
861 	for (WordIdx w = 0; w < sent->length; w++)
862 	{
863 		sent->word[w].d = pack_disjuncts(sent, ts, sent->word[w].d, w);
864 	}
865 
866 	return ts;
867 }
868 
869 /**
870  * Pack the sentence for pruning.
871  * @return New tracon sharing descriptor.
872  */
pack_sentence_for_pruning(Sentence sent)873 Tracon_sharing *pack_sentence_for_pruning(Sentence sent)
874 {
875 	unsigned int ccnt_before = 0;
876 	if (verbosity_level(D_DISJ)) ccnt_before = count_connectors(sent);
877 
878 	Tracon_sharing *ts = pack_sentence(sent, true);
879 
880 	if (NULL == ts->csid[0])
881 	{
882 		lgdebug(D_DISJ, "Debug: Encode for pruning (len %zu): None\n",
883 		        sent->length);
884 	}
885 	else
886 	{
887 		lgdebug(D_DISJ, "Debug: Encode for pruning (len %zu): "
888 		        "tracon_id %zu (%zu+,%zu-), shared connectors %d\n",
889 		        sent->length,
890 		        ts->tracon_list->entries[0]+ts->tracon_list->entries[1],
891 		        ts->tracon_list->entries[0], ts->tracon_list->entries[1],
892 				  (int)(&ts->cblock_base[ccnt_before] - ts->cblock));
893 	}
894 
895 	return ts;
896 }
897 
898 /**
899  * Pack the sentence for parsing.
900  * @return New tracon sharing descriptor.
901  */
pack_sentence_for_parsing(Sentence sent)902 Tracon_sharing *pack_sentence_for_parsing(Sentence sent)
903 {
904 	unsigned int ccnt_before = 0;
905 	if (verbosity_level(D_DISJ)) ccnt_before = count_connectors(sent);
906 
907 	Tracon_sharing *ts = pack_sentence(sent, false);
908 
909 	if (verbosity_level(D_SPEC+2))
910 	{
911 		printf("pack_sentence_for_parsing (null_count %u):\n", sent->null_count);
912 		print_all_disjuncts(sent);
913 	}
914 
915 	if (NULL == ts->csid[0])
916 	{
917 		lgdebug(D_DISJ, "Debug: Encode for parsing (len %zu): None\n",
918 		        sent->length);
919 	}
920 	else
921 	{
922 		lgdebug(D_DISJ, "Debug: Encode for parsing (len %zu): "
923 		        "tracon_id %d (%d+,%d-), shared connectors %d\n",
924 		        sent->length,
925 		        (ts->next_id[0]-ts->word_offset)+(ts->next_id[1]-ts->word_offset),
926 		        ts->next_id[0]-ts->word_offset, ts->next_id[1]-ts->word_offset,
927 		        (int)(&ts->cblock_base[ccnt_before] - ts->cblock));
928 	}
929 
930 	return ts;
931 }
932 
933 /* ============ Save and restore sentence disjuncts ============ */
save_disjuncts(Sentence sent,Tracon_sharing * ts)934 void *save_disjuncts(Sentence sent, Tracon_sharing *ts)
935 {
936 	void *saved_memblock = malloc(ts->memblock_sz);
937 	memcpy(saved_memblock, ts->memblock, ts->memblock_sz);
938 
939 	if (NULL == ts->d)
940 		ts->d = malloc(sent->length * sizeof(Disjunct *));
941 	for (WordIdx w = 0; w < sent->length; w++)
942 		ts->d[w] = sent->word[w].d;
943 
944 	return saved_memblock;
945 }
946 
restore_disjuncts(Sentence sent,void * saved_memblock,Tracon_sharing * ts)947 void restore_disjuncts(Sentence sent, void *saved_memblock, Tracon_sharing *ts)
948 {
949 	if (NULL == saved_memblock) return;
950 
951 	for (WordIdx w = 0; w < sent->length; w++)
952 		sent->word[w].d = ts->d[w];
953 
954 	memcpy(ts->memblock, saved_memblock, ts->memblock_sz);
955 }
956