1 /*************************************************************************/
2 /* Copyright (c) 2004 */
3 /* Daniel Sleator, David Temperley, and John Lafferty */
4 /* Copyright 2018-2020, Amir Plivatsky */
5 /* All rights reserved */
6 /* */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software. */
9 /* This license allows free redistribution and use in source and binary */
10 /* forms, with or without modification, subject to certain conditions. */
11 /* */
12 /*************************************************************************/
13 #include <string.h>
14
15 #include "api-structures.h" // Sentence
16 #include "connectors.h"
17 #include "dict-common/dict-structures.h"
18 #include "dict-common/dict-utils.h" // copy_Exp
19 #include "dict-common/regex-morph.h" // match_regex
20 #include "disjunct-utils.h"
21 #include "memory-pool.h"
22 #include "prepare/build-disjuncts.h"
23 #include "print/print-util.h"
24 #include "tokenize/tok-structures.h" // XXX TODO provide gword access methods!
25 #include "tokenize/word-structures.h"
26 #include "tracon-set.h"
27 #include "utilities.h"
28
29 /* Disjunct utilities ... */
30
31 #define D_DISJ 5 /* Verbosity level for this file. */
32
33 /**
34 * free_disjuncts() -- free the list of disjuncts pointed to by c
35 * (does not free any strings)
36 */
free_disjuncts(Disjunct * c)37 void free_disjuncts(Disjunct *c)
38 {
39 Disjunct *c1;
40 for (;c != NULL; c = c1) {
41 c1 = c->next;
42 free_connectors(c->left);
43 free_connectors(c->right);
44 xfree((char *)c, sizeof(Disjunct));
45 }
46 }
47
free_sentence_disjuncts(Sentence sent)48 void free_sentence_disjuncts(Sentence sent)
49 {
50 if (NULL != sent->dc_memblock)
51 {
52 free(sent->dc_memblock);
53 sent->dc_memblock = NULL;
54 }
55 else if (NULL != sent->Disjunct_pool)
56 {
57 pool_delete(sent->Disjunct_pool);
58 pool_delete(sent->Connector_pool);
59 sent->Disjunct_pool = NULL;
60 }
61 }
62
63 /**
64 * Destructively catenates the two disjunct lists d1 followed by d2.
65 * Doesn't change the contents of the disjuncts.
66 * Traverses the first list, but not the second.
67 */
catenate_disjuncts(Disjunct * d1,Disjunct * d2)68 Disjunct * catenate_disjuncts(Disjunct *d1, Disjunct *d2)
69 {
70 Disjunct * dis = d1;
71
72 if (d1 == NULL) return d2;
73 if (d2 == NULL) return d1;
74 while (dis->next != NULL) dis = dis->next;
75 dis->next = d2;
76 return d1;
77 }
78
79 /** Returns the number of disjuncts in the list pointed to by d */
count_disjuncts(Disjunct * d)80 unsigned int count_disjuncts(Disjunct * d)
81 {
82 unsigned int count = 0;
83 for (; d != NULL; d = d->next)
84 {
85 count++;
86 }
87 return count;
88 }
89
90 /** Returns the number of connectors in the sentence. */
count_connectors(Sentence sent)91 static unsigned int count_connectors(Sentence sent)
92 {
93 unsigned int ccnt = 0;
94
95 for (WordIdx w = 0; w < sent->length; w++)
96 {
97 for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next)
98 {
99 for (Connector *c = d->left; c != NULL; c = c->next) ccnt++;
100 for (Connector *c = d->right; c !=NULL; c = c->next) ccnt++;
101 }
102 }
103
104 return ccnt;
105 }
106 /* ============================================================= */
107
108 typedef struct disjunct_dup_table_s disjunct_dup_table;
109 struct disjunct_dup_table_s
110 {
111 size_t dup_table_size;
112 Disjunct *dup_table[];
113 };
114
115 /**
116 * This is a hash function for disjuncts
117 *
118 * This is the old version that doesn't check for domination, just
119 * equality.
120 */
old_hash_disjunct(disjunct_dup_table * dt,Disjunct * d)121 static inline unsigned int old_hash_disjunct(disjunct_dup_table *dt, Disjunct * d)
122 {
123 unsigned int i;
124 i = 0;
125 for (Connector *e = d->left; e != NULL; e = e->next) {
126 i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7;
127 }
128 for (Connector *e = d->right; e != NULL; e = e->next) {
129 i = (41 * (i + e->desc->uc_num)) + (unsigned int)e->desc->lc_letters + 7;
130 }
131 #if 0 /* Redundant - the connector hashing has enough entropy. */
132 i += string_hash(d->word_string);
133 #endif
134 i += (i>>10);
135
136 d->dup_hash = i;
137 return (i & (dt->dup_table_size-1));
138 }
139
140 /**
141 * The connectors must be exactly equal.
142 */
connectors_equal_prune(Connector * c1,Connector * c2)143 static bool connectors_equal_prune(Connector *c1, Connector *c2)
144 {
145 return c1->desc == c2->desc && (c1->multi == c2->multi);
146 }
147
148 /** returns TRUE if the disjuncts are exactly the same */
disjuncts_equal(Disjunct * d1,Disjunct * d2)149 static bool disjuncts_equal(Disjunct * d1, Disjunct * d2)
150 {
151 Connector *e1, *e2;
152
153 e1 = d1->left;
154 e2 = d2->left;
155 while ((e1 != NULL) && (e2 != NULL)) {
156 if (!connectors_equal_prune(e1, e2)) return false;
157 e1 = e1->next;
158 e2 = e2->next;
159 }
160 if ((e1 != NULL) || (e2 != NULL)) return false;
161
162 e1 = d1->right;
163 e2 = d2->right;
164 while ((e1 != NULL) && (e2 != NULL)) {
165 if (!connectors_equal_prune(e1, e2)) return false;
166 e1 = e1->next;
167 e2 = e2->next;
168 }
169 if ((e1 != NULL) || (e2 != NULL)) return false;
170
171 /* Save CPU time by comparing this last, since this will
172 * almost always be true. Rarely, the strings are not from
173 * the same string_set and hence the 2-step comparison. */
174 if (d1->word_string == d2->word_string) return true;
175 return (strcmp(d1->word_string, d2->word_string) == 0);
176 }
177
178 #if 0
179 int de_fp = 0;
180 int de_total = 0;
181 static void disjuncts_equal_stat(void)
182 {
183 fprintf(stderr, "disjuncts_equal FP %d/%d\n", de_fp, de_total);
184 }
185
186 static bool disjuncts_equal(Disjunct * d1, Disjunct * d2)
187 {
188 if (de_total == 0) atexit(disjuncts_equal_stat);
189 de_total++;
190
191 bool rc = disjuncts_equal1(d1, d2);
192 if (!rc) de_fp++;
193
194 return rc;
195 }
196 #endif
197
disjunct_dup_table_new(size_t sz)198 static disjunct_dup_table * disjunct_dup_table_new(size_t sz)
199 {
200 disjunct_dup_table *dt;
201
202 dt = malloc(sz * sizeof(Disjunct *) + sizeof(disjunct_dup_table));
203 dt->dup_table_size = sz;
204
205 memset(dt->dup_table, 0, sz * sizeof(Disjunct *));
206
207 return dt;
208 }
209
disjunct_dup_table_delete(disjunct_dup_table * dt)210 static void disjunct_dup_table_delete(disjunct_dup_table *dt)
211 {
212 free(dt);
213 }
214
215 #ifdef DEBUG
gword_set_len(const gword_set * gl)216 GNUC_UNUSED static int gword_set_len(const gword_set *gl)
217 {
218 int len = 0;
219 for (; NULL != gl; gl = gl->next) len++;
220 return len;
221 }
222 #endif
223
224 /**
225 * Return a new gword_set element, initialized from the given element.
226 * @param old_e Existing element.
227 */
gword_set_element_new(gword_set * old_e)228 static gword_set *gword_set_element_new(gword_set *old_e)
229 {
230 gword_set *new_e = malloc(sizeof(gword_set));
231 *new_e = (gword_set){0};
232
233 new_e->o_gword = old_e->o_gword;
234 gword_set *chain_next = old_e->chain_next;
235 old_e->chain_next = new_e;
236 new_e->chain_next = chain_next;
237
238 return new_e;
239 }
240
241 /**
242 * Add an element to existing gword_set. Uniqueness is assumed.
243 * @return A new set with the element.
244 */
gword_set_add(gword_set * gset,gword_set * ge)245 static gword_set *gword_set_add(gword_set *gset, gword_set *ge)
246 {
247 gword_set *n = gword_set_element_new(ge);
248 n->next = gset;
249 gset = n;
250
251 return gset;
252 }
253
254 /**
255 * Combine the given gword sets.
256 * The gword sets are not modified.
257 * This function is used for adding the gword pointers of an eliminated
258 * disjunct to the ones of the kept disjuncts, with no duplicates.
259 *
260 * @param kept gword_set of the kept disjunct.
261 * @param eliminated gword_set of the eliminated disjunct.
262 * @return Use copy-on-write semantics - the gword_set of the kept disjunct
263 * just gets returned if there is nothing to add to it. Else - a new gword
264 * set is returned.
265 */
gword_set_union(gword_set * kept,gword_set * eliminated)266 static gword_set *gword_set_union(gword_set *kept, gword_set *eliminated)
267 {
268 /* Preserve the gword pointers of the eliminated disjunct if different. */
269 gword_set *preserved_set = NULL;
270 for (gword_set *e = eliminated; NULL != e; e = e->next)
271 {
272 gword_set *k;
273
274 /* Ensure uniqueness. */
275 for (k = kept; NULL != k; k = k->next)
276 if (e->o_gword == k->o_gword) break;
277 if (NULL != k) continue;
278
279 preserved_set = gword_set_add(preserved_set, e);
280 }
281
282 if (preserved_set)
283 {
284 /* Preserve the originating gword pointers of the remaining disjunct. */
285 for (gword_set *k = kept; NULL != k; k = k->next)
286 preserved_set = gword_set_add(preserved_set, k);
287 kept = preserved_set;
288 }
289
290 return kept;
291 }
292
293 /**
294 * Takes the list of disjuncts pointed to by d, eliminates all
295 * duplicates, and returns a pointer to a new list.
296 */
eliminate_duplicate_disjuncts(Disjunct * dw)297 Disjunct *eliminate_duplicate_disjuncts(Disjunct *dw)
298 {
299 unsigned int count = 0;
300 disjunct_dup_table *dt;
301 /* This initialization is unneeded because the first disjunct is never
302 * eliminated. However, omitting it generates "uninitialized" compiler
303 * warning. Setting it to NULL generates clang-analyzer error on
304 * possible NULL dereference. */
305 Disjunct *prev = dw;
306
307 dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(dw)));
308
309 for (Disjunct *d = dw; d != NULL; d = d->next)
310 {
311 Disjunct *dx;
312 unsigned int h = old_hash_disjunct(dt, d);
313
314 for (dx = dt->dup_table[h]; dx != NULL; dx = dx->dup_table_next)
315 {
316 if (d->dup_hash != dx->dup_hash) continue;
317 if (disjuncts_equal(dx, d)) break;
318 }
319
320 if (dx != NULL)
321 {
322 /* Discard the current disjunct. */
323 if (d->cost < dx->cost) dx->cost = d->cost;
324
325 dx->originating_gword =
326 gword_set_union(dx->originating_gword, d->originating_gword);
327
328 count++;
329 prev->next = d->next;
330 }
331 else
332 {
333 d->dup_table_next = dt->dup_table[h];
334 dt->dup_table[h] = d;
335 prev = d;
336 }
337 }
338
339 lgdebug(+D_DISJ+(0==count)*1000, "Killed %u duplicates\n", count);
340
341 disjunct_dup_table_delete(dt);
342 return dw;
343 }
344
345 /* ============================================================= */
346
347 /* Return the stringified disjunct.
348 * Be sure to free the string upon return.
349 */
350
prt_con(Connector * c,dyn_str * p,char dir)351 static void prt_con(Connector *c, dyn_str * p, char dir)
352 {
353 if (NULL == c) return;
354 prt_con (c->next, p, dir);
355
356 if (c->multi)
357 {
358 append_string(p, "@%s%c ", connector_string(c), dir);
359 }
360 else
361 {
362 append_string(p, "%s%c ", connector_string(c), dir);
363 }
364 }
365
print_one_disjunct(Disjunct * dj)366 char * print_one_disjunct(Disjunct *dj)
367 {
368 dyn_str *p = dyn_str_new();
369
370 prt_con(dj->left, p, '-');
371 prt_con(dj->right, p, '+');
372
373 return dyn_str_take(p);
374 }
375
376 /* ============================================================= */
377
378 /**
379 * returns the number of connectors in the left lists of the disjuncts.
380 */
left_connector_count(Disjunct * d)381 int left_connector_count(Disjunct * d)
382 {
383 int i=0;
384 for (;d!=NULL; d=d->next) {
385 for (Connector *c = d->left; c!=NULL; c = c->next) i++;
386 }
387 return i;
388 }
389
right_connector_count(Disjunct * d)390 int right_connector_count(Disjunct * d)
391 {
392 int i=0;
393 for (;d!=NULL; d=d->next) {
394 for (Connector *c = d->right; c!=NULL; c = c->next) i++;
395 }
396 return i;
397 }
398
399 /** Returns the number of disjuncts and connectors in the sentence. */
count_disjuncts_and_connectors(Sentence sent,unsigned int * dca,unsigned int * cca)400 void count_disjuncts_and_connectors(Sentence sent, unsigned int *dca,
401 unsigned int *cca)
402 {
403 unsigned int ccnt = 0, dcnt = 0;
404
405 for (WordIdx w = 0; w < sent->length; w++)
406 {
407 for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next)
408 {
409 dcnt++;
410 for (Connector *c = d->left; c != NULL; c = c->next) ccnt++;
411 for (Connector *c = d->right; c !=NULL; c = c->next) ccnt++;
412 }
413 }
414
415 *cca = ccnt;
416 *dca = dcnt;
417 }
418
419 /* ============= Connector encoding, sharing and packing ============= */
420
421 /*
422 * sentence_pack() copies the disjuncts and connectors to a contiguous
423 * memory. This facilitate a better memory caching for long sentences.
424 *
425 * In addition, it shares the memory of identical trailing connector
426 * sequences, aka "tracons". Tracons are considered identical if they
427 * belong to the same Gword (or same word for the pruning step) and
428 * contain identical connectors in the same order (with one exception:
429 * shallow connectors must have the same nearest_word as tracon leading
430 * deep connectors). Connectors are considered identical if they have
431 * the same string representation (including "multi" and the direction
432 * mark) with an additional requirement if the packing is done for the
433 * pruning step - shallow and deep connectors are then not considered
434 * identical. In both cases the exception regarding shallow connectors
435 * is because shallow connectors can match any connector, while deep
436 * connectors can match only shallow connectors. Note: For efficiency,
437 * the actual connector string representation is not used for connector
438 * comparison.
439 *
440 * For the parsing step, identical tracons are assigned a unique tracon
441 * ID, which is kept in their first connector tracon_id field. The rest of
442 * their connectors also have tracon IDs, which belong to tracons starting
443 * with that connectors. The tracon_id is not used for pruning.
444 *
445 * For the pruning step, more things are done:
446 * Additional data structure - a tracon list - is constructed, which
447 * includes a tracon table and per-word prune table sizes. These data
448 * structure consists of 2 identical parts - one for each tracon
449 * direction (left/right). The tracon table is indexed by (tracon_id -
450 * 1), and it indexes the connectors memory block (it doesn't use
451 * pointers in order to save memory on 64-bit CPUs because it may
452 * contain in the order of 100K entries for very long sentences).
453 * Also, a refcount field is set for each tracon to tell how many
454 * tracons are memory-shared at that connector address.
455 *
456 * Tracons are used differently in the pruning and parsing steps.
457 *
458 * Power Pruning:
459 * The first connector of each tracon is inserted into the power table,
460 * along with its reference count. When a connector cannot be matched,
461 * this means that all the disjuncts that contain its tracon also cannot
462 * be matched. It is then marked as bad (by nearest_word=BAD_WORD) and
463 * due to the tracon memory sharing all the connectors that share the same
464 * memory are marked simultaneously, and thus are detected when the next
465 * disjuncts are examined without a need to further process them.
466 * This drastically reduces the "power_cost" and saves much processing.
467 * Setting the nearest_word field is hence done only once per tracon on
468 * each pass. The pass_number field is used to detect already-processed
469 * good tracons - they are assigned the pass number so each tracon is
470 * processed only once per pass. The connector refcount field is used to
471 * discard connectors from the power table when all the disjuncts that
472 * contain them are discarded.
473 *
474 * PP pruning:
475 * Here too only the first connector in each tracon needs to be
476 * examined. Marking a connector with BAD_WORD simultaneously leaves
477 * a mark in the corresponding connector in the cms table and in all
478 * the disjuncts that share it.
479 *
480 * Parsing:
481 * Originally, the classic parser memoized the number of possible
482 * linkages per a given connector-pair using connector addresses. Since
483 * an exhaustive search is done, such an approach has two main problems
484 * for long sentences:
485 * 1. A very big count hash table (Table_connector in count.c) is used
486 * due to the huge number of connectors (100Ks) in long sentences, a
487 * thing that causes a severe CPU cache trash (to the degree that
488 * absolutely most of the memory accesses are L3 misses).
489 * 2. Repeated linkage detailed calculation for what seems identical
490 * connectors. A hint for the tracon idea was the use of 0 hash values
491 * for NULL connectors, which is the same for all the disjuncts of the
492 * same word (they can be considered a private case of a tracon - a
493 * "null tracon").
494 *
495 * The idea that is implemented here is based on the fact that the
496 * number of linkages between the same words using any of their
497 * connector-pair endpoints is governed only by these connectors and the
498 * connectors after them (since cross links are not permitted). Using
499 * tracon IDs as the hash keys allows to share the memoizing table
500 * counts between connectors that start the same tracons. As a
501 * result, long sentences have significantly less different connector
502 * hash values than their total number of connectors.
503 *
504 * In order to save the need to cache and check the endpoint word
505 * numbers the tracon IDs should not be shared between words. They also
506 * should not be shared between alternatives since connectors that belong
507 * to disjuncts of different alternatives may have different linkage
508 * counts because some alternatives-connectivity checks (to the middle
509 * disjunct) are done in the fast-matcher. These restrictions are
510 * implemented by using a different tracon ID per Gword (FIXME - this is
511 * more strict then needed - a different tracon IDs per alternative would
512 * suffice).
513 * The tracon memory sharing is currently not directly used in the
514 * parsing algo besides reducing the needed CPU cache by a large factor.
515 *
516 * Algo of generating tracon Ids, shared tracons and the tracon list:
517 * The string-set code has been adapted (see tracon-set.c) to hash
518 * tracons. The tracon-set hash table slots are Connector pointers which
519 * point to the memory block of the sentence connectors. When a tracon
520 * is not found in the hash table, a new tracon ID is assigned to it,
521 * and the tracon is copied to the said connector memory block. However,
522 * if it is found, its address is used instead of copying the
523 * connectors, thus sharing its memory with identical tracons. The
524 * tracon-set hash table is cleared after each word (for pruning tracons)
525 * or Gword (for parsing tracons), thus ensuring that the tracons IDs are
526 * not shared between words (or Gwords).
527 *
528 * Some tracon features:
529 * - Each connector starts some tracon.
530 * - Connectors of identical tracons share their memory.
531 *
532 * Jets:
533 * A jet is a (whole) ordered set of connectors all pointing in the same
534 * direction (left, or right). Every disjunct can be split into two jets;
535 * that is, a disjunct is a pair of jets, and so each word consists of a
536 * collection of pairs of jets. The first connector in a jet called
537 * a "shallow" connector. Connectors that are not shallow are deep.
538 * See the comments in prune.c for their connection properties.
539 * A jet is also a tracon.
540 *
541 * Note: This comment is referred-to in disjunct-utils.h, so changes
542 * here may need to be reflected in the comments there too.
543 */
544
tlsz_check(Tracon_list * tl,unsigned int index,int dir)545 static void tlsz_check(Tracon_list *tl, unsigned int index, int dir)
546 {
547
548 if (index >= tl->table_size[dir])
549 {
550 size_t new_id_table_size = (0 == tl->table_size[dir]) ?
551 index : tl->table_size[dir] * 2;
552 size_t new_bytes = new_id_table_size * sizeof(uint32_t *);
553
554 tl->table[dir] = realloc(tl->table[dir], new_bytes);
555 tl->table_size[dir] = new_id_table_size;
556 }
557 }
558
559 /**
560 * Pack the connectors in an array; memory-share and enumerate tracons.
561 */
pack_connectors(Tracon_sharing * ts,Connector * origc,int dir,int w)562 static Connector *pack_connectors(Tracon_sharing *ts, Connector *origc, int dir,
563 int w)
564 {
565 if (NULL == origc) return NULL;
566
567 Connector head;
568 Connector *prevc = &head;
569 Connector *newc = &head;
570 Connector *lcblock = ts->cblock; /* For convenience. */
571 Tracon_list *tl = ts->tracon_list; /* If non-NULL - encode for pruning. */
572
573 for (Connector *o = origc; NULL != o; o = o->next)
574 {
575 newc = NULL;
576
577 if (NULL != ts->csid[dir])
578 {
579 /* Encoding is used - share tracons. */
580 Connector **tracon = tracon_set_add(o, ts->csid[dir]);
581
582 if (NULL == *tracon)
583 {
584 /* The first time we encounter this tracon. */
585 *tracon = lcblock; /* Save its future location in the tracon_set. */
586
587 if (NULL != tl)
588 {
589 tlsz_check(tl, tl->entries[dir], dir);
590 uint32_t cblock_index = (uint32_t)(lcblock - ts->cblock_base);
591 tl->table[dir][tl->entries[dir]] = cblock_index;
592 tl->entries[dir]++;
593 }
594 }
595 else
596 {
597 newc = *tracon;
598 if (!ts->is_pruning)
599 {
600 if ((o->nearest_word != newc->nearest_word) ||
601 (o->farthest_word != newc->farthest_word))
602 {
603 /* This is a rare case in which a shallow and deep
604 * connectors don't have the same nearest_word, because
605 * a shallow connector may mach a deep connector
606 * earlier. Because the nearest word is different, we
607 * cannot share it. (Such shallow and deep tracons could
608 * be shared separately, but because this is a rare
609 * event there is no benefit to do that.)
610 * Note:
611 * In case the parsing ever depends on other Connector
612 * fields, their will be a need to add a check for them
613 * here.
614 * Update: farthest_word added. */
615 newc = NULL; /* Don't share it. */
616 }
617 }
618 }
619 }
620
621 if (newc == NULL)
622 {
623 /* No sharing yet. */
624 newc = lcblock++;
625 *newc = *o;
626
627 if (ts->is_pruning)
628 {
629 /* Initialize for the pruning step when no sharing is done yet. */
630 newc->refcount = 1; /* No sharing yet. */
631 if (NULL != tl)
632 tl->num_cnctrs_per_word[dir][w]++;
633 }
634 else
635 {
636 /* For the parsing step we need a unique ID. */
637 newc->tracon_id = ts->next_id[dir]++;
638 }
639 }
640 else
641 {
642 if (NULL != tl)
643 {
644 for (Connector *n = newc; NULL != n; n = n->next)
645 n->refcount++;
646 }
647 prevc->next = newc;
648
649 /* Just shared a tracon, nothing more to do. */
650 ts->cblock = lcblock;
651 return head.next;
652 }
653
654 prevc->next = newc;
655 prevc = newc;
656 }
657 newc->next = NULL;
658
659 ts->cblock = lcblock;
660 return head.next;
661 }
662
663 /**
664 * Pack the given disjunct chain in a contiguous memory block.
665 * If the disjunct is NULL, return NULL.
666 */
pack_disjunct(Tracon_sharing * ts,Disjunct * d,int w)667 static Disjunct *pack_disjunct(Tracon_sharing *ts, Disjunct *d, int w)
668 {
669 Disjunct *newd;
670 uintptr_t token = (uintptr_t)w;
671
672 newd = (ts->dblock)++;
673 newd->word_string = d->word_string;
674 newd->cost = d->cost;
675 newd->originating_gword = d->originating_gword;
676
677 if (NULL == ts->tracon_list)
678 token = (uintptr_t)d->originating_gword;
679
680 if ((token != ts->last_token) && (NULL != ts->csid[0]))
681 {
682 ts->last_token = token;
683 //printf("Token %ld\n", token);
684 tracon_set_reset(ts->csid[0]);
685 tracon_set_reset(ts->csid[1]);
686 }
687 newd->left = pack_connectors(ts, d->left, 0, w);
688 newd->right = pack_connectors(ts, d->right, 1, w);
689
690 return newd;
691 }
692
pack_disjuncts(Sentence sent,Tracon_sharing * ts,Disjunct * origd,int w)693 static Disjunct *pack_disjuncts(Sentence sent, Tracon_sharing *ts,
694 Disjunct *origd, int w)
695 {
696 Disjunct head;
697 Disjunct *prevd = &head;
698
699 for (Disjunct *d = origd; NULL != d; d = d->next)
700 {
701 prevd->next = pack_disjunct(ts, d, w);
702 prevd = prevd->next;
703 }
704 prevd->next = NULL;
705
706 return head.next;
707 }
708
709 #define TLSZ 8192 /* Initial size of the tracon list table */
710
711 /* Reserved tracon ID space for NULL connectors (zero-length tracons).
712 * Currently, tracons are unique per word. So this is actually the max.
713 * number of words in a sentence rounded up to a power of 2.
714 * FIXME: Derive it from MAX_SENTENCE. */
715 #define WORD_OFFSET 256
716
717 /** Create a context descriptor for disjuncts & connector memory "packing".
718 * Allocate a memory block for all the disjuncts & connectors.
719 * The current Connector struct size is 32 bytes, and the intention is
720 * to keep it with a power-of-2 size. The idea is to put an integral
721 * number of connectors in each cache line (assumed to be >= Connector
722 * struct size, e.g. 64 bytes), so one connector will not need 2 cache
723 * lines.
724 *
725 * The current Disjunct struct size is 64 bytes, and the intention is
726 * to keep it at this size for performance reasons.
727 *
728 * The allocated memory block includes 2 sections, in that order:
729 * 1. A block for disjuncts.
730 * 2. A block of connectors.
731 *
732 * If encoding is done for the pruning step, allocate tracon list
733 * stuff too. In that case also call tracon_set_shallow() so tracons
734 * starting with a shallow connector will be considered different than
735 * similar ones starting with a deep connector.
736 *
737 * Note:
738 * In order to save overhead, sentences shorter than
739 * sent->min_len_encoding don't undergo encoding - only packing.
740 * This can also be used for library tests that totally bypass the use of
741 * connector encoding (to validate that the tracon_id/sharing/refcount
742 * implementation didn't introduce bugs in the pruning and parsing steps).
743 * E.g. when using link-parser:
744 * - To entirely disable connector encoding:
745 * link-parser -test=min-len-encoding:254
746 * - To use connector encoding even for short sentences:
747 * link-parser -test=min-len-encoding:0
748 * Any different result (e.g. number of discarded disjuncts in the pruning
749 * step or different parsing results) indicates a bug.
750 *
751 * @param is_pruning TRUE if invoked for pruning, FALSE if invoked for parsing.
752 * @return The said context descriptor.
753 */
pack_sentence_init(Sentence sent,bool is_pruning)754 static Tracon_sharing *pack_sentence_init(Sentence sent, bool is_pruning)
755 {
756 unsigned int dcnt = 0, ccnt = 0;
757 count_disjuncts_and_connectors(sent, &dcnt, &ccnt);
758
759 size_t dsize = dcnt * sizeof(Disjunct);
760 if (sizeof(Disjunct) != 64)
761 dsize = ALIGN(dsize, sizeof(Connector));
762 size_t csize = ccnt * sizeof(Connector);
763 size_t memblock_sz = dsize + csize;
764 void *memblock = malloc(memblock_sz);
765 Disjunct *dblock = memblock;
766 Connector *cblock = (Connector *)((char *)memblock + dsize);
767
768 Tracon_sharing *ts = malloc(sizeof(Tracon_sharing));
769 memset(ts, 0, sizeof(Tracon_sharing));
770
771 ts->memblock = memblock;
772 ts->memblock_sz = memblock_sz;
773 ts->cblock_base = cblock;
774 ts->cblock = cblock;
775 ts->dblock = dblock;
776 ts->num_connectors = ccnt;
777 ts->num_disjuncts = dcnt;
778 ts->word_offset = is_pruning ? 1 : WORD_OFFSET;
779 ts->is_pruning = is_pruning;
780 ts->next_id[0] = ts->next_id[1] = ts->word_offset;
781 ts->last_token = (uintptr_t)-1;
782
783 /* Encode connectors only for long-enough sentences. */
784 if (sent->length >= sent->min_len_encoding)
785 {
786 ts->csid[0] = tracon_set_create();
787 ts->csid[1] = tracon_set_create();
788
789 if (is_pruning)
790 {
791 ts->tracon_list = malloc(sizeof(Tracon_list));
792 memset(ts->tracon_list, 0, sizeof(Tracon_list));
793 unsigned int **ncpw = ts->tracon_list->num_cnctrs_per_word;
794 for (int dir = 0; dir < 2; dir++)
795 {
796 ncpw[dir] = malloc(sent->length * sizeof(**ncpw));
797 memset(ncpw[dir], 0, sent->length * sizeof(**ncpw));
798
799 tracon_set_shallow(true, ts->csid[dir]);
800 tlsz_check(ts->tracon_list, TLSZ, dir); /* Allocate table. */
801 }
802 }
803 }
804
805 if (!is_pruning && (ts->memblock != sent->dc_memblock))
806 {
807 /* The disjunct & connector content is stored in dc_memblock.
808 * It will be freed at sentence_delete(). */
809 if (sent->dc_memblock) free(sent->dc_memblock);
810 sent->dc_memblock = ts->memblock;
811 sent->num_disjuncts = ts->num_disjuncts;
812 }
813
814 return ts;
815 }
816
free_tracon_sharing(Tracon_sharing * ts)817 void free_tracon_sharing(Tracon_sharing *ts)
818 {
819 if (NULL == ts) return;
820
821 for (int dir = 0; dir < 2; dir++)
822 {
823 if (NULL != ts->tracon_list)
824 {
825 free(ts->tracon_list->num_cnctrs_per_word[dir]);
826 free(ts->tracon_list->table[dir]);
827 }
828 if (NULL != ts->csid[dir])
829 {
830 tracon_set_delete(ts->csid[dir]);
831 ts->csid[dir] = NULL;
832 }
833 }
834
835 if (NULL != ts->d) free(ts->d);
836 free(ts->tracon_list);
837 ts->tracon_list = NULL;
838
839 free(ts);
840 }
841
842 /**
843 * Pack all disjunct and connectors into a one big memory block, share
844 * tracon memory and generate tracon IDs (for parsing) or tracon lists
845 * with reference count (for pruning). Aka "connector encoding".
846 *
847 * The disjunct and connectors packing in a contiguous memory facilitate a
848 * better memory caching for long sentences (a performance gain of a few
849 * percents in the initial implementation, in which this was the sole
850 * purpose of this packing.) In addition, tracon memory sharing
851 * drastically reduces the memory used for connectors.
852 *
853 * The tracon IDs (if invoked for the parsing step) or tracon lists (if
854 * invoked for pruning step) allow for a huge performance boost at these
855 * steps.
856 */
pack_sentence(Sentence sent,bool is_pruning)857 static Tracon_sharing *pack_sentence(Sentence sent, bool is_pruning)
858 {
859 Tracon_sharing *ts = pack_sentence_init(sent, is_pruning);
860
861 for (WordIdx w = 0; w < sent->length; w++)
862 {
863 sent->word[w].d = pack_disjuncts(sent, ts, sent->word[w].d, w);
864 }
865
866 return ts;
867 }
868
869 /**
870 * Pack the sentence for pruning.
871 * @return New tracon sharing descriptor.
872 */
pack_sentence_for_pruning(Sentence sent)873 Tracon_sharing *pack_sentence_for_pruning(Sentence sent)
874 {
875 unsigned int ccnt_before = 0;
876 if (verbosity_level(D_DISJ)) ccnt_before = count_connectors(sent);
877
878 Tracon_sharing *ts = pack_sentence(sent, true);
879
880 if (NULL == ts->csid[0])
881 {
882 lgdebug(D_DISJ, "Debug: Encode for pruning (len %zu): None\n",
883 sent->length);
884 }
885 else
886 {
887 lgdebug(D_DISJ, "Debug: Encode for pruning (len %zu): "
888 "tracon_id %zu (%zu+,%zu-), shared connectors %d\n",
889 sent->length,
890 ts->tracon_list->entries[0]+ts->tracon_list->entries[1],
891 ts->tracon_list->entries[0], ts->tracon_list->entries[1],
892 (int)(&ts->cblock_base[ccnt_before] - ts->cblock));
893 }
894
895 return ts;
896 }
897
898 /**
899 * Pack the sentence for parsing.
900 * @return New tracon sharing descriptor.
901 */
pack_sentence_for_parsing(Sentence sent)902 Tracon_sharing *pack_sentence_for_parsing(Sentence sent)
903 {
904 unsigned int ccnt_before = 0;
905 if (verbosity_level(D_DISJ)) ccnt_before = count_connectors(sent);
906
907 Tracon_sharing *ts = pack_sentence(sent, false);
908
909 if (verbosity_level(D_SPEC+2))
910 {
911 printf("pack_sentence_for_parsing (null_count %u):\n", sent->null_count);
912 print_all_disjuncts(sent);
913 }
914
915 if (NULL == ts->csid[0])
916 {
917 lgdebug(D_DISJ, "Debug: Encode for parsing (len %zu): None\n",
918 sent->length);
919 }
920 else
921 {
922 lgdebug(D_DISJ, "Debug: Encode for parsing (len %zu): "
923 "tracon_id %d (%d+,%d-), shared connectors %d\n",
924 sent->length,
925 (ts->next_id[0]-ts->word_offset)+(ts->next_id[1]-ts->word_offset),
926 ts->next_id[0]-ts->word_offset, ts->next_id[1]-ts->word_offset,
927 (int)(&ts->cblock_base[ccnt_before] - ts->cblock));
928 }
929
930 return ts;
931 }
932
933 /* ============ Save and restore sentence disjuncts ============ */
save_disjuncts(Sentence sent,Tracon_sharing * ts)934 void *save_disjuncts(Sentence sent, Tracon_sharing *ts)
935 {
936 void *saved_memblock = malloc(ts->memblock_sz);
937 memcpy(saved_memblock, ts->memblock, ts->memblock_sz);
938
939 if (NULL == ts->d)
940 ts->d = malloc(sent->length * sizeof(Disjunct *));
941 for (WordIdx w = 0; w < sent->length; w++)
942 ts->d[w] = sent->word[w].d;
943
944 return saved_memblock;
945 }
946
restore_disjuncts(Sentence sent,void * saved_memblock,Tracon_sharing * ts)947 void restore_disjuncts(Sentence sent, void *saved_memblock, Tracon_sharing *ts)
948 {
949 if (NULL == saved_memblock) return;
950
951 for (WordIdx w = 0; w < sent->length; w++)
952 sent->word[w].d = ts->d[w];
953
954 memcpy(ts->memblock, saved_memblock, ts->memblock_sz);
955 }
956