1 /*************************************************************************/
2 /* Copyright (c) 2004 */
3 /* Daniel Sleator, David Temperley, and John Lafferty */
4 /* Copyright (c) 2013 Linas Vepstas */
5 /* All rights reserved */
6 /* */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software. */
9 /* This license allows free redistribution and use in source and binary */
10 /* forms, with or without modification, subject to certain conditions. */
11 /* */
12 /*************************************************************************/
13 /*
14 * Miscellaneous utilities for dealing with word types.
15 */
16
17 #include <limits.h> // CHAR_BIT
18
19 #include "dict-common/dict-utils.h" // size_of_expression
20 #include "api-structures.h" // Parse_Options_s
21 #include "connectors.h"
22 #include "link-includes.h" // Parse_Options
23
24 #define WILD_TYPE '*'
25 #define LENGTH_LINIT_WILD_TYPE WILD_TYPE
26
27 /**
28 * free_connectors() -- free the list of connectors pointed to by e
29 * (does not free any strings)
30 */
free_connectors(Connector * e)31 void free_connectors(Connector *e)
32 {
33 Connector * n;
34 for (; e != NULL; e = n)
35 {
36 n = e->next;
37 free(e);
38 }
39 }
40
41 void
set_connector_length_limit(Connector * c,Parse_Options opts)42 set_connector_length_limit(Connector *c, Parse_Options opts)
43 {
44 if (NULL == opts)
45 {
46 c->length_limit = UNLIMITED_LEN;
47 return;
48 }
49
50 int short_len = opts->short_length;
51 bool all_short = opts->all_short;
52 int length_limit = c->desc->length_limit;
53
54 if ((all_short && (length_limit > short_len)) || (0 == length_limit))
55 c->length_limit = short_len;
56 else
57 c->length_limit = length_limit;
58 }
59
connector_new(Pool_desc * connector_pool,const condesc_t * desc,Parse_Options opts)60 Connector * connector_new(Pool_desc *connector_pool, const condesc_t *desc,
61 Parse_Options opts)
62 {
63 Connector *c;
64
65 if (NULL == connector_pool) /* For the SAT-parser, until fixed. */
66 {
67 c = malloc(sizeof(Connector));
68 memset(c, 0, sizeof(Connector));
69 }
70 else
71 c = pool_alloc(connector_pool); /* Memory-pool has zero_out attribute.*/
72
73 c->desc = desc;
74 set_connector_length_limit(c, opts);
75 //assert(0 != c->length_limit, "Connector_new(): Zero length_limit");
76
77 return c;
78 }
79
80 /* ======================================================== */
81 /* Connector length limit handling - UNLIMITED-CONNECTORS and LENGTH-LIMIT-n. */
82
get_connectors_from_expression(condesc_t ** conlist,const Exp * e)83 static size_t get_connectors_from_expression(condesc_t **conlist, const Exp *e)
84 {
85 if (e->type == CONNECTOR_type)
86 {
87 if (NULL != conlist) *conlist = e->condesc;
88 return 1;
89 }
90
91 size_t cl_size = 0;
92 for (Exp *opd = e->operand_first; opd != NULL; opd = opd->operand_next)
93 {
94 cl_size += get_connectors_from_expression(conlist, opd);
95 if (NULL != conlist) conlist++;
96 }
97
98 return cl_size;
99 }
100
condesc_by_uc_num(const void * a,const void * b)101 static int condesc_by_uc_num(const void *a, const void *b)
102 {
103 const condesc_t * const * cda = a;
104 const condesc_t * const * cdb = b;
105
106 if ((*cda)->uc_num < (*cdb)->uc_num) return -1;
107 if ((*cda)->uc_num > (*cdb)->uc_num) return 1;
108
109 return 0;
110 }
111
112 /**
113 * Set the length limit of all the connectors that match those in e.
114 * XXX A connector in e that doesn't match any other connector cannot
115 * be detected, because it has been inserted into the connector table and
116 * hence matches at least itself.
117 */
set_condesc_length_limit(Dictionary dict,const Exp * e,int length_limit)118 static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_limit)
119 {
120 size_t exp_num_con;
121 ConTable *ct = &dict->contable;
122 condesc_t **sdesc = ct->sdesc;
123 condesc_t **econlist;
124
125 /* Create a connector list from the given expression. */
126 exp_num_con = get_connectors_from_expression(NULL, e);
127 if (0 == exp_num_con) return; /* Empty connector list. */
128 econlist = alloca(exp_num_con * sizeof(*econlist));
129 get_connectors_from_expression(econlist, e);
130
131 qsort(econlist, exp_num_con, sizeof(*econlist), condesc_by_uc_num);
132
133 /* Scan the expression connector list and set length_limit.
134 * restart_cn is needed because several connectors in this list
135 * may match a given uppercase part. */
136 size_t restart_cn = 0, cn, en;
137 for (en = 0; en < exp_num_con; en++)
138 {
139 for (cn = restart_cn; cn < ct->num_con; cn++)
140 if (sdesc[cn]->uc_num >= econlist[en]->uc_num) break;
141
142 for (; en < exp_num_con; en++)
143 if (econlist[en]->uc_num >= sdesc[cn]->uc_num) break;
144 if (en == exp_num_con) break;
145
146 if (econlist[en]->uc_num != sdesc[cn]->uc_num) continue;
147 restart_cn = cn;
148
149 const char *wc_str = econlist[en]->string;
150 char *uc_wildcard = strchr(wc_str, LENGTH_LINIT_WILD_TYPE);
151
152 for (; cn < ct->num_con; cn++)
153 {
154 if (NULL == uc_wildcard)
155 {
156 if (econlist[en]->uc_num != sdesc[cn]->uc_num)
157 break;
158 /* The uppercase parts are equal - match only the lowercase ones. */
159 if (!lc_easy_match(econlist[en], sdesc[cn]))
160 continue;
161 }
162 else
163 {
164 /* The uppercase part is a prefix. */
165 if (0 != strncmp(wc_str, sdesc[cn]->string, uc_wildcard - wc_str))
166 break;
167 }
168
169 sdesc[cn]->length_limit = length_limit;
170 }
171 }
172 }
173
condesc_length_limit_def_delete(ConTable * ct)174 static void condesc_length_limit_def_delete(ConTable *ct)
175 {
176 length_limit_def_t *l_next;
177
178 for (length_limit_def_t *l = ct->length_limit_def; NULL != l; l = l_next)
179 {
180 l_next = l->next;
181 free(l);
182 }
183 ct->length_limit_def = NULL;
184 }
185
set_all_condesc_length_limit(Dictionary dict)186 static void set_all_condesc_length_limit(Dictionary dict)
187 {
188 ConTable *ct = &dict->contable;
189 bool unlimited_len_found = false;
190
191 for (length_limit_def_t *l = ct->length_limit_def; NULL != l; l = l->next)
192 {
193 set_condesc_length_limit(dict, l->defexp, l->length_limit);
194 if (UNLIMITED_LEN == l->length_limit) unlimited_len_found = true;
195 }
196
197 if (!unlimited_len_found)
198 {
199 /* If no connectors are defined as UNLIMITED_LEN, set all the
200 * connectors with no defined length-limit to UNLIMITED_LEN. */
201 condesc_t **sdesc = ct->sdesc;
202
203 for (size_t en = 0; en < ct->num_con; en++)
204 {
205 if (0 == sdesc[en]->length_limit)
206 sdesc[en]->length_limit = UNLIMITED_LEN;
207 }
208 }
209
210 condesc_length_limit_def_delete(&dict->contable);
211
212 if (verbosity_level(D_SPEC+1))
213 {
214 prt_error("Debug:\n%5s %-6s %3s\n\\", "num", "uc_num", "ll");
215 for (size_t n = 0; n < ct->num_con; n++)
216 {
217 prt_error("%5zu %6u %3d %s\n\\", n, ct->sdesc[n]->uc_num,
218 ct->sdesc[n]->length_limit, ct->sdesc[n]->string);
219 }
220 prt_error("\n");
221 }
222 }
223
224 /* ======================================================== */
225
226 /**
227 * Pack the LC part of a connector into 64 bits, and compute a wild-card mask.
228 * Up to 9 characters can be so packed.
229 *
230 * Because we pack by shifts, we can do it using 7-bit per original
231 * character at the same overhead needed for 8-bit packing.
232 *
233 * Note: The LC part may consist of chars in the range [a-z0-9]
234 * (total 36) so a 6-bit packing is possible (by abs(value-60) on each
235 * character value).
236 */
connector_encode_lc(const char * lc_string,condesc_t * desc)237 static void connector_encode_lc(const char *lc_string, condesc_t *desc)
238 {
239 lc_enc_t lc_mask = 0;
240 lc_enc_t lc_value = 0;
241 lc_enc_t wildcard = LC_MASK;
242 const char *s;
243
244 for (s = lc_string; '\0' != *s; s++)
245 {
246 if (*s != WILD_TYPE)
247 {
248 lc_value |= (lc_enc_t)(*s & LC_MASK) << ((s-lc_string)*LC_BITS);
249 lc_mask |= wildcard;
250 }
251 wildcard <<= LC_BITS;
252 };
253
254 /* FIXME: Check on dict read. */
255 if ((size_t)(s-lc_string) > (sizeof(lc_value)/LC_BITS)*CHAR_BIT)
256 {
257 prt_error("Warning: Lower-case part '%s' is too long (%d>%d)\n",
258 lc_string, (int)(s-lc_string), MAX_CONNECTOR_LC_LENGTH);
259 }
260
261 desc->lc_mask = (lc_mask << 1) + !!(desc->flags & CD_HEAD_DEPENDENT);
262 desc->lc_letters = (lc_value << 1) + !!(desc->flags & CD_HEAD);
263 }
264
265 /**
266 * Calculate fixed connector information that only depend on its string.
267 * This information is used to speed up the parsing stage. It is
268 * calculated during the directory creation and doesn't change afterward.
269 */
calculate_connector_info(condesc_t * c)270 static void calculate_connector_info(condesc_t * c)
271 {
272 const char *s;
273
274 s = c->string;
275 if (islower(*s)) s++; /* Ignore head-dependent indicator. */
276 if ((c->string[0] == 'h') || (c->string[0] == 'd'))
277 c->flags |= CD_HEAD_DEPENDENT;
278 if (c->string[0] == 'h')
279 c->flags |= CD_HEAD;
280
281 c->uc_start = (uint8_t)(s - c->string);
282 while (isupper(*++s)) {} /* Skip the uppercase part. */
283 c->uc_length = (uint8_t)(s - c->string - c->uc_start);
284
285 connector_encode_lc(s, c);
286 }
287
288 /* ================= Connector descriptor table. ====================== */
289
connector_str_hash(const char * s)290 static uint32_t connector_str_hash(const char *s)
291 {
292 /* From an old-code comment:
293 * For most situations, all three hashes are very nearly equal.
294 * For both English and Russian, there are about 100 pre-defined
295 * connectors, and another 2K-4K autogen'ed ones (the IDxxx idiom
296 * connectors, and the LLxxx suffix connectors for Russian). */
297 #ifdef USE_DJB2
298 /* djb2 hash. */
299 uint32_t i = 5381;
300 while (isupper(*s)) /* Connector tables cannot contain UTF8. */
301 {
302 i = ((i << 5) + i) + *s;
303 s++;
304 }
305 i += i>>14;
306 #endif /* USE_DJB2 */
307
308 #define USE_JENKINS
309 #ifdef USE_JENKINS
310 /* Jenkins one-at-a-time hash. */
311 uint32_t i = 0;
312 while (isupper(*s)) /* Connector tables cannot contain UTF8. */
313 {
314 i += *s;
315 i += (i<<10);
316 i ^= (i>>6);
317 s++;
318 }
319 i += (i << 3);
320 i ^= (i >> 11);
321 i += (i << 15);
322 #endif /* USE_JENKINS */
323
324 #ifdef USE_SDBM
325 /* sdbm hash. */
326 uint32_t i = 0;
327 c->uc_start = s - c->string;
328 while (isupper(*s))
329 {
330 i = *s + (i << 6) + (i << 16) - i;
331 s++;
332 }
333 #endif /* USE_SDBM */
334
335 return i;
336 }
337
338 /**
339 * Compare connector UC parts, for qsort.
340 */
condesc_by_uc_constring(const void * a,const void * b)341 static int condesc_by_uc_constring(const void * a, const void * b)
342 {
343 const condesc_t * const * cda = a;
344 const condesc_t * const * cdb = b;
345
346 /* Move the empty slots to the end. */
347 if (NULL == *cda) return (NULL != *cdb);
348 if (NULL == *cdb) return -1;
349
350 const char *sa = &(*cda)->string[(*cda)->uc_start];
351 const char *sb = &(*cdb)->string[(*cdb)->uc_start];
352
353 int la = (*cda)->uc_length;
354 int lb = (*cdb)->uc_length;
355
356 if (la == lb)
357 {
358 //printf("la==lb A=%s b=%s, la=%d lb=%d len=%d\n",sa,sb,la,lb,la);
359 return strncmp(sa, sb, la);
360 }
361
362 if (la < lb)
363 {
364 char *uca = strdupa(sa);
365 uca[la] = '\0';
366 //printf("la<lb A=%s b=%s, la=%d lb=%d len=%d\n",uca,sb,la,lb,lb);
367 return strncmp(uca, sb, lb);
368 }
369 else
370 {
371 char *ucb = strdupa(sb);
372 ucb[lb] = '\0';
373 //printf("la>lb A=%s b=%s, la=%d lb=%d len=%d\n",sa,ucb,la,lb,la);
374 return strncmp(sa, ucb, la);
375 }
376 }
377
378 /**
379 * Enumerate the connectors by their UC parts - equal parts get the same number.
380 * It can later serve as a table index, as if it was a perfect hash.
381 */
sort_condesc_by_uc_constring(Dictionary dict)382 bool sort_condesc_by_uc_constring(Dictionary dict)
383 {
384 if ((0 == dict->contable.num_con) && !IS_DB_DICT(dict))
385 {
386 prt_error("Error: Dictionary %s: No connectors found.\n", dict->name);
387 return false;
388 }
389
390 /* An SQL dict without <UNKNOWN-WORD> may have 0 connectors here. */
391 if (0 == dict->contable.num_con)
392 return true;
393
394 condesc_t **sdesc = malloc(dict->contable.num_con * sizeof(condesc_t *));
395 size_t i = 0;
396 for (size_t n = 0; n < dict->contable.size; n++)
397 {
398 condesc_t *condesc = dict->contable.hdesc[n].desc;
399
400 if (NULL == condesc) continue;
401 calculate_connector_info(condesc);
402 sdesc[i++] = dict->contable.hdesc[n].desc;
403 }
404
405 qsort(sdesc, dict->contable.num_con, sizeof(*dict->contable.sdesc),
406 condesc_by_uc_constring);
407
408 /* Enumerate the connectors according to their UC part. */
409 int uc_num = 0;
410
411 sdesc[0]->uc_num = uc_num;
412 for (size_t n = 1; n < dict->contable.num_con; n++)
413 {
414 condesc_t **condesc = &sdesc[n];
415
416 if (condesc[0]->uc_length != condesc[-1]->uc_length)
417
418 {
419 /* We know that the UC part has been changed. */
420 uc_num++;
421 }
422 else
423 {
424 const char *uc1 = &condesc[0]->string[condesc[0]->uc_start];
425 const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start];
426 if (0 != strncmp(uc1, uc2, condesc[0]->uc_length))
427 {
428 uc_num++;
429 }
430 }
431
432 //printf("%5d constring=%s\n", uc_num, condesc[0]->string);
433 condesc[0]->uc_num = uc_num;
434 }
435
436 lgdebug(+11, "Dictionary %s: %zu different connectors "
437 "(%d with a different UC part)\n",
438 dict->name, dict->contable.num_con, uc_num+1);
439
440 dict->contable.sdesc = sdesc;
441 dict->contable.num_uc = uc_num + 1;
442
443 /* hdesc is not freed here because it is needed for finding ZZZ.
444 * It could be freed here if we have ZZZ cached in the dict structure. */
445 return true;
446 }
447
condesc_delete(Dictionary dict)448 void condesc_delete(Dictionary dict)
449 {
450 ConTable *ct = &dict->contable;
451
452 free(ct->hdesc);
453 pool_delete(ct->mempool);
454 condesc_length_limit_def_delete(ct);
455 }
456
condesc_reuse(Dictionary dict)457 void condesc_reuse(Dictionary dict)
458 {
459 ConTable *ct = &dict->contable;
460
461 ct->num_con = 0;
462 ct->num_uc = 0;
463 memset(ct->hdesc, 0, ct->size * sizeof(hdesc_t));
464 pool_reuse(ct->mempool);
465 }
466
condesc_find(ConTable * ct,const char * constring,uint32_t hash)467 static hdesc_t *condesc_find(ConTable *ct, const char *constring, uint32_t hash)
468 {
469 uint32_t i = hash & (ct->size-1);
470
471 while ((NULL != ct->hdesc[i].desc) &&
472 !string_set_cmp(constring, ct->hdesc[i].desc->string))
473 {
474 i = (i + 1) & (ct->size-1);
475 }
476
477 return &ct->hdesc[i];
478 }
479
condesc_table_alloc(ConTable * ct,size_t size)480 static void condesc_table_alloc(ConTable *ct, size_t size)
481 {
482 ct->hdesc = malloc(size * sizeof(hdesc_t));
483 memset(ct->hdesc, 0, size * sizeof(hdesc_t));
484 ct->size = size;
485 }
486
487 #define CONDESC_TABLE_GROWTH_FACTOR 2
488
condesc_grow(ConTable * ct)489 static bool condesc_grow(ConTable *ct)
490 {
491 size_t old_size = ct->size;
492 hdesc_t *old_hdesc = ct->hdesc;
493
494 lgdebug(+11, "Growing ConTable from %zu\n", old_size);
495 condesc_table_alloc(ct, ct->size * CONDESC_TABLE_GROWTH_FACTOR);
496
497 for (size_t i = 0; i < old_size; i++)
498 {
499 hdesc_t *old_h = &old_hdesc[i];
500 if (NULL == old_h->desc) continue;
501 hdesc_t *new_h = condesc_find(ct, old_h->desc->string, old_h->str_hash);
502
503 if (NULL != new_h->desc)
504 {
505 prt_error("Fatal Error: condesc_grow(): Internal error\n");
506 free(old_hdesc);
507 return false;
508 }
509 *new_h = *old_h;
510 }
511
512 free(old_hdesc);
513 return true;
514 }
515
condesc_add(ConTable * ct,const char * constring)516 condesc_t *condesc_add(ConTable *ct, const char *constring)
517 {
518 uint32_t hash = (connector_hash_t)connector_str_hash(constring);
519 hdesc_t *h = condesc_find(ct, constring, hash);
520
521 if (NULL == h->desc)
522 {
523 assert(0 == ct->num_uc, "Trying to add a connector (%s) "
524 "after reading the dict.\n", constring);
525 lgdebug(+11, "Creating connector '%s' (%zu)\n", constring, ct->num_con);
526 h->desc = pool_alloc(ct->mempool);
527 h->desc->string = constring;
528 h->str_hash = hash;
529 ct->num_con++;
530
531 if ((8 * ct->num_con) > (3 * ct->size))
532 {
533 if (!condesc_grow(ct)) return NULL;
534 h = condesc_find(ct, constring, hash);
535 }
536 }
537
538 return h->desc;
539 }
540
condesc_init(Dictionary dict,size_t num_con)541 void condesc_init(Dictionary dict, size_t num_con)
542 {
543 ConTable *ct = &dict->contable;
544
545 condesc_table_alloc(ct, num_con);
546 ct->mempool = pool_new(__func__, "ConTable",
547 /*num_elements*/1024, sizeof(condesc_t),
548 /*zero_out*/true, /*align*/true, /*exact*/false);
549
550 ct->length_limit_def = NULL;
551 ct->length_limit_def_next = &ct->length_limit_def;
552 }
553
condesc_setup(Dictionary dict)554 void condesc_setup(Dictionary dict)
555 {
556 sort_condesc_by_uc_constring(dict);
557 set_all_condesc_length_limit(dict);
558 free(dict->contable.sdesc);
559 }
560 /* ========================= END OF FILE ============================== */
561