1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright (c) 2013 Linas Vepstas                                      */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 /*
14  * Miscellaneous utilities for dealing with word types.
15  */
16 
17 #include <limits.h>                     // CHAR_BIT
18 
19 #include "dict-common/dict-utils.h"     // size_of_expression
20 #include "api-structures.h"             // Parse_Options_s
21 #include "connectors.h"
22 #include "link-includes.h"              // Parse_Options
23 
24 #define WILD_TYPE '*'
25 #define LENGTH_LINIT_WILD_TYPE WILD_TYPE
26 
27 /**
28  * free_connectors() -- free the list of connectors pointed to by e
29  * (does not free any strings)
30  */
free_connectors(Connector * e)31 void free_connectors(Connector *e)
32 {
33 	Connector * n;
34 	for (; e != NULL; e = n)
35 	{
36 		n = e->next;
37 		free(e);
38 	}
39 }
40 
41 void
set_connector_length_limit(Connector * c,Parse_Options opts)42 set_connector_length_limit(Connector *c, Parse_Options opts)
43 {
44 	if (NULL == opts)
45 	{
46 		c->length_limit = UNLIMITED_LEN;
47 		return;
48 	}
49 
50 	int short_len = opts->short_length;
51 	bool all_short = opts->all_short;
52 	int length_limit = c->desc->length_limit;
53 
54 	if ((all_short && (length_limit > short_len)) || (0 == length_limit))
55 		c->length_limit = short_len;
56 	else
57 		c->length_limit = length_limit;
58 }
59 
connector_new(Pool_desc * connector_pool,const condesc_t * desc,Parse_Options opts)60 Connector * connector_new(Pool_desc *connector_pool, const condesc_t *desc,
61                           Parse_Options opts)
62 {
63 	Connector *c;
64 
65 	if (NULL == connector_pool) /* For the SAT-parser, until fixed. */
66 	{
67 		c = malloc(sizeof(Connector));
68 		memset(c, 0, sizeof(Connector));
69 	}
70 	else
71 		c = pool_alloc(connector_pool); /* Memory-pool has zero_out attribute.*/
72 
73 	c->desc = desc;
74 	set_connector_length_limit(c, opts);
75 	//assert(0 != c->length_limit, "Connector_new(): Zero length_limit");
76 
77 	return c;
78 }
79 
80 /* ======================================================== */
81 /* Connector length limit handling - UNLIMITED-CONNECTORS and LENGTH-LIMIT-n. */
82 
get_connectors_from_expression(condesc_t ** conlist,const Exp * e)83 static size_t get_connectors_from_expression(condesc_t **conlist, const Exp *e)
84 {
85 	if (e->type == CONNECTOR_type)
86 	{
87 		if (NULL != conlist) *conlist = e->condesc;
88 		return 1;
89 	}
90 
91 	size_t cl_size = 0;
92 	for (Exp *opd = e->operand_first; opd != NULL; opd = opd->operand_next)
93 	{
94 		cl_size += get_connectors_from_expression(conlist, opd);
95 		if (NULL != conlist) conlist++;
96 	}
97 
98 	return cl_size;
99 }
100 
condesc_by_uc_num(const void * a,const void * b)101 static int condesc_by_uc_num(const void *a, const void *b)
102 {
103 	const condesc_t * const * cda = a;
104 	const condesc_t * const * cdb = b;
105 
106 	if ((*cda)->uc_num < (*cdb)->uc_num) return -1;
107 	if ((*cda)->uc_num > (*cdb)->uc_num) return 1;
108 
109 	return 0;
110 }
111 
112 /**
113  * Set the length limit of all the connectors that match those in e.
114  * XXX A connector in e that doesn't match any other connector cannot
115  * be detected, because it has been inserted into the connector table and
116  * hence matches at least itself.
117  */
set_condesc_length_limit(Dictionary dict,const Exp * e,int length_limit)118 static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_limit)
119 {
120 	size_t exp_num_con;
121 	ConTable *ct = &dict->contable;
122 	condesc_t **sdesc = ct->sdesc;
123 	condesc_t **econlist;
124 
125 	/* Create a connector list from the given expression. */
126 	exp_num_con = get_connectors_from_expression(NULL, e);
127 	if (0 == exp_num_con) return; /* Empty connector list. */
128 	econlist = alloca(exp_num_con * sizeof(*econlist));
129 	get_connectors_from_expression(econlist, e);
130 
131 	qsort(econlist, exp_num_con, sizeof(*econlist), condesc_by_uc_num);
132 
133 	/* Scan the expression connector list and set length_limit.
134 	 * restart_cn is needed because several connectors in this list
135 	 * may match a given uppercase part. */
136 	size_t restart_cn = 0, cn, en;
137 	for (en = 0; en < exp_num_con; en++)
138 	{
139 		for (cn = restart_cn; cn < ct->num_con; cn++)
140 			if (sdesc[cn]->uc_num >= econlist[en]->uc_num) break;
141 
142 		for (; en < exp_num_con; en++)
143 			if (econlist[en]->uc_num >= sdesc[cn]->uc_num) break;
144 		if (en == exp_num_con) break;
145 
146 		if (econlist[en]->uc_num != sdesc[cn]->uc_num) continue;
147 		restart_cn = cn;
148 
149 		const char *wc_str = econlist[en]->string;
150 		char *uc_wildcard = strchr(wc_str, LENGTH_LINIT_WILD_TYPE);
151 
152 		for (; cn < ct->num_con; cn++)
153 		{
154 			if (NULL == uc_wildcard)
155 			{
156 				if (econlist[en]->uc_num != sdesc[cn]->uc_num)
157 					break;
158 				/* The uppercase parts are equal - match only the lowercase ones. */
159 				if (!lc_easy_match(econlist[en], sdesc[cn]))
160 					continue;
161 			}
162 			else
163 			{
164 				/* The uppercase part is a prefix. */
165 				if (0 != strncmp(wc_str, sdesc[cn]->string, uc_wildcard - wc_str))
166 					break;
167 			}
168 
169 			sdesc[cn]->length_limit = length_limit;
170 		}
171 	}
172 }
173 
condesc_length_limit_def_delete(ConTable * ct)174 static void condesc_length_limit_def_delete(ConTable *ct)
175 {
176 	length_limit_def_t *l_next;
177 
178 	for (length_limit_def_t *l = ct->length_limit_def; NULL != l; l = l_next)
179 	{
180 		l_next = l->next;
181 		free(l);
182 	}
183 	ct->length_limit_def = NULL;
184 }
185 
set_all_condesc_length_limit(Dictionary dict)186 static void set_all_condesc_length_limit(Dictionary dict)
187 {
188 	ConTable *ct = &dict->contable;
189 	bool unlimited_len_found = false;
190 
191 	for (length_limit_def_t *l = ct->length_limit_def; NULL != l; l = l->next)
192 	{
193 		set_condesc_length_limit(dict, l->defexp, l->length_limit);
194 		if (UNLIMITED_LEN == l->length_limit) unlimited_len_found = true;
195 	}
196 
197 	if (!unlimited_len_found)
198 	{
199 		/* If no connectors are defined as UNLIMITED_LEN, set all the
200 		 * connectors with no defined length-limit to UNLIMITED_LEN. */
201 		condesc_t **sdesc = ct->sdesc;
202 
203 		for (size_t en = 0; en < ct->num_con; en++)
204 		{
205 			if (0 == sdesc[en]->length_limit)
206 				sdesc[en]->length_limit = UNLIMITED_LEN;
207 		}
208 	}
209 
210 	condesc_length_limit_def_delete(&dict->contable);
211 
212 	if (verbosity_level(D_SPEC+1))
213 	{
214 		prt_error("Debug:\n%5s %-6s %3s\n\\", "num", "uc_num", "ll");
215 		for (size_t n = 0; n < ct->num_con; n++)
216 		{
217 			prt_error("%5zu %6u %3d %s\n\\", n, ct->sdesc[n]->uc_num,
218 			       ct->sdesc[n]->length_limit, ct->sdesc[n]->string);
219 		}
220 		prt_error("\n");
221 	}
222 }
223 
224 /* ======================================================== */
225 
226 /**
227  * Pack the LC part of a connector into 64 bits, and compute a wild-card mask.
228  * Up to 9 characters can be so packed.
229  *
230  * Because we pack by shifts, we can do it using 7-bit per original
231  * character at the same overhead needed for 8-bit packing.
232  *
233  * Note: The LC part may consist of chars in the range [a-z0-9]
234  * (total 36) so a 6-bit packing is possible (by abs(value-60) on each
235  * character value).
236  */
connector_encode_lc(const char * lc_string,condesc_t * desc)237 static void connector_encode_lc(const char *lc_string, condesc_t *desc)
238 {
239 	lc_enc_t lc_mask = 0;
240 	lc_enc_t lc_value = 0;
241 	lc_enc_t wildcard = LC_MASK;
242 	const char *s;
243 
244 	for (s = lc_string; '\0' != *s; s++)
245 	{
246 		if (*s != WILD_TYPE)
247 		{
248 			lc_value |= (lc_enc_t)(*s & LC_MASK) << ((s-lc_string)*LC_BITS);
249 			lc_mask |= wildcard;
250 		}
251 		wildcard <<= LC_BITS;
252 	};
253 
254 	/* FIXME: Check on dict read. */
255 	if ((size_t)(s-lc_string) > (sizeof(lc_value)/LC_BITS)*CHAR_BIT)
256 	{
257 		prt_error("Warning: Lower-case part '%s' is too long (%d>%d)\n",
258 					 lc_string, (int)(s-lc_string), MAX_CONNECTOR_LC_LENGTH);
259 	}
260 
261 	desc->lc_mask = (lc_mask << 1) + !!(desc->flags & CD_HEAD_DEPENDENT);
262 	desc->lc_letters = (lc_value << 1) + !!(desc->flags & CD_HEAD);
263 }
264 
265 /**
266  * Calculate fixed connector information that only depend on its string.
267  * This information is used to speed up the parsing stage. It is
268  * calculated during the directory creation and doesn't change afterward.
269  */
calculate_connector_info(condesc_t * c)270 static void calculate_connector_info(condesc_t * c)
271 {
272 	const char *s;
273 
274 	s = c->string;
275 	if (islower(*s)) s++; /* Ignore head-dependent indicator. */
276 	if ((c->string[0] == 'h') || (c->string[0] == 'd'))
277 		c->flags |= CD_HEAD_DEPENDENT;
278 	if (c->string[0] == 'h')
279 		c->flags |= CD_HEAD;
280 
281 	c->uc_start = (uint8_t)(s - c->string);
282 	while (isupper(*++s)) {} /* Skip the uppercase part. */
283 	c->uc_length = (uint8_t)(s - c->string - c->uc_start);
284 
285 	connector_encode_lc(s, c);
286 }
287 
288 /* ================= Connector descriptor table. ====================== */
289 
connector_str_hash(const char * s)290 static uint32_t connector_str_hash(const char *s)
291 {
292 	/* From an old-code comment:
293 	 * For most situations, all three hashes are very nearly equal.
294 	 * For both English and Russian, there are about 100 pre-defined
295 	 * connectors, and another 2K-4K autogen'ed ones (the IDxxx idiom
296 	 * connectors, and the LLxxx suffix connectors for Russian). */
297 #ifdef USE_DJB2
298 	/* djb2 hash. */
299 	uint32_t i = 5381;
300 	while (isupper(*s)) /* Connector tables cannot contain UTF8. */
301 	{
302 		i = ((i << 5) + i) + *s;
303 		s++;
304 	}
305 	i += i>>14;
306 #endif /* USE_DJB2 */
307 
308 #define USE_JENKINS
309 #ifdef USE_JENKINS
310 	/* Jenkins one-at-a-time hash. */
311 	uint32_t i = 0;
312 	while (isupper(*s)) /* Connector tables cannot contain UTF8. */
313 	{
314 		i += *s;
315 		i += (i<<10);
316 		i ^= (i>>6);
317 		s++;
318 	}
319 	i += (i << 3);
320 	i ^= (i >> 11);
321 	i += (i << 15);
322 #endif /* USE_JENKINS */
323 
324 #ifdef USE_SDBM
325 	/* sdbm hash. */
326 	uint32_t i = 0;
327 	c->uc_start = s - c->string;
328 	while (isupper(*s))
329 	{
330 		i = *s + (i << 6) + (i << 16) - i;
331 		s++;
332 	}
333 #endif /* USE_SDBM */
334 
335 	return i;
336 }
337 
338 /**
339  * Compare connector UC parts, for qsort.
340  */
condesc_by_uc_constring(const void * a,const void * b)341 static int condesc_by_uc_constring(const void * a, const void * b)
342 {
343 	const condesc_t * const * cda = a;
344 	const condesc_t * const * cdb = b;
345 
346 	/* Move the empty slots to the end. */
347 	if (NULL == *cda) return (NULL != *cdb);
348 	if (NULL == *cdb) return -1;
349 
350 	const char *sa = &(*cda)->string[(*cda)->uc_start];
351 	const char *sb = &(*cdb)->string[(*cdb)->uc_start];
352 
353 	int la = (*cda)->uc_length;
354 	int lb = (*cdb)->uc_length;
355 
356 	if (la == lb)
357 	{
358 		//printf("la==lb A=%s b=%s, la=%d lb=%d len=%d\n",sa,sb,la,lb,la);
359 		return strncmp(sa, sb, la);
360 	}
361 
362 	if (la < lb)
363 	{
364 		char *uca = strdupa(sa);
365 		uca[la] = '\0';
366 		//printf("la<lb A=%s b=%s, la=%d lb=%d len=%d\n",uca,sb,la,lb,lb);
367 		return strncmp(uca, sb, lb);
368 	}
369 	else
370 	{
371 		char *ucb = strdupa(sb);
372 		ucb[lb] = '\0';
373 		//printf("la>lb A=%s b=%s, la=%d lb=%d len=%d\n",sa,ucb,la,lb,la);
374 		return strncmp(sa, ucb, la);
375 	}
376 }
377 
378 /**
379  * Enumerate the connectors by their UC parts - equal parts get the same number.
380  * It can later serve as a table index, as if it was a perfect hash.
381  */
sort_condesc_by_uc_constring(Dictionary dict)382 bool sort_condesc_by_uc_constring(Dictionary dict)
383 {
384 	if ((0 == dict->contable.num_con) && !IS_DB_DICT(dict))
385 	{
386 		prt_error("Error: Dictionary %s: No connectors found.\n", dict->name);
387 		return false;
388 	}
389 
390 	/* An SQL dict without <UNKNOWN-WORD> may have 0 connectors here. */
391 	if (0 == dict->contable.num_con)
392 		return true;
393 
394 	condesc_t **sdesc = malloc(dict->contable.num_con * sizeof(condesc_t *));
395 	size_t i = 0;
396 	for (size_t n = 0; n < dict->contable.size; n++)
397 	{
398 		condesc_t *condesc = dict->contable.hdesc[n].desc;
399 
400 		if (NULL == condesc) continue;
401 		calculate_connector_info(condesc);
402 		sdesc[i++] = dict->contable.hdesc[n].desc;
403 	}
404 
405 	qsort(sdesc, dict->contable.num_con, sizeof(*dict->contable.sdesc),
406 	      condesc_by_uc_constring);
407 
408 	/* Enumerate the connectors according to their UC part. */
409 	int uc_num = 0;
410 
411 	sdesc[0]->uc_num = uc_num;
412 	for (size_t n = 1; n < dict->contable.num_con; n++)
413 	{
414 		condesc_t **condesc = &sdesc[n];
415 
416 		if (condesc[0]->uc_length != condesc[-1]->uc_length)
417 
418 		{
419 			/* We know that the UC part has been changed. */
420 			uc_num++;
421 		}
422 		else
423 		{
424 			const char *uc1 = &condesc[0]->string[condesc[0]->uc_start];
425 			const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start];
426 			if (0 != strncmp(uc1, uc2, condesc[0]->uc_length))
427 			{
428 				uc_num++;
429 			}
430 		}
431 
432 		//printf("%5d constring=%s\n", uc_num, condesc[0]->string);
433 		condesc[0]->uc_num = uc_num;
434 	}
435 
436 	lgdebug(+11, "Dictionary %s: %zu different connectors "
437 	        "(%d with a different UC part)\n",
438 	        dict->name, dict->contable.num_con, uc_num+1);
439 
440 	dict->contable.sdesc = sdesc;
441 	dict->contable.num_uc = uc_num + 1;
442 
443 	/* hdesc is not freed here because it is needed for finding ZZZ.
444 	 * It could be freed here if we have ZZZ cached in the dict structure. */
445 	return true;
446 }
447 
condesc_delete(Dictionary dict)448 void condesc_delete(Dictionary dict)
449 {
450 	ConTable *ct = &dict->contable;
451 
452 	free(ct->hdesc);
453 	pool_delete(ct->mempool);
454 	condesc_length_limit_def_delete(ct);
455 }
456 
condesc_reuse(Dictionary dict)457 void condesc_reuse(Dictionary dict)
458 {
459 	ConTable *ct = &dict->contable;
460 
461 	ct->num_con = 0;
462 	ct->num_uc = 0;
463 	memset(ct->hdesc, 0, ct->size * sizeof(hdesc_t));
464 	pool_reuse(ct->mempool);
465 }
466 
condesc_find(ConTable * ct,const char * constring,uint32_t hash)467 static hdesc_t *condesc_find(ConTable *ct, const char *constring, uint32_t hash)
468 {
469 	uint32_t i = hash & (ct->size-1);
470 
471 	while ((NULL != ct->hdesc[i].desc) &&
472 	       !string_set_cmp(constring, ct->hdesc[i].desc->string))
473 	{
474 		i = (i + 1) & (ct->size-1);
475 	}
476 
477 	return &ct->hdesc[i];
478 }
479 
condesc_table_alloc(ConTable * ct,size_t size)480 static void condesc_table_alloc(ConTable *ct, size_t size)
481 {
482 	ct->hdesc = malloc(size * sizeof(hdesc_t));
483 	memset(ct->hdesc, 0, size * sizeof(hdesc_t));
484 	ct->size = size;
485 }
486 
487 #define CONDESC_TABLE_GROWTH_FACTOR 2
488 
condesc_grow(ConTable * ct)489 static bool condesc_grow(ConTable *ct)
490 {
491 	size_t old_size = ct->size;
492 	hdesc_t *old_hdesc = ct->hdesc;
493 
494 	lgdebug(+11, "Growing ConTable from %zu\n", old_size);
495 	condesc_table_alloc(ct, ct->size * CONDESC_TABLE_GROWTH_FACTOR);
496 
497 	for (size_t i = 0; i < old_size; i++)
498 	{
499 		hdesc_t *old_h = &old_hdesc[i];
500 		if (NULL == old_h->desc) continue;
501 		hdesc_t *new_h = condesc_find(ct, old_h->desc->string, old_h->str_hash);
502 
503 		if (NULL != new_h->desc)
504 		{
505 			prt_error("Fatal Error: condesc_grow(): Internal error\n");
506 			free(old_hdesc);
507 			return false;
508 		}
509 		*new_h = *old_h;
510 	}
511 
512 	free(old_hdesc);
513 	return true;
514 }
515 
condesc_add(ConTable * ct,const char * constring)516 condesc_t *condesc_add(ConTable *ct, const char *constring)
517 {
518 	uint32_t hash = (connector_hash_t)connector_str_hash(constring);
519 	hdesc_t *h = condesc_find(ct, constring, hash);
520 
521 	if (NULL == h->desc)
522 	{
523 		assert(0 == ct->num_uc, "Trying to add a connector (%s) "
524 				 "after reading the dict.\n", constring);
525 		lgdebug(+11, "Creating connector '%s' (%zu)\n", constring, ct->num_con);
526 		h->desc = pool_alloc(ct->mempool);
527 		h->desc->string = constring;
528 		h->str_hash = hash;
529 		ct->num_con++;
530 
531 		if ((8 * ct->num_con) > (3 * ct->size))
532 		{
533 			if (!condesc_grow(ct)) return NULL;
534 			h = condesc_find(ct, constring, hash);
535 		}
536 	}
537 
538 	return h->desc;
539 }
540 
condesc_init(Dictionary dict,size_t num_con)541 void condesc_init(Dictionary dict, size_t num_con)
542 {
543 	ConTable *ct = &dict->contable;
544 
545 	condesc_table_alloc(ct, num_con);
546 	ct->mempool = pool_new(__func__, "ConTable",
547 								  /*num_elements*/1024, sizeof(condesc_t),
548 								  /*zero_out*/true, /*align*/true, /*exact*/false);
549 
550 	ct->length_limit_def = NULL;
551 	ct->length_limit_def_next = &ct->length_limit_def;
552 }
553 
condesc_setup(Dictionary dict)554 void condesc_setup(Dictionary dict)
555 {
556 	sort_condesc_by_uc_constring(dict);
557 	set_all_condesc_length_limit(dict);
558 	free(dict->contable.sdesc);
559 }
560 /* ========================= END OF FILE ============================== */
561