1 /*-------------------------------------------------------------------------
2  *
3  * spell.c
4  *		Normalizing word with ISpell
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  *
8  * Ispell dictionary
9  * -----------------
10  *
11  * Rules of dictionaries are defined in two files with .affix and .dict
12  * extensions. They are used by spell checker programs Ispell and Hunspell.
13  *
14  * An .affix file declares morphological rules to get a basic form of words.
15  * The format of an .affix file has different structure for Ispell and Hunspell
16  * dictionaries. The Hunspell format is more complicated. But when an .affix
17  * file is imported and compiled, it is stored in the same structure AffixNode.
18  *
19  * A .dict file stores a list of basic forms of words with references to
20  * affix rules. The format of a .dict file has the same structure for Ispell
21  * and Hunspell dictionaries.
22  *
23  * Compilation of a dictionary
24  * ---------------------------
25  *
26  * A compiled dictionary is stored in the IspellDict structure. Compilation of
27  * a dictionary is divided into the several steps:
28  *	- NIImportDictionary() - stores each word of a .dict file in the
29  *	  temporary Spell field.
30  *	- NIImportAffixes() - stores affix rules of an .affix file in the
31  *	  Affix field (not temporary) if an .affix file has the Ispell format.
32  *	  -> NIImportOOAffixes() - stores affix rules if an .affix file has the
33  *		 Hunspell format. The AffixData field is initialized if AF parameter
34  *		 is defined.
35  *	- NISortDictionary() - builds a prefix tree (Trie) from the words list
36  *	  and stores it in the Dictionary field. The words list is got from the
37  *	  Spell field. The AffixData field is initialized if AF parameter is not
38  *	  defined.
39  *	- NISortAffixes():
40  *	  - builds a list of compound affixes from the affix list and stores it
41  *		in the CompoundAffix.
42  *	  - builds prefix trees (Trie) from the affix list for prefixes and suffixes
43  *		and stores them in Suffix and Prefix fields.
44  *	  The affix list is got from the Affix field.
45  *
46  * Memory management
47  * -----------------
48  *
49  * The IspellDict structure has the Spell field which is used only in compile
50  * time. The Spell field stores a words list. It can take a lot of memory.
51  * Therefore when a dictionary is compiled this field is cleared by
52  * NIFinishBuild().
53  *
54  * All resources which should cleared by NIFinishBuild() is initialized using
55  * tmpalloc() and tmpalloc0().
56  *
57  * IDENTIFICATION
58  *	  src/backend/tsearch/spell.c
59  *
60  *-------------------------------------------------------------------------
61  */
62 
63 #include "postgres.h"
64 
65 #include "catalog/pg_collation.h"
66 #include "tsearch/dicts/spell.h"
67 #include "tsearch/ts_locale.h"
68 #include "utils/memutils.h"
69 
70 
71 /*
72  * Initialization requires a lot of memory that's not needed
73  * after the initialization is done.  During initialization,
74  * CurrentMemoryContext is the long-lived memory context associated
75  * with the dictionary cache entry.  We keep the short-lived stuff
76  * in the Conf->buildCxt context.
77  */
78 #define tmpalloc(sz)  MemoryContextAlloc(Conf->buildCxt, (sz))
79 #define tmpalloc0(sz)  MemoryContextAllocZero(Conf->buildCxt, (sz))
80 
81 /*
82  * Prepare for constructing an ISpell dictionary.
83  *
84  * The IspellDict struct is assumed to be zeroed when allocated.
85  */
86 void
NIStartBuild(IspellDict * Conf)87 NIStartBuild(IspellDict *Conf)
88 {
89 	/*
90 	 * The temp context is a child of CurTransactionContext, so that it will
91 	 * go away automatically on error.
92 	 */
93 	Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
94 										   "Ispell dictionary init context",
95 										   ALLOCSET_DEFAULT_SIZES);
96 }
97 
98 /*
99  * Clean up when dictionary construction is complete.
100  */
101 void
NIFinishBuild(IspellDict * Conf)102 NIFinishBuild(IspellDict *Conf)
103 {
104 	/* Release no-longer-needed temp memory */
105 	MemoryContextDelete(Conf->buildCxt);
106 	/* Just for cleanliness, zero the now-dangling pointers */
107 	Conf->buildCxt = NULL;
108 	Conf->Spell = NULL;
109 	Conf->firstfree = NULL;
110 	Conf->CompoundAffixFlags = NULL;
111 }
112 
113 
114 /*
115  * "Compact" palloc: allocate without extra palloc overhead.
116  *
117  * Since we have no need to free the ispell data items individually, there's
118  * not much value in the per-chunk overhead normally consumed by palloc.
119  * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
120  *
121  * We currently pre-zero all data allocated this way, even though some of it
122  * doesn't need that.  The cpalloc and cpalloc0 macros are just documentation
123  * to indicate which allocations actually require zeroing.
124  */
125 #define COMPACT_ALLOC_CHUNK 8192	/* amount to get from palloc at once */
126 #define COMPACT_MAX_REQ		1024	/* must be < COMPACT_ALLOC_CHUNK */
127 
128 static void *
compact_palloc0(IspellDict * Conf,size_t size)129 compact_palloc0(IspellDict *Conf, size_t size)
130 {
131 	void	   *result;
132 
133 	/* Should only be called during init */
134 	Assert(Conf->buildCxt != NULL);
135 
136 	/* No point in this for large chunks */
137 	if (size > COMPACT_MAX_REQ)
138 		return palloc0(size);
139 
140 	/* Keep everything maxaligned */
141 	size = MAXALIGN(size);
142 
143 	/* Need more space? */
144 	if (size > Conf->avail)
145 	{
146 		Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
147 		Conf->avail = COMPACT_ALLOC_CHUNK;
148 	}
149 
150 	result = (void *) Conf->firstfree;
151 	Conf->firstfree += size;
152 	Conf->avail -= size;
153 
154 	return result;
155 }
156 
157 #define cpalloc(size) compact_palloc0(Conf, size)
158 #define cpalloc0(size) compact_palloc0(Conf, size)
159 
160 static char *
cpstrdup(IspellDict * Conf,const char * str)161 cpstrdup(IspellDict *Conf, const char *str)
162 {
163 	char	   *res = cpalloc(strlen(str) + 1);
164 
165 	strcpy(res, str);
166 	return res;
167 }
168 
169 
170 /*
171  * Apply lowerstr(), producing a temporary result (in the buildCxt).
172  */
173 static char *
lowerstr_ctx(IspellDict * Conf,const char * src)174 lowerstr_ctx(IspellDict *Conf, const char *src)
175 {
176 	MemoryContext saveCtx;
177 	char	   *dst;
178 
179 	saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
180 	dst = lowerstr(src);
181 	MemoryContextSwitchTo(saveCtx);
182 
183 	return dst;
184 }
185 
186 #define MAX_NORM 1024
187 #define MAXNORMLEN 256
188 
189 #define STRNCMP(s,p)	strncmp( (s), (p), strlen(p) )
190 #define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
191 #define GETCHAR(A,N,T)	  GETWCHAR( (A)->repl, (A)->replen, N, T )
192 
193 static char *VoidString = "";
194 
195 static int
cmpspell(const void * s1,const void * s2)196 cmpspell(const void *s1, const void *s2)
197 {
198 	return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
199 }
200 
201 static int
cmpspellaffix(const void * s1,const void * s2)202 cmpspellaffix(const void *s1, const void *s2)
203 {
204 	return strcmp((*(SPELL *const *) s1)->p.flag,
205 				  (*(SPELL *const *) s2)->p.flag);
206 }
207 
208 static int
cmpcmdflag(const void * f1,const void * f2)209 cmpcmdflag(const void *f1, const void *f2)
210 {
211 	CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1,
212 			   *fv2 = (CompoundAffixFlag *) f2;
213 
214 	Assert(fv1->flagMode == fv2->flagMode);
215 
216 	if (fv1->flagMode == FM_NUM)
217 	{
218 		if (fv1->flag.i == fv2->flag.i)
219 			return 0;
220 
221 		return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
222 	}
223 
224 	return strcmp(fv1->flag.s, fv2->flag.s);
225 }
226 
227 static char *
findchar(char * str,int c)228 findchar(char *str, int c)
229 {
230 	while (*str)
231 	{
232 		if (t_iseq(str, c))
233 			return str;
234 		str += pg_mblen(str);
235 	}
236 
237 	return NULL;
238 }
239 
240 static char *
findchar2(char * str,int c1,int c2)241 findchar2(char *str, int c1, int c2)
242 {
243 	while (*str)
244 	{
245 		if (t_iseq(str, c1) || t_iseq(str, c2))
246 			return str;
247 		str += pg_mblen(str);
248 	}
249 
250 	return NULL;
251 }
252 
253 
254 /* backward string compare for suffix tree operations */
255 static int
strbcmp(const unsigned char * s1,const unsigned char * s2)256 strbcmp(const unsigned char *s1, const unsigned char *s2)
257 {
258 	int			l1 = strlen((const char *) s1) - 1,
259 				l2 = strlen((const char *) s2) - 1;
260 
261 	while (l1 >= 0 && l2 >= 0)
262 	{
263 		if (s1[l1] < s2[l2])
264 			return -1;
265 		if (s1[l1] > s2[l2])
266 			return 1;
267 		l1--;
268 		l2--;
269 	}
270 	if (l1 < l2)
271 		return -1;
272 	if (l1 > l2)
273 		return 1;
274 
275 	return 0;
276 }
277 
278 static int
strbncmp(const unsigned char * s1,const unsigned char * s2,size_t count)279 strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
280 {
281 	int			l1 = strlen((const char *) s1) - 1,
282 				l2 = strlen((const char *) s2) - 1,
283 				l = count;
284 
285 	while (l1 >= 0 && l2 >= 0 && l > 0)
286 	{
287 		if (s1[l1] < s2[l2])
288 			return -1;
289 		if (s1[l1] > s2[l2])
290 			return 1;
291 		l1--;
292 		l2--;
293 		l--;
294 	}
295 	if (l == 0)
296 		return 0;
297 	if (l1 < l2)
298 		return -1;
299 	if (l1 > l2)
300 		return 1;
301 	return 0;
302 }
303 
304 /*
305  * Compares affixes.
306  * First compares the type of an affix. Prefixes should go before affixes.
307  * If types are equal then compares replaceable string.
308  */
309 static int
cmpaffix(const void * s1,const void * s2)310 cmpaffix(const void *s1, const void *s2)
311 {
312 	const AFFIX *a1 = (const AFFIX *) s1;
313 	const AFFIX *a2 = (const AFFIX *) s2;
314 
315 	if (a1->type < a2->type)
316 		return -1;
317 	if (a1->type > a2->type)
318 		return 1;
319 	if (a1->type == FF_PREFIX)
320 		return strcmp(a1->repl, a2->repl);
321 	else
322 		return strbcmp((const unsigned char *) a1->repl,
323 					   (const unsigned char *) a2->repl);
324 }
325 
326 /*
327  * Gets an affix flag from the set of affix flags (sflagset).
328  *
329  * Several flags can be stored in a single string. Flags can be represented by:
330  * - 1 character (FM_CHAR). A character may be Unicode.
331  * - 2 characters (FM_LONG). A character may be Unicode.
332  * - numbers from 1 to 65000 (FM_NUM).
333  *
334  * Depending on the flagMode an affix string can have the following format:
335  * - FM_CHAR: ABCD
336  *	 Here we have 4 flags: A, B, C and D
337  * - FM_LONG: ABCDE*
338  *	 Here we have 3 flags: AB, CD and E*
339  * - FM_NUM: 200,205,50
340  *	 Here we have 3 flags: 200, 205 and 50
341  *
342  * Conf: current dictionary.
343  * sflagset: the set of affix flags. Returns a reference to the start of a next
344  *			 affix flag.
345  * sflag: returns an affix flag from sflagset.
346  */
347 static void
getNextFlagFromString(IspellDict * Conf,char ** sflagset,char * sflag)348 getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
349 {
350 	int32		s;
351 	char	   *next,
352 			   *sbuf = *sflagset;
353 	int			maxstep;
354 	bool		stop = false;
355 	bool		met_comma = false;
356 
357 	maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
358 
359 	while (**sflagset)
360 	{
361 		switch (Conf->flagMode)
362 		{
363 			case FM_LONG:
364 			case FM_CHAR:
365 				COPYCHAR(sflag, *sflagset);
366 				sflag += pg_mblen(*sflagset);
367 
368 				/* Go to start of the next flag */
369 				*sflagset += pg_mblen(*sflagset);
370 
371 				/* Check if we get all characters of flag */
372 				maxstep--;
373 				stop = (maxstep == 0);
374 				break;
375 			case FM_NUM:
376 				s = strtol(*sflagset, &next, 10);
377 				if (*sflagset == next || errno == ERANGE)
378 					ereport(ERROR,
379 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
380 							 errmsg("invalid affix flag \"%s\"", *sflagset)));
381 				if (s < 0 || s > FLAGNUM_MAXSIZE)
382 					ereport(ERROR,
383 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
384 							 errmsg("affix flag \"%s\" is out of range",
385 									*sflagset)));
386 				sflag += sprintf(sflag, "%0d", s);
387 
388 				/* Go to start of the next flag */
389 				*sflagset = next;
390 				while (**sflagset)
391 				{
392 					if (t_isdigit(*sflagset))
393 					{
394 						if (!met_comma)
395 							ereport(ERROR,
396 									(errcode(ERRCODE_CONFIG_FILE_ERROR),
397 									 errmsg("invalid affix flag \"%s\"",
398 											*sflagset)));
399 						break;
400 					}
401 					else if (t_iseq(*sflagset, ','))
402 					{
403 						if (met_comma)
404 							ereport(ERROR,
405 									(errcode(ERRCODE_CONFIG_FILE_ERROR),
406 									 errmsg("invalid affix flag \"%s\"",
407 											*sflagset)));
408 						met_comma = true;
409 					}
410 					else if (!t_isspace(*sflagset))
411 					{
412 						ereport(ERROR,
413 								(errcode(ERRCODE_CONFIG_FILE_ERROR),
414 								 errmsg("invalid character in affix flag \"%s\"",
415 										*sflagset)));
416 					}
417 
418 					*sflagset += pg_mblen(*sflagset);
419 				}
420 				stop = true;
421 				break;
422 			default:
423 				elog(ERROR, "unrecognized type of Conf->flagMode: %d",
424 					 Conf->flagMode);
425 		}
426 
427 		if (stop)
428 			break;
429 	}
430 
431 	if (Conf->flagMode == FM_LONG && maxstep > 0)
432 		ereport(ERROR,
433 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
434 				 errmsg("invalid affix flag \"%s\" with \"long\" flag value",
435 						sbuf)));
436 
437 	*sflag = '\0';
438 }
439 
440 /*
441  * Checks if the affix set Conf->AffixData[affix] contains affixflag.
442  * Conf->AffixData[affix] does not contain affixflag if this flag is not used
443  * actually by the .dict file.
444  *
445  * Conf: current dictionary.
446  * affix: index of the Conf->AffixData array.
447  * affixflag: the affix flag.
448  *
449  * Returns true if the string Conf->AffixData[affix] contains affixflag,
450  * otherwise returns false.
451  */
452 static bool
IsAffixFlagInUse(IspellDict * Conf,int affix,const char * affixflag)453 IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
454 {
455 	char	   *flagcur;
456 	char		flag[BUFSIZ];
457 
458 	if (*affixflag == 0)
459 		return true;
460 
461 	Assert(affix < Conf->nAffixData);
462 
463 	flagcur = Conf->AffixData[affix];
464 
465 	while (*flagcur)
466 	{
467 		getNextFlagFromString(Conf, &flagcur, flag);
468 		/* Compare first affix flag in flagcur with affixflag */
469 		if (strcmp(flag, affixflag) == 0)
470 			return true;
471 	}
472 
473 	/* Could not find affixflag */
474 	return false;
475 }
476 
477 /*
478  * Adds the new word into the temporary array Spell.
479  *
480  * Conf: current dictionary.
481  * word: new word.
482  * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
483  */
484 static void
NIAddSpell(IspellDict * Conf,const char * word,const char * flag)485 NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
486 {
487 	if (Conf->nspell >= Conf->mspell)
488 	{
489 		if (Conf->mspell)
490 		{
491 			Conf->mspell *= 2;
492 			Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
493 		}
494 		else
495 		{
496 			Conf->mspell = 1024 * 20;
497 			Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
498 		}
499 	}
500 	Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
501 	strcpy(Conf->Spell[Conf->nspell]->word, word);
502 	Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
503 		? cpstrdup(Conf, flag) : VoidString;
504 	Conf->nspell++;
505 }
506 
507 /*
508  * Imports dictionary into the temporary array Spell.
509  *
510  * Note caller must already have applied get_tsearch_config_filename.
511  *
512  * Conf: current dictionary.
513  * filename: path to the .dict file.
514  */
515 void
NIImportDictionary(IspellDict * Conf,const char * filename)516 NIImportDictionary(IspellDict *Conf, const char *filename)
517 {
518 	tsearch_readline_state trst;
519 	char	   *line;
520 
521 	if (!tsearch_readline_begin(&trst, filename))
522 		ereport(ERROR,
523 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
524 				 errmsg("could not open dictionary file \"%s\": %m",
525 						filename)));
526 
527 	while ((line = tsearch_readline(&trst)) != NULL)
528 	{
529 		char	   *s,
530 				   *pstr;
531 
532 		/* Set of affix flags */
533 		const char *flag;
534 
535 		/* Extract flag from the line */
536 		flag = NULL;
537 		if ((s = findchar(line, '/')))
538 		{
539 			*s++ = '\0';
540 			flag = s;
541 			while (*s)
542 			{
543 				/* we allow only single encoded flags for faster works */
544 				if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
545 					s++;
546 				else
547 				{
548 					*s = '\0';
549 					break;
550 				}
551 			}
552 		}
553 		else
554 			flag = "";
555 
556 		/* Remove trailing spaces */
557 		s = line;
558 		while (*s)
559 		{
560 			if (t_isspace(s))
561 			{
562 				*s = '\0';
563 				break;
564 			}
565 			s += pg_mblen(s);
566 		}
567 		pstr = lowerstr_ctx(Conf, line);
568 
569 		NIAddSpell(Conf, pstr, flag);
570 		pfree(pstr);
571 
572 		pfree(line);
573 	}
574 	tsearch_readline_end(&trst);
575 }
576 
577 /*
578  * Searches a basic form of word in the prefix tree. This word was generated
579  * using an affix rule. This rule may not be presented in an affix set of
580  * a basic form of word.
581  *
582  * For example, we have the entry in the .dict file:
583  * meter/GMD
584  *
585  * The affix rule with the flag S:
586  * SFX S   y	 ies		[^aeiou]y
587  * is not presented here.
588  *
589  * The affix rule with the flag M:
590  * SFX M   0	 's         .
591  * is presented here.
592  *
593  * Conf: current dictionary.
594  * word: basic form of word.
595  * affixflag: affix flag, by which a basic form of word was generated.
596  * flag: compound flag used to compare with StopMiddle->compoundflag.
597  *
598  * Returns 1 if the word was found in the prefix tree, else returns 0.
599  */
600 static int
FindWord(IspellDict * Conf,const char * word,const char * affixflag,int flag)601 FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
602 {
603 	SPNode	   *node = Conf->Dictionary;
604 	SPNodeData *StopLow,
605 			   *StopHigh,
606 			   *StopMiddle;
607 	const uint8 *ptr = (const uint8 *) word;
608 
609 	flag &= FF_COMPOUNDFLAGMASK;
610 
611 	while (node && *ptr)
612 	{
613 		StopLow = node->data;
614 		StopHigh = node->data + node->length;
615 		while (StopLow < StopHigh)
616 		{
617 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
618 			if (StopMiddle->val == *ptr)
619 			{
620 				if (*(ptr + 1) == '\0' && StopMiddle->isword)
621 				{
622 					if (flag == 0)
623 					{
624 						/*
625 						 * The word can be formed only with another word. And
626 						 * in the flag parameter there is not a sign that we
627 						 * search compound words.
628 						 */
629 						if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
630 							return 0;
631 					}
632 					else if ((flag & StopMiddle->compoundflag) == 0)
633 						return 0;
634 
635 					/*
636 					 * Check if this affix rule is presented in the affix set
637 					 * with index StopMiddle->affix.
638 					 */
639 					if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
640 						return 1;
641 				}
642 				node = StopMiddle->node;
643 				ptr++;
644 				break;
645 			}
646 			else if (StopMiddle->val < *ptr)
647 				StopLow = StopMiddle + 1;
648 			else
649 				StopHigh = StopMiddle;
650 		}
651 		if (StopLow >= StopHigh)
652 			break;
653 	}
654 	return 0;
655 }
656 
657 /*
658  * Context reset/delete callback for a regular expression used in an affix
659  */
660 static void
regex_affix_deletion_callback(void * arg)661 regex_affix_deletion_callback(void *arg)
662 {
663 	aff_regex_struct *pregex = (aff_regex_struct *) arg;
664 
665 	pg_regfree(&(pregex->regex));
666 }
667 
668 /*
669  * Adds a new affix rule to the Affix field.
670  *
671  * Conf: current dictionary.
672  * flag: affix flag ('\' in the below example).
673  * flagflags: set of flags from the flagval field for this affix rule. This set
674  *			  is listed after '/' character in the added string (repl).
675  *
676  *			  For example L flag in the hunspell_sample.affix:
677  *			  SFX \   0 Y/L [^Y]
678  *
679  * mask: condition for search ('[^Y]' in the above example).
680  * find: stripping characters from beginning (at prefix) or end (at suffix)
681  *		 of the word ('0' in the above example, 0 means that there is not
682  *		 stripping character).
683  * repl: adding string after stripping ('Y' in the above example).
684  * type: FF_SUFFIX or FF_PREFIX.
685  */
686 static void
NIAddAffix(IspellDict * Conf,const char * flag,char flagflags,const char * mask,const char * find,const char * repl,int type)687 NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
688 		   const char *find, const char *repl, int type)
689 {
690 	AFFIX	   *Affix;
691 
692 	if (Conf->naffixes >= Conf->maffixes)
693 	{
694 		if (Conf->maffixes)
695 		{
696 			Conf->maffixes *= 2;
697 			Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
698 		}
699 		else
700 		{
701 			Conf->maffixes = 16;
702 			Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
703 		}
704 	}
705 
706 	Affix = Conf->Affix + Conf->naffixes;
707 
708 	/* This affix rule can be applied for words with any ending */
709 	if (strcmp(mask, ".") == 0 || *mask == '\0')
710 	{
711 		Affix->issimple = 1;
712 		Affix->isregis = 0;
713 	}
714 	/* This affix rule will use regis to search word ending */
715 	else if (RS_isRegis(mask))
716 	{
717 		Affix->issimple = 0;
718 		Affix->isregis = 1;
719 		RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
720 				   *mask ? mask : VoidString);
721 	}
722 	/* This affix rule will use regex_t to search word ending */
723 	else
724 	{
725 		int			masklen;
726 		int			wmasklen;
727 		int			err;
728 		pg_wchar   *wmask;
729 		char	   *tmask;
730 		aff_regex_struct *pregex;
731 
732 		Affix->issimple = 0;
733 		Affix->isregis = 0;
734 		tmask = (char *) tmpalloc(strlen(mask) + 3);
735 		if (type == FF_SUFFIX)
736 			sprintf(tmask, "%s$", mask);
737 		else
738 			sprintf(tmask, "^%s", mask);
739 
740 		masklen = strlen(tmask);
741 		wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
742 		wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
743 
744 		/*
745 		 * The regex engine stores its stuff using malloc not palloc, so we
746 		 * must arrange to explicitly clean up the regex when the dictionary's
747 		 * context is cleared.  That means the regex_t has to stay in a fixed
748 		 * location within the context; we can't keep it directly in the AFFIX
749 		 * struct, since we may sort and resize the array of AFFIXes.
750 		 */
751 		Affix->reg.pregex = pregex = palloc(sizeof(aff_regex_struct));
752 
753 		err = pg_regcomp(&(pregex->regex), wmask, wmasklen,
754 						 REG_ADVANCED | REG_NOSUB,
755 						 DEFAULT_COLLATION_OID);
756 		if (err)
757 		{
758 			char		errstr[100];
759 
760 			pg_regerror(err, &(pregex->regex), errstr, sizeof(errstr));
761 			ereport(ERROR,
762 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
763 					 errmsg("invalid regular expression: %s", errstr)));
764 		}
765 
766 		pregex->mcallback.func = regex_affix_deletion_callback;
767 		pregex->mcallback.arg = (void *) pregex;
768 		MemoryContextRegisterResetCallback(CurrentMemoryContext,
769 										   &pregex->mcallback);
770 	}
771 
772 	Affix->flagflags = flagflags;
773 	if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
774 	{
775 		if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
776 			Affix->flagflags |= FF_COMPOUNDFLAG;
777 	}
778 	Affix->flag = cpstrdup(Conf, flag);
779 	Affix->type = type;
780 
781 	Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
782 	if ((Affix->replen = strlen(repl)) > 0)
783 		Affix->repl = cpstrdup(Conf, repl);
784 	else
785 		Affix->repl = VoidString;
786 	Conf->naffixes++;
787 }
788 
789 /* Parsing states for parse_affentry() and friends */
790 #define PAE_WAIT_MASK	0
791 #define PAE_INMASK		1
792 #define PAE_WAIT_FIND	2
793 #define PAE_INFIND		3
794 #define PAE_WAIT_REPL	4
795 #define PAE_INREPL		5
796 #define PAE_WAIT_TYPE	6
797 #define PAE_WAIT_FLAG	7
798 
799 /*
800  * Parse next space-separated field of an .affix file line.
801  *
802  * *str is the input pointer (will be advanced past field)
803  * next is where to copy the field value to, with null termination
804  *
805  * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
806  *
807  * Returns true if we found a field, false if not.
808  */
809 static bool
get_nextfield(char ** str,char * next)810 get_nextfield(char **str, char *next)
811 {
812 	int			state = PAE_WAIT_MASK;
813 	int			avail = BUFSIZ;
814 
815 	while (**str)
816 	{
817 		if (state == PAE_WAIT_MASK)
818 		{
819 			if (t_iseq(*str, '#'))
820 				return false;
821 			else if (!t_isspace(*str))
822 			{
823 				int			clen = pg_mblen(*str);
824 
825 				if (clen < avail)
826 				{
827 					COPYCHAR(next, *str);
828 					next += clen;
829 					avail -= clen;
830 				}
831 				state = PAE_INMASK;
832 			}
833 		}
834 		else					/* state == PAE_INMASK */
835 		{
836 			if (t_isspace(*str))
837 			{
838 				*next = '\0';
839 				return true;
840 			}
841 			else
842 			{
843 				int			clen = pg_mblen(*str);
844 
845 				if (clen < avail)
846 				{
847 					COPYCHAR(next, *str);
848 					next += clen;
849 					avail -= clen;
850 				}
851 			}
852 		}
853 		*str += pg_mblen(*str);
854 	}
855 
856 	*next = '\0';
857 
858 	return (state == PAE_INMASK);	/* OK if we got a nonempty field */
859 }
860 
861 /*
862  * Parses entry of an .affix file of MySpell or Hunspell format.
863  *
864  * An .affix file entry has the following format:
865  * - header
866  *	 <type>  <flag>  <cross_flag>  <flag_count>
867  * - fields after header:
868  *	 <type>  <flag>  <find>  <replace>	<mask>
869  *
870  * str is the input line
871  * field values are returned to type etc, which must be buffers of size BUFSIZ.
872  *
873  * Returns number of fields found; any omitted fields are set to empty strings.
874  */
875 static int
parse_ooaffentry(char * str,char * type,char * flag,char * find,char * repl,char * mask)876 parse_ooaffentry(char *str, char *type, char *flag, char *find,
877 				 char *repl, char *mask)
878 {
879 	int			state = PAE_WAIT_TYPE;
880 	int			fields_read = 0;
881 	bool		valid = false;
882 
883 	*type = *flag = *find = *repl = *mask = '\0';
884 
885 	while (*str)
886 	{
887 		switch (state)
888 		{
889 			case PAE_WAIT_TYPE:
890 				valid = get_nextfield(&str, type);
891 				state = PAE_WAIT_FLAG;
892 				break;
893 			case PAE_WAIT_FLAG:
894 				valid = get_nextfield(&str, flag);
895 				state = PAE_WAIT_FIND;
896 				break;
897 			case PAE_WAIT_FIND:
898 				valid = get_nextfield(&str, find);
899 				state = PAE_WAIT_REPL;
900 				break;
901 			case PAE_WAIT_REPL:
902 				valid = get_nextfield(&str, repl);
903 				state = PAE_WAIT_MASK;
904 				break;
905 			case PAE_WAIT_MASK:
906 				valid = get_nextfield(&str, mask);
907 				state = -1;		/* force loop exit */
908 				break;
909 			default:
910 				elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
911 					 state);
912 				break;
913 		}
914 		if (valid)
915 			fields_read++;
916 		else
917 			break;				/* early EOL */
918 		if (state < 0)
919 			break;				/* got all fields */
920 	}
921 
922 	return fields_read;
923 }
924 
925 /*
926  * Parses entry of an .affix file of Ispell format
927  *
928  * An .affix file entry has the following format:
929  * <mask>  >  [-<find>,]<replace>
930  */
931 static bool
parse_affentry(char * str,char * mask,char * find,char * repl)932 parse_affentry(char *str, char *mask, char *find, char *repl)
933 {
934 	int			state = PAE_WAIT_MASK;
935 	char	   *pmask = mask,
936 			   *pfind = find,
937 			   *prepl = repl;
938 
939 	*mask = *find = *repl = '\0';
940 
941 	while (*str)
942 	{
943 		if (state == PAE_WAIT_MASK)
944 		{
945 			if (t_iseq(str, '#'))
946 				return false;
947 			else if (!t_isspace(str))
948 			{
949 				COPYCHAR(pmask, str);
950 				pmask += pg_mblen(str);
951 				state = PAE_INMASK;
952 			}
953 		}
954 		else if (state == PAE_INMASK)
955 		{
956 			if (t_iseq(str, '>'))
957 			{
958 				*pmask = '\0';
959 				state = PAE_WAIT_FIND;
960 			}
961 			else if (!t_isspace(str))
962 			{
963 				COPYCHAR(pmask, str);
964 				pmask += pg_mblen(str);
965 			}
966 		}
967 		else if (state == PAE_WAIT_FIND)
968 		{
969 			if (t_iseq(str, '-'))
970 			{
971 				state = PAE_INFIND;
972 			}
973 			else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
974 			{
975 				COPYCHAR(prepl, str);
976 				prepl += pg_mblen(str);
977 				state = PAE_INREPL;
978 			}
979 			else if (!t_isspace(str))
980 				ereport(ERROR,
981 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
982 						 errmsg("syntax error")));
983 		}
984 		else if (state == PAE_INFIND)
985 		{
986 			if (t_iseq(str, ','))
987 			{
988 				*pfind = '\0';
989 				state = PAE_WAIT_REPL;
990 			}
991 			else if (t_isalpha(str))
992 			{
993 				COPYCHAR(pfind, str);
994 				pfind += pg_mblen(str);
995 			}
996 			else if (!t_isspace(str))
997 				ereport(ERROR,
998 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
999 						 errmsg("syntax error")));
1000 		}
1001 		else if (state == PAE_WAIT_REPL)
1002 		{
1003 			if (t_iseq(str, '-'))
1004 			{
1005 				break;			/* void repl */
1006 			}
1007 			else if (t_isalpha(str))
1008 			{
1009 				COPYCHAR(prepl, str);
1010 				prepl += pg_mblen(str);
1011 				state = PAE_INREPL;
1012 			}
1013 			else if (!t_isspace(str))
1014 				ereport(ERROR,
1015 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
1016 						 errmsg("syntax error")));
1017 		}
1018 		else if (state == PAE_INREPL)
1019 		{
1020 			if (t_iseq(str, '#'))
1021 			{
1022 				*prepl = '\0';
1023 				break;
1024 			}
1025 			else if (t_isalpha(str))
1026 			{
1027 				COPYCHAR(prepl, str);
1028 				prepl += pg_mblen(str);
1029 			}
1030 			else if (!t_isspace(str))
1031 				ereport(ERROR,
1032 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
1033 						 errmsg("syntax error")));
1034 		}
1035 		else
1036 			elog(ERROR, "unrecognized state in parse_affentry: %d", state);
1037 
1038 		str += pg_mblen(str);
1039 	}
1040 
1041 	*pmask = *pfind = *prepl = '\0';
1042 
1043 	return (*mask && (*find || *repl));
1044 }
1045 
1046 /*
1047  * Sets a Hunspell options depending on flag type.
1048  */
1049 static void
setCompoundAffixFlagValue(IspellDict * Conf,CompoundAffixFlag * entry,char * s,uint32 val)1050 setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry,
1051 						  char *s, uint32 val)
1052 {
1053 	if (Conf->flagMode == FM_NUM)
1054 	{
1055 		char	   *next;
1056 		int			i;
1057 
1058 		i = strtol(s, &next, 10);
1059 		if (s == next || errno == ERANGE)
1060 			ereport(ERROR,
1061 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1062 					 errmsg("invalid affix flag \"%s\"", s)));
1063 		if (i < 0 || i > FLAGNUM_MAXSIZE)
1064 			ereport(ERROR,
1065 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1066 					 errmsg("affix flag \"%s\" is out of range", s)));
1067 
1068 		entry->flag.i = i;
1069 	}
1070 	else
1071 		entry->flag.s = cpstrdup(Conf, s);
1072 
1073 	entry->flagMode = Conf->flagMode;
1074 	entry->value = val;
1075 }
1076 
1077 /*
1078  * Sets up a correspondence for the affix parameter with the affix flag.
1079  *
1080  * Conf: current dictionary.
1081  * s: affix flag in string.
1082  * val: affix parameter.
1083  */
1084 static void
addCompoundAffixFlagValue(IspellDict * Conf,char * s,uint32 val)1085 addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
1086 {
1087 	CompoundAffixFlag *newValue;
1088 	char		sbuf[BUFSIZ];
1089 	char	   *sflag;
1090 	int			clen;
1091 
1092 	while (*s && t_isspace(s))
1093 		s += pg_mblen(s);
1094 
1095 	if (!*s)
1096 		ereport(ERROR,
1097 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1098 				 errmsg("syntax error")));
1099 
1100 	/* Get flag without \n */
1101 	sflag = sbuf;
1102 	while (*s && !t_isspace(s) && *s != '\n')
1103 	{
1104 		clen = pg_mblen(s);
1105 		COPYCHAR(sflag, s);
1106 		sflag += clen;
1107 		s += clen;
1108 	}
1109 	*sflag = '\0';
1110 
1111 	/* Resize array or allocate memory for array CompoundAffixFlag */
1112 	if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
1113 	{
1114 		if (Conf->mCompoundAffixFlag)
1115 		{
1116 			Conf->mCompoundAffixFlag *= 2;
1117 			Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1118 				repalloc((void *) Conf->CompoundAffixFlags,
1119 						 Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1120 		}
1121 		else
1122 		{
1123 			Conf->mCompoundAffixFlag = 10;
1124 			Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1125 				tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1126 		}
1127 	}
1128 
1129 	newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
1130 
1131 	setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
1132 
1133 	Conf->usecompound = true;
1134 	Conf->nCompoundAffixFlag++;
1135 }
1136 
1137 /*
1138  * Returns a set of affix parameters which correspondence to the set of affix
1139  * flags s.
1140  */
1141 static int
getCompoundAffixFlagValue(IspellDict * Conf,char * s)1142 getCompoundAffixFlagValue(IspellDict *Conf, char *s)
1143 {
1144 	uint32		flag = 0;
1145 	CompoundAffixFlag *found,
1146 				key;
1147 	char		sflag[BUFSIZ];
1148 	char	   *flagcur;
1149 
1150 	if (Conf->nCompoundAffixFlag == 0)
1151 		return 0;
1152 
1153 	flagcur = s;
1154 	while (*flagcur)
1155 	{
1156 		getNextFlagFromString(Conf, &flagcur, sflag);
1157 		setCompoundAffixFlagValue(Conf, &key, sflag, 0);
1158 
1159 		found = (CompoundAffixFlag *)
1160 			bsearch(&key, (void *) Conf->CompoundAffixFlags,
1161 					Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
1162 					cmpcmdflag);
1163 		if (found != NULL)
1164 			flag |= found->value;
1165 	}
1166 
1167 	return flag;
1168 }
1169 
1170 /*
1171  * Returns a flag set using the s parameter.
1172  *
1173  * If Conf->useFlagAliases is true then the s parameter is index of the
1174  * Conf->AffixData array and function returns its entry.
1175  * Else function returns the s parameter.
1176  */
1177 static char *
getAffixFlagSet(IspellDict * Conf,char * s)1178 getAffixFlagSet(IspellDict *Conf, char *s)
1179 {
1180 	if (Conf->useFlagAliases && *s != '\0')
1181 	{
1182 		int			curaffix;
1183 		char	   *end;
1184 
1185 		curaffix = strtol(s, &end, 10);
1186 		if (s == end || errno == ERANGE)
1187 			ereport(ERROR,
1188 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1189 					 errmsg("invalid affix alias \"%s\"", s)));
1190 
1191 		if (curaffix > 0 && curaffix < Conf->nAffixData)
1192 
1193 			/*
1194 			 * Do not subtract 1 from curaffix because empty string was added
1195 			 * in NIImportOOAffixes
1196 			 */
1197 			return Conf->AffixData[curaffix];
1198 		else if (curaffix > Conf->nAffixData)
1199 			ereport(ERROR,
1200 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1201 					 errmsg("invalid affix alias \"%s\"", s)));
1202 		return VoidString;
1203 	}
1204 	else
1205 		return s;
1206 }
1207 
1208 /*
1209  * Import an affix file that follows MySpell or Hunspell format.
1210  *
1211  * Conf: current dictionary.
1212  * filename: path to the .affix file.
1213  */
1214 static void
NIImportOOAffixes(IspellDict * Conf,const char * filename)1215 NIImportOOAffixes(IspellDict *Conf, const char *filename)
1216 {
1217 	char		type[BUFSIZ],
1218 			   *ptype = NULL;
1219 	char		sflag[BUFSIZ];
1220 	char		mask[BUFSIZ],
1221 			   *pmask;
1222 	char		find[BUFSIZ],
1223 			   *pfind;
1224 	char		repl[BUFSIZ],
1225 			   *prepl;
1226 	bool		isSuffix = false;
1227 	int			naffix = 0,
1228 				curaffix = 0;
1229 	int			sflaglen = 0;
1230 	char		flagflags = 0;
1231 	tsearch_readline_state trst;
1232 	char	   *recoded;
1233 
1234 	/* read file to find any flag */
1235 	Conf->usecompound = false;
1236 	Conf->useFlagAliases = false;
1237 	Conf->flagMode = FM_CHAR;
1238 
1239 	if (!tsearch_readline_begin(&trst, filename))
1240 		ereport(ERROR,
1241 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1242 				 errmsg("could not open affix file \"%s\": %m",
1243 						filename)));
1244 
1245 	while ((recoded = tsearch_readline(&trst)) != NULL)
1246 	{
1247 		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1248 		{
1249 			pfree(recoded);
1250 			continue;
1251 		}
1252 
1253 		if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
1254 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
1255 									  FF_COMPOUNDFLAG);
1256 		else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
1257 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
1258 									  FF_COMPOUNDBEGIN);
1259 		else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
1260 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
1261 									  FF_COMPOUNDLAST);
1262 		/* COMPOUNDLAST and COMPOUNDEND are synonyms */
1263 		else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
1264 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
1265 									  FF_COMPOUNDLAST);
1266 		else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
1267 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
1268 									  FF_COMPOUNDMIDDLE);
1269 		else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
1270 			addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
1271 									  FF_COMPOUNDONLY);
1272 		else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
1273 			addCompoundAffixFlagValue(Conf,
1274 									  recoded + strlen("COMPOUNDPERMITFLAG"),
1275 									  FF_COMPOUNDPERMITFLAG);
1276 		else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
1277 			addCompoundAffixFlagValue(Conf,
1278 									  recoded + strlen("COMPOUNDFORBIDFLAG"),
1279 									  FF_COMPOUNDFORBIDFLAG);
1280 		else if (STRNCMP(recoded, "FLAG") == 0)
1281 		{
1282 			char	   *s = recoded + strlen("FLAG");
1283 
1284 			while (*s && t_isspace(s))
1285 				s += pg_mblen(s);
1286 
1287 			if (*s)
1288 			{
1289 				if (STRNCMP(s, "long") == 0)
1290 					Conf->flagMode = FM_LONG;
1291 				else if (STRNCMP(s, "num") == 0)
1292 					Conf->flagMode = FM_NUM;
1293 				else if (STRNCMP(s, "default") != 0)
1294 					ereport(ERROR,
1295 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1296 							 errmsg("Ispell dictionary supports only "
1297 									"\"default\", \"long\", "
1298 									"and \"num\" flag values")));
1299 			}
1300 		}
1301 
1302 		pfree(recoded);
1303 	}
1304 	tsearch_readline_end(&trst);
1305 
1306 	if (Conf->nCompoundAffixFlag > 1)
1307 		qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
1308 			  sizeof(CompoundAffixFlag), cmpcmdflag);
1309 
1310 	if (!tsearch_readline_begin(&trst, filename))
1311 		ereport(ERROR,
1312 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1313 				 errmsg("could not open affix file \"%s\": %m",
1314 						filename)));
1315 
1316 	while ((recoded = tsearch_readline(&trst)) != NULL)
1317 	{
1318 		int			fields_read;
1319 
1320 		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1321 			goto nextline;
1322 
1323 		fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
1324 
1325 		if (ptype)
1326 			pfree(ptype);
1327 		ptype = lowerstr_ctx(Conf, type);
1328 
1329 		/* First try to parse AF parameter (alias compression) */
1330 		if (STRNCMP(ptype, "af") == 0)
1331 		{
1332 			/* First line is the number of aliases */
1333 			if (!Conf->useFlagAliases)
1334 			{
1335 				Conf->useFlagAliases = true;
1336 				naffix = atoi(sflag);
1337 				if (naffix <= 0)
1338 					ereport(ERROR,
1339 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1340 							 errmsg("invalid number of flag vector aliases")));
1341 
1342 				/* Also reserve place for empty flag set */
1343 				naffix++;
1344 
1345 				Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1346 				Conf->lenAffixData = Conf->nAffixData = naffix;
1347 
1348 				/* Add empty flag set into AffixData */
1349 				Conf->AffixData[curaffix] = VoidString;
1350 				curaffix++;
1351 			}
1352 			/* Other lines are aliases */
1353 			else
1354 			{
1355 				if (curaffix < naffix)
1356 				{
1357 					Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
1358 					curaffix++;
1359 				}
1360 				else
1361 					ereport(ERROR,
1362 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1363 							 errmsg("number of aliases exceeds specified number %d",
1364 									naffix - 1)));
1365 			}
1366 			goto nextline;
1367 		}
1368 		/* Else try to parse prefixes and suffixes */
1369 		if (fields_read < 4 ||
1370 			(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
1371 			goto nextline;
1372 
1373 		sflaglen = strlen(sflag);
1374 		if (sflaglen == 0
1375 			|| (sflaglen > 1 && Conf->flagMode == FM_CHAR)
1376 			|| (sflaglen > 2 && Conf->flagMode == FM_LONG))
1377 			goto nextline;
1378 
1379 		/*--------
1380 		 * Affix header. For example:
1381 		 * SFX \ N 1
1382 		 *--------
1383 		 */
1384 		if (fields_read == 4)
1385 		{
1386 			isSuffix = (STRNCMP(ptype, "sfx") == 0);
1387 			if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
1388 				flagflags = FF_CROSSPRODUCT;
1389 			else
1390 				flagflags = 0;
1391 		}
1392 		/*--------
1393 		 * Affix fields. For example:
1394 		 * SFX \   0	Y/L [^Y]
1395 		 *--------
1396 		 */
1397 		else
1398 		{
1399 			char	   *ptr;
1400 			int			aflg = 0;
1401 
1402 			/* Get flags after '/' (flags are case sensitive) */
1403 			if ((ptr = strchr(repl, '/')) != NULL)
1404 				aflg |= getCompoundAffixFlagValue(Conf,
1405 												  getAffixFlagSet(Conf,
1406 																  ptr + 1));
1407 			/* Get lowercased version of string before '/' */
1408 			prepl = lowerstr_ctx(Conf, repl);
1409 			if ((ptr = strchr(prepl, '/')) != NULL)
1410 				*ptr = '\0';
1411 			pfind = lowerstr_ctx(Conf, find);
1412 			pmask = lowerstr_ctx(Conf, mask);
1413 			if (t_iseq(find, '0'))
1414 				*pfind = '\0';
1415 			if (t_iseq(repl, '0'))
1416 				*prepl = '\0';
1417 
1418 			NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
1419 					   isSuffix ? FF_SUFFIX : FF_PREFIX);
1420 			pfree(prepl);
1421 			pfree(pfind);
1422 			pfree(pmask);
1423 		}
1424 
1425 nextline:
1426 		pfree(recoded);
1427 	}
1428 
1429 	tsearch_readline_end(&trst);
1430 	if (ptype)
1431 		pfree(ptype);
1432 }
1433 
1434 /*
1435  * import affixes
1436  *
1437  * Note caller must already have applied get_tsearch_config_filename
1438  *
1439  * This function is responsible for parsing ispell ("old format") affix files.
1440  * If we realize that the file contains new-format commands, we pass off the
1441  * work to NIImportOOAffixes(), which will re-read the whole file.
1442  */
1443 void
NIImportAffixes(IspellDict * Conf,const char * filename)1444 NIImportAffixes(IspellDict *Conf, const char *filename)
1445 {
1446 	char	   *pstr = NULL;
1447 	char		flag[BUFSIZ];
1448 	char		mask[BUFSIZ];
1449 	char		find[BUFSIZ];
1450 	char		repl[BUFSIZ];
1451 	char	   *s;
1452 	bool		suffixes = false;
1453 	bool		prefixes = false;
1454 	char		flagflags = 0;
1455 	tsearch_readline_state trst;
1456 	bool		oldformat = false;
1457 	char	   *recoded = NULL;
1458 
1459 	if (!tsearch_readline_begin(&trst, filename))
1460 		ereport(ERROR,
1461 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1462 				 errmsg("could not open affix file \"%s\": %m",
1463 						filename)));
1464 
1465 	Conf->usecompound = false;
1466 	Conf->useFlagAliases = false;
1467 	Conf->flagMode = FM_CHAR;
1468 
1469 	while ((recoded = tsearch_readline(&trst)) != NULL)
1470 	{
1471 		pstr = lowerstr(recoded);
1472 
1473 		/* Skip comments and empty lines */
1474 		if (*pstr == '#' || *pstr == '\n')
1475 			goto nextline;
1476 
1477 		if (STRNCMP(pstr, "compoundwords") == 0)
1478 		{
1479 			/* Find case-insensitive L flag in non-lowercased string */
1480 			s = findchar2(recoded, 'l', 'L');
1481 			if (s)
1482 			{
1483 				while (*s && !t_isspace(s))
1484 					s += pg_mblen(s);
1485 				while (*s && t_isspace(s))
1486 					s += pg_mblen(s);
1487 
1488 				if (*s && pg_mblen(s) == 1)
1489 				{
1490 					addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
1491 					Conf->usecompound = true;
1492 				}
1493 				oldformat = true;
1494 				goto nextline;
1495 			}
1496 		}
1497 		if (STRNCMP(pstr, "suffixes") == 0)
1498 		{
1499 			suffixes = true;
1500 			prefixes = false;
1501 			oldformat = true;
1502 			goto nextline;
1503 		}
1504 		if (STRNCMP(pstr, "prefixes") == 0)
1505 		{
1506 			suffixes = false;
1507 			prefixes = true;
1508 			oldformat = true;
1509 			goto nextline;
1510 		}
1511 		if (STRNCMP(pstr, "flag") == 0)
1512 		{
1513 			s = recoded + 4;	/* we need non-lowercased string */
1514 			flagflags = 0;
1515 
1516 			while (*s && t_isspace(s))
1517 				s += pg_mblen(s);
1518 
1519 			if (*s == '*')
1520 			{
1521 				flagflags |= FF_CROSSPRODUCT;
1522 				s++;
1523 			}
1524 			else if (*s == '~')
1525 			{
1526 				flagflags |= FF_COMPOUNDONLY;
1527 				s++;
1528 			}
1529 
1530 			if (*s == '\\')
1531 				s++;
1532 
1533 			/*
1534 			 * An old-format flag is a single ASCII character; we expect it to
1535 			 * be followed by EOL, whitespace, or ':'.  Otherwise this is a
1536 			 * new-format flag command.
1537 			 */
1538 			if (*s && pg_mblen(s) == 1)
1539 			{
1540 				COPYCHAR(flag, s);
1541 				flag[1] = '\0';
1542 
1543 				s++;
1544 				if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
1545 					t_isspace(s))
1546 				{
1547 					oldformat = true;
1548 					goto nextline;
1549 				}
1550 			}
1551 			goto isnewformat;
1552 		}
1553 		if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
1554 			STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
1555 			STRNCMP(recoded, "PFX") == 0 ||
1556 			STRNCMP(recoded, "SFX") == 0)
1557 			goto isnewformat;
1558 
1559 		if ((!suffixes) && (!prefixes))
1560 			goto nextline;
1561 
1562 		if (!parse_affentry(pstr, mask, find, repl))
1563 			goto nextline;
1564 
1565 		NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
1566 
1567 nextline:
1568 		pfree(recoded);
1569 		pfree(pstr);
1570 	}
1571 	tsearch_readline_end(&trst);
1572 	return;
1573 
1574 isnewformat:
1575 	if (oldformat)
1576 		ereport(ERROR,
1577 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1578 				 errmsg("affix file contains both old-style and new-style commands")));
1579 	tsearch_readline_end(&trst);
1580 
1581 	NIImportOOAffixes(Conf, filename);
1582 }
1583 
1584 /*
1585  * Merges two affix flag sets and stores a new affix flag set into
1586  * Conf->AffixData.
1587  *
1588  * Returns index of a new affix flag set.
1589  */
1590 static int
MergeAffix(IspellDict * Conf,int a1,int a2)1591 MergeAffix(IspellDict *Conf, int a1, int a2)
1592 {
1593 	char	  **ptr;
1594 
1595 	Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData);
1596 
1597 	/* Do not merge affix flags if one of affix flags is empty */
1598 	if (*Conf->AffixData[a1] == '\0')
1599 		return a2;
1600 	else if (*Conf->AffixData[a2] == '\0')
1601 		return a1;
1602 
1603 	while (Conf->nAffixData + 1 >= Conf->lenAffixData)
1604 	{
1605 		Conf->lenAffixData *= 2;
1606 		Conf->AffixData = (char **) repalloc(Conf->AffixData,
1607 											 sizeof(char *) * Conf->lenAffixData);
1608 	}
1609 
1610 	ptr = Conf->AffixData + Conf->nAffixData;
1611 	if (Conf->flagMode == FM_NUM)
1612 	{
1613 		*ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1614 					   strlen(Conf->AffixData[a2]) +
1615 					   1 /* comma */ + 1 /* \0 */ );
1616 		sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1617 	}
1618 	else
1619 	{
1620 		*ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1621 					   strlen(Conf->AffixData[a2]) +
1622 					   1 /* \0 */ );
1623 		sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1624 	}
1625 	ptr++;
1626 	*ptr = NULL;
1627 	Conf->nAffixData++;
1628 
1629 	return Conf->nAffixData - 1;
1630 }
1631 
1632 /*
1633  * Returns a set of affix parameters which correspondence to the set of affix
1634  * flags with the given index.
1635  */
1636 static uint32
makeCompoundFlags(IspellDict * Conf,int affix)1637 makeCompoundFlags(IspellDict *Conf, int affix)
1638 {
1639 	Assert(affix < Conf->nAffixData);
1640 
1641 	return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) &
1642 			FF_COMPOUNDFLAGMASK);
1643 }
1644 
1645 /*
1646  * Makes a prefix tree for the given level.
1647  *
1648  * Conf: current dictionary.
1649  * low: lower index of the Conf->Spell array.
1650  * high: upper index of the Conf->Spell array.
1651  * level: current prefix tree level.
1652  */
1653 static SPNode *
mkSPNode(IspellDict * Conf,int low,int high,int level)1654 mkSPNode(IspellDict *Conf, int low, int high, int level)
1655 {
1656 	int			i;
1657 	int			nchar = 0;
1658 	char		lastchar = '\0';
1659 	SPNode	   *rs;
1660 	SPNodeData *data;
1661 	int			lownew = low;
1662 
1663 	for (i = low; i < high; i++)
1664 		if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
1665 		{
1666 			nchar++;
1667 			lastchar = Conf->Spell[i]->word[level];
1668 		}
1669 
1670 	if (!nchar)
1671 		return NULL;
1672 
1673 	rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
1674 	rs->length = nchar;
1675 	data = rs->data;
1676 
1677 	lastchar = '\0';
1678 	for (i = low; i < high; i++)
1679 		if (Conf->Spell[i]->p.d.len > level)
1680 		{
1681 			if (lastchar != Conf->Spell[i]->word[level])
1682 			{
1683 				if (lastchar)
1684 				{
1685 					/* Next level of the prefix tree */
1686 					data->node = mkSPNode(Conf, lownew, i, level + 1);
1687 					lownew = i;
1688 					data++;
1689 				}
1690 				lastchar = Conf->Spell[i]->word[level];
1691 			}
1692 			data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
1693 			if (Conf->Spell[i]->p.d.len == level + 1)
1694 			{
1695 				bool		clearCompoundOnly = false;
1696 
1697 				if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
1698 				{
1699 					/*
1700 					 * MergeAffix called a few times. If one of word is
1701 					 * allowed to be in compound word and another isn't, then
1702 					 * clear FF_COMPOUNDONLY flag.
1703 					 */
1704 
1705 					clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
1706 										 & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
1707 						? false : true;
1708 					data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
1709 				}
1710 				else
1711 					data->affix = Conf->Spell[i]->p.d.affix;
1712 				data->isword = 1;
1713 
1714 				data->compoundflag = makeCompoundFlags(Conf, data->affix);
1715 
1716 				if ((data->compoundflag & FF_COMPOUNDONLY) &&
1717 					(data->compoundflag & FF_COMPOUNDFLAG) == 0)
1718 					data->compoundflag |= FF_COMPOUNDFLAG;
1719 
1720 				if (clearCompoundOnly)
1721 					data->compoundflag &= ~FF_COMPOUNDONLY;
1722 			}
1723 		}
1724 
1725 	/* Next level of the prefix tree */
1726 	data->node = mkSPNode(Conf, lownew, high, level + 1);
1727 
1728 	return rs;
1729 }
1730 
1731 /*
1732  * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
1733  * and affixes.
1734  */
1735 void
NISortDictionary(IspellDict * Conf)1736 NISortDictionary(IspellDict *Conf)
1737 {
1738 	int			i;
1739 	int			naffix = 0;
1740 	int			curaffix;
1741 
1742 	/* compress affixes */
1743 
1744 	/*
1745 	 * If we use flag aliases then we need to use Conf->AffixData filled in
1746 	 * the NIImportOOAffixes().
1747 	 */
1748 	if (Conf->useFlagAliases)
1749 	{
1750 		for (i = 0; i < Conf->nspell; i++)
1751 		{
1752 			char	   *end;
1753 
1754 			if (*Conf->Spell[i]->p.flag != '\0')
1755 			{
1756 				curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
1757 				if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
1758 					ereport(ERROR,
1759 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1760 							 errmsg("invalid affix alias \"%s\"",
1761 									Conf->Spell[i]->p.flag)));
1762 				if (curaffix < 0 || curaffix >= Conf->nAffixData)
1763 					ereport(ERROR,
1764 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1765 							 errmsg("invalid affix alias \"%s\"",
1766 									Conf->Spell[i]->p.flag)));
1767 				if (*end != '\0' && !t_isdigit(end) && !t_isspace(end))
1768 					ereport(ERROR,
1769 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1770 							 errmsg("invalid affix alias \"%s\"",
1771 									Conf->Spell[i]->p.flag)));
1772 			}
1773 			else
1774 			{
1775 				/*
1776 				 * If Conf->Spell[i]->p.flag is empty, then get empty value of
1777 				 * Conf->AffixData (0 index).
1778 				 */
1779 				curaffix = 0;
1780 			}
1781 
1782 			Conf->Spell[i]->p.d.affix = curaffix;
1783 			Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1784 		}
1785 	}
1786 	/* Otherwise fill Conf->AffixData here */
1787 	else
1788 	{
1789 		/* Count the number of different flags used in the dictionary */
1790 		qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
1791 			  cmpspellaffix);
1792 
1793 		naffix = 0;
1794 		for (i = 0; i < Conf->nspell; i++)
1795 		{
1796 			if (i == 0
1797 				|| strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
1798 				naffix++;
1799 		}
1800 
1801 		/*
1802 		 * Fill in Conf->AffixData with the affixes that were used in the
1803 		 * dictionary. Replace textual flag-field of Conf->Spell entries with
1804 		 * indexes into Conf->AffixData array.
1805 		 */
1806 		Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1807 
1808 		curaffix = -1;
1809 		for (i = 0; i < Conf->nspell; i++)
1810 		{
1811 			if (i == 0
1812 				|| strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]))
1813 			{
1814 				curaffix++;
1815 				Assert(curaffix < naffix);
1816 				Conf->AffixData[curaffix] = cpstrdup(Conf,
1817 													 Conf->Spell[i]->p.flag);
1818 			}
1819 
1820 			Conf->Spell[i]->p.d.affix = curaffix;
1821 			Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1822 		}
1823 
1824 		Conf->lenAffixData = Conf->nAffixData = naffix;
1825 	}
1826 
1827 	/* Start build a prefix tree */
1828 	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1829 	Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1830 }
1831 
1832 /*
1833  * Makes a prefix tree for the given level using the repl string of an affix
1834  * rule. Affixes with empty replace string do not include in the prefix tree.
1835  * This affixes are included by mkVoidAffix().
1836  *
1837  * Conf: current dictionary.
1838  * low: lower index of the Conf->Affix array.
1839  * high: upper index of the Conf->Affix array.
1840  * level: current prefix tree level.
1841  * type: FF_SUFFIX or FF_PREFIX.
1842  */
1843 static AffixNode *
mkANode(IspellDict * Conf,int low,int high,int level,int type)1844 mkANode(IspellDict *Conf, int low, int high, int level, int type)
1845 {
1846 	int			i;
1847 	int			nchar = 0;
1848 	uint8		lastchar = '\0';
1849 	AffixNode  *rs;
1850 	AffixNodeData *data;
1851 	int			lownew = low;
1852 	int			naff;
1853 	AFFIX	  **aff;
1854 
1855 	for (i = low; i < high; i++)
1856 		if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1857 		{
1858 			nchar++;
1859 			lastchar = GETCHAR(Conf->Affix + i, level, type);
1860 		}
1861 
1862 	if (!nchar)
1863 		return NULL;
1864 
1865 	aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1866 	naff = 0;
1867 
1868 	rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1869 	rs->length = nchar;
1870 	data = rs->data;
1871 
1872 	lastchar = '\0';
1873 	for (i = low; i < high; i++)
1874 		if (Conf->Affix[i].replen > level)
1875 		{
1876 			if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1877 			{
1878 				if (lastchar)
1879 				{
1880 					/* Next level of the prefix tree */
1881 					data->node = mkANode(Conf, lownew, i, level + 1, type);
1882 					if (naff)
1883 					{
1884 						data->naff = naff;
1885 						data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1886 						memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1887 						naff = 0;
1888 					}
1889 					data++;
1890 					lownew = i;
1891 				}
1892 				lastchar = GETCHAR(Conf->Affix + i, level, type);
1893 			}
1894 			data->val = GETCHAR(Conf->Affix + i, level, type);
1895 			if (Conf->Affix[i].replen == level + 1)
1896 			{					/* affix stopped */
1897 				aff[naff++] = Conf->Affix + i;
1898 			}
1899 		}
1900 
1901 	/* Next level of the prefix tree */
1902 	data->node = mkANode(Conf, lownew, high, level + 1, type);
1903 	if (naff)
1904 	{
1905 		data->naff = naff;
1906 		data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1907 		memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1908 		naff = 0;
1909 	}
1910 
1911 	pfree(aff);
1912 
1913 	return rs;
1914 }
1915 
1916 /*
1917  * Makes the root void node in the prefix tree. The root void node is created
1918  * for affixes which have empty replace string ("repl" field).
1919  */
1920 static void
mkVoidAffix(IspellDict * Conf,bool issuffix,int startsuffix)1921 mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
1922 {
1923 	int			i,
1924 				cnt = 0;
1925 	int			start = (issuffix) ? startsuffix : 0;
1926 	int			end = (issuffix) ? Conf->naffixes : startsuffix;
1927 	AffixNode  *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1928 
1929 	Affix->length = 1;
1930 	Affix->isvoid = 1;
1931 
1932 	if (issuffix)
1933 	{
1934 		Affix->data->node = Conf->Suffix;
1935 		Conf->Suffix = Affix;
1936 	}
1937 	else
1938 	{
1939 		Affix->data->node = Conf->Prefix;
1940 		Conf->Prefix = Affix;
1941 	}
1942 
1943 	/* Count affixes with empty replace string */
1944 	for (i = start; i < end; i++)
1945 		if (Conf->Affix[i].replen == 0)
1946 			cnt++;
1947 
1948 	/* There is not affixes with empty replace string */
1949 	if (cnt == 0)
1950 		return;
1951 
1952 	Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
1953 	Affix->data->naff = (uint32) cnt;
1954 
1955 	cnt = 0;
1956 	for (i = start; i < end; i++)
1957 		if (Conf->Affix[i].replen == 0)
1958 		{
1959 			Affix->data->aff[cnt] = Conf->Affix + i;
1960 			cnt++;
1961 		}
1962 }
1963 
1964 /*
1965  * Checks if the affixflag is used by dictionary. Conf->AffixData does not
1966  * contain affixflag if this flag is not used actually by the .dict file.
1967  *
1968  * Conf: current dictionary.
1969  * affixflag: affix flag.
1970  *
1971  * Returns true if the Conf->AffixData array contains affixflag, otherwise
1972  * returns false.
1973  */
1974 static bool
isAffixInUse(IspellDict * Conf,char * affixflag)1975 isAffixInUse(IspellDict *Conf, char *affixflag)
1976 {
1977 	int			i;
1978 
1979 	for (i = 0; i < Conf->nAffixData; i++)
1980 		if (IsAffixFlagInUse(Conf, i, affixflag))
1981 			return true;
1982 
1983 	return false;
1984 }
1985 
1986 /*
1987  * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
1988  */
1989 void
NISortAffixes(IspellDict * Conf)1990 NISortAffixes(IspellDict *Conf)
1991 {
1992 	AFFIX	   *Affix;
1993 	size_t		i;
1994 	CMPDAffix  *ptr;
1995 	int			firstsuffix = Conf->naffixes;
1996 
1997 	if (Conf->naffixes == 0)
1998 		return;
1999 
2000 	/* Store compound affixes in the Conf->CompoundAffix array */
2001 	if (Conf->naffixes > 1)
2002 		qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
2003 	Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
2004 	ptr->affix = NULL;
2005 
2006 	for (i = 0; i < Conf->naffixes; i++)
2007 	{
2008 		Affix = &(((AFFIX *) Conf->Affix)[i]);
2009 		if (Affix->type == FF_SUFFIX && i < firstsuffix)
2010 			firstsuffix = i;
2011 
2012 		if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
2013 			isAffixInUse(Conf, Affix->flag))
2014 		{
2015 			bool		issuffix = (Affix->type == FF_SUFFIX);
2016 
2017 			if (ptr == Conf->CompoundAffix ||
2018 				issuffix != (ptr - 1)->issuffix ||
2019 				strbncmp((const unsigned char *) (ptr - 1)->affix,
2020 						 (const unsigned char *) Affix->repl,
2021 						 (ptr - 1)->len))
2022 			{
2023 				/* leave only unique and minimals suffixes */
2024 				ptr->affix = Affix->repl;
2025 				ptr->len = Affix->replen;
2026 				ptr->issuffix = issuffix;
2027 				ptr++;
2028 			}
2029 		}
2030 	}
2031 	ptr->affix = NULL;
2032 	Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
2033 
2034 	/* Start build a prefix tree */
2035 	Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
2036 	Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
2037 	mkVoidAffix(Conf, true, firstsuffix);
2038 	mkVoidAffix(Conf, false, firstsuffix);
2039 }
2040 
2041 static AffixNodeData *
FindAffixes(AffixNode * node,const char * word,int wrdlen,int * level,int type)2042 FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
2043 {
2044 	AffixNodeData *StopLow,
2045 			   *StopHigh,
2046 			   *StopMiddle;
2047 	uint8 symbol;
2048 
2049 	if (node->isvoid)
2050 	{							/* search void affixes */
2051 		if (node->data->naff)
2052 			return node->data;
2053 		node = node->data->node;
2054 	}
2055 
2056 	while (node && *level < wrdlen)
2057 	{
2058 		StopLow = node->data;
2059 		StopHigh = node->data + node->length;
2060 		while (StopLow < StopHigh)
2061 		{
2062 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2063 			symbol = GETWCHAR(word, wrdlen, *level, type);
2064 
2065 			if (StopMiddle->val == symbol)
2066 			{
2067 				(*level)++;
2068 				if (StopMiddle->naff)
2069 					return StopMiddle;
2070 				node = StopMiddle->node;
2071 				break;
2072 			}
2073 			else if (StopMiddle->val < symbol)
2074 				StopLow = StopMiddle + 1;
2075 			else
2076 				StopHigh = StopMiddle;
2077 		}
2078 		if (StopLow >= StopHigh)
2079 			break;
2080 	}
2081 	return NULL;
2082 }
2083 
2084 static char *
CheckAffix(const char * word,size_t len,AFFIX * Affix,int flagflags,char * newword,int * baselen)2085 CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
2086 {
2087 	/*
2088 	 * Check compound allow flags
2089 	 */
2090 
2091 	if (flagflags == 0)
2092 	{
2093 		if (Affix->flagflags & FF_COMPOUNDONLY)
2094 			return NULL;
2095 	}
2096 	else if (flagflags & FF_COMPOUNDBEGIN)
2097 	{
2098 		if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2099 			return NULL;
2100 		if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
2101 			if (Affix->type == FF_SUFFIX)
2102 				return NULL;
2103 	}
2104 	else if (flagflags & FF_COMPOUNDMIDDLE)
2105 	{
2106 		if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
2107 			(Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
2108 			return NULL;
2109 	}
2110 	else if (flagflags & FF_COMPOUNDLAST)
2111 	{
2112 		if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2113 			return NULL;
2114 		if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
2115 			if (Affix->type == FF_PREFIX)
2116 				return NULL;
2117 	}
2118 
2119 	/*
2120 	 * make replace pattern of affix
2121 	 */
2122 	if (Affix->type == FF_SUFFIX)
2123 	{
2124 		strcpy(newword, word);
2125 		strcpy(newword + len - Affix->replen, Affix->find);
2126 		if (baselen)			/* store length of non-changed part of word */
2127 			*baselen = len - Affix->replen;
2128 	}
2129 	else
2130 	{
2131 		/*
2132 		 * if prefix is an all non-changed part's length then all word
2133 		 * contains only prefix and suffix, so out
2134 		 */
2135 		if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
2136 			return NULL;
2137 		strcpy(newword, Affix->find);
2138 		strcat(newword, word + Affix->replen);
2139 	}
2140 
2141 	/*
2142 	 * check resulting word
2143 	 */
2144 	if (Affix->issimple)
2145 		return newword;
2146 	else if (Affix->isregis)
2147 	{
2148 		if (RS_execute(&(Affix->reg.regis), newword))
2149 			return newword;
2150 	}
2151 	else
2152 	{
2153 		pg_wchar   *data;
2154 		size_t		data_len;
2155 		int			newword_len;
2156 
2157 		/* Convert data string to wide characters */
2158 		newword_len = strlen(newword);
2159 		data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
2160 		data_len = pg_mb2wchar_with_len(newword, data, newword_len);
2161 
2162 		if (pg_regexec(&(Affix->reg.pregex->regex), data, data_len,
2163 					   0, NULL, 0, NULL, 0) == REG_OKAY)
2164 		{
2165 			pfree(data);
2166 			return newword;
2167 		}
2168 		pfree(data);
2169 	}
2170 
2171 	return NULL;
2172 }
2173 
2174 static int
addToResult(char ** forms,char ** cur,char * word)2175 addToResult(char **forms, char **cur, char *word)
2176 {
2177 	if (cur - forms >= MAX_NORM - 1)
2178 		return 0;
2179 	if (forms == cur || strcmp(word, *(cur - 1)) != 0)
2180 	{
2181 		*cur = pstrdup(word);
2182 		*(cur + 1) = NULL;
2183 		return 1;
2184 	}
2185 
2186 	return 0;
2187 }
2188 
2189 static char **
NormalizeSubWord(IspellDict * Conf,char * word,int flag)2190 NormalizeSubWord(IspellDict *Conf, char *word, int flag)
2191 {
2192 	AffixNodeData *suffix = NULL,
2193 			   *prefix = NULL;
2194 	int			slevel = 0,
2195 				plevel = 0;
2196 	int			wrdlen = strlen(word),
2197 				swrdlen;
2198 	char	  **forms;
2199 	char	  **cur;
2200 	char		newword[2 * MAXNORMLEN] = "";
2201 	char		pnewword[2 * MAXNORMLEN] = "";
2202 	AffixNode  *snode = Conf->Suffix,
2203 			   *pnode;
2204 	int			i,
2205 				j;
2206 
2207 	if (wrdlen > MAXNORMLEN)
2208 		return NULL;
2209 	cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
2210 	*cur = NULL;
2211 
2212 
2213 	/* Check that the word itself is normal form */
2214 	if (FindWord(Conf, word, VoidString, flag))
2215 	{
2216 		*cur = pstrdup(word);
2217 		cur++;
2218 		*cur = NULL;
2219 	}
2220 
2221 	/* Find all other NORMAL forms of the 'word' (check only prefix) */
2222 	pnode = Conf->Prefix;
2223 	plevel = 0;
2224 	while (pnode)
2225 	{
2226 		prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
2227 		if (!prefix)
2228 			break;
2229 		for (j = 0; j < prefix->naff; j++)
2230 		{
2231 			if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
2232 			{
2233 				/* prefix success */
2234 				if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
2235 					cur += addToResult(forms, cur, newword);
2236 			}
2237 		}
2238 		pnode = prefix->node;
2239 	}
2240 
2241 	/*
2242 	 * Find all other NORMAL forms of the 'word' (check suffix and then
2243 	 * prefix)
2244 	 */
2245 	while (snode)
2246 	{
2247 		int			baselen = 0;
2248 
2249 		/* find possible suffix */
2250 		suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
2251 		if (!suffix)
2252 			break;
2253 		/* foreach suffix check affix */
2254 		for (i = 0; i < suffix->naff; i++)
2255 		{
2256 			if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
2257 			{
2258 				/* suffix success */
2259 				if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
2260 					cur += addToResult(forms, cur, newword);
2261 
2262 				/* now we will look changed word with prefixes */
2263 				pnode = Conf->Prefix;
2264 				plevel = 0;
2265 				swrdlen = strlen(newword);
2266 				while (pnode)
2267 				{
2268 					prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
2269 					if (!prefix)
2270 						break;
2271 					for (j = 0; j < prefix->naff; j++)
2272 					{
2273 						if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
2274 						{
2275 							/* prefix success */
2276 							char	   *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
2277 							VoidString : prefix->aff[j]->flag;
2278 
2279 							if (FindWord(Conf, pnewword, ff, flag))
2280 								cur += addToResult(forms, cur, pnewword);
2281 						}
2282 					}
2283 					pnode = prefix->node;
2284 				}
2285 			}
2286 		}
2287 
2288 		snode = suffix->node;
2289 	}
2290 
2291 	if (cur == forms)
2292 	{
2293 		pfree(forms);
2294 		return NULL;
2295 	}
2296 	return forms;
2297 }
2298 
2299 typedef struct SplitVar
2300 {
2301 	int			nstem;
2302 	int			lenstem;
2303 	char	  **stem;
2304 	struct SplitVar *next;
2305 } SplitVar;
2306 
2307 static int
CheckCompoundAffixes(CMPDAffix ** ptr,char * word,int len,bool CheckInPlace)2308 CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
2309 {
2310 	bool		issuffix;
2311 
2312 	/* in case CompoundAffix is null: */
2313 	if (*ptr == NULL)
2314 		return -1;
2315 
2316 	if (CheckInPlace)
2317 	{
2318 		while ((*ptr)->affix)
2319 		{
2320 			if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
2321 			{
2322 				len = (*ptr)->len;
2323 				issuffix = (*ptr)->issuffix;
2324 				(*ptr)++;
2325 				return (issuffix) ? len : 0;
2326 			}
2327 			(*ptr)++;
2328 		}
2329 	}
2330 	else
2331 	{
2332 		char	   *affbegin;
2333 
2334 		while ((*ptr)->affix)
2335 		{
2336 			if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
2337 			{
2338 				len = (*ptr)->len + (affbegin - word);
2339 				issuffix = (*ptr)->issuffix;
2340 				(*ptr)++;
2341 				return (issuffix) ? len : 0;
2342 			}
2343 			(*ptr)++;
2344 		}
2345 	}
2346 	return -1;
2347 }
2348 
2349 static SplitVar *
CopyVar(SplitVar * s,int makedup)2350 CopyVar(SplitVar *s, int makedup)
2351 {
2352 	SplitVar   *v = (SplitVar *) palloc(sizeof(SplitVar));
2353 
2354 	v->next = NULL;
2355 	if (s)
2356 	{
2357 		int			i;
2358 
2359 		v->lenstem = s->lenstem;
2360 		v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2361 		v->nstem = s->nstem;
2362 		for (i = 0; i < s->nstem; i++)
2363 			v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
2364 	}
2365 	else
2366 	{
2367 		v->lenstem = 16;
2368 		v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2369 		v->nstem = 0;
2370 	}
2371 	return v;
2372 }
2373 
2374 static void
AddStem(SplitVar * v,char * word)2375 AddStem(SplitVar *v, char *word)
2376 {
2377 	if (v->nstem >= v->lenstem)
2378 	{
2379 		v->lenstem *= 2;
2380 		v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
2381 	}
2382 
2383 	v->stem[v->nstem] = word;
2384 	v->nstem++;
2385 }
2386 
2387 static SplitVar *
SplitToVariants(IspellDict * Conf,SPNode * snode,SplitVar * orig,char * word,int wordlen,int startpos,int minpos)2388 SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
2389 {
2390 	SplitVar   *var = NULL;
2391 	SPNodeData *StopLow,
2392 			   *StopHigh,
2393 			   *StopMiddle = NULL;
2394 	SPNode	   *node = (snode) ? snode : Conf->Dictionary;
2395 	int			level = (snode) ? minpos : startpos;	/* recursive
2396 														 * minpos==level */
2397 	int			lenaff;
2398 	CMPDAffix  *caff;
2399 	char	   *notprobed;
2400 	int			compoundflag = 0;
2401 
2402 	notprobed = (char *) palloc(wordlen);
2403 	memset(notprobed, 1, wordlen);
2404 	var = CopyVar(orig, 1);
2405 
2406 	while (level < wordlen)
2407 	{
2408 		/* find word with epenthetic or/and compound affix */
2409 		caff = Conf->CompoundAffix;
2410 		while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
2411 		{
2412 			/*
2413 			 * there is one of compound affixes, so check word for existings
2414 			 */
2415 			char		buf[MAXNORMLEN];
2416 			char	  **subres;
2417 
2418 			lenaff = level - startpos + lenaff;
2419 
2420 			if (!notprobed[startpos + lenaff - 1])
2421 				continue;
2422 
2423 			if (level + lenaff - 1 <= minpos)
2424 				continue;
2425 
2426 			if (lenaff >= MAXNORMLEN)
2427 				continue;		/* skip too big value */
2428 			if (lenaff > 0)
2429 				memcpy(buf, word + startpos, lenaff);
2430 			buf[lenaff] = '\0';
2431 
2432 			if (level == 0)
2433 				compoundflag = FF_COMPOUNDBEGIN;
2434 			else if (level == wordlen - 1)
2435 				compoundflag = FF_COMPOUNDLAST;
2436 			else
2437 				compoundflag = FF_COMPOUNDMIDDLE;
2438 			subres = NormalizeSubWord(Conf, buf, compoundflag);
2439 			if (subres)
2440 			{
2441 				/* Yes, it was a word from dictionary */
2442 				SplitVar   *new = CopyVar(var, 0);
2443 				SplitVar   *ptr = var;
2444 				char	  **sptr = subres;
2445 
2446 				notprobed[startpos + lenaff - 1] = 0;
2447 
2448 				while (*sptr)
2449 				{
2450 					AddStem(new, *sptr);
2451 					sptr++;
2452 				}
2453 				pfree(subres);
2454 
2455 				while (ptr->next)
2456 					ptr = ptr->next;
2457 				ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2458 
2459 				pfree(new->stem);
2460 				pfree(new);
2461 			}
2462 		}
2463 
2464 		if (!node)
2465 			break;
2466 
2467 		StopLow = node->data;
2468 		StopHigh = node->data + node->length;
2469 		while (StopLow < StopHigh)
2470 		{
2471 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2472 			if (StopMiddle->val == ((uint8 *) (word))[level])
2473 				break;
2474 			else if (StopMiddle->val < ((uint8 *) (word))[level])
2475 				StopLow = StopMiddle + 1;
2476 			else
2477 				StopHigh = StopMiddle;
2478 		}
2479 
2480 		if (StopLow < StopHigh)
2481 		{
2482 			if (startpos == 0)
2483 				compoundflag = FF_COMPOUNDBEGIN;
2484 			else if (level == wordlen - 1)
2485 				compoundflag = FF_COMPOUNDLAST;
2486 			else
2487 				compoundflag = FF_COMPOUNDMIDDLE;
2488 
2489 			/* find infinitive */
2490 			if (StopMiddle->isword &&
2491 				(StopMiddle->compoundflag & compoundflag) &&
2492 				notprobed[level])
2493 			{
2494 				/* ok, we found full compoundallowed word */
2495 				if (level > minpos)
2496 				{
2497 					/* and its length more than minimal */
2498 					if (wordlen == level + 1)
2499 					{
2500 						/* well, it was last word */
2501 						AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2502 						pfree(notprobed);
2503 						return var;
2504 					}
2505 					else
2506 					{
2507 						/* then we will search more big word at the same point */
2508 						SplitVar   *ptr = var;
2509 
2510 						while (ptr->next)
2511 							ptr = ptr->next;
2512 						ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
2513 						/* we can find next word */
2514 						level++;
2515 						AddStem(var, pnstrdup(word + startpos, level - startpos));
2516 						node = Conf->Dictionary;
2517 						startpos = level;
2518 						continue;
2519 					}
2520 				}
2521 			}
2522 			node = StopMiddle->node;
2523 		}
2524 		else
2525 			node = NULL;
2526 		level++;
2527 	}
2528 
2529 	AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2530 	pfree(notprobed);
2531 	return var;
2532 }
2533 
2534 static void
addNorm(TSLexeme ** lres,TSLexeme ** lcur,char * word,int flags,uint16 NVariant)2535 addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
2536 {
2537 	if (*lres == NULL)
2538 		*lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
2539 
2540 	if (*lcur - *lres < MAX_NORM - 1)
2541 	{
2542 		(*lcur)->lexeme = word;
2543 		(*lcur)->flags = flags;
2544 		(*lcur)->nvariant = NVariant;
2545 		(*lcur)++;
2546 		(*lcur)->lexeme = NULL;
2547 	}
2548 }
2549 
2550 TSLexeme *
NINormalizeWord(IspellDict * Conf,char * word)2551 NINormalizeWord(IspellDict *Conf, char *word)
2552 {
2553 	char	  **res;
2554 	TSLexeme   *lcur = NULL,
2555 			   *lres = NULL;
2556 	uint16		NVariant = 1;
2557 
2558 	res = NormalizeSubWord(Conf, word, 0);
2559 
2560 	if (res)
2561 	{
2562 		char	  **ptr = res;
2563 
2564 		while (*ptr && (lcur - lres) < MAX_NORM)
2565 		{
2566 			addNorm(&lres, &lcur, *ptr, 0, NVariant++);
2567 			ptr++;
2568 		}
2569 		pfree(res);
2570 	}
2571 
2572 	if (Conf->usecompound)
2573 	{
2574 		int			wordlen = strlen(word);
2575 		SplitVar   *ptr,
2576 				   *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
2577 		int			i;
2578 
2579 		while (var)
2580 		{
2581 			if (var->nstem > 1)
2582 			{
2583 				char	  **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
2584 
2585 				if (subres)
2586 				{
2587 					char	  **subptr = subres;
2588 
2589 					while (*subptr)
2590 					{
2591 						for (i = 0; i < var->nstem - 1; i++)
2592 						{
2593 							addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
2594 						}
2595 
2596 						addNorm(&lres, &lcur, *subptr, 0, NVariant);
2597 						subptr++;
2598 						NVariant++;
2599 					}
2600 
2601 					pfree(subres);
2602 					var->stem[0] = NULL;
2603 					pfree(var->stem[var->nstem - 1]);
2604 				}
2605 			}
2606 
2607 			for (i = 0; i < var->nstem && var->stem[i]; i++)
2608 				pfree(var->stem[i]);
2609 			ptr = var->next;
2610 			pfree(var->stem);
2611 			pfree(var);
2612 			var = ptr;
2613 		}
2614 	}
2615 
2616 	return lres;
2617 }
2618