1 /*-------------------------------------------------------------------------
2  *
3  * spell.c
4  *		Normalizing word with ISpell
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  * Ispell dictionary
9  * -----------------
10  *
11  * Rules of dictionaries are defined in two files with .affix and .dict
12  * extensions. They are used by spell checker programs Ispell and Hunspell.
13  *
14  * An .affix file declares morphological rules to get a basic form of words.
15  * The format of an .affix file has different structure for Ispell and Hunspell
16  * dictionaries. The Hunspell format is more complicated. But when an .affix
17  * file is imported and compiled, it is stored in the same structure AffixNode.
18  *
19  * A .dict file stores a list of basic forms of words with references to
20  * affix rules. The format of a .dict file has the same structure for Ispell
21  * and Hunspell dictionaries.
22  *
23  * Compilation of a dictionary
24  * ---------------------------
25  *
26  * A compiled dictionary is stored in the IspellDict structure. Compilation of
27  * a dictionary is divided into the several steps:
28  *	- NIImportDictionary() - stores each word of a .dict file in the
29  *	  temporary Spell field.
30  *	- NIImportAffixes() - stores affix rules of an .affix file in the
31  *	  Affix field (not temporary) if an .affix file has the Ispell format.
32  *	  -> NIImportOOAffixes() - stores affix rules if an .affix file has the
33  *		 Hunspell format. The AffixData field is initialized if AF parameter
34  *		 is defined.
35  *	- NISortDictionary() - builds a prefix tree (Trie) from the words list
36  *	  and stores it in the Dictionary field. The words list is got from the
37  *	  Spell field. The AffixData field is initialized if AF parameter is not
38  *	  defined.
39  *	- NISortAffixes():
40  *	  - builds a list of compound affixes from the affix list and stores it
41  *		in the CompoundAffix.
42  *	  - builds prefix trees (Trie) from the affix list for prefixes and suffixes
43  *		and stores them in Suffix and Prefix fields.
44  *	  The affix list is got from the Affix field.
45  *
46  * Memory management
47  * -----------------
48  *
49  * The IspellDict structure has the Spell field which is used only in compile
50  * time. The Spell field stores a words list. It can take a lot of memory.
51  * Therefore when a dictionary is compiled this field is cleared by
52  * NIFinishBuild().
53  *
54  * All resources which should cleared by NIFinishBuild() is initialized using
55  * tmpalloc() and tmpalloc0().
56  *
57  * IDENTIFICATION
58  *	  src/backend/tsearch/spell.c
59  *
60  *-------------------------------------------------------------------------
61  */
62 
63 #include "postgres.h"
64 
65 #include "catalog/pg_collation.h"
66 #include "tsearch/dicts/spell.h"
67 #include "tsearch/ts_locale.h"
68 #include "utils/memutils.h"
69 
70 
71 /*
72  * Initialization requires a lot of memory that's not needed
73  * after the initialization is done.  During initialization,
74  * CurrentMemoryContext is the long-lived memory context associated
75  * with the dictionary cache entry.  We keep the short-lived stuff
76  * in the Conf->buildCxt context.
77  */
78 #define tmpalloc(sz)  MemoryContextAlloc(Conf->buildCxt, (sz))
79 #define tmpalloc0(sz)  MemoryContextAllocZero(Conf->buildCxt, (sz))
80 
81 /*
82  * Prepare for constructing an ISpell dictionary.
83  *
84  * The IspellDict struct is assumed to be zeroed when allocated.
85  */
86 void
NIStartBuild(IspellDict * Conf)87 NIStartBuild(IspellDict *Conf)
88 {
89 	/*
90 	 * The temp context is a child of CurTransactionContext, so that it will
91 	 * go away automatically on error.
92 	 */
93 	Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
94 										   "Ispell dictionary init context",
95 										   ALLOCSET_DEFAULT_SIZES);
96 }
97 
98 /*
99  * Clean up when dictionary construction is complete.
100  */
101 void
NIFinishBuild(IspellDict * Conf)102 NIFinishBuild(IspellDict *Conf)
103 {
104 	/* Release no-longer-needed temp memory */
105 	MemoryContextDelete(Conf->buildCxt);
106 	/* Just for cleanliness, zero the now-dangling pointers */
107 	Conf->buildCxt = NULL;
108 	Conf->Spell = NULL;
109 	Conf->firstfree = NULL;
110 	Conf->CompoundAffixFlags = NULL;
111 }
112 
113 
114 /*
115  * "Compact" palloc: allocate without extra palloc overhead.
116  *
117  * Since we have no need to free the ispell data items individually, there's
118  * not much value in the per-chunk overhead normally consumed by palloc.
119  * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
120  *
121  * We currently pre-zero all data allocated this way, even though some of it
122  * doesn't need that.  The cpalloc and cpalloc0 macros are just documentation
123  * to indicate which allocations actually require zeroing.
124  */
125 #define COMPACT_ALLOC_CHUNK 8192	/* amount to get from palloc at once */
126 #define COMPACT_MAX_REQ		1024	/* must be < COMPACT_ALLOC_CHUNK */
127 
128 static void *
compact_palloc0(IspellDict * Conf,size_t size)129 compact_palloc0(IspellDict *Conf, size_t size)
130 {
131 	void	   *result;
132 
133 	/* Should only be called during init */
134 	Assert(Conf->buildCxt != NULL);
135 
136 	/* No point in this for large chunks */
137 	if (size > COMPACT_MAX_REQ)
138 		return palloc0(size);
139 
140 	/* Keep everything maxaligned */
141 	size = MAXALIGN(size);
142 
143 	/* Need more space? */
144 	if (size > Conf->avail)
145 	{
146 		Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
147 		Conf->avail = COMPACT_ALLOC_CHUNK;
148 	}
149 
150 	result = (void *) Conf->firstfree;
151 	Conf->firstfree += size;
152 	Conf->avail -= size;
153 
154 	return result;
155 }
156 
157 #define cpalloc(size) compact_palloc0(Conf, size)
158 #define cpalloc0(size) compact_palloc0(Conf, size)
159 
160 static char *
cpstrdup(IspellDict * Conf,const char * str)161 cpstrdup(IspellDict *Conf, const char *str)
162 {
163 	char	   *res = cpalloc(strlen(str) + 1);
164 
165 	strcpy(res, str);
166 	return res;
167 }
168 
169 
170 /*
171  * Apply lowerstr(), producing a temporary result (in the buildCxt).
172  */
173 static char *
lowerstr_ctx(IspellDict * Conf,const char * src)174 lowerstr_ctx(IspellDict *Conf, const char *src)
175 {
176 	MemoryContext saveCtx;
177 	char	   *dst;
178 
179 	saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
180 	dst = lowerstr(src);
181 	MemoryContextSwitchTo(saveCtx);
182 
183 	return dst;
184 }
185 
186 #define MAX_NORM 1024
187 #define MAXNORMLEN 256
188 
189 #define STRNCMP(s,p)	strncmp( (s), (p), strlen(p) )
190 #define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
191 #define GETCHAR(A,N,T)	  GETWCHAR( (A)->repl, (A)->replen, N, T )
192 
193 static char *VoidString = "";
194 
195 static int
cmpspell(const void * s1,const void * s2)196 cmpspell(const void *s1, const void *s2)
197 {
198 	return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word));
199 }
200 
201 static int
cmpspellaffix(const void * s1,const void * s2)202 cmpspellaffix(const void *s1, const void *s2)
203 {
204 	return (strcmp((*(SPELL *const *) s1)->p.flag,
205 				   (*(SPELL *const *) s2)->p.flag));
206 }
207 
208 static int
cmpcmdflag(const void * f1,const void * f2)209 cmpcmdflag(const void *f1, const void *f2)
210 {
211 	CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1,
212 			   *fv2 = (CompoundAffixFlag *) f2;
213 
214 	Assert(fv1->flagMode == fv2->flagMode);
215 
216 	if (fv1->flagMode == FM_NUM)
217 	{
218 		if (fv1->flag.i == fv2->flag.i)
219 			return 0;
220 
221 		return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
222 	}
223 
224 	return strcmp(fv1->flag.s, fv2->flag.s);
225 }
226 
227 static char *
findchar(char * str,int c)228 findchar(char *str, int c)
229 {
230 	while (*str)
231 	{
232 		if (t_iseq(str, c))
233 			return str;
234 		str += pg_mblen(str);
235 	}
236 
237 	return NULL;
238 }
239 
240 static char *
findchar2(char * str,int c1,int c2)241 findchar2(char *str, int c1, int c2)
242 {
243 	while (*str)
244 	{
245 		if (t_iseq(str, c1) || t_iseq(str, c2))
246 			return str;
247 		str += pg_mblen(str);
248 	}
249 
250 	return NULL;
251 }
252 
253 
254 /* backward string compare for suffix tree operations */
255 static int
strbcmp(const unsigned char * s1,const unsigned char * s2)256 strbcmp(const unsigned char *s1, const unsigned char *s2)
257 {
258 	int			l1 = strlen((const char *) s1) - 1,
259 				l2 = strlen((const char *) s2) - 1;
260 
261 	while (l1 >= 0 && l2 >= 0)
262 	{
263 		if (s1[l1] < s2[l2])
264 			return -1;
265 		if (s1[l1] > s2[l2])
266 			return 1;
267 		l1--;
268 		l2--;
269 	}
270 	if (l1 < l2)
271 		return -1;
272 	if (l1 > l2)
273 		return 1;
274 
275 	return 0;
276 }
277 
278 static int
strbncmp(const unsigned char * s1,const unsigned char * s2,size_t count)279 strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
280 {
281 	int			l1 = strlen((const char *) s1) - 1,
282 				l2 = strlen((const char *) s2) - 1,
283 				l = count;
284 
285 	while (l1 >= 0 && l2 >= 0 && l > 0)
286 	{
287 		if (s1[l1] < s2[l2])
288 			return -1;
289 		if (s1[l1] > s2[l2])
290 			return 1;
291 		l1--;
292 		l2--;
293 		l--;
294 	}
295 	if (l == 0)
296 		return 0;
297 	if (l1 < l2)
298 		return -1;
299 	if (l1 > l2)
300 		return 1;
301 	return 0;
302 }
303 
304 /*
305  * Compares affixes.
306  * First compares the type of an affix. Prefixes should go before affixes.
307  * If types are equal then compares replaceable string.
308  */
309 static int
cmpaffix(const void * s1,const void * s2)310 cmpaffix(const void *s1, const void *s2)
311 {
312 	const AFFIX *a1 = (const AFFIX *) s1;
313 	const AFFIX *a2 = (const AFFIX *) s2;
314 
315 	if (a1->type < a2->type)
316 		return -1;
317 	if (a1->type > a2->type)
318 		return 1;
319 	if (a1->type == FF_PREFIX)
320 		return strcmp(a1->repl, a2->repl);
321 	else
322 		return strbcmp((const unsigned char *) a1->repl,
323 					   (const unsigned char *) a2->repl);
324 }
325 
326 /*
327  * Gets an affix flag from the set of affix flags (sflagset).
328  *
329  * Several flags can be stored in a single string. Flags can be represented by:
330  * - 1 character (FM_CHAR). A character may be Unicode.
331  * - 2 characters (FM_LONG). A character may be Unicode.
332  * - numbers from 1 to 65000 (FM_NUM).
333  *
334  * Depending on the flagMode an affix string can have the following format:
335  * - FM_CHAR: ABCD
336  *	 Here we have 4 flags: A, B, C and D
337  * - FM_LONG: ABCDE*
338  *	 Here we have 3 flags: AB, CD and E*
339  * - FM_NUM: 200,205,50
340  *	 Here we have 3 flags: 200, 205 and 50
341  *
342  * Conf: current dictionary.
343  * sflagset: the set of affix flags. Returns a reference to the start of a next
344  *			 affix flag.
345  * sflag: returns an affix flag from sflagset.
346  */
347 static void
getNextFlagFromString(IspellDict * Conf,char ** sflagset,char * sflag)348 getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
349 {
350 	int32		s;
351 	char	   *next,
352 			   *sbuf = *sflagset;
353 	int			maxstep;
354 	bool		stop = false;
355 	bool		met_comma = false;
356 
357 	maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
358 
359 	while (**sflagset)
360 	{
361 		switch (Conf->flagMode)
362 		{
363 			case FM_LONG:
364 			case FM_CHAR:
365 				COPYCHAR(sflag, *sflagset);
366 				sflag += pg_mblen(*sflagset);
367 
368 				/* Go to start of the next flag */
369 				*sflagset += pg_mblen(*sflagset);
370 
371 				/* Check if we get all characters of flag */
372 				maxstep--;
373 				stop = (maxstep == 0);
374 				break;
375 			case FM_NUM:
376 				s = strtol(*sflagset, &next, 10);
377 				if (*sflagset == next || errno == ERANGE)
378 					ereport(ERROR,
379 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
380 							 errmsg("invalid affix flag \"%s\"", *sflagset)));
381 				if (s < 0 || s > FLAGNUM_MAXSIZE)
382 					ereport(ERROR,
383 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
384 							 errmsg("affix flag \"%s\" is out of range",
385 									*sflagset)));
386 				sflag += sprintf(sflag, "%0d", s);
387 
388 				/* Go to start of the next flag */
389 				*sflagset = next;
390 				while (**sflagset)
391 				{
392 					if (t_isdigit(*sflagset))
393 					{
394 						if (!met_comma)
395 							ereport(ERROR,
396 									(errcode(ERRCODE_CONFIG_FILE_ERROR),
397 									 errmsg("invalid affix flag \"%s\"",
398 											*sflagset)));
399 						break;
400 					}
401 					else if (t_iseq(*sflagset, ','))
402 					{
403 						if (met_comma)
404 							ereport(ERROR,
405 									(errcode(ERRCODE_CONFIG_FILE_ERROR),
406 									 errmsg("invalid affix flag \"%s\"",
407 											*sflagset)));
408 						met_comma = true;
409 					}
410 					else if (!t_isspace(*sflagset))
411 					{
412 						ereport(ERROR,
413 								(errcode(ERRCODE_CONFIG_FILE_ERROR),
414 							 errmsg("invalid character in affix flag \"%s\"",
415 									*sflagset)));
416 					}
417 
418 					*sflagset += pg_mblen(*sflagset);
419 				}
420 				stop = true;
421 				break;
422 			default:
423 				elog(ERROR, "unrecognized type of Conf->flagMode: %d",
424 					 Conf->flagMode);
425 		}
426 
427 		if (stop)
428 			break;
429 	}
430 
431 	if (Conf->flagMode == FM_LONG && maxstep > 0)
432 		ereport(ERROR,
433 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
434 				 errmsg("invalid affix flag \"%s\" with \"long\" flag value",
435 						sbuf)));
436 
437 	*sflag = '\0';
438 }
439 
440 /*
441  * Checks if the affix set Conf->AffixData[affix] contains affixflag.
442  * Conf->AffixData[affix] does not contain affixflag if this flag is not used
443  * actually by the .dict file.
444  *
445  * Conf: current dictionary.
446  * affix: index of the Conf->AffixData array.
447  * affixflag: the affix flag.
448  *
449  * Returns true if the string Conf->AffixData[affix] contains affixflag,
450  * otherwise returns false.
451  */
452 static bool
IsAffixFlagInUse(IspellDict * Conf,int affix,char * affixflag)453 IsAffixFlagInUse(IspellDict *Conf, int affix, char *affixflag)
454 {
455 	char	   *flagcur;
456 	char		flag[BUFSIZ];
457 
458 	if (*affixflag == 0)
459 		return true;
460 
461 	Assert(affix < Conf->nAffixData);
462 
463 	flagcur = Conf->AffixData[affix];
464 
465 	while (*flagcur)
466 	{
467 		getNextFlagFromString(Conf, &flagcur, flag);
468 		/* Compare first affix flag in flagcur with affixflag */
469 		if (strcmp(flag, affixflag) == 0)
470 			return true;
471 	}
472 
473 	/* Could not find affixflag */
474 	return false;
475 }
476 
477 /*
478  * Adds the new word into the temporary array Spell.
479  *
480  * Conf: current dictionary.
481  * word: new word.
482  * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
483  */
484 static void
NIAddSpell(IspellDict * Conf,const char * word,const char * flag)485 NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
486 {
487 	if (Conf->nspell >= Conf->mspell)
488 	{
489 		if (Conf->mspell)
490 		{
491 			Conf->mspell *= 2;
492 			Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
493 		}
494 		else
495 		{
496 			Conf->mspell = 1024 * 20;
497 			Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
498 		}
499 	}
500 	Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
501 	strcpy(Conf->Spell[Conf->nspell]->word, word);
502 	Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
503 		? cpstrdup(Conf, flag) : VoidString;
504 	Conf->nspell++;
505 }
506 
507 /*
508  * Imports dictionary into the temporary array Spell.
509  *
510  * Note caller must already have applied get_tsearch_config_filename.
511  *
512  * Conf: current dictionary.
513  * filename: path to the .dict file.
514  */
515 void
NIImportDictionary(IspellDict * Conf,const char * filename)516 NIImportDictionary(IspellDict *Conf, const char *filename)
517 {
518 	tsearch_readline_state trst;
519 	char	   *line;
520 
521 	if (!tsearch_readline_begin(&trst, filename))
522 		ereport(ERROR,
523 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
524 				 errmsg("could not open dictionary file \"%s\": %m",
525 						filename)));
526 
527 	while ((line = tsearch_readline(&trst)) != NULL)
528 	{
529 		char	   *s,
530 				   *pstr;
531 
532 		/* Set of affix flags */
533 		const char *flag;
534 
535 		/* Extract flag from the line */
536 		flag = NULL;
537 		if ((s = findchar(line, '/')))
538 		{
539 			*s++ = '\0';
540 			flag = s;
541 			while (*s)
542 			{
543 				/* we allow only single encoded flags for faster works */
544 				if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
545 					s++;
546 				else
547 				{
548 					*s = '\0';
549 					break;
550 				}
551 			}
552 		}
553 		else
554 			flag = "";
555 
556 		/* Remove trailing spaces */
557 		s = line;
558 		while (*s)
559 		{
560 			if (t_isspace(s))
561 			{
562 				*s = '\0';
563 				break;
564 			}
565 			s += pg_mblen(s);
566 		}
567 		pstr = lowerstr_ctx(Conf, line);
568 
569 		NIAddSpell(Conf, pstr, flag);
570 		pfree(pstr);
571 
572 		pfree(line);
573 	}
574 	tsearch_readline_end(&trst);
575 }
576 
577 /*
578  * Searches a basic form of word in the prefix tree. This word was generated
579  * using an affix rule. This rule may not be presented in an affix set of
580  * a basic form of word.
581  *
582  * For example, we have the entry in the .dict file:
583  * meter/GMD
584  *
585  * The affix rule with the flag S:
586  * SFX S   y	 ies		[^aeiou]y
587  * is not presented here.
588  *
589  * The affix rule with the flag M:
590  * SFX M   0	 's         .
591  * is presented here.
592  *
593  * Conf: current dictionary.
594  * word: basic form of word.
595  * affixflag: affix flag, by which a basic form of word was generated.
596  * flag: compound flag used to compare with StopMiddle->compoundflag.
597  *
598  * Returns 1 if the word was found in the prefix tree, else returns 0.
599  */
600 static int
FindWord(IspellDict * Conf,const char * word,char * affixflag,int flag)601 FindWord(IspellDict *Conf, const char *word, char *affixflag, int flag)
602 {
603 	SPNode	   *node = Conf->Dictionary;
604 	SPNodeData *StopLow,
605 			   *StopHigh,
606 			   *StopMiddle;
607 	const uint8 *ptr = (const uint8 *) word;
608 
609 	flag &= FF_COMPOUNDFLAGMASK;
610 
611 	while (node && *ptr)
612 	{
613 		StopLow = node->data;
614 		StopHigh = node->data + node->length;
615 		while (StopLow < StopHigh)
616 		{
617 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
618 			if (StopMiddle->val == *ptr)
619 			{
620 				if (*(ptr + 1) == '\0' && StopMiddle->isword)
621 				{
622 					if (flag == 0)
623 					{
624 						/*
625 						 * The word can be formed only with another word. And
626 						 * in the flag parameter there is not a sign that we
627 						 * search compound words.
628 						 */
629 						if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
630 							return 0;
631 					}
632 					else if ((flag & StopMiddle->compoundflag) == 0)
633 						return 0;
634 
635 					/*
636 					 * Check if this affix rule is presented in the affix set
637 					 * with index StopMiddle->affix.
638 					 */
639 					if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
640 						return 1;
641 				}
642 				node = StopMiddle->node;
643 				ptr++;
644 				break;
645 			}
646 			else if (StopMiddle->val < *ptr)
647 				StopLow = StopMiddle + 1;
648 			else
649 				StopHigh = StopMiddle;
650 		}
651 		if (StopLow >= StopHigh)
652 			break;
653 	}
654 	return 0;
655 }
656 
657 /*
658  * Context reset/delete callback for a regular expression used in an affix
659  */
660 static void
regex_affix_deletion_callback(void * arg)661 regex_affix_deletion_callback(void *arg)
662 {
663 	aff_regex_struct *pregex = (aff_regex_struct *) arg;
664 
665 	pg_regfree(&(pregex->regex));
666 }
667 
668 /*
669  * Adds a new affix rule to the Affix field.
670  *
671  * Conf: current dictionary.
672  * flag: affix flag ('\' in the below example).
673  * flagflags: set of flags from the flagval field for this affix rule. This set
674  *			  is listed after '/' character in the added string (repl).
675  *
676  *			  For example L flag in the hunspell_sample.affix:
677  *			  SFX \   0 Y/L [^Y]
678  *
679  * mask: condition for search ('[^Y]' in the above example).
680  * find: stripping characters from beginning (at prefix) or end (at suffix)
681  *		 of the word ('0' in the above example, 0 means that there is not
682  *		 stripping character).
683  * repl: adding string after stripping ('Y' in the above example).
684  * type: FF_SUFFIX or FF_PREFIX.
685  */
686 static void
NIAddAffix(IspellDict * Conf,const char * flag,char flagflags,const char * mask,const char * find,const char * repl,int type)687 NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
688 		   const char *find, const char *repl, int type)
689 {
690 	AFFIX	   *Affix;
691 
692 	if (Conf->naffixes >= Conf->maffixes)
693 	{
694 		if (Conf->maffixes)
695 		{
696 			Conf->maffixes *= 2;
697 			Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
698 		}
699 		else
700 		{
701 			Conf->maffixes = 16;
702 			Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
703 		}
704 	}
705 
706 	Affix = Conf->Affix + Conf->naffixes;
707 
708 	/* This affix rule can be applied for words with any ending */
709 	if (strcmp(mask, ".") == 0 || *mask == '\0')
710 	{
711 		Affix->issimple = 1;
712 		Affix->isregis = 0;
713 	}
714 	/* This affix rule will use regis to search word ending */
715 	else if (RS_isRegis(mask))
716 	{
717 		Affix->issimple = 0;
718 		Affix->isregis = 1;
719 		RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
720 				   *mask ? mask : VoidString);
721 	}
722 	/* This affix rule will use regex_t to search word ending */
723 	else
724 	{
725 		int			masklen;
726 		int			wmasklen;
727 		int			err;
728 		pg_wchar   *wmask;
729 		char	   *tmask;
730 		aff_regex_struct *pregex;
731 
732 		Affix->issimple = 0;
733 		Affix->isregis = 0;
734 		tmask = (char *) tmpalloc(strlen(mask) + 3);
735 		if (type == FF_SUFFIX)
736 			sprintf(tmask, "%s$", mask);
737 		else
738 			sprintf(tmask, "^%s", mask);
739 
740 		masklen = strlen(tmask);
741 		wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
742 		wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
743 
744 		/*
745 		 * The regex engine stores its stuff using malloc not palloc, so we
746 		 * must arrange to explicitly clean up the regex when the dictionary's
747 		 * context is cleared.  That means the regex_t has to stay in a fixed
748 		 * location within the context; we can't keep it directly in the AFFIX
749 		 * struct, since we may sort and resize the array of AFFIXes.
750 		 */
751 		Affix->reg.pregex = pregex = palloc(sizeof(aff_regex_struct));
752 
753 		err = pg_regcomp(&(pregex->regex), wmask, wmasklen,
754 						 REG_ADVANCED | REG_NOSUB,
755 						 DEFAULT_COLLATION_OID);
756 		if (err)
757 		{
758 			char		errstr[100];
759 
760 			pg_regerror(err, &(pregex->regex), errstr, sizeof(errstr));
761 			ereport(ERROR,
762 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
763 					 errmsg("invalid regular expression: %s", errstr)));
764 		}
765 
766 		pregex->mcallback.func = regex_affix_deletion_callback;
767 		pregex->mcallback.arg = (void *) pregex;
768 		MemoryContextRegisterResetCallback(CurrentMemoryContext,
769 										   &pregex->mcallback);
770 	}
771 
772 	Affix->flagflags = flagflags;
773 	if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
774 	{
775 		if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
776 			Affix->flagflags |= FF_COMPOUNDFLAG;
777 	}
778 	Affix->flag = cpstrdup(Conf, flag);
779 	Affix->type = type;
780 
781 	Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
782 	if ((Affix->replen = strlen(repl)) > 0)
783 		Affix->repl = cpstrdup(Conf, repl);
784 	else
785 		Affix->repl = VoidString;
786 	Conf->naffixes++;
787 }
788 
789 /* Parsing states for parse_affentry() and friends */
790 #define PAE_WAIT_MASK	0
791 #define PAE_INMASK		1
792 #define PAE_WAIT_FIND	2
793 #define PAE_INFIND		3
794 #define PAE_WAIT_REPL	4
795 #define PAE_INREPL		5
796 #define PAE_WAIT_TYPE	6
797 #define PAE_WAIT_FLAG	7
798 
799 /*
800  * Parse next space-separated field of an .affix file line.
801  *
802  * *str is the input pointer (will be advanced past field)
803  * next is where to copy the field value to, with null termination
804  *
805  * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
806  *
807  * Returns TRUE if we found a field, FALSE if not.
808  */
809 static bool
get_nextfield(char ** str,char * next)810 get_nextfield(char **str, char *next)
811 {
812 	int			state = PAE_WAIT_MASK;
813 	int			avail = BUFSIZ;
814 
815 	while (**str)
816 	{
817 		if (state == PAE_WAIT_MASK)
818 		{
819 			if (t_iseq(*str, '#'))
820 				return false;
821 			else if (!t_isspace(*str))
822 			{
823 				int			clen = pg_mblen(*str);
824 
825 				if (clen < avail)
826 				{
827 					COPYCHAR(next, *str);
828 					next += clen;
829 					avail -= clen;
830 				}
831 				state = PAE_INMASK;
832 			}
833 		}
834 		else	/* state == PAE_INMASK */
835 		{
836 			if (t_isspace(*str))
837 			{
838 				*next = '\0';
839 				return true;
840 			}
841 			else
842 			{
843 				int			clen = pg_mblen(*str);
844 
845 				if (clen < avail)
846 				{
847 					COPYCHAR(next, *str);
848 					next += clen;
849 					avail -= clen;
850 				}
851 			}
852 		}
853 		*str += pg_mblen(*str);
854 	}
855 
856 	*next = '\0';
857 
858 	return (state == PAE_INMASK);		/* OK if we got a nonempty field */
859 }
860 
861 /*
862  * Parses entry of an .affix file of MySpell or Hunspell format.
863  *
864  * An .affix file entry has the following format:
865  * - header
866  *	 <type>  <flag>  <cross_flag>  <flag_count>
867  * - fields after header:
868  *	 <type>  <flag>  <find>  <replace>	<mask>
869  *
870  * str is the input line
871  * field values are returned to type etc, which must be buffers of size BUFSIZ.
872  *
873  * Returns number of fields found; any omitted fields are set to empty strings.
874  */
875 static int
parse_ooaffentry(char * str,char * type,char * flag,char * find,char * repl,char * mask)876 parse_ooaffentry(char *str, char *type, char *flag, char *find,
877 				 char *repl, char *mask)
878 {
879 	int			state = PAE_WAIT_TYPE;
880 	int			fields_read = 0;
881 	bool		valid = false;
882 
883 	*type = *flag = *find = *repl = *mask = '\0';
884 
885 	while (*str)
886 	{
887 		switch (state)
888 		{
889 			case PAE_WAIT_TYPE:
890 				valid = get_nextfield(&str, type);
891 				state = PAE_WAIT_FLAG;
892 				break;
893 			case PAE_WAIT_FLAG:
894 				valid = get_nextfield(&str, flag);
895 				state = PAE_WAIT_FIND;
896 				break;
897 			case PAE_WAIT_FIND:
898 				valid = get_nextfield(&str, find);
899 				state = PAE_WAIT_REPL;
900 				break;
901 			case PAE_WAIT_REPL:
902 				valid = get_nextfield(&str, repl);
903 				state = PAE_WAIT_MASK;
904 				break;
905 			case PAE_WAIT_MASK:
906 				valid = get_nextfield(&str, mask);
907 				state = -1;		/* force loop exit */
908 				break;
909 			default:
910 				elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
911 					 state);
912 				break;
913 		}
914 		if (valid)
915 			fields_read++;
916 		else
917 			break;				/* early EOL */
918 		if (state < 0)
919 			break;				/* got all fields */
920 	}
921 
922 	return fields_read;
923 }
924 
925 /*
926  * Parses entry of an .affix file of Ispell format
927  *
928  * An .affix file entry has the following format:
929  * <mask>  >  [-<find>,]<replace>
930  */
931 static bool
parse_affentry(char * str,char * mask,char * find,char * repl)932 parse_affentry(char *str, char *mask, char *find, char *repl)
933 {
934 	int			state = PAE_WAIT_MASK;
935 	char	   *pmask = mask,
936 			   *pfind = find,
937 			   *prepl = repl;
938 
939 	*mask = *find = *repl = '\0';
940 
941 	while (*str)
942 	{
943 		if (state == PAE_WAIT_MASK)
944 		{
945 			if (t_iseq(str, '#'))
946 				return false;
947 			else if (!t_isspace(str))
948 			{
949 				COPYCHAR(pmask, str);
950 				pmask += pg_mblen(str);
951 				state = PAE_INMASK;
952 			}
953 		}
954 		else if (state == PAE_INMASK)
955 		{
956 			if (t_iseq(str, '>'))
957 			{
958 				*pmask = '\0';
959 				state = PAE_WAIT_FIND;
960 			}
961 			else if (!t_isspace(str))
962 			{
963 				COPYCHAR(pmask, str);
964 				pmask += pg_mblen(str);
965 			}
966 		}
967 		else if (state == PAE_WAIT_FIND)
968 		{
969 			if (t_iseq(str, '-'))
970 			{
971 				state = PAE_INFIND;
972 			}
973 			else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
974 			{
975 				COPYCHAR(prepl, str);
976 				prepl += pg_mblen(str);
977 				state = PAE_INREPL;
978 			}
979 			else if (!t_isspace(str))
980 				ereport(ERROR,
981 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
982 						 errmsg("syntax error")));
983 		}
984 		else if (state == PAE_INFIND)
985 		{
986 			if (t_iseq(str, ','))
987 			{
988 				*pfind = '\0';
989 				state = PAE_WAIT_REPL;
990 			}
991 			else if (t_isalpha(str))
992 			{
993 				COPYCHAR(pfind, str);
994 				pfind += pg_mblen(str);
995 			}
996 			else if (!t_isspace(str))
997 				ereport(ERROR,
998 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
999 						 errmsg("syntax error")));
1000 		}
1001 		else if (state == PAE_WAIT_REPL)
1002 		{
1003 			if (t_iseq(str, '-'))
1004 			{
1005 				break;			/* void repl */
1006 			}
1007 			else if (t_isalpha(str))
1008 			{
1009 				COPYCHAR(prepl, str);
1010 				prepl += pg_mblen(str);
1011 				state = PAE_INREPL;
1012 			}
1013 			else if (!t_isspace(str))
1014 				ereport(ERROR,
1015 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
1016 						 errmsg("syntax error")));
1017 		}
1018 		else if (state == PAE_INREPL)
1019 		{
1020 			if (t_iseq(str, '#'))
1021 			{
1022 				*prepl = '\0';
1023 				break;
1024 			}
1025 			else if (t_isalpha(str))
1026 			{
1027 				COPYCHAR(prepl, str);
1028 				prepl += pg_mblen(str);
1029 			}
1030 			else if (!t_isspace(str))
1031 				ereport(ERROR,
1032 						(errcode(ERRCODE_CONFIG_FILE_ERROR),
1033 						 errmsg("syntax error")));
1034 		}
1035 		else
1036 			elog(ERROR, "unrecognized state in parse_affentry: %d", state);
1037 
1038 		str += pg_mblen(str);
1039 	}
1040 
1041 	*pmask = *pfind = *prepl = '\0';
1042 
1043 	return (*mask && (*find || *repl));
1044 }
1045 
1046 /*
1047  * Sets a Hunspell options depending on flag type.
1048  */
1049 static void
setCompoundAffixFlagValue(IspellDict * Conf,CompoundAffixFlag * entry,char * s,uint32 val)1050 setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry,
1051 						  char *s, uint32 val)
1052 {
1053 	if (Conf->flagMode == FM_NUM)
1054 	{
1055 		char	   *next;
1056 		int			i;
1057 
1058 		i = strtol(s, &next, 10);
1059 		if (s == next || errno == ERANGE)
1060 			ereport(ERROR,
1061 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1062 					 errmsg("invalid affix flag \"%s\"", s)));
1063 		if (i < 0 || i > FLAGNUM_MAXSIZE)
1064 			ereport(ERROR,
1065 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1066 					 errmsg("affix flag \"%s\" is out of range", s)));
1067 
1068 		entry->flag.i = i;
1069 	}
1070 	else
1071 		entry->flag.s = cpstrdup(Conf, s);
1072 
1073 	entry->flagMode = Conf->flagMode;
1074 	entry->value = val;
1075 }
1076 
1077 /*
1078  * Sets up a correspondence for the affix parameter with the affix flag.
1079  *
1080  * Conf: current dictionary.
1081  * s: affix flag in string.
1082  * val: affix parameter.
1083  */
1084 static void
addCompoundAffixFlagValue(IspellDict * Conf,char * s,uint32 val)1085 addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
1086 {
1087 	CompoundAffixFlag *newValue;
1088 	char		sbuf[BUFSIZ];
1089 	char	   *sflag;
1090 	int			clen;
1091 
1092 	while (*s && t_isspace(s))
1093 		s += pg_mblen(s);
1094 
1095 	if (!*s)
1096 		ereport(ERROR,
1097 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1098 				 errmsg("syntax error")));
1099 
1100 	/* Get flag without \n */
1101 	sflag = sbuf;
1102 	while (*s && !t_isspace(s) && *s != '\n')
1103 	{
1104 		clen = pg_mblen(s);
1105 		COPYCHAR(sflag, s);
1106 		sflag += clen;
1107 		s += clen;
1108 	}
1109 	*sflag = '\0';
1110 
1111 	/* Resize array or allocate memory for array CompoundAffixFlag */
1112 	if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
1113 	{
1114 		if (Conf->mCompoundAffixFlag)
1115 		{
1116 			Conf->mCompoundAffixFlag *= 2;
1117 			Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1118 				repalloc((void *) Conf->CompoundAffixFlags,
1119 					   Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1120 		}
1121 		else
1122 		{
1123 			Conf->mCompoundAffixFlag = 10;
1124 			Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1125 				tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1126 		}
1127 	}
1128 
1129 	newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
1130 
1131 	setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
1132 
1133 	Conf->usecompound = true;
1134 	Conf->nCompoundAffixFlag++;
1135 }
1136 
1137 /*
1138  * Returns a set of affix parameters which correspondence to the set of affix
1139  * flags s.
1140  */
1141 static int
getCompoundAffixFlagValue(IspellDict * Conf,char * s)1142 getCompoundAffixFlagValue(IspellDict *Conf, char *s)
1143 {
1144 	uint32		flag = 0;
1145 	CompoundAffixFlag *found,
1146 				key;
1147 	char		sflag[BUFSIZ];
1148 	char	   *flagcur;
1149 
1150 	if (Conf->nCompoundAffixFlag == 0)
1151 		return 0;
1152 
1153 	flagcur = s;
1154 	while (*flagcur)
1155 	{
1156 		getNextFlagFromString(Conf, &flagcur, sflag);
1157 		setCompoundAffixFlagValue(Conf, &key, sflag, 0);
1158 
1159 		found = (CompoundAffixFlag *)
1160 			bsearch(&key, (void *) Conf->CompoundAffixFlags,
1161 					Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
1162 					cmpcmdflag);
1163 		if (found != NULL)
1164 			flag |= found->value;
1165 	}
1166 
1167 	return flag;
1168 }
1169 
1170 /*
1171  * Returns a flag set using the s parameter.
1172  *
1173  * If Conf->useFlagAliases is true then the s parameter is index of the
1174  * Conf->AffixData array and function returns its entry.
1175  * Else function returns the s parameter.
1176  */
1177 static char *
getAffixFlagSet(IspellDict * Conf,char * s)1178 getAffixFlagSet(IspellDict *Conf, char *s)
1179 {
1180 	if (Conf->useFlagAliases && *s != '\0')
1181 	{
1182 		int			curaffix;
1183 		char	   *end;
1184 
1185 		curaffix = strtol(s, &end, 10);
1186 		if (s == end || errno == ERANGE)
1187 			ereport(ERROR,
1188 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1189 					 errmsg("invalid affix alias \"%s\"", s)));
1190 
1191 		if (curaffix > 0 && curaffix < Conf->nAffixData)
1192 
1193 			/*
1194 			 * Do not subtract 1 from curaffix because empty string was added
1195 			 * in NIImportOOAffixes
1196 			 */
1197 			return Conf->AffixData[curaffix];
1198 		else if (curaffix > Conf->nAffixData)
1199 			ereport(ERROR,
1200 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
1201 					 errmsg("invalid affix alias \"%s\"", s)));
1202 		return VoidString;
1203 	}
1204 	else
1205 		return s;
1206 }
1207 
1208 /*
1209  * Import an affix file that follows MySpell or Hunspell format.
1210  *
1211  * Conf: current dictionary.
1212  * filename: path to the .affix file.
1213  */
1214 static void
NIImportOOAffixes(IspellDict * Conf,const char * filename)1215 NIImportOOAffixes(IspellDict *Conf, const char *filename)
1216 {
1217 	char		type[BUFSIZ],
1218 			   *ptype = NULL;
1219 	char		sflag[BUFSIZ];
1220 	char		mask[BUFSIZ],
1221 			   *pmask;
1222 	char		find[BUFSIZ],
1223 			   *pfind;
1224 	char		repl[BUFSIZ],
1225 			   *prepl;
1226 	bool		isSuffix = false;
1227 	int			naffix = 0,
1228 				curaffix = 0;
1229 	int			sflaglen = 0;
1230 	char		flagflags = 0;
1231 	tsearch_readline_state trst;
1232 	char	   *recoded;
1233 
1234 	/* read file to find any flag */
1235 	Conf->usecompound = false;
1236 	Conf->useFlagAliases = false;
1237 	Conf->flagMode = FM_CHAR;
1238 
1239 	if (!tsearch_readline_begin(&trst, filename))
1240 		ereport(ERROR,
1241 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1242 				 errmsg("could not open affix file \"%s\": %m",
1243 						filename)));
1244 
1245 	while ((recoded = tsearch_readline(&trst)) != NULL)
1246 	{
1247 		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1248 		{
1249 			pfree(recoded);
1250 			continue;
1251 		}
1252 
1253 		if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
1254 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
1255 									  FF_COMPOUNDFLAG);
1256 		else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
1257 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
1258 									  FF_COMPOUNDBEGIN);
1259 		else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
1260 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
1261 									  FF_COMPOUNDLAST);
1262 		/* COMPOUNDLAST and COMPOUNDEND are synonyms */
1263 		else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
1264 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
1265 									  FF_COMPOUNDLAST);
1266 		else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
1267 			addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
1268 									  FF_COMPOUNDMIDDLE);
1269 		else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
1270 			addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
1271 									  FF_COMPOUNDONLY);
1272 		else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
1273 			addCompoundAffixFlagValue(Conf,
1274 									  recoded + strlen("COMPOUNDPERMITFLAG"),
1275 									  FF_COMPOUNDPERMITFLAG);
1276 		else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
1277 			addCompoundAffixFlagValue(Conf,
1278 									  recoded + strlen("COMPOUNDFORBIDFLAG"),
1279 									  FF_COMPOUNDFORBIDFLAG);
1280 		else if (STRNCMP(recoded, "FLAG") == 0)
1281 		{
1282 			char	   *s = recoded + strlen("FLAG");
1283 
1284 			while (*s && t_isspace(s))
1285 				s += pg_mblen(s);
1286 
1287 			if (*s)
1288 			{
1289 				if (STRNCMP(s, "long") == 0)
1290 					Conf->flagMode = FM_LONG;
1291 				else if (STRNCMP(s, "num") == 0)
1292 					Conf->flagMode = FM_NUM;
1293 				else if (STRNCMP(s, "default") != 0)
1294 					ereport(ERROR,
1295 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1296 							 errmsg("Ispell dictionary supports only "
1297 									"\"default\", \"long\", "
1298 									"and \"num\" flag values")));
1299 			}
1300 		}
1301 
1302 		pfree(recoded);
1303 	}
1304 	tsearch_readline_end(&trst);
1305 
1306 	if (Conf->nCompoundAffixFlag > 1)
1307 		qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
1308 			  sizeof(CompoundAffixFlag), cmpcmdflag);
1309 
1310 	if (!tsearch_readline_begin(&trst, filename))
1311 		ereport(ERROR,
1312 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1313 				 errmsg("could not open affix file \"%s\": %m",
1314 						filename)));
1315 
1316 	while ((recoded = tsearch_readline(&trst)) != NULL)
1317 	{
1318 		int			fields_read;
1319 
1320 		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1321 			goto nextline;
1322 
1323 		fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
1324 
1325 		if (ptype)
1326 			pfree(ptype);
1327 		ptype = lowerstr_ctx(Conf, type);
1328 
1329 		/* First try to parse AF parameter (alias compression) */
1330 		if (STRNCMP(ptype, "af") == 0)
1331 		{
1332 			/* First line is the number of aliases */
1333 			if (!Conf->useFlagAliases)
1334 			{
1335 				Conf->useFlagAliases = true;
1336 				naffix = atoi(sflag);
1337 				if (naffix == 0)
1338 					ereport(ERROR,
1339 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1340 						   errmsg("invalid number of flag vector aliases")));
1341 
1342 				/* Also reserve place for empty flag set */
1343 				naffix++;
1344 
1345 				Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1346 				Conf->lenAffixData = Conf->nAffixData = naffix;
1347 
1348 				/* Add empty flag set into AffixData */
1349 				Conf->AffixData[curaffix] = VoidString;
1350 				curaffix++;
1351 			}
1352 			/* Other lines is aliases */
1353 			else
1354 			{
1355 				if (curaffix < naffix)
1356 				{
1357 					Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
1358 					curaffix++;
1359 				}
1360 			}
1361 			goto nextline;
1362 		}
1363 		/* Else try to parse prefixes and suffixes */
1364 		if (fields_read < 4 ||
1365 			(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
1366 			goto nextline;
1367 
1368 		sflaglen = strlen(sflag);
1369 		if (sflaglen == 0
1370 			|| (sflaglen > 1 && Conf->flagMode == FM_CHAR)
1371 			|| (sflaglen > 2 && Conf->flagMode == FM_LONG))
1372 			goto nextline;
1373 
1374 		/*--------
1375 		 * Affix header. For example:
1376 		 * SFX \ N 1
1377 		 *--------
1378 		 */
1379 		if (fields_read == 4)
1380 		{
1381 			isSuffix = (STRNCMP(ptype, "sfx") == 0);
1382 			if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
1383 				flagflags = FF_CROSSPRODUCT;
1384 			else
1385 				flagflags = 0;
1386 		}
1387 		/*--------
1388 		 * Affix fields. For example:
1389 		 * SFX \   0	Y/L [^Y]
1390 		 *--------
1391 		 */
1392 		else
1393 		{
1394 			char	   *ptr;
1395 			int			aflg = 0;
1396 
1397 			/* Get flags after '/' (flags are case sensitive) */
1398 			if ((ptr = strchr(repl, '/')) != NULL)
1399 				aflg |= getCompoundAffixFlagValue(Conf,
1400 												  getAffixFlagSet(Conf,
1401 																  ptr + 1));
1402 			/* Get lowercased version of string before '/' */
1403 			prepl = lowerstr_ctx(Conf, repl);
1404 			if ((ptr = strchr(prepl, '/')) != NULL)
1405 				*ptr = '\0';
1406 			pfind = lowerstr_ctx(Conf, find);
1407 			pmask = lowerstr_ctx(Conf, mask);
1408 			if (t_iseq(find, '0'))
1409 				*pfind = '\0';
1410 			if (t_iseq(repl, '0'))
1411 				*prepl = '\0';
1412 
1413 			NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
1414 					   isSuffix ? FF_SUFFIX : FF_PREFIX);
1415 			pfree(prepl);
1416 			pfree(pfind);
1417 			pfree(pmask);
1418 		}
1419 
1420 nextline:
1421 		pfree(recoded);
1422 	}
1423 
1424 	tsearch_readline_end(&trst);
1425 	if (ptype)
1426 		pfree(ptype);
1427 }
1428 
1429 /*
1430  * import affixes
1431  *
1432  * Note caller must already have applied get_tsearch_config_filename
1433  *
1434  * This function is responsible for parsing ispell ("old format") affix files.
1435  * If we realize that the file contains new-format commands, we pass off the
1436  * work to NIImportOOAffixes(), which will re-read the whole file.
1437  */
1438 void
NIImportAffixes(IspellDict * Conf,const char * filename)1439 NIImportAffixes(IspellDict *Conf, const char *filename)
1440 {
1441 	char	   *pstr = NULL;
1442 	char		flag[BUFSIZ];
1443 	char		mask[BUFSIZ];
1444 	char		find[BUFSIZ];
1445 	char		repl[BUFSIZ];
1446 	char	   *s;
1447 	bool		suffixes = false;
1448 	bool		prefixes = false;
1449 	char		flagflags = 0;
1450 	tsearch_readline_state trst;
1451 	bool		oldformat = false;
1452 	char	   *recoded = NULL;
1453 
1454 	if (!tsearch_readline_begin(&trst, filename))
1455 		ereport(ERROR,
1456 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1457 				 errmsg("could not open affix file \"%s\": %m",
1458 						filename)));
1459 
1460 	Conf->usecompound = false;
1461 	Conf->useFlagAliases = false;
1462 	Conf->flagMode = FM_CHAR;
1463 
1464 	while ((recoded = tsearch_readline(&trst)) != NULL)
1465 	{
1466 		pstr = lowerstr(recoded);
1467 
1468 		/* Skip comments and empty lines */
1469 		if (*pstr == '#' || *pstr == '\n')
1470 			goto nextline;
1471 
1472 		if (STRNCMP(pstr, "compoundwords") == 0)
1473 		{
1474 			/* Find case-insensitive L flag in non-lowercased string */
1475 			s = findchar2(recoded, 'l', 'L');
1476 			if (s)
1477 			{
1478 				while (*s && !t_isspace(s))
1479 					s += pg_mblen(s);
1480 				while (*s && t_isspace(s))
1481 					s += pg_mblen(s);
1482 
1483 				if (*s && pg_mblen(s) == 1)
1484 				{
1485 					addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
1486 					Conf->usecompound = true;
1487 				}
1488 				oldformat = true;
1489 				goto nextline;
1490 			}
1491 		}
1492 		if (STRNCMP(pstr, "suffixes") == 0)
1493 		{
1494 			suffixes = true;
1495 			prefixes = false;
1496 			oldformat = true;
1497 			goto nextline;
1498 		}
1499 		if (STRNCMP(pstr, "prefixes") == 0)
1500 		{
1501 			suffixes = false;
1502 			prefixes = true;
1503 			oldformat = true;
1504 			goto nextline;
1505 		}
1506 		if (STRNCMP(pstr, "flag") == 0)
1507 		{
1508 			s = recoded + 4;	/* we need non-lowercased string */
1509 			flagflags = 0;
1510 
1511 			while (*s && t_isspace(s))
1512 				s += pg_mblen(s);
1513 
1514 			if (*s == '*')
1515 			{
1516 				flagflags |= FF_CROSSPRODUCT;
1517 				s++;
1518 			}
1519 			else if (*s == '~')
1520 			{
1521 				flagflags |= FF_COMPOUNDONLY;
1522 				s++;
1523 			}
1524 
1525 			if (*s == '\\')
1526 				s++;
1527 
1528 			/*
1529 			 * An old-format flag is a single ASCII character; we expect it to
1530 			 * be followed by EOL, whitespace, or ':'.  Otherwise this is a
1531 			 * new-format flag command.
1532 			 */
1533 			if (*s && pg_mblen(s) == 1)
1534 			{
1535 				COPYCHAR(flag, s);
1536 				flag[1] = '\0';
1537 
1538 				s++;
1539 				if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
1540 					t_isspace(s))
1541 				{
1542 					oldformat = true;
1543 					goto nextline;
1544 				}
1545 			}
1546 			goto isnewformat;
1547 		}
1548 		if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
1549 			STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
1550 			STRNCMP(recoded, "PFX") == 0 ||
1551 			STRNCMP(recoded, "SFX") == 0)
1552 			goto isnewformat;
1553 
1554 		if ((!suffixes) && (!prefixes))
1555 			goto nextline;
1556 
1557 		if (!parse_affentry(pstr, mask, find, repl))
1558 			goto nextline;
1559 
1560 		NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
1561 
1562 nextline:
1563 		pfree(recoded);
1564 		pfree(pstr);
1565 	}
1566 	tsearch_readline_end(&trst);
1567 	return;
1568 
1569 isnewformat:
1570 	if (oldformat)
1571 		ereport(ERROR,
1572 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
1573 		errmsg("affix file contains both old-style and new-style commands")));
1574 	tsearch_readline_end(&trst);
1575 
1576 	NIImportOOAffixes(Conf, filename);
1577 }
1578 
1579 /*
1580  * Merges two affix flag sets and stores a new affix flag set into
1581  * Conf->AffixData.
1582  *
1583  * Returns index of a new affix flag set.
1584  */
1585 static int
MergeAffix(IspellDict * Conf,int a1,int a2)1586 MergeAffix(IspellDict *Conf, int a1, int a2)
1587 {
1588 	char	  **ptr;
1589 
1590 	Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData);
1591 
1592 	/* Do not merge affix flags if one of affix flags is empty */
1593 	if (*Conf->AffixData[a1] == '\0')
1594 		return a2;
1595 	else if (*Conf->AffixData[a2] == '\0')
1596 		return a1;
1597 
1598 	while (Conf->nAffixData + 1 >= Conf->lenAffixData)
1599 	{
1600 		Conf->lenAffixData *= 2;
1601 		Conf->AffixData = (char **) repalloc(Conf->AffixData,
1602 										sizeof(char *) * Conf->lenAffixData);
1603 	}
1604 
1605 	ptr = Conf->AffixData + Conf->nAffixData;
1606 	if (Conf->flagMode == FM_NUM)
1607 	{
1608 		*ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1609 					   strlen(Conf->AffixData[a2]) +
1610 					   1 /* comma */ + 1 /* \0 */ );
1611 		sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1612 	}
1613 	else
1614 	{
1615 		*ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1616 					   strlen(Conf->AffixData[a2]) +
1617 					   1 /* \0 */ );
1618 		sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1619 	}
1620 	ptr++;
1621 	*ptr = NULL;
1622 	Conf->nAffixData++;
1623 
1624 	return Conf->nAffixData - 1;
1625 }
1626 
1627 /*
1628  * Returns a set of affix parameters which correspondence to the set of affix
1629  * flags with the given index.
1630  */
1631 static uint32
makeCompoundFlags(IspellDict * Conf,int affix)1632 makeCompoundFlags(IspellDict *Conf, int affix)
1633 {
1634 	Assert(affix < Conf->nAffixData);
1635 
1636 	return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) &
1637 			FF_COMPOUNDFLAGMASK);
1638 }
1639 
1640 /*
1641  * Makes a prefix tree for the given level.
1642  *
1643  * Conf: current dictionary.
1644  * low: lower index of the Conf->Spell array.
1645  * high: upper index of the Conf->Spell array.
1646  * level: current prefix tree level.
1647  */
1648 static SPNode *
mkSPNode(IspellDict * Conf,int low,int high,int level)1649 mkSPNode(IspellDict *Conf, int low, int high, int level)
1650 {
1651 	int			i;
1652 	int			nchar = 0;
1653 	char		lastchar = '\0';
1654 	SPNode	   *rs;
1655 	SPNodeData *data;
1656 	int			lownew = low;
1657 
1658 	for (i = low; i < high; i++)
1659 		if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
1660 		{
1661 			nchar++;
1662 			lastchar = Conf->Spell[i]->word[level];
1663 		}
1664 
1665 	if (!nchar)
1666 		return NULL;
1667 
1668 	rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
1669 	rs->length = nchar;
1670 	data = rs->data;
1671 
1672 	lastchar = '\0';
1673 	for (i = low; i < high; i++)
1674 		if (Conf->Spell[i]->p.d.len > level)
1675 		{
1676 			if (lastchar != Conf->Spell[i]->word[level])
1677 			{
1678 				if (lastchar)
1679 				{
1680 					/* Next level of the prefix tree */
1681 					data->node = mkSPNode(Conf, lownew, i, level + 1);
1682 					lownew = i;
1683 					data++;
1684 				}
1685 				lastchar = Conf->Spell[i]->word[level];
1686 			}
1687 			data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
1688 			if (Conf->Spell[i]->p.d.len == level + 1)
1689 			{
1690 				bool		clearCompoundOnly = false;
1691 
1692 				if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
1693 				{
1694 					/*
1695 					 * MergeAffix called a few times. If one of word is
1696 					 * allowed to be in compound word and another isn't, then
1697 					 * clear FF_COMPOUNDONLY flag.
1698 					 */
1699 
1700 					clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
1701 						& makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
1702 						? false : true;
1703 					data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
1704 				}
1705 				else
1706 					data->affix = Conf->Spell[i]->p.d.affix;
1707 				data->isword = 1;
1708 
1709 				data->compoundflag = makeCompoundFlags(Conf, data->affix);
1710 
1711 				if ((data->compoundflag & FF_COMPOUNDONLY) &&
1712 					(data->compoundflag & FF_COMPOUNDFLAG) == 0)
1713 					data->compoundflag |= FF_COMPOUNDFLAG;
1714 
1715 				if (clearCompoundOnly)
1716 					data->compoundflag &= ~FF_COMPOUNDONLY;
1717 			}
1718 		}
1719 
1720 	/* Next level of the prefix tree */
1721 	data->node = mkSPNode(Conf, lownew, high, level + 1);
1722 
1723 	return rs;
1724 }
1725 
1726 /*
1727  * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
1728  * and affixes.
1729  */
1730 void
NISortDictionary(IspellDict * Conf)1731 NISortDictionary(IspellDict *Conf)
1732 {
1733 	int			i;
1734 	int			naffix = 0;
1735 	int			curaffix;
1736 
1737 	/* compress affixes */
1738 
1739 	/*
1740 	 * If we use flag aliases then we need to use Conf->AffixData filled in
1741 	 * the NIImportOOAffixes().
1742 	 */
1743 	if (Conf->useFlagAliases)
1744 	{
1745 		for (i = 0; i < Conf->nspell; i++)
1746 		{
1747 			char	   *end;
1748 
1749 			if (*Conf->Spell[i]->p.flag != '\0')
1750 			{
1751 				curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
1752 				if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
1753 					ereport(ERROR,
1754 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1755 							 errmsg("invalid affix alias \"%s\"",
1756 									Conf->Spell[i]->p.flag)));
1757 				if (curaffix < 0 || curaffix >= Conf->nAffixData)
1758 					ereport(ERROR,
1759 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1760 							 errmsg("invalid affix alias \"%s\"",
1761 									Conf->Spell[i]->p.flag)));
1762 				if (*end != '\0' && !t_isdigit(end) && !t_isspace(end))
1763 					ereport(ERROR,
1764 							(errcode(ERRCODE_CONFIG_FILE_ERROR),
1765 							 errmsg("invalid affix alias \"%s\"",
1766 									Conf->Spell[i]->p.flag)));
1767 			}
1768 			else
1769 			{
1770 				/*
1771 				 * If Conf->Spell[i]->p.flag is empty, then get empty value of
1772 				 * Conf->AffixData (0 index).
1773 				 */
1774 				curaffix = 0;
1775 			}
1776 
1777 			Conf->Spell[i]->p.d.affix = curaffix;
1778 			Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1779 		}
1780 	}
1781 	/* Otherwise fill Conf->AffixData here */
1782 	else
1783 	{
1784 		/* Count the number of different flags used in the dictionary */
1785 		qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
1786 			  cmpspellaffix);
1787 
1788 		naffix = 0;
1789 		for (i = 0; i < Conf->nspell; i++)
1790 		{
1791 			if (i == 0
1792 				|| strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
1793 				naffix++;
1794 		}
1795 
1796 		/*
1797 		 * Fill in Conf->AffixData with the affixes that were used in the
1798 		 * dictionary. Replace textual flag-field of Conf->Spell entries with
1799 		 * indexes into Conf->AffixData array.
1800 		 */
1801 		Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1802 
1803 		curaffix = -1;
1804 		for (i = 0; i < Conf->nspell; i++)
1805 		{
1806 			if (i == 0
1807 				|| strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]))
1808 			{
1809 				curaffix++;
1810 				Assert(curaffix < naffix);
1811 				Conf->AffixData[curaffix] = cpstrdup(Conf,
1812 													 Conf->Spell[i]->p.flag);
1813 			}
1814 
1815 			Conf->Spell[i]->p.d.affix = curaffix;
1816 			Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1817 		}
1818 
1819 		Conf->lenAffixData = Conf->nAffixData = naffix;
1820 	}
1821 
1822 	/* Start build a prefix tree */
1823 	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1824 	Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1825 }
1826 
1827 /*
1828  * Makes a prefix tree for the given level using the repl string of an affix
1829  * rule. Affixes with empty replace string do not include in the prefix tree.
1830  * This affixes are included by mkVoidAffix().
1831  *
1832  * Conf: current dictionary.
1833  * low: lower index of the Conf->Affix array.
1834  * high: upper index of the Conf->Affix array.
1835  * level: current prefix tree level.
1836  * type: FF_SUFFIX or FF_PREFIX.
1837  */
1838 static AffixNode *
mkANode(IspellDict * Conf,int low,int high,int level,int type)1839 mkANode(IspellDict *Conf, int low, int high, int level, int type)
1840 {
1841 	int			i;
1842 	int			nchar = 0;
1843 	uint8		lastchar = '\0';
1844 	AffixNode  *rs;
1845 	AffixNodeData *data;
1846 	int			lownew = low;
1847 	int			naff;
1848 	AFFIX	  **aff;
1849 
1850 	for (i = low; i < high; i++)
1851 		if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1852 		{
1853 			nchar++;
1854 			lastchar = GETCHAR(Conf->Affix + i, level, type);
1855 		}
1856 
1857 	if (!nchar)
1858 		return NULL;
1859 
1860 	aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1861 	naff = 0;
1862 
1863 	rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1864 	rs->length = nchar;
1865 	data = rs->data;
1866 
1867 	lastchar = '\0';
1868 	for (i = low; i < high; i++)
1869 		if (Conf->Affix[i].replen > level)
1870 		{
1871 			if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1872 			{
1873 				if (lastchar)
1874 				{
1875 					/* Next level of the prefix tree */
1876 					data->node = mkANode(Conf, lownew, i, level + 1, type);
1877 					if (naff)
1878 					{
1879 						data->naff = naff;
1880 						data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1881 						memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1882 						naff = 0;
1883 					}
1884 					data++;
1885 					lownew = i;
1886 				}
1887 				lastchar = GETCHAR(Conf->Affix + i, level, type);
1888 			}
1889 			data->val = GETCHAR(Conf->Affix + i, level, type);
1890 			if (Conf->Affix[i].replen == level + 1)
1891 			{					/* affix stopped */
1892 				aff[naff++] = Conf->Affix + i;
1893 			}
1894 		}
1895 
1896 	/* Next level of the prefix tree */
1897 	data->node = mkANode(Conf, lownew, high, level + 1, type);
1898 	if (naff)
1899 	{
1900 		data->naff = naff;
1901 		data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1902 		memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1903 		naff = 0;
1904 	}
1905 
1906 	pfree(aff);
1907 
1908 	return rs;
1909 }
1910 
1911 /*
1912  * Makes the root void node in the prefix tree. The root void node is created
1913  * for affixes which have empty replace string ("repl" field).
1914  */
1915 static void
mkVoidAffix(IspellDict * Conf,bool issuffix,int startsuffix)1916 mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
1917 {
1918 	int			i,
1919 				cnt = 0;
1920 	int			start = (issuffix) ? startsuffix : 0;
1921 	int			end = (issuffix) ? Conf->naffixes : startsuffix;
1922 	AffixNode  *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1923 
1924 	Affix->length = 1;
1925 	Affix->isvoid = 1;
1926 
1927 	if (issuffix)
1928 	{
1929 		Affix->data->node = Conf->Suffix;
1930 		Conf->Suffix = Affix;
1931 	}
1932 	else
1933 	{
1934 		Affix->data->node = Conf->Prefix;
1935 		Conf->Prefix = Affix;
1936 	}
1937 
1938 	/* Count affixes with empty replace string */
1939 	for (i = start; i < end; i++)
1940 		if (Conf->Affix[i].replen == 0)
1941 			cnt++;
1942 
1943 	/* There is not affixes with empty replace string */
1944 	if (cnt == 0)
1945 		return;
1946 
1947 	Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
1948 	Affix->data->naff = (uint32) cnt;
1949 
1950 	cnt = 0;
1951 	for (i = start; i < end; i++)
1952 		if (Conf->Affix[i].replen == 0)
1953 		{
1954 			Affix->data->aff[cnt] = Conf->Affix + i;
1955 			cnt++;
1956 		}
1957 }
1958 
1959 /*
1960  * Checks if the affixflag is used by dictionary. Conf->AffixData does not
1961  * contain affixflag if this flag is not used actually by the .dict file.
1962  *
1963  * Conf: current dictionary.
1964  * affixflag: affix flag.
1965  *
1966  * Returns true if the Conf->AffixData array contains affixflag, otherwise
1967  * returns false.
1968  */
1969 static bool
isAffixInUse(IspellDict * Conf,char * affixflag)1970 isAffixInUse(IspellDict *Conf, char *affixflag)
1971 {
1972 	int			i;
1973 
1974 	for (i = 0; i < Conf->nAffixData; i++)
1975 		if (IsAffixFlagInUse(Conf, i, affixflag))
1976 			return true;
1977 
1978 	return false;
1979 }
1980 
1981 /*
1982  * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
1983  */
1984 void
NISortAffixes(IspellDict * Conf)1985 NISortAffixes(IspellDict *Conf)
1986 {
1987 	AFFIX	   *Affix;
1988 	size_t		i;
1989 	CMPDAffix  *ptr;
1990 	int			firstsuffix = Conf->naffixes;
1991 
1992 	if (Conf->naffixes == 0)
1993 		return;
1994 
1995 	/* Store compound affixes in the Conf->CompoundAffix array */
1996 	if (Conf->naffixes > 1)
1997 		qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
1998 	Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
1999 	ptr->affix = NULL;
2000 
2001 	for (i = 0; i < Conf->naffixes; i++)
2002 	{
2003 		Affix = &(((AFFIX *) Conf->Affix)[i]);
2004 		if (Affix->type == FF_SUFFIX && i < firstsuffix)
2005 			firstsuffix = i;
2006 
2007 		if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
2008 			isAffixInUse(Conf, Affix->flag))
2009 		{
2010 			bool		issuffix = (Affix->type == FF_SUFFIX);
2011 
2012 			if (ptr == Conf->CompoundAffix ||
2013 				issuffix != (ptr - 1)->issuffix ||
2014 				strbncmp((const unsigned char *) (ptr - 1)->affix,
2015 						 (const unsigned char *) Affix->repl,
2016 						 (ptr - 1)->len))
2017 			{
2018 				/* leave only unique and minimals suffixes */
2019 				ptr->affix = Affix->repl;
2020 				ptr->len = Affix->replen;
2021 				ptr->issuffix = issuffix;
2022 				ptr++;
2023 			}
2024 		}
2025 	}
2026 	ptr->affix = NULL;
2027 	Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
2028 
2029 	/* Start build a prefix tree */
2030 	Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
2031 	Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
2032 	mkVoidAffix(Conf, true, firstsuffix);
2033 	mkVoidAffix(Conf, false, firstsuffix);
2034 }
2035 
2036 static AffixNodeData *
FindAffixes(AffixNode * node,const char * word,int wrdlen,int * level,int type)2037 FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
2038 {
2039 	AffixNodeData *StopLow,
2040 			   *StopHigh,
2041 			   *StopMiddle;
2042 	uint8 symbol;
2043 
2044 	if (node->isvoid)
2045 	{							/* search void affixes */
2046 		if (node->data->naff)
2047 			return node->data;
2048 		node = node->data->node;
2049 	}
2050 
2051 	while (node && *level < wrdlen)
2052 	{
2053 		StopLow = node->data;
2054 		StopHigh = node->data + node->length;
2055 		while (StopLow < StopHigh)
2056 		{
2057 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2058 			symbol = GETWCHAR(word, wrdlen, *level, type);
2059 
2060 			if (StopMiddle->val == symbol)
2061 			{
2062 				(*level)++;
2063 				if (StopMiddle->naff)
2064 					return StopMiddle;
2065 				node = StopMiddle->node;
2066 				break;
2067 			}
2068 			else if (StopMiddle->val < symbol)
2069 				StopLow = StopMiddle + 1;
2070 			else
2071 				StopHigh = StopMiddle;
2072 		}
2073 		if (StopLow >= StopHigh)
2074 			break;
2075 	}
2076 	return NULL;
2077 }
2078 
2079 static char *
CheckAffix(const char * word,size_t len,AFFIX * Affix,int flagflags,char * newword,int * baselen)2080 CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
2081 {
2082 	/*
2083 	 * Check compound allow flags
2084 	 */
2085 
2086 	if (flagflags == 0)
2087 	{
2088 		if (Affix->flagflags & FF_COMPOUNDONLY)
2089 			return NULL;
2090 	}
2091 	else if (flagflags & FF_COMPOUNDBEGIN)
2092 	{
2093 		if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2094 			return NULL;
2095 		if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
2096 			if (Affix->type == FF_SUFFIX)
2097 				return NULL;
2098 	}
2099 	else if (flagflags & FF_COMPOUNDMIDDLE)
2100 	{
2101 		if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
2102 			(Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
2103 			return NULL;
2104 	}
2105 	else if (flagflags & FF_COMPOUNDLAST)
2106 	{
2107 		if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2108 			return NULL;
2109 		if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
2110 			if (Affix->type == FF_PREFIX)
2111 				return NULL;
2112 	}
2113 
2114 	/*
2115 	 * make replace pattern of affix
2116 	 */
2117 	if (Affix->type == FF_SUFFIX)
2118 	{
2119 		strcpy(newword, word);
2120 		strcpy(newword + len - Affix->replen, Affix->find);
2121 		if (baselen)			/* store length of non-changed part of word */
2122 			*baselen = len - Affix->replen;
2123 	}
2124 	else
2125 	{
2126 		/*
2127 		 * if prefix is an all non-changed part's length then all word
2128 		 * contains only prefix and suffix, so out
2129 		 */
2130 		if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
2131 			return NULL;
2132 		strcpy(newword, Affix->find);
2133 		strcat(newword, word + Affix->replen);
2134 	}
2135 
2136 	/*
2137 	 * check resulting word
2138 	 */
2139 	if (Affix->issimple)
2140 		return newword;
2141 	else if (Affix->isregis)
2142 	{
2143 		if (RS_execute(&(Affix->reg.regis), newword))
2144 			return newword;
2145 	}
2146 	else
2147 	{
2148 		pg_wchar   *data;
2149 		size_t		data_len;
2150 		int			newword_len;
2151 
2152 		/* Convert data string to wide characters */
2153 		newword_len = strlen(newword);
2154 		data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
2155 		data_len = pg_mb2wchar_with_len(newword, data, newword_len);
2156 
2157 		if (pg_regexec(&(Affix->reg.pregex->regex), data, data_len,
2158 					   0, NULL, 0, NULL, 0) == REG_OKAY)
2159 		{
2160 			pfree(data);
2161 			return newword;
2162 		}
2163 		pfree(data);
2164 	}
2165 
2166 	return NULL;
2167 }
2168 
2169 static int
addToResult(char ** forms,char ** cur,char * word)2170 addToResult(char **forms, char **cur, char *word)
2171 {
2172 	if (cur - forms >= MAX_NORM - 1)
2173 		return 0;
2174 	if (forms == cur || strcmp(word, *(cur - 1)) != 0)
2175 	{
2176 		*cur = pstrdup(word);
2177 		*(cur + 1) = NULL;
2178 		return 1;
2179 	}
2180 
2181 	return 0;
2182 }
2183 
2184 static char **
NormalizeSubWord(IspellDict * Conf,char * word,int flag)2185 NormalizeSubWord(IspellDict *Conf, char *word, int flag)
2186 {
2187 	AffixNodeData *suffix = NULL,
2188 			   *prefix = NULL;
2189 	int			slevel = 0,
2190 				plevel = 0;
2191 	int			wrdlen = strlen(word),
2192 				swrdlen;
2193 	char	  **forms;
2194 	char	  **cur;
2195 	char		newword[2 * MAXNORMLEN] = "";
2196 	char		pnewword[2 * MAXNORMLEN] = "";
2197 	AffixNode  *snode = Conf->Suffix,
2198 			   *pnode;
2199 	int			i,
2200 				j;
2201 
2202 	if (wrdlen > MAXNORMLEN)
2203 		return NULL;
2204 	cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
2205 	*cur = NULL;
2206 
2207 
2208 	/* Check that the word itself is normal form */
2209 	if (FindWord(Conf, word, VoidString, flag))
2210 	{
2211 		*cur = pstrdup(word);
2212 		cur++;
2213 		*cur = NULL;
2214 	}
2215 
2216 	/* Find all other NORMAL forms of the 'word' (check only prefix) */
2217 	pnode = Conf->Prefix;
2218 	plevel = 0;
2219 	while (pnode)
2220 	{
2221 		prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
2222 		if (!prefix)
2223 			break;
2224 		for (j = 0; j < prefix->naff; j++)
2225 		{
2226 			if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
2227 			{
2228 				/* prefix success */
2229 				if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
2230 					cur += addToResult(forms, cur, newword);
2231 			}
2232 		}
2233 		pnode = prefix->node;
2234 	}
2235 
2236 	/*
2237 	 * Find all other NORMAL forms of the 'word' (check suffix and then
2238 	 * prefix)
2239 	 */
2240 	while (snode)
2241 	{
2242 		int			baselen = 0;
2243 
2244 		/* find possible suffix */
2245 		suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
2246 		if (!suffix)
2247 			break;
2248 		/* foreach suffix check affix */
2249 		for (i = 0; i < suffix->naff; i++)
2250 		{
2251 			if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
2252 			{
2253 				/* suffix success */
2254 				if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
2255 					cur += addToResult(forms, cur, newword);
2256 
2257 				/* now we will look changed word with prefixes */
2258 				pnode = Conf->Prefix;
2259 				plevel = 0;
2260 				swrdlen = strlen(newword);
2261 				while (pnode)
2262 				{
2263 					prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
2264 					if (!prefix)
2265 						break;
2266 					for (j = 0; j < prefix->naff; j++)
2267 					{
2268 						if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
2269 						{
2270 							/* prefix success */
2271 							char	   *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
2272 							VoidString : prefix->aff[j]->flag;
2273 
2274 							if (FindWord(Conf, pnewword, ff, flag))
2275 								cur += addToResult(forms, cur, pnewword);
2276 						}
2277 					}
2278 					pnode = prefix->node;
2279 				}
2280 			}
2281 		}
2282 
2283 		snode = suffix->node;
2284 	}
2285 
2286 	if (cur == forms)
2287 	{
2288 		pfree(forms);
2289 		return (NULL);
2290 	}
2291 	return (forms);
2292 }
2293 
2294 typedef struct SplitVar
2295 {
2296 	int			nstem;
2297 	int			lenstem;
2298 	char	  **stem;
2299 	struct SplitVar *next;
2300 } SplitVar;
2301 
2302 static int
CheckCompoundAffixes(CMPDAffix ** ptr,char * word,int len,bool CheckInPlace)2303 CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
2304 {
2305 	bool		issuffix;
2306 
2307 	/* in case CompoundAffix is null: */
2308 	if (*ptr == NULL)
2309 		return -1;
2310 
2311 	if (CheckInPlace)
2312 	{
2313 		while ((*ptr)->affix)
2314 		{
2315 			if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
2316 			{
2317 				len = (*ptr)->len;
2318 				issuffix = (*ptr)->issuffix;
2319 				(*ptr)++;
2320 				return (issuffix) ? len : 0;
2321 			}
2322 			(*ptr)++;
2323 		}
2324 	}
2325 	else
2326 	{
2327 		char	   *affbegin;
2328 
2329 		while ((*ptr)->affix)
2330 		{
2331 			if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
2332 			{
2333 				len = (*ptr)->len + (affbegin - word);
2334 				issuffix = (*ptr)->issuffix;
2335 				(*ptr)++;
2336 				return (issuffix) ? len : 0;
2337 			}
2338 			(*ptr)++;
2339 		}
2340 	}
2341 	return -1;
2342 }
2343 
2344 static SplitVar *
CopyVar(SplitVar * s,int makedup)2345 CopyVar(SplitVar *s, int makedup)
2346 {
2347 	SplitVar   *v = (SplitVar *) palloc(sizeof(SplitVar));
2348 
2349 	v->next = NULL;
2350 	if (s)
2351 	{
2352 		int			i;
2353 
2354 		v->lenstem = s->lenstem;
2355 		v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2356 		v->nstem = s->nstem;
2357 		for (i = 0; i < s->nstem; i++)
2358 			v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
2359 	}
2360 	else
2361 	{
2362 		v->lenstem = 16;
2363 		v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2364 		v->nstem = 0;
2365 	}
2366 	return v;
2367 }
2368 
2369 static void
AddStem(SplitVar * v,char * word)2370 AddStem(SplitVar *v, char *word)
2371 {
2372 	if (v->nstem >= v->lenstem)
2373 	{
2374 		v->lenstem *= 2;
2375 		v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
2376 	}
2377 
2378 	v->stem[v->nstem] = word;
2379 	v->nstem++;
2380 }
2381 
2382 static SplitVar *
SplitToVariants(IspellDict * Conf,SPNode * snode,SplitVar * orig,char * word,int wordlen,int startpos,int minpos)2383 SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
2384 {
2385 	SplitVar   *var = NULL;
2386 	SPNodeData *StopLow,
2387 			   *StopHigh,
2388 			   *StopMiddle = NULL;
2389 	SPNode	   *node = (snode) ? snode : Conf->Dictionary;
2390 	int			level = (snode) ? minpos : startpos;	/* recursive
2391 														 * minpos==level */
2392 	int			lenaff;
2393 	CMPDAffix  *caff;
2394 	char	   *notprobed;
2395 	int			compoundflag = 0;
2396 
2397 	notprobed = (char *) palloc(wordlen);
2398 	memset(notprobed, 1, wordlen);
2399 	var = CopyVar(orig, 1);
2400 
2401 	while (level < wordlen)
2402 	{
2403 		/* find word with epenthetic or/and compound affix */
2404 		caff = Conf->CompoundAffix;
2405 		while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
2406 		{
2407 			/*
2408 			 * there is one of compound affixes, so check word for existings
2409 			 */
2410 			char		buf[MAXNORMLEN];
2411 			char	  **subres;
2412 
2413 			lenaff = level - startpos + lenaff;
2414 
2415 			if (!notprobed[startpos + lenaff - 1])
2416 				continue;
2417 
2418 			if (level + lenaff - 1 <= minpos)
2419 				continue;
2420 
2421 			if (lenaff >= MAXNORMLEN)
2422 				continue;		/* skip too big value */
2423 			if (lenaff > 0)
2424 				memcpy(buf, word + startpos, lenaff);
2425 			buf[lenaff] = '\0';
2426 
2427 			if (level == 0)
2428 				compoundflag = FF_COMPOUNDBEGIN;
2429 			else if (level == wordlen - 1)
2430 				compoundflag = FF_COMPOUNDLAST;
2431 			else
2432 				compoundflag = FF_COMPOUNDMIDDLE;
2433 			subres = NormalizeSubWord(Conf, buf, compoundflag);
2434 			if (subres)
2435 			{
2436 				/* Yes, it was a word from dictionary */
2437 				SplitVar   *new = CopyVar(var, 0);
2438 				SplitVar   *ptr = var;
2439 				char	  **sptr = subres;
2440 
2441 				notprobed[startpos + lenaff - 1] = 0;
2442 
2443 				while (*sptr)
2444 				{
2445 					AddStem(new, *sptr);
2446 					sptr++;
2447 				}
2448 				pfree(subres);
2449 
2450 				while (ptr->next)
2451 					ptr = ptr->next;
2452 				ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2453 
2454 				pfree(new->stem);
2455 				pfree(new);
2456 			}
2457 		}
2458 
2459 		if (!node)
2460 			break;
2461 
2462 		StopLow = node->data;
2463 		StopHigh = node->data + node->length;
2464 		while (StopLow < StopHigh)
2465 		{
2466 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2467 			if (StopMiddle->val == ((uint8 *) (word))[level])
2468 				break;
2469 			else if (StopMiddle->val < ((uint8 *) (word))[level])
2470 				StopLow = StopMiddle + 1;
2471 			else
2472 				StopHigh = StopMiddle;
2473 		}
2474 
2475 		if (StopLow < StopHigh)
2476 		{
2477 			if (startpos == 0)
2478 				compoundflag = FF_COMPOUNDBEGIN;
2479 			else if (level == wordlen - 1)
2480 				compoundflag = FF_COMPOUNDLAST;
2481 			else
2482 				compoundflag = FF_COMPOUNDMIDDLE;
2483 
2484 			/* find infinitive */
2485 			if (StopMiddle->isword &&
2486 				(StopMiddle->compoundflag & compoundflag) &&
2487 				notprobed[level])
2488 			{
2489 				/* ok, we found full compoundallowed word */
2490 				if (level > minpos)
2491 				{
2492 					/* and its length more than minimal */
2493 					if (wordlen == level + 1)
2494 					{
2495 						/* well, it was last word */
2496 						AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2497 						pfree(notprobed);
2498 						return var;
2499 					}
2500 					else
2501 					{
2502 						/* then we will search more big word at the same point */
2503 						SplitVar   *ptr = var;
2504 
2505 						while (ptr->next)
2506 							ptr = ptr->next;
2507 						ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
2508 						/* we can find next word */
2509 						level++;
2510 						AddStem(var, pnstrdup(word + startpos, level - startpos));
2511 						node = Conf->Dictionary;
2512 						startpos = level;
2513 						continue;
2514 					}
2515 				}
2516 			}
2517 			node = StopMiddle->node;
2518 		}
2519 		else
2520 			node = NULL;
2521 		level++;
2522 	}
2523 
2524 	AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2525 	pfree(notprobed);
2526 	return var;
2527 }
2528 
2529 static void
addNorm(TSLexeme ** lres,TSLexeme ** lcur,char * word,int flags,uint16 NVariant)2530 addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
2531 {
2532 	if (*lres == NULL)
2533 		*lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
2534 
2535 	if (*lcur - *lres < MAX_NORM - 1)
2536 	{
2537 		(*lcur)->lexeme = word;
2538 		(*lcur)->flags = flags;
2539 		(*lcur)->nvariant = NVariant;
2540 		(*lcur)++;
2541 		(*lcur)->lexeme = NULL;
2542 	}
2543 }
2544 
2545 TSLexeme *
NINormalizeWord(IspellDict * Conf,char * word)2546 NINormalizeWord(IspellDict *Conf, char *word)
2547 {
2548 	char	  **res;
2549 	TSLexeme   *lcur = NULL,
2550 			   *lres = NULL;
2551 	uint16		NVariant = 1;
2552 
2553 	res = NormalizeSubWord(Conf, word, 0);
2554 
2555 	if (res)
2556 	{
2557 		char	  **ptr = res;
2558 
2559 		while (*ptr && (lcur - lres) < MAX_NORM)
2560 		{
2561 			addNorm(&lres, &lcur, *ptr, 0, NVariant++);
2562 			ptr++;
2563 		}
2564 		pfree(res);
2565 	}
2566 
2567 	if (Conf->usecompound)
2568 	{
2569 		int			wordlen = strlen(word);
2570 		SplitVar   *ptr,
2571 				   *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
2572 		int			i;
2573 
2574 		while (var)
2575 		{
2576 			if (var->nstem > 1)
2577 			{
2578 				char	  **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
2579 
2580 				if (subres)
2581 				{
2582 					char	  **subptr = subres;
2583 
2584 					while (*subptr)
2585 					{
2586 						for (i = 0; i < var->nstem - 1; i++)
2587 						{
2588 							addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
2589 						}
2590 
2591 						addNorm(&lres, &lcur, *subptr, 0, NVariant);
2592 						subptr++;
2593 						NVariant++;
2594 					}
2595 
2596 					pfree(subres);
2597 					var->stem[0] = NULL;
2598 					pfree(var->stem[var->nstem - 1]);
2599 				}
2600 			}
2601 
2602 			for (i = 0; i < var->nstem && var->stem[i]; i++)
2603 				pfree(var->stem[i]);
2604 			ptr = var->next;
2605 			pfree(var->stem);
2606 			pfree(var);
2607 			var = ptr;
2608 		}
2609 	}
2610 
2611 	return lres;
2612 }
2613