1 /*-------------------------------------------------------------------------
2 *
3 * spell.c
4 * Normalizing word with ISpell
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 *
8 * Ispell dictionary
9 * -----------------
10 *
11 * Rules of dictionaries are defined in two files with .affix and .dict
12 * extensions. They are used by spell checker programs Ispell and Hunspell.
13 *
14 * An .affix file declares morphological rules to get a basic form of words.
15 * The format of an .affix file has different structure for Ispell and Hunspell
16 * dictionaries. The Hunspell format is more complicated. But when an .affix
17 * file is imported and compiled, it is stored in the same structure AffixNode.
18 *
19 * A .dict file stores a list of basic forms of words with references to
20 * affix rules. The format of a .dict file has the same structure for Ispell
21 * and Hunspell dictionaries.
22 *
23 * Compilation of a dictionary
24 * ---------------------------
25 *
26 * A compiled dictionary is stored in the IspellDict structure. Compilation of
27 * a dictionary is divided into the several steps:
28 * - NIImportDictionary() - stores each word of a .dict file in the
29 * temporary Spell field.
30 * - NIImportAffixes() - stores affix rules of an .affix file in the
31 * Affix field (not temporary) if an .affix file has the Ispell format.
32 * -> NIImportOOAffixes() - stores affix rules if an .affix file has the
33 * Hunspell format. The AffixData field is initialized if AF parameter
34 * is defined.
35 * - NISortDictionary() - builds a prefix tree (Trie) from the words list
36 * and stores it in the Dictionary field. The words list is got from the
37 * Spell field. The AffixData field is initialized if AF parameter is not
38 * defined.
39 * - NISortAffixes():
40 * - builds a list of compound affixes from the affix list and stores it
41 * in the CompoundAffix.
42 * - builds prefix trees (Trie) from the affix list for prefixes and suffixes
43 * and stores them in Suffix and Prefix fields.
44 * The affix list is got from the Affix field.
45 *
46 * Memory management
47 * -----------------
48 *
49 * The IspellDict structure has the Spell field which is used only in compile
50 * time. The Spell field stores a words list. It can take a lot of memory.
51 * Therefore when a dictionary is compiled this field is cleared by
52 * NIFinishBuild().
53 *
54 * All resources which should cleared by NIFinishBuild() is initialized using
55 * tmpalloc() and tmpalloc0().
56 *
57 * IDENTIFICATION
58 * src/backend/tsearch/spell.c
59 *
60 *-------------------------------------------------------------------------
61 */
62
63 #include "postgres.h"
64
65 #include "catalog/pg_collation.h"
66 #include "tsearch/dicts/spell.h"
67 #include "tsearch/ts_locale.h"
68 #include "utils/memutils.h"
69
70
71 /*
72 * Initialization requires a lot of memory that's not needed
73 * after the initialization is done. During initialization,
74 * CurrentMemoryContext is the long-lived memory context associated
75 * with the dictionary cache entry. We keep the short-lived stuff
76 * in the Conf->buildCxt context.
77 */
78 #define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
79 #define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
80
81 /*
82 * Prepare for constructing an ISpell dictionary.
83 *
84 * The IspellDict struct is assumed to be zeroed when allocated.
85 */
86 void
NIStartBuild(IspellDict * Conf)87 NIStartBuild(IspellDict *Conf)
88 {
89 /*
90 * The temp context is a child of CurTransactionContext, so that it will
91 * go away automatically on error.
92 */
93 Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
94 "Ispell dictionary init context",
95 ALLOCSET_DEFAULT_SIZES);
96 }
97
98 /*
99 * Clean up when dictionary construction is complete.
100 */
101 void
NIFinishBuild(IspellDict * Conf)102 NIFinishBuild(IspellDict *Conf)
103 {
104 /* Release no-longer-needed temp memory */
105 MemoryContextDelete(Conf->buildCxt);
106 /* Just for cleanliness, zero the now-dangling pointers */
107 Conf->buildCxt = NULL;
108 Conf->Spell = NULL;
109 Conf->firstfree = NULL;
110 Conf->CompoundAffixFlags = NULL;
111 }
112
113
114 /*
115 * "Compact" palloc: allocate without extra palloc overhead.
116 *
117 * Since we have no need to free the ispell data items individually, there's
118 * not much value in the per-chunk overhead normally consumed by palloc.
119 * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
120 *
121 * We currently pre-zero all data allocated this way, even though some of it
122 * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
123 * to indicate which allocations actually require zeroing.
124 */
125 #define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
126 #define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
127
128 static void *
compact_palloc0(IspellDict * Conf,size_t size)129 compact_palloc0(IspellDict *Conf, size_t size)
130 {
131 void *result;
132
133 /* Should only be called during init */
134 Assert(Conf->buildCxt != NULL);
135
136 /* No point in this for large chunks */
137 if (size > COMPACT_MAX_REQ)
138 return palloc0(size);
139
140 /* Keep everything maxaligned */
141 size = MAXALIGN(size);
142
143 /* Need more space? */
144 if (size > Conf->avail)
145 {
146 Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
147 Conf->avail = COMPACT_ALLOC_CHUNK;
148 }
149
150 result = (void *) Conf->firstfree;
151 Conf->firstfree += size;
152 Conf->avail -= size;
153
154 return result;
155 }
156
157 #define cpalloc(size) compact_palloc0(Conf, size)
158 #define cpalloc0(size) compact_palloc0(Conf, size)
159
160 static char *
cpstrdup(IspellDict * Conf,const char * str)161 cpstrdup(IspellDict *Conf, const char *str)
162 {
163 char *res = cpalloc(strlen(str) + 1);
164
165 strcpy(res, str);
166 return res;
167 }
168
169
170 /*
171 * Apply lowerstr(), producing a temporary result (in the buildCxt).
172 */
173 static char *
lowerstr_ctx(IspellDict * Conf,const char * src)174 lowerstr_ctx(IspellDict *Conf, const char *src)
175 {
176 MemoryContext saveCtx;
177 char *dst;
178
179 saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
180 dst = lowerstr(src);
181 MemoryContextSwitchTo(saveCtx);
182
183 return dst;
184 }
185
186 #define MAX_NORM 1024
187 #define MAXNORMLEN 256
188
189 #define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
190 #define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
191 #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
192
193 static char *VoidString = "";
194
195 static int
cmpspell(const void * s1,const void * s2)196 cmpspell(const void *s1, const void *s2)
197 {
198 return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
199 }
200
201 static int
cmpspellaffix(const void * s1,const void * s2)202 cmpspellaffix(const void *s1, const void *s2)
203 {
204 return strcmp((*(SPELL *const *) s1)->p.flag,
205 (*(SPELL *const *) s2)->p.flag);
206 }
207
208 static int
cmpcmdflag(const void * f1,const void * f2)209 cmpcmdflag(const void *f1, const void *f2)
210 {
211 CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1,
212 *fv2 = (CompoundAffixFlag *) f2;
213
214 Assert(fv1->flagMode == fv2->flagMode);
215
216 if (fv1->flagMode == FM_NUM)
217 {
218 if (fv1->flag.i == fv2->flag.i)
219 return 0;
220
221 return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
222 }
223
224 return strcmp(fv1->flag.s, fv2->flag.s);
225 }
226
227 static char *
findchar(char * str,int c)228 findchar(char *str, int c)
229 {
230 while (*str)
231 {
232 if (t_iseq(str, c))
233 return str;
234 str += pg_mblen(str);
235 }
236
237 return NULL;
238 }
239
240 static char *
findchar2(char * str,int c1,int c2)241 findchar2(char *str, int c1, int c2)
242 {
243 while (*str)
244 {
245 if (t_iseq(str, c1) || t_iseq(str, c2))
246 return str;
247 str += pg_mblen(str);
248 }
249
250 return NULL;
251 }
252
253
254 /* backward string compare for suffix tree operations */
255 static int
strbcmp(const unsigned char * s1,const unsigned char * s2)256 strbcmp(const unsigned char *s1, const unsigned char *s2)
257 {
258 int l1 = strlen((const char *) s1) - 1,
259 l2 = strlen((const char *) s2) - 1;
260
261 while (l1 >= 0 && l2 >= 0)
262 {
263 if (s1[l1] < s2[l2])
264 return -1;
265 if (s1[l1] > s2[l2])
266 return 1;
267 l1--;
268 l2--;
269 }
270 if (l1 < l2)
271 return -1;
272 if (l1 > l2)
273 return 1;
274
275 return 0;
276 }
277
278 static int
strbncmp(const unsigned char * s1,const unsigned char * s2,size_t count)279 strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
280 {
281 int l1 = strlen((const char *) s1) - 1,
282 l2 = strlen((const char *) s2) - 1,
283 l = count;
284
285 while (l1 >= 0 && l2 >= 0 && l > 0)
286 {
287 if (s1[l1] < s2[l2])
288 return -1;
289 if (s1[l1] > s2[l2])
290 return 1;
291 l1--;
292 l2--;
293 l--;
294 }
295 if (l == 0)
296 return 0;
297 if (l1 < l2)
298 return -1;
299 if (l1 > l2)
300 return 1;
301 return 0;
302 }
303
304 /*
305 * Compares affixes.
306 * First compares the type of an affix. Prefixes should go before affixes.
307 * If types are equal then compares replaceable string.
308 */
309 static int
cmpaffix(const void * s1,const void * s2)310 cmpaffix(const void *s1, const void *s2)
311 {
312 const AFFIX *a1 = (const AFFIX *) s1;
313 const AFFIX *a2 = (const AFFIX *) s2;
314
315 if (a1->type < a2->type)
316 return -1;
317 if (a1->type > a2->type)
318 return 1;
319 if (a1->type == FF_PREFIX)
320 return strcmp(a1->repl, a2->repl);
321 else
322 return strbcmp((const unsigned char *) a1->repl,
323 (const unsigned char *) a2->repl);
324 }
325
326 /*
327 * Gets an affix flag from the set of affix flags (sflagset).
328 *
329 * Several flags can be stored in a single string. Flags can be represented by:
330 * - 1 character (FM_CHAR). A character may be Unicode.
331 * - 2 characters (FM_LONG). A character may be Unicode.
332 * - numbers from 1 to 65000 (FM_NUM).
333 *
334 * Depending on the flagMode an affix string can have the following format:
335 * - FM_CHAR: ABCD
336 * Here we have 4 flags: A, B, C and D
337 * - FM_LONG: ABCDE*
338 * Here we have 3 flags: AB, CD and E*
339 * - FM_NUM: 200,205,50
340 * Here we have 3 flags: 200, 205 and 50
341 *
342 * Conf: current dictionary.
343 * sflagset: the set of affix flags. Returns a reference to the start of a next
344 * affix flag.
345 * sflag: returns an affix flag from sflagset.
346 */
347 static void
getNextFlagFromString(IspellDict * Conf,char ** sflagset,char * sflag)348 getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
349 {
350 int32 s;
351 char *next,
352 *sbuf = *sflagset;
353 int maxstep;
354 bool stop = false;
355 bool met_comma = false;
356
357 maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
358
359 while (**sflagset)
360 {
361 switch (Conf->flagMode)
362 {
363 case FM_LONG:
364 case FM_CHAR:
365 COPYCHAR(sflag, *sflagset);
366 sflag += pg_mblen(*sflagset);
367
368 /* Go to start of the next flag */
369 *sflagset += pg_mblen(*sflagset);
370
371 /* Check if we get all characters of flag */
372 maxstep--;
373 stop = (maxstep == 0);
374 break;
375 case FM_NUM:
376 s = strtol(*sflagset, &next, 10);
377 if (*sflagset == next || errno == ERANGE)
378 ereport(ERROR,
379 (errcode(ERRCODE_CONFIG_FILE_ERROR),
380 errmsg("invalid affix flag \"%s\"", *sflagset)));
381 if (s < 0 || s > FLAGNUM_MAXSIZE)
382 ereport(ERROR,
383 (errcode(ERRCODE_CONFIG_FILE_ERROR),
384 errmsg("affix flag \"%s\" is out of range",
385 *sflagset)));
386 sflag += sprintf(sflag, "%0d", s);
387
388 /* Go to start of the next flag */
389 *sflagset = next;
390 while (**sflagset)
391 {
392 if (t_isdigit(*sflagset))
393 {
394 if (!met_comma)
395 ereport(ERROR,
396 (errcode(ERRCODE_CONFIG_FILE_ERROR),
397 errmsg("invalid affix flag \"%s\"",
398 *sflagset)));
399 break;
400 }
401 else if (t_iseq(*sflagset, ','))
402 {
403 if (met_comma)
404 ereport(ERROR,
405 (errcode(ERRCODE_CONFIG_FILE_ERROR),
406 errmsg("invalid affix flag \"%s\"",
407 *sflagset)));
408 met_comma = true;
409 }
410 else if (!t_isspace(*sflagset))
411 {
412 ereport(ERROR,
413 (errcode(ERRCODE_CONFIG_FILE_ERROR),
414 errmsg("invalid character in affix flag \"%s\"",
415 *sflagset)));
416 }
417
418 *sflagset += pg_mblen(*sflagset);
419 }
420 stop = true;
421 break;
422 default:
423 elog(ERROR, "unrecognized type of Conf->flagMode: %d",
424 Conf->flagMode);
425 }
426
427 if (stop)
428 break;
429 }
430
431 if (Conf->flagMode == FM_LONG && maxstep > 0)
432 ereport(ERROR,
433 (errcode(ERRCODE_CONFIG_FILE_ERROR),
434 errmsg("invalid affix flag \"%s\" with \"long\" flag value",
435 sbuf)));
436
437 *sflag = '\0';
438 }
439
440 /*
441 * Checks if the affix set Conf->AffixData[affix] contains affixflag.
442 * Conf->AffixData[affix] does not contain affixflag if this flag is not used
443 * actually by the .dict file.
444 *
445 * Conf: current dictionary.
446 * affix: index of the Conf->AffixData array.
447 * affixflag: the affix flag.
448 *
449 * Returns true if the string Conf->AffixData[affix] contains affixflag,
450 * otherwise returns false.
451 */
452 static bool
IsAffixFlagInUse(IspellDict * Conf,int affix,const char * affixflag)453 IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
454 {
455 char *flagcur;
456 char flag[BUFSIZ];
457
458 if (*affixflag == 0)
459 return true;
460
461 Assert(affix < Conf->nAffixData);
462
463 flagcur = Conf->AffixData[affix];
464
465 while (*flagcur)
466 {
467 getNextFlagFromString(Conf, &flagcur, flag);
468 /* Compare first affix flag in flagcur with affixflag */
469 if (strcmp(flag, affixflag) == 0)
470 return true;
471 }
472
473 /* Could not find affixflag */
474 return false;
475 }
476
477 /*
478 * Adds the new word into the temporary array Spell.
479 *
480 * Conf: current dictionary.
481 * word: new word.
482 * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
483 */
484 static void
NIAddSpell(IspellDict * Conf,const char * word,const char * flag)485 NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
486 {
487 if (Conf->nspell >= Conf->mspell)
488 {
489 if (Conf->mspell)
490 {
491 Conf->mspell *= 2;
492 Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
493 }
494 else
495 {
496 Conf->mspell = 1024 * 20;
497 Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
498 }
499 }
500 Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
501 strcpy(Conf->Spell[Conf->nspell]->word, word);
502 Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
503 ? cpstrdup(Conf, flag) : VoidString;
504 Conf->nspell++;
505 }
506
507 /*
508 * Imports dictionary into the temporary array Spell.
509 *
510 * Note caller must already have applied get_tsearch_config_filename.
511 *
512 * Conf: current dictionary.
513 * filename: path to the .dict file.
514 */
515 void
NIImportDictionary(IspellDict * Conf,const char * filename)516 NIImportDictionary(IspellDict *Conf, const char *filename)
517 {
518 tsearch_readline_state trst;
519 char *line;
520
521 if (!tsearch_readline_begin(&trst, filename))
522 ereport(ERROR,
523 (errcode(ERRCODE_CONFIG_FILE_ERROR),
524 errmsg("could not open dictionary file \"%s\": %m",
525 filename)));
526
527 while ((line = tsearch_readline(&trst)) != NULL)
528 {
529 char *s,
530 *pstr;
531
532 /* Set of affix flags */
533 const char *flag;
534
535 /* Extract flag from the line */
536 flag = NULL;
537 if ((s = findchar(line, '/')))
538 {
539 *s++ = '\0';
540 flag = s;
541 while (*s)
542 {
543 /* we allow only single encoded flags for faster works */
544 if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
545 s++;
546 else
547 {
548 *s = '\0';
549 break;
550 }
551 }
552 }
553 else
554 flag = "";
555
556 /* Remove trailing spaces */
557 s = line;
558 while (*s)
559 {
560 if (t_isspace(s))
561 {
562 *s = '\0';
563 break;
564 }
565 s += pg_mblen(s);
566 }
567 pstr = lowerstr_ctx(Conf, line);
568
569 NIAddSpell(Conf, pstr, flag);
570 pfree(pstr);
571
572 pfree(line);
573 }
574 tsearch_readline_end(&trst);
575 }
576
577 /*
578 * Searches a basic form of word in the prefix tree. This word was generated
579 * using an affix rule. This rule may not be presented in an affix set of
580 * a basic form of word.
581 *
582 * For example, we have the entry in the .dict file:
583 * meter/GMD
584 *
585 * The affix rule with the flag S:
586 * SFX S y ies [^aeiou]y
587 * is not presented here.
588 *
589 * The affix rule with the flag M:
590 * SFX M 0 's .
591 * is presented here.
592 *
593 * Conf: current dictionary.
594 * word: basic form of word.
595 * affixflag: affix flag, by which a basic form of word was generated.
596 * flag: compound flag used to compare with StopMiddle->compoundflag.
597 *
598 * Returns 1 if the word was found in the prefix tree, else returns 0.
599 */
600 static int
FindWord(IspellDict * Conf,const char * word,const char * affixflag,int flag)601 FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
602 {
603 SPNode *node = Conf->Dictionary;
604 SPNodeData *StopLow,
605 *StopHigh,
606 *StopMiddle;
607 const uint8 *ptr = (const uint8 *) word;
608
609 flag &= FF_COMPOUNDFLAGMASK;
610
611 while (node && *ptr)
612 {
613 StopLow = node->data;
614 StopHigh = node->data + node->length;
615 while (StopLow < StopHigh)
616 {
617 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
618 if (StopMiddle->val == *ptr)
619 {
620 if (*(ptr + 1) == '\0' && StopMiddle->isword)
621 {
622 if (flag == 0)
623 {
624 /*
625 * The word can be formed only with another word. And
626 * in the flag parameter there is not a sign that we
627 * search compound words.
628 */
629 if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
630 return 0;
631 }
632 else if ((flag & StopMiddle->compoundflag) == 0)
633 return 0;
634
635 /*
636 * Check if this affix rule is presented in the affix set
637 * with index StopMiddle->affix.
638 */
639 if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
640 return 1;
641 }
642 node = StopMiddle->node;
643 ptr++;
644 break;
645 }
646 else if (StopMiddle->val < *ptr)
647 StopLow = StopMiddle + 1;
648 else
649 StopHigh = StopMiddle;
650 }
651 if (StopLow >= StopHigh)
652 break;
653 }
654 return 0;
655 }
656
657 /*
658 * Context reset/delete callback for a regular expression used in an affix
659 */
660 static void
regex_affix_deletion_callback(void * arg)661 regex_affix_deletion_callback(void *arg)
662 {
663 aff_regex_struct *pregex = (aff_regex_struct *) arg;
664
665 pg_regfree(&(pregex->regex));
666 }
667
668 /*
669 * Adds a new affix rule to the Affix field.
670 *
671 * Conf: current dictionary.
672 * flag: affix flag ('\' in the below example).
673 * flagflags: set of flags from the flagval field for this affix rule. This set
674 * is listed after '/' character in the added string (repl).
675 *
676 * For example L flag in the hunspell_sample.affix:
677 * SFX \ 0 Y/L [^Y]
678 *
679 * mask: condition for search ('[^Y]' in the above example).
680 * find: stripping characters from beginning (at prefix) or end (at suffix)
681 * of the word ('0' in the above example, 0 means that there is not
682 * stripping character).
683 * repl: adding string after stripping ('Y' in the above example).
684 * type: FF_SUFFIX or FF_PREFIX.
685 */
686 static void
NIAddAffix(IspellDict * Conf,const char * flag,char flagflags,const char * mask,const char * find,const char * repl,int type)687 NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
688 const char *find, const char *repl, int type)
689 {
690 AFFIX *Affix;
691
692 if (Conf->naffixes >= Conf->maffixes)
693 {
694 if (Conf->maffixes)
695 {
696 Conf->maffixes *= 2;
697 Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
698 }
699 else
700 {
701 Conf->maffixes = 16;
702 Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
703 }
704 }
705
706 Affix = Conf->Affix + Conf->naffixes;
707
708 /* This affix rule can be applied for words with any ending */
709 if (strcmp(mask, ".") == 0 || *mask == '\0')
710 {
711 Affix->issimple = 1;
712 Affix->isregis = 0;
713 }
714 /* This affix rule will use regis to search word ending */
715 else if (RS_isRegis(mask))
716 {
717 Affix->issimple = 0;
718 Affix->isregis = 1;
719 RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
720 *mask ? mask : VoidString);
721 }
722 /* This affix rule will use regex_t to search word ending */
723 else
724 {
725 int masklen;
726 int wmasklen;
727 int err;
728 pg_wchar *wmask;
729 char *tmask;
730 aff_regex_struct *pregex;
731
732 Affix->issimple = 0;
733 Affix->isregis = 0;
734 tmask = (char *) tmpalloc(strlen(mask) + 3);
735 if (type == FF_SUFFIX)
736 sprintf(tmask, "%s$", mask);
737 else
738 sprintf(tmask, "^%s", mask);
739
740 masklen = strlen(tmask);
741 wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
742 wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
743
744 /*
745 * The regex engine stores its stuff using malloc not palloc, so we
746 * must arrange to explicitly clean up the regex when the dictionary's
747 * context is cleared. That means the regex_t has to stay in a fixed
748 * location within the context; we can't keep it directly in the AFFIX
749 * struct, since we may sort and resize the array of AFFIXes.
750 */
751 Affix->reg.pregex = pregex = palloc(sizeof(aff_regex_struct));
752
753 err = pg_regcomp(&(pregex->regex), wmask, wmasklen,
754 REG_ADVANCED | REG_NOSUB,
755 DEFAULT_COLLATION_OID);
756 if (err)
757 {
758 char errstr[100];
759
760 pg_regerror(err, &(pregex->regex), errstr, sizeof(errstr));
761 ereport(ERROR,
762 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
763 errmsg("invalid regular expression: %s", errstr)));
764 }
765
766 pregex->mcallback.func = regex_affix_deletion_callback;
767 pregex->mcallback.arg = (void *) pregex;
768 MemoryContextRegisterResetCallback(CurrentMemoryContext,
769 &pregex->mcallback);
770 }
771
772 Affix->flagflags = flagflags;
773 if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
774 {
775 if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
776 Affix->flagflags |= FF_COMPOUNDFLAG;
777 }
778 Affix->flag = cpstrdup(Conf, flag);
779 Affix->type = type;
780
781 Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
782 if ((Affix->replen = strlen(repl)) > 0)
783 Affix->repl = cpstrdup(Conf, repl);
784 else
785 Affix->repl = VoidString;
786 Conf->naffixes++;
787 }
788
789 /* Parsing states for parse_affentry() and friends */
790 #define PAE_WAIT_MASK 0
791 #define PAE_INMASK 1
792 #define PAE_WAIT_FIND 2
793 #define PAE_INFIND 3
794 #define PAE_WAIT_REPL 4
795 #define PAE_INREPL 5
796 #define PAE_WAIT_TYPE 6
797 #define PAE_WAIT_FLAG 7
798
799 /*
800 * Parse next space-separated field of an .affix file line.
801 *
802 * *str is the input pointer (will be advanced past field)
803 * next is where to copy the field value to, with null termination
804 *
805 * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
806 *
807 * Returns true if we found a field, false if not.
808 */
809 static bool
get_nextfield(char ** str,char * next)810 get_nextfield(char **str, char *next)
811 {
812 int state = PAE_WAIT_MASK;
813 int avail = BUFSIZ;
814
815 while (**str)
816 {
817 if (state == PAE_WAIT_MASK)
818 {
819 if (t_iseq(*str, '#'))
820 return false;
821 else if (!t_isspace(*str))
822 {
823 int clen = pg_mblen(*str);
824
825 if (clen < avail)
826 {
827 COPYCHAR(next, *str);
828 next += clen;
829 avail -= clen;
830 }
831 state = PAE_INMASK;
832 }
833 }
834 else /* state == PAE_INMASK */
835 {
836 if (t_isspace(*str))
837 {
838 *next = '\0';
839 return true;
840 }
841 else
842 {
843 int clen = pg_mblen(*str);
844
845 if (clen < avail)
846 {
847 COPYCHAR(next, *str);
848 next += clen;
849 avail -= clen;
850 }
851 }
852 }
853 *str += pg_mblen(*str);
854 }
855
856 *next = '\0';
857
858 return (state == PAE_INMASK); /* OK if we got a nonempty field */
859 }
860
861 /*
862 * Parses entry of an .affix file of MySpell or Hunspell format.
863 *
864 * An .affix file entry has the following format:
865 * - header
866 * <type> <flag> <cross_flag> <flag_count>
867 * - fields after header:
868 * <type> <flag> <find> <replace> <mask>
869 *
870 * str is the input line
871 * field values are returned to type etc, which must be buffers of size BUFSIZ.
872 *
873 * Returns number of fields found; any omitted fields are set to empty strings.
874 */
875 static int
parse_ooaffentry(char * str,char * type,char * flag,char * find,char * repl,char * mask)876 parse_ooaffentry(char *str, char *type, char *flag, char *find,
877 char *repl, char *mask)
878 {
879 int state = PAE_WAIT_TYPE;
880 int fields_read = 0;
881 bool valid = false;
882
883 *type = *flag = *find = *repl = *mask = '\0';
884
885 while (*str)
886 {
887 switch (state)
888 {
889 case PAE_WAIT_TYPE:
890 valid = get_nextfield(&str, type);
891 state = PAE_WAIT_FLAG;
892 break;
893 case PAE_WAIT_FLAG:
894 valid = get_nextfield(&str, flag);
895 state = PAE_WAIT_FIND;
896 break;
897 case PAE_WAIT_FIND:
898 valid = get_nextfield(&str, find);
899 state = PAE_WAIT_REPL;
900 break;
901 case PAE_WAIT_REPL:
902 valid = get_nextfield(&str, repl);
903 state = PAE_WAIT_MASK;
904 break;
905 case PAE_WAIT_MASK:
906 valid = get_nextfield(&str, mask);
907 state = -1; /* force loop exit */
908 break;
909 default:
910 elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
911 state);
912 break;
913 }
914 if (valid)
915 fields_read++;
916 else
917 break; /* early EOL */
918 if (state < 0)
919 break; /* got all fields */
920 }
921
922 return fields_read;
923 }
924
925 /*
926 * Parses entry of an .affix file of Ispell format
927 *
928 * An .affix file entry has the following format:
929 * <mask> > [-<find>,]<replace>
930 */
931 static bool
parse_affentry(char * str,char * mask,char * find,char * repl)932 parse_affentry(char *str, char *mask, char *find, char *repl)
933 {
934 int state = PAE_WAIT_MASK;
935 char *pmask = mask,
936 *pfind = find,
937 *prepl = repl;
938
939 *mask = *find = *repl = '\0';
940
941 while (*str)
942 {
943 if (state == PAE_WAIT_MASK)
944 {
945 if (t_iseq(str, '#'))
946 return false;
947 else if (!t_isspace(str))
948 {
949 COPYCHAR(pmask, str);
950 pmask += pg_mblen(str);
951 state = PAE_INMASK;
952 }
953 }
954 else if (state == PAE_INMASK)
955 {
956 if (t_iseq(str, '>'))
957 {
958 *pmask = '\0';
959 state = PAE_WAIT_FIND;
960 }
961 else if (!t_isspace(str))
962 {
963 COPYCHAR(pmask, str);
964 pmask += pg_mblen(str);
965 }
966 }
967 else if (state == PAE_WAIT_FIND)
968 {
969 if (t_iseq(str, '-'))
970 {
971 state = PAE_INFIND;
972 }
973 else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
974 {
975 COPYCHAR(prepl, str);
976 prepl += pg_mblen(str);
977 state = PAE_INREPL;
978 }
979 else if (!t_isspace(str))
980 ereport(ERROR,
981 (errcode(ERRCODE_CONFIG_FILE_ERROR),
982 errmsg("syntax error")));
983 }
984 else if (state == PAE_INFIND)
985 {
986 if (t_iseq(str, ','))
987 {
988 *pfind = '\0';
989 state = PAE_WAIT_REPL;
990 }
991 else if (t_isalpha(str))
992 {
993 COPYCHAR(pfind, str);
994 pfind += pg_mblen(str);
995 }
996 else if (!t_isspace(str))
997 ereport(ERROR,
998 (errcode(ERRCODE_CONFIG_FILE_ERROR),
999 errmsg("syntax error")));
1000 }
1001 else if (state == PAE_WAIT_REPL)
1002 {
1003 if (t_iseq(str, '-'))
1004 {
1005 break; /* void repl */
1006 }
1007 else if (t_isalpha(str))
1008 {
1009 COPYCHAR(prepl, str);
1010 prepl += pg_mblen(str);
1011 state = PAE_INREPL;
1012 }
1013 else if (!t_isspace(str))
1014 ereport(ERROR,
1015 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1016 errmsg("syntax error")));
1017 }
1018 else if (state == PAE_INREPL)
1019 {
1020 if (t_iseq(str, '#'))
1021 {
1022 *prepl = '\0';
1023 break;
1024 }
1025 else if (t_isalpha(str))
1026 {
1027 COPYCHAR(prepl, str);
1028 prepl += pg_mblen(str);
1029 }
1030 else if (!t_isspace(str))
1031 ereport(ERROR,
1032 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1033 errmsg("syntax error")));
1034 }
1035 else
1036 elog(ERROR, "unrecognized state in parse_affentry: %d", state);
1037
1038 str += pg_mblen(str);
1039 }
1040
1041 *pmask = *pfind = *prepl = '\0';
1042
1043 return (*mask && (*find || *repl));
1044 }
1045
1046 /*
1047 * Sets a Hunspell options depending on flag type.
1048 */
1049 static void
setCompoundAffixFlagValue(IspellDict * Conf,CompoundAffixFlag * entry,char * s,uint32 val)1050 setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry,
1051 char *s, uint32 val)
1052 {
1053 if (Conf->flagMode == FM_NUM)
1054 {
1055 char *next;
1056 int i;
1057
1058 i = strtol(s, &next, 10);
1059 if (s == next || errno == ERANGE)
1060 ereport(ERROR,
1061 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1062 errmsg("invalid affix flag \"%s\"", s)));
1063 if (i < 0 || i > FLAGNUM_MAXSIZE)
1064 ereport(ERROR,
1065 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1066 errmsg("affix flag \"%s\" is out of range", s)));
1067
1068 entry->flag.i = i;
1069 }
1070 else
1071 entry->flag.s = cpstrdup(Conf, s);
1072
1073 entry->flagMode = Conf->flagMode;
1074 entry->value = val;
1075 }
1076
1077 /*
1078 * Sets up a correspondence for the affix parameter with the affix flag.
1079 *
1080 * Conf: current dictionary.
1081 * s: affix flag in string.
1082 * val: affix parameter.
1083 */
1084 static void
addCompoundAffixFlagValue(IspellDict * Conf,char * s,uint32 val)1085 addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
1086 {
1087 CompoundAffixFlag *newValue;
1088 char sbuf[BUFSIZ];
1089 char *sflag;
1090 int clen;
1091
1092 while (*s && t_isspace(s))
1093 s += pg_mblen(s);
1094
1095 if (!*s)
1096 ereport(ERROR,
1097 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1098 errmsg("syntax error")));
1099
1100 /* Get flag without \n */
1101 sflag = sbuf;
1102 while (*s && !t_isspace(s) && *s != '\n')
1103 {
1104 clen = pg_mblen(s);
1105 COPYCHAR(sflag, s);
1106 sflag += clen;
1107 s += clen;
1108 }
1109 *sflag = '\0';
1110
1111 /* Resize array or allocate memory for array CompoundAffixFlag */
1112 if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
1113 {
1114 if (Conf->mCompoundAffixFlag)
1115 {
1116 Conf->mCompoundAffixFlag *= 2;
1117 Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1118 repalloc((void *) Conf->CompoundAffixFlags,
1119 Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1120 }
1121 else
1122 {
1123 Conf->mCompoundAffixFlag = 10;
1124 Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1125 tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1126 }
1127 }
1128
1129 newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
1130
1131 setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
1132
1133 Conf->usecompound = true;
1134 Conf->nCompoundAffixFlag++;
1135 }
1136
1137 /*
1138 * Returns a set of affix parameters which correspondence to the set of affix
1139 * flags s.
1140 */
1141 static int
getCompoundAffixFlagValue(IspellDict * Conf,char * s)1142 getCompoundAffixFlagValue(IspellDict *Conf, char *s)
1143 {
1144 uint32 flag = 0;
1145 CompoundAffixFlag *found,
1146 key;
1147 char sflag[BUFSIZ];
1148 char *flagcur;
1149
1150 if (Conf->nCompoundAffixFlag == 0)
1151 return 0;
1152
1153 flagcur = s;
1154 while (*flagcur)
1155 {
1156 getNextFlagFromString(Conf, &flagcur, sflag);
1157 setCompoundAffixFlagValue(Conf, &key, sflag, 0);
1158
1159 found = (CompoundAffixFlag *)
1160 bsearch(&key, (void *) Conf->CompoundAffixFlags,
1161 Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
1162 cmpcmdflag);
1163 if (found != NULL)
1164 flag |= found->value;
1165 }
1166
1167 return flag;
1168 }
1169
1170 /*
1171 * Returns a flag set using the s parameter.
1172 *
1173 * If Conf->useFlagAliases is true then the s parameter is index of the
1174 * Conf->AffixData array and function returns its entry.
1175 * Else function returns the s parameter.
1176 */
1177 static char *
getAffixFlagSet(IspellDict * Conf,char * s)1178 getAffixFlagSet(IspellDict *Conf, char *s)
1179 {
1180 if (Conf->useFlagAliases && *s != '\0')
1181 {
1182 int curaffix;
1183 char *end;
1184
1185 curaffix = strtol(s, &end, 10);
1186 if (s == end || errno == ERANGE)
1187 ereport(ERROR,
1188 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1189 errmsg("invalid affix alias \"%s\"", s)));
1190
1191 if (curaffix > 0 && curaffix < Conf->nAffixData)
1192
1193 /*
1194 * Do not subtract 1 from curaffix because empty string was added
1195 * in NIImportOOAffixes
1196 */
1197 return Conf->AffixData[curaffix];
1198 else if (curaffix > Conf->nAffixData)
1199 ereport(ERROR,
1200 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1201 errmsg("invalid affix alias \"%s\"", s)));
1202 return VoidString;
1203 }
1204 else
1205 return s;
1206 }
1207
1208 /*
1209 * Import an affix file that follows MySpell or Hunspell format.
1210 *
1211 * Conf: current dictionary.
1212 * filename: path to the .affix file.
1213 */
1214 static void
NIImportOOAffixes(IspellDict * Conf,const char * filename)1215 NIImportOOAffixes(IspellDict *Conf, const char *filename)
1216 {
1217 char type[BUFSIZ],
1218 *ptype = NULL;
1219 char sflag[BUFSIZ];
1220 char mask[BUFSIZ],
1221 *pmask;
1222 char find[BUFSIZ],
1223 *pfind;
1224 char repl[BUFSIZ],
1225 *prepl;
1226 bool isSuffix = false;
1227 int naffix = 0,
1228 curaffix = 0;
1229 int sflaglen = 0;
1230 char flagflags = 0;
1231 tsearch_readline_state trst;
1232 char *recoded;
1233
1234 /* read file to find any flag */
1235 Conf->usecompound = false;
1236 Conf->useFlagAliases = false;
1237 Conf->flagMode = FM_CHAR;
1238
1239 if (!tsearch_readline_begin(&trst, filename))
1240 ereport(ERROR,
1241 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1242 errmsg("could not open affix file \"%s\": %m",
1243 filename)));
1244
1245 while ((recoded = tsearch_readline(&trst)) != NULL)
1246 {
1247 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1248 {
1249 pfree(recoded);
1250 continue;
1251 }
1252
1253 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
1254 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
1255 FF_COMPOUNDFLAG);
1256 else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
1257 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
1258 FF_COMPOUNDBEGIN);
1259 else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
1260 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
1261 FF_COMPOUNDLAST);
1262 /* COMPOUNDLAST and COMPOUNDEND are synonyms */
1263 else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
1264 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
1265 FF_COMPOUNDLAST);
1266 else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
1267 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
1268 FF_COMPOUNDMIDDLE);
1269 else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
1270 addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
1271 FF_COMPOUNDONLY);
1272 else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
1273 addCompoundAffixFlagValue(Conf,
1274 recoded + strlen("COMPOUNDPERMITFLAG"),
1275 FF_COMPOUNDPERMITFLAG);
1276 else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
1277 addCompoundAffixFlagValue(Conf,
1278 recoded + strlen("COMPOUNDFORBIDFLAG"),
1279 FF_COMPOUNDFORBIDFLAG);
1280 else if (STRNCMP(recoded, "FLAG") == 0)
1281 {
1282 char *s = recoded + strlen("FLAG");
1283
1284 while (*s && t_isspace(s))
1285 s += pg_mblen(s);
1286
1287 if (*s)
1288 {
1289 if (STRNCMP(s, "long") == 0)
1290 Conf->flagMode = FM_LONG;
1291 else if (STRNCMP(s, "num") == 0)
1292 Conf->flagMode = FM_NUM;
1293 else if (STRNCMP(s, "default") != 0)
1294 ereport(ERROR,
1295 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1296 errmsg("Ispell dictionary supports only "
1297 "\"default\", \"long\", "
1298 "and \"num\" flag values")));
1299 }
1300 }
1301
1302 pfree(recoded);
1303 }
1304 tsearch_readline_end(&trst);
1305
1306 if (Conf->nCompoundAffixFlag > 1)
1307 qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
1308 sizeof(CompoundAffixFlag), cmpcmdflag);
1309
1310 if (!tsearch_readline_begin(&trst, filename))
1311 ereport(ERROR,
1312 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1313 errmsg("could not open affix file \"%s\": %m",
1314 filename)));
1315
1316 while ((recoded = tsearch_readline(&trst)) != NULL)
1317 {
1318 int fields_read;
1319
1320 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1321 goto nextline;
1322
1323 fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
1324
1325 if (ptype)
1326 pfree(ptype);
1327 ptype = lowerstr_ctx(Conf, type);
1328
1329 /* First try to parse AF parameter (alias compression) */
1330 if (STRNCMP(ptype, "af") == 0)
1331 {
1332 /* First line is the number of aliases */
1333 if (!Conf->useFlagAliases)
1334 {
1335 Conf->useFlagAliases = true;
1336 naffix = atoi(sflag);
1337 if (naffix <= 0)
1338 ereport(ERROR,
1339 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1340 errmsg("invalid number of flag vector aliases")));
1341
1342 /* Also reserve place for empty flag set */
1343 naffix++;
1344
1345 Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1346 Conf->lenAffixData = Conf->nAffixData = naffix;
1347
1348 /* Add empty flag set into AffixData */
1349 Conf->AffixData[curaffix] = VoidString;
1350 curaffix++;
1351 }
1352 /* Other lines are aliases */
1353 else
1354 {
1355 if (curaffix < naffix)
1356 {
1357 Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
1358 curaffix++;
1359 }
1360 else
1361 ereport(ERROR,
1362 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1363 errmsg("number of aliases exceeds specified number %d",
1364 naffix - 1)));
1365 }
1366 goto nextline;
1367 }
1368 /* Else try to parse prefixes and suffixes */
1369 if (fields_read < 4 ||
1370 (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
1371 goto nextline;
1372
1373 sflaglen = strlen(sflag);
1374 if (sflaglen == 0
1375 || (sflaglen > 1 && Conf->flagMode == FM_CHAR)
1376 || (sflaglen > 2 && Conf->flagMode == FM_LONG))
1377 goto nextline;
1378
1379 /*--------
1380 * Affix header. For example:
1381 * SFX \ N 1
1382 *--------
1383 */
1384 if (fields_read == 4)
1385 {
1386 isSuffix = (STRNCMP(ptype, "sfx") == 0);
1387 if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
1388 flagflags = FF_CROSSPRODUCT;
1389 else
1390 flagflags = 0;
1391 }
1392 /*--------
1393 * Affix fields. For example:
1394 * SFX \ 0 Y/L [^Y]
1395 *--------
1396 */
1397 else
1398 {
1399 char *ptr;
1400 int aflg = 0;
1401
1402 /* Get flags after '/' (flags are case sensitive) */
1403 if ((ptr = strchr(repl, '/')) != NULL)
1404 aflg |= getCompoundAffixFlagValue(Conf,
1405 getAffixFlagSet(Conf,
1406 ptr + 1));
1407 /* Get lowercased version of string before '/' */
1408 prepl = lowerstr_ctx(Conf, repl);
1409 if ((ptr = strchr(prepl, '/')) != NULL)
1410 *ptr = '\0';
1411 pfind = lowerstr_ctx(Conf, find);
1412 pmask = lowerstr_ctx(Conf, mask);
1413 if (t_iseq(find, '0'))
1414 *pfind = '\0';
1415 if (t_iseq(repl, '0'))
1416 *prepl = '\0';
1417
1418 NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
1419 isSuffix ? FF_SUFFIX : FF_PREFIX);
1420 pfree(prepl);
1421 pfree(pfind);
1422 pfree(pmask);
1423 }
1424
1425 nextline:
1426 pfree(recoded);
1427 }
1428
1429 tsearch_readline_end(&trst);
1430 if (ptype)
1431 pfree(ptype);
1432 }
1433
1434 /*
1435 * import affixes
1436 *
1437 * Note caller must already have applied get_tsearch_config_filename
1438 *
1439 * This function is responsible for parsing ispell ("old format") affix files.
1440 * If we realize that the file contains new-format commands, we pass off the
1441 * work to NIImportOOAffixes(), which will re-read the whole file.
1442 */
1443 void
NIImportAffixes(IspellDict * Conf,const char * filename)1444 NIImportAffixes(IspellDict *Conf, const char *filename)
1445 {
1446 char *pstr = NULL;
1447 char flag[BUFSIZ];
1448 char mask[BUFSIZ];
1449 char find[BUFSIZ];
1450 char repl[BUFSIZ];
1451 char *s;
1452 bool suffixes = false;
1453 bool prefixes = false;
1454 char flagflags = 0;
1455 tsearch_readline_state trst;
1456 bool oldformat = false;
1457 char *recoded = NULL;
1458
1459 if (!tsearch_readline_begin(&trst, filename))
1460 ereport(ERROR,
1461 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1462 errmsg("could not open affix file \"%s\": %m",
1463 filename)));
1464
1465 Conf->usecompound = false;
1466 Conf->useFlagAliases = false;
1467 Conf->flagMode = FM_CHAR;
1468
1469 while ((recoded = tsearch_readline(&trst)) != NULL)
1470 {
1471 pstr = lowerstr(recoded);
1472
1473 /* Skip comments and empty lines */
1474 if (*pstr == '#' || *pstr == '\n')
1475 goto nextline;
1476
1477 if (STRNCMP(pstr, "compoundwords") == 0)
1478 {
1479 /* Find case-insensitive L flag in non-lowercased string */
1480 s = findchar2(recoded, 'l', 'L');
1481 if (s)
1482 {
1483 while (*s && !t_isspace(s))
1484 s += pg_mblen(s);
1485 while (*s && t_isspace(s))
1486 s += pg_mblen(s);
1487
1488 if (*s && pg_mblen(s) == 1)
1489 {
1490 addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
1491 Conf->usecompound = true;
1492 }
1493 oldformat = true;
1494 goto nextline;
1495 }
1496 }
1497 if (STRNCMP(pstr, "suffixes") == 0)
1498 {
1499 suffixes = true;
1500 prefixes = false;
1501 oldformat = true;
1502 goto nextline;
1503 }
1504 if (STRNCMP(pstr, "prefixes") == 0)
1505 {
1506 suffixes = false;
1507 prefixes = true;
1508 oldformat = true;
1509 goto nextline;
1510 }
1511 if (STRNCMP(pstr, "flag") == 0)
1512 {
1513 s = recoded + 4; /* we need non-lowercased string */
1514 flagflags = 0;
1515
1516 while (*s && t_isspace(s))
1517 s += pg_mblen(s);
1518
1519 if (*s == '*')
1520 {
1521 flagflags |= FF_CROSSPRODUCT;
1522 s++;
1523 }
1524 else if (*s == '~')
1525 {
1526 flagflags |= FF_COMPOUNDONLY;
1527 s++;
1528 }
1529
1530 if (*s == '\\')
1531 s++;
1532
1533 /*
1534 * An old-format flag is a single ASCII character; we expect it to
1535 * be followed by EOL, whitespace, or ':'. Otherwise this is a
1536 * new-format flag command.
1537 */
1538 if (*s && pg_mblen(s) == 1)
1539 {
1540 COPYCHAR(flag, s);
1541 flag[1] = '\0';
1542
1543 s++;
1544 if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
1545 t_isspace(s))
1546 {
1547 oldformat = true;
1548 goto nextline;
1549 }
1550 }
1551 goto isnewformat;
1552 }
1553 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
1554 STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
1555 STRNCMP(recoded, "PFX") == 0 ||
1556 STRNCMP(recoded, "SFX") == 0)
1557 goto isnewformat;
1558
1559 if ((!suffixes) && (!prefixes))
1560 goto nextline;
1561
1562 if (!parse_affentry(pstr, mask, find, repl))
1563 goto nextline;
1564
1565 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
1566
1567 nextline:
1568 pfree(recoded);
1569 pfree(pstr);
1570 }
1571 tsearch_readline_end(&trst);
1572 return;
1573
1574 isnewformat:
1575 if (oldformat)
1576 ereport(ERROR,
1577 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1578 errmsg("affix file contains both old-style and new-style commands")));
1579 tsearch_readline_end(&trst);
1580
1581 NIImportOOAffixes(Conf, filename);
1582 }
1583
1584 /*
1585 * Merges two affix flag sets and stores a new affix flag set into
1586 * Conf->AffixData.
1587 *
1588 * Returns index of a new affix flag set.
1589 */
1590 static int
MergeAffix(IspellDict * Conf,int a1,int a2)1591 MergeAffix(IspellDict *Conf, int a1, int a2)
1592 {
1593 char **ptr;
1594
1595 Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData);
1596
1597 /* Do not merge affix flags if one of affix flags is empty */
1598 if (*Conf->AffixData[a1] == '\0')
1599 return a2;
1600 else if (*Conf->AffixData[a2] == '\0')
1601 return a1;
1602
1603 while (Conf->nAffixData + 1 >= Conf->lenAffixData)
1604 {
1605 Conf->lenAffixData *= 2;
1606 Conf->AffixData = (char **) repalloc(Conf->AffixData,
1607 sizeof(char *) * Conf->lenAffixData);
1608 }
1609
1610 ptr = Conf->AffixData + Conf->nAffixData;
1611 if (Conf->flagMode == FM_NUM)
1612 {
1613 *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1614 strlen(Conf->AffixData[a2]) +
1615 1 /* comma */ + 1 /* \0 */ );
1616 sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1617 }
1618 else
1619 {
1620 *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1621 strlen(Conf->AffixData[a2]) +
1622 1 /* \0 */ );
1623 sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1624 }
1625 ptr++;
1626 *ptr = NULL;
1627 Conf->nAffixData++;
1628
1629 return Conf->nAffixData - 1;
1630 }
1631
1632 /*
1633 * Returns a set of affix parameters which correspondence to the set of affix
1634 * flags with the given index.
1635 */
1636 static uint32
makeCompoundFlags(IspellDict * Conf,int affix)1637 makeCompoundFlags(IspellDict *Conf, int affix)
1638 {
1639 Assert(affix < Conf->nAffixData);
1640
1641 return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) &
1642 FF_COMPOUNDFLAGMASK);
1643 }
1644
1645 /*
1646 * Makes a prefix tree for the given level.
1647 *
1648 * Conf: current dictionary.
1649 * low: lower index of the Conf->Spell array.
1650 * high: upper index of the Conf->Spell array.
1651 * level: current prefix tree level.
1652 */
1653 static SPNode *
mkSPNode(IspellDict * Conf,int low,int high,int level)1654 mkSPNode(IspellDict *Conf, int low, int high, int level)
1655 {
1656 int i;
1657 int nchar = 0;
1658 char lastchar = '\0';
1659 SPNode *rs;
1660 SPNodeData *data;
1661 int lownew = low;
1662
1663 for (i = low; i < high; i++)
1664 if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
1665 {
1666 nchar++;
1667 lastchar = Conf->Spell[i]->word[level];
1668 }
1669
1670 if (!nchar)
1671 return NULL;
1672
1673 rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
1674 rs->length = nchar;
1675 data = rs->data;
1676
1677 lastchar = '\0';
1678 for (i = low; i < high; i++)
1679 if (Conf->Spell[i]->p.d.len > level)
1680 {
1681 if (lastchar != Conf->Spell[i]->word[level])
1682 {
1683 if (lastchar)
1684 {
1685 /* Next level of the prefix tree */
1686 data->node = mkSPNode(Conf, lownew, i, level + 1);
1687 lownew = i;
1688 data++;
1689 }
1690 lastchar = Conf->Spell[i]->word[level];
1691 }
1692 data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
1693 if (Conf->Spell[i]->p.d.len == level + 1)
1694 {
1695 bool clearCompoundOnly = false;
1696
1697 if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
1698 {
1699 /*
1700 * MergeAffix called a few times. If one of word is
1701 * allowed to be in compound word and another isn't, then
1702 * clear FF_COMPOUNDONLY flag.
1703 */
1704
1705 clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
1706 & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
1707 ? false : true;
1708 data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
1709 }
1710 else
1711 data->affix = Conf->Spell[i]->p.d.affix;
1712 data->isword = 1;
1713
1714 data->compoundflag = makeCompoundFlags(Conf, data->affix);
1715
1716 if ((data->compoundflag & FF_COMPOUNDONLY) &&
1717 (data->compoundflag & FF_COMPOUNDFLAG) == 0)
1718 data->compoundflag |= FF_COMPOUNDFLAG;
1719
1720 if (clearCompoundOnly)
1721 data->compoundflag &= ~FF_COMPOUNDONLY;
1722 }
1723 }
1724
1725 /* Next level of the prefix tree */
1726 data->node = mkSPNode(Conf, lownew, high, level + 1);
1727
1728 return rs;
1729 }
1730
1731 /*
1732 * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
1733 * and affixes.
1734 */
1735 void
NISortDictionary(IspellDict * Conf)1736 NISortDictionary(IspellDict *Conf)
1737 {
1738 int i;
1739 int naffix = 0;
1740 int curaffix;
1741
1742 /* compress affixes */
1743
1744 /*
1745 * If we use flag aliases then we need to use Conf->AffixData filled in
1746 * the NIImportOOAffixes().
1747 */
1748 if (Conf->useFlagAliases)
1749 {
1750 for (i = 0; i < Conf->nspell; i++)
1751 {
1752 char *end;
1753
1754 if (*Conf->Spell[i]->p.flag != '\0')
1755 {
1756 curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
1757 if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
1758 ereport(ERROR,
1759 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1760 errmsg("invalid affix alias \"%s\"",
1761 Conf->Spell[i]->p.flag)));
1762 if (curaffix < 0 || curaffix >= Conf->nAffixData)
1763 ereport(ERROR,
1764 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1765 errmsg("invalid affix alias \"%s\"",
1766 Conf->Spell[i]->p.flag)));
1767 if (*end != '\0' && !t_isdigit(end) && !t_isspace(end))
1768 ereport(ERROR,
1769 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1770 errmsg("invalid affix alias \"%s\"",
1771 Conf->Spell[i]->p.flag)));
1772 }
1773 else
1774 {
1775 /*
1776 * If Conf->Spell[i]->p.flag is empty, then get empty value of
1777 * Conf->AffixData (0 index).
1778 */
1779 curaffix = 0;
1780 }
1781
1782 Conf->Spell[i]->p.d.affix = curaffix;
1783 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1784 }
1785 }
1786 /* Otherwise fill Conf->AffixData here */
1787 else
1788 {
1789 /* Count the number of different flags used in the dictionary */
1790 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
1791 cmpspellaffix);
1792
1793 naffix = 0;
1794 for (i = 0; i < Conf->nspell; i++)
1795 {
1796 if (i == 0 ||
1797 strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag) != 0)
1798 naffix++;
1799 }
1800
1801 /*
1802 * Fill in Conf->AffixData with the affixes that were used in the
1803 * dictionary. Replace textual flag-field of Conf->Spell entries with
1804 * indexes into Conf->AffixData array.
1805 */
1806 Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1807
1808 curaffix = -1;
1809 for (i = 0; i < Conf->nspell; i++)
1810 {
1811 if (i == 0 ||
1812 strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]) != 0)
1813 {
1814 curaffix++;
1815 Assert(curaffix < naffix);
1816 Conf->AffixData[curaffix] = cpstrdup(Conf,
1817 Conf->Spell[i]->p.flag);
1818 }
1819
1820 Conf->Spell[i]->p.d.affix = curaffix;
1821 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1822 }
1823
1824 Conf->lenAffixData = Conf->nAffixData = naffix;
1825 }
1826
1827 /* Start build a prefix tree */
1828 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1829 Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1830 }
1831
1832 /*
1833 * Makes a prefix tree for the given level using the repl string of an affix
1834 * rule. Affixes with empty replace string do not include in the prefix tree.
1835 * This affixes are included by mkVoidAffix().
1836 *
1837 * Conf: current dictionary.
1838 * low: lower index of the Conf->Affix array.
1839 * high: upper index of the Conf->Affix array.
1840 * level: current prefix tree level.
1841 * type: FF_SUFFIX or FF_PREFIX.
1842 */
1843 static AffixNode *
mkANode(IspellDict * Conf,int low,int high,int level,int type)1844 mkANode(IspellDict *Conf, int low, int high, int level, int type)
1845 {
1846 int i;
1847 int nchar = 0;
1848 uint8 lastchar = '\0';
1849 AffixNode *rs;
1850 AffixNodeData *data;
1851 int lownew = low;
1852 int naff;
1853 AFFIX **aff;
1854
1855 for (i = low; i < high; i++)
1856 if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1857 {
1858 nchar++;
1859 lastchar = GETCHAR(Conf->Affix + i, level, type);
1860 }
1861
1862 if (!nchar)
1863 return NULL;
1864
1865 aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1866 naff = 0;
1867
1868 rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1869 rs->length = nchar;
1870 data = rs->data;
1871
1872 lastchar = '\0';
1873 for (i = low; i < high; i++)
1874 if (Conf->Affix[i].replen > level)
1875 {
1876 if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1877 {
1878 if (lastchar)
1879 {
1880 /* Next level of the prefix tree */
1881 data->node = mkANode(Conf, lownew, i, level + 1, type);
1882 if (naff)
1883 {
1884 data->naff = naff;
1885 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1886 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1887 naff = 0;
1888 }
1889 data++;
1890 lownew = i;
1891 }
1892 lastchar = GETCHAR(Conf->Affix + i, level, type);
1893 }
1894 data->val = GETCHAR(Conf->Affix + i, level, type);
1895 if (Conf->Affix[i].replen == level + 1)
1896 { /* affix stopped */
1897 aff[naff++] = Conf->Affix + i;
1898 }
1899 }
1900
1901 /* Next level of the prefix tree */
1902 data->node = mkANode(Conf, lownew, high, level + 1, type);
1903 if (naff)
1904 {
1905 data->naff = naff;
1906 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1907 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1908 naff = 0;
1909 }
1910
1911 pfree(aff);
1912
1913 return rs;
1914 }
1915
1916 /*
1917 * Makes the root void node in the prefix tree. The root void node is created
1918 * for affixes which have empty replace string ("repl" field).
1919 */
1920 static void
mkVoidAffix(IspellDict * Conf,bool issuffix,int startsuffix)1921 mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
1922 {
1923 int i,
1924 cnt = 0;
1925 int start = (issuffix) ? startsuffix : 0;
1926 int end = (issuffix) ? Conf->naffixes : startsuffix;
1927 AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1928
1929 Affix->length = 1;
1930 Affix->isvoid = 1;
1931
1932 if (issuffix)
1933 {
1934 Affix->data->node = Conf->Suffix;
1935 Conf->Suffix = Affix;
1936 }
1937 else
1938 {
1939 Affix->data->node = Conf->Prefix;
1940 Conf->Prefix = Affix;
1941 }
1942
1943 /* Count affixes with empty replace string */
1944 for (i = start; i < end; i++)
1945 if (Conf->Affix[i].replen == 0)
1946 cnt++;
1947
1948 /* There is not affixes with empty replace string */
1949 if (cnt == 0)
1950 return;
1951
1952 Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
1953 Affix->data->naff = (uint32) cnt;
1954
1955 cnt = 0;
1956 for (i = start; i < end; i++)
1957 if (Conf->Affix[i].replen == 0)
1958 {
1959 Affix->data->aff[cnt] = Conf->Affix + i;
1960 cnt++;
1961 }
1962 }
1963
1964 /*
1965 * Checks if the affixflag is used by dictionary. Conf->AffixData does not
1966 * contain affixflag if this flag is not used actually by the .dict file.
1967 *
1968 * Conf: current dictionary.
1969 * affixflag: affix flag.
1970 *
1971 * Returns true if the Conf->AffixData array contains affixflag, otherwise
1972 * returns false.
1973 */
1974 static bool
isAffixInUse(IspellDict * Conf,char * affixflag)1975 isAffixInUse(IspellDict *Conf, char *affixflag)
1976 {
1977 int i;
1978
1979 for (i = 0; i < Conf->nAffixData; i++)
1980 if (IsAffixFlagInUse(Conf, i, affixflag))
1981 return true;
1982
1983 return false;
1984 }
1985
1986 /*
1987 * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
1988 */
1989 void
NISortAffixes(IspellDict * Conf)1990 NISortAffixes(IspellDict *Conf)
1991 {
1992 AFFIX *Affix;
1993 size_t i;
1994 CMPDAffix *ptr;
1995 int firstsuffix = Conf->naffixes;
1996
1997 if (Conf->naffixes == 0)
1998 return;
1999
2000 /* Store compound affixes in the Conf->CompoundAffix array */
2001 if (Conf->naffixes > 1)
2002 qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
2003 Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
2004 ptr->affix = NULL;
2005
2006 for (i = 0; i < Conf->naffixes; i++)
2007 {
2008 Affix = &(((AFFIX *) Conf->Affix)[i]);
2009 if (Affix->type == FF_SUFFIX && i < firstsuffix)
2010 firstsuffix = i;
2011
2012 if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
2013 isAffixInUse(Conf, Affix->flag))
2014 {
2015 bool issuffix = (Affix->type == FF_SUFFIX);
2016
2017 if (ptr == Conf->CompoundAffix ||
2018 issuffix != (ptr - 1)->issuffix ||
2019 strbncmp((const unsigned char *) (ptr - 1)->affix,
2020 (const unsigned char *) Affix->repl,
2021 (ptr - 1)->len))
2022 {
2023 /* leave only unique and minimals suffixes */
2024 ptr->affix = Affix->repl;
2025 ptr->len = Affix->replen;
2026 ptr->issuffix = issuffix;
2027 ptr++;
2028 }
2029 }
2030 }
2031 ptr->affix = NULL;
2032 Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
2033
2034 /* Start build a prefix tree */
2035 Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
2036 Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
2037 mkVoidAffix(Conf, true, firstsuffix);
2038 mkVoidAffix(Conf, false, firstsuffix);
2039 }
2040
2041 static AffixNodeData *
FindAffixes(AffixNode * node,const char * word,int wrdlen,int * level,int type)2042 FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
2043 {
2044 AffixNodeData *StopLow,
2045 *StopHigh,
2046 *StopMiddle;
2047 uint8 symbol;
2048
2049 if (node->isvoid)
2050 { /* search void affixes */
2051 if (node->data->naff)
2052 return node->data;
2053 node = node->data->node;
2054 }
2055
2056 while (node && *level < wrdlen)
2057 {
2058 StopLow = node->data;
2059 StopHigh = node->data + node->length;
2060 while (StopLow < StopHigh)
2061 {
2062 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2063 symbol = GETWCHAR(word, wrdlen, *level, type);
2064
2065 if (StopMiddle->val == symbol)
2066 {
2067 (*level)++;
2068 if (StopMiddle->naff)
2069 return StopMiddle;
2070 node = StopMiddle->node;
2071 break;
2072 }
2073 else if (StopMiddle->val < symbol)
2074 StopLow = StopMiddle + 1;
2075 else
2076 StopHigh = StopMiddle;
2077 }
2078 if (StopLow >= StopHigh)
2079 break;
2080 }
2081 return NULL;
2082 }
2083
2084 static char *
CheckAffix(const char * word,size_t len,AFFIX * Affix,int flagflags,char * newword,int * baselen)2085 CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
2086 {
2087 /*
2088 * Check compound allow flags
2089 */
2090
2091 if (flagflags == 0)
2092 {
2093 if (Affix->flagflags & FF_COMPOUNDONLY)
2094 return NULL;
2095 }
2096 else if (flagflags & FF_COMPOUNDBEGIN)
2097 {
2098 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2099 return NULL;
2100 if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
2101 if (Affix->type == FF_SUFFIX)
2102 return NULL;
2103 }
2104 else if (flagflags & FF_COMPOUNDMIDDLE)
2105 {
2106 if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
2107 (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
2108 return NULL;
2109 }
2110 else if (flagflags & FF_COMPOUNDLAST)
2111 {
2112 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2113 return NULL;
2114 if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
2115 if (Affix->type == FF_PREFIX)
2116 return NULL;
2117 }
2118
2119 /*
2120 * make replace pattern of affix
2121 */
2122 if (Affix->type == FF_SUFFIX)
2123 {
2124 strcpy(newword, word);
2125 strcpy(newword + len - Affix->replen, Affix->find);
2126 if (baselen) /* store length of non-changed part of word */
2127 *baselen = len - Affix->replen;
2128 }
2129 else
2130 {
2131 /*
2132 * if prefix is an all non-changed part's length then all word
2133 * contains only prefix and suffix, so out
2134 */
2135 if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
2136 return NULL;
2137 strcpy(newword, Affix->find);
2138 strcat(newword, word + Affix->replen);
2139 }
2140
2141 /*
2142 * check resulting word
2143 */
2144 if (Affix->issimple)
2145 return newword;
2146 else if (Affix->isregis)
2147 {
2148 if (RS_execute(&(Affix->reg.regis), newword))
2149 return newword;
2150 }
2151 else
2152 {
2153 pg_wchar *data;
2154 size_t data_len;
2155 int newword_len;
2156
2157 /* Convert data string to wide characters */
2158 newword_len = strlen(newword);
2159 data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
2160 data_len = pg_mb2wchar_with_len(newword, data, newword_len);
2161
2162 if (pg_regexec(&(Affix->reg.pregex->regex), data, data_len,
2163 0, NULL, 0, NULL, 0) == REG_OKAY)
2164 {
2165 pfree(data);
2166 return newword;
2167 }
2168 pfree(data);
2169 }
2170
2171 return NULL;
2172 }
2173
2174 static int
addToResult(char ** forms,char ** cur,char * word)2175 addToResult(char **forms, char **cur, char *word)
2176 {
2177 if (cur - forms >= MAX_NORM - 1)
2178 return 0;
2179 if (forms == cur || strcmp(word, *(cur - 1)) != 0)
2180 {
2181 *cur = pstrdup(word);
2182 *(cur + 1) = NULL;
2183 return 1;
2184 }
2185
2186 return 0;
2187 }
2188
2189 static char **
NormalizeSubWord(IspellDict * Conf,char * word,int flag)2190 NormalizeSubWord(IspellDict *Conf, char *word, int flag)
2191 {
2192 AffixNodeData *suffix = NULL,
2193 *prefix = NULL;
2194 int slevel = 0,
2195 plevel = 0;
2196 int wrdlen = strlen(word),
2197 swrdlen;
2198 char **forms;
2199 char **cur;
2200 char newword[2 * MAXNORMLEN] = "";
2201 char pnewword[2 * MAXNORMLEN] = "";
2202 AffixNode *snode = Conf->Suffix,
2203 *pnode;
2204 int i,
2205 j;
2206
2207 if (wrdlen > MAXNORMLEN)
2208 return NULL;
2209 cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
2210 *cur = NULL;
2211
2212
2213 /* Check that the word itself is normal form */
2214 if (FindWord(Conf, word, VoidString, flag))
2215 {
2216 *cur = pstrdup(word);
2217 cur++;
2218 *cur = NULL;
2219 }
2220
2221 /* Find all other NORMAL forms of the 'word' (check only prefix) */
2222 pnode = Conf->Prefix;
2223 plevel = 0;
2224 while (pnode)
2225 {
2226 prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
2227 if (!prefix)
2228 break;
2229 for (j = 0; j < prefix->naff; j++)
2230 {
2231 if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
2232 {
2233 /* prefix success */
2234 if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
2235 cur += addToResult(forms, cur, newword);
2236 }
2237 }
2238 pnode = prefix->node;
2239 }
2240
2241 /*
2242 * Find all other NORMAL forms of the 'word' (check suffix and then
2243 * prefix)
2244 */
2245 while (snode)
2246 {
2247 int baselen = 0;
2248
2249 /* find possible suffix */
2250 suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
2251 if (!suffix)
2252 break;
2253 /* foreach suffix check affix */
2254 for (i = 0; i < suffix->naff; i++)
2255 {
2256 if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
2257 {
2258 /* suffix success */
2259 if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
2260 cur += addToResult(forms, cur, newword);
2261
2262 /* now we will look changed word with prefixes */
2263 pnode = Conf->Prefix;
2264 plevel = 0;
2265 swrdlen = strlen(newword);
2266 while (pnode)
2267 {
2268 prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
2269 if (!prefix)
2270 break;
2271 for (j = 0; j < prefix->naff; j++)
2272 {
2273 if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
2274 {
2275 /* prefix success */
2276 char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
2277 VoidString : prefix->aff[j]->flag;
2278
2279 if (FindWord(Conf, pnewword, ff, flag))
2280 cur += addToResult(forms, cur, pnewword);
2281 }
2282 }
2283 pnode = prefix->node;
2284 }
2285 }
2286 }
2287
2288 snode = suffix->node;
2289 }
2290
2291 if (cur == forms)
2292 {
2293 pfree(forms);
2294 return NULL;
2295 }
2296 return forms;
2297 }
2298
2299 typedef struct SplitVar
2300 {
2301 int nstem;
2302 int lenstem;
2303 char **stem;
2304 struct SplitVar *next;
2305 } SplitVar;
2306
2307 static int
CheckCompoundAffixes(CMPDAffix ** ptr,char * word,int len,bool CheckInPlace)2308 CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
2309 {
2310 bool issuffix;
2311
2312 /* in case CompoundAffix is null: */
2313 if (*ptr == NULL)
2314 return -1;
2315
2316 if (CheckInPlace)
2317 {
2318 while ((*ptr)->affix)
2319 {
2320 if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
2321 {
2322 len = (*ptr)->len;
2323 issuffix = (*ptr)->issuffix;
2324 (*ptr)++;
2325 return (issuffix) ? len : 0;
2326 }
2327 (*ptr)++;
2328 }
2329 }
2330 else
2331 {
2332 char *affbegin;
2333
2334 while ((*ptr)->affix)
2335 {
2336 if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
2337 {
2338 len = (*ptr)->len + (affbegin - word);
2339 issuffix = (*ptr)->issuffix;
2340 (*ptr)++;
2341 return (issuffix) ? len : 0;
2342 }
2343 (*ptr)++;
2344 }
2345 }
2346 return -1;
2347 }
2348
2349 static SplitVar *
CopyVar(SplitVar * s,int makedup)2350 CopyVar(SplitVar *s, int makedup)
2351 {
2352 SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
2353
2354 v->next = NULL;
2355 if (s)
2356 {
2357 int i;
2358
2359 v->lenstem = s->lenstem;
2360 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2361 v->nstem = s->nstem;
2362 for (i = 0; i < s->nstem; i++)
2363 v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
2364 }
2365 else
2366 {
2367 v->lenstem = 16;
2368 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2369 v->nstem = 0;
2370 }
2371 return v;
2372 }
2373
2374 static void
AddStem(SplitVar * v,char * word)2375 AddStem(SplitVar *v, char *word)
2376 {
2377 if (v->nstem >= v->lenstem)
2378 {
2379 v->lenstem *= 2;
2380 v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
2381 }
2382
2383 v->stem[v->nstem] = word;
2384 v->nstem++;
2385 }
2386
2387 static SplitVar *
SplitToVariants(IspellDict * Conf,SPNode * snode,SplitVar * orig,char * word,int wordlen,int startpos,int minpos)2388 SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
2389 {
2390 SplitVar *var = NULL;
2391 SPNodeData *StopLow,
2392 *StopHigh,
2393 *StopMiddle = NULL;
2394 SPNode *node = (snode) ? snode : Conf->Dictionary;
2395 int level = (snode) ? minpos : startpos; /* recursive
2396 * minpos==level */
2397 int lenaff;
2398 CMPDAffix *caff;
2399 char *notprobed;
2400 int compoundflag = 0;
2401
2402 notprobed = (char *) palloc(wordlen);
2403 memset(notprobed, 1, wordlen);
2404 var = CopyVar(orig, 1);
2405
2406 while (level < wordlen)
2407 {
2408 /* find word with epenthetic or/and compound affix */
2409 caff = Conf->CompoundAffix;
2410 while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
2411 {
2412 /*
2413 * there is one of compound affixes, so check word for existings
2414 */
2415 char buf[MAXNORMLEN];
2416 char **subres;
2417
2418 lenaff = level - startpos + lenaff;
2419
2420 if (!notprobed[startpos + lenaff - 1])
2421 continue;
2422
2423 if (level + lenaff - 1 <= minpos)
2424 continue;
2425
2426 if (lenaff >= MAXNORMLEN)
2427 continue; /* skip too big value */
2428 if (lenaff > 0)
2429 memcpy(buf, word + startpos, lenaff);
2430 buf[lenaff] = '\0';
2431
2432 if (level == 0)
2433 compoundflag = FF_COMPOUNDBEGIN;
2434 else if (level == wordlen - 1)
2435 compoundflag = FF_COMPOUNDLAST;
2436 else
2437 compoundflag = FF_COMPOUNDMIDDLE;
2438 subres = NormalizeSubWord(Conf, buf, compoundflag);
2439 if (subres)
2440 {
2441 /* Yes, it was a word from dictionary */
2442 SplitVar *new = CopyVar(var, 0);
2443 SplitVar *ptr = var;
2444 char **sptr = subres;
2445
2446 notprobed[startpos + lenaff - 1] = 0;
2447
2448 while (*sptr)
2449 {
2450 AddStem(new, *sptr);
2451 sptr++;
2452 }
2453 pfree(subres);
2454
2455 while (ptr->next)
2456 ptr = ptr->next;
2457 ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2458
2459 pfree(new->stem);
2460 pfree(new);
2461 }
2462 }
2463
2464 if (!node)
2465 break;
2466
2467 StopLow = node->data;
2468 StopHigh = node->data + node->length;
2469 while (StopLow < StopHigh)
2470 {
2471 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2472 if (StopMiddle->val == ((uint8 *) (word))[level])
2473 break;
2474 else if (StopMiddle->val < ((uint8 *) (word))[level])
2475 StopLow = StopMiddle + 1;
2476 else
2477 StopHigh = StopMiddle;
2478 }
2479
2480 if (StopLow < StopHigh)
2481 {
2482 if (startpos == 0)
2483 compoundflag = FF_COMPOUNDBEGIN;
2484 else if (level == wordlen - 1)
2485 compoundflag = FF_COMPOUNDLAST;
2486 else
2487 compoundflag = FF_COMPOUNDMIDDLE;
2488
2489 /* find infinitive */
2490 if (StopMiddle->isword &&
2491 (StopMiddle->compoundflag & compoundflag) &&
2492 notprobed[level])
2493 {
2494 /* ok, we found full compoundallowed word */
2495 if (level > minpos)
2496 {
2497 /* and its length more than minimal */
2498 if (wordlen == level + 1)
2499 {
2500 /* well, it was last word */
2501 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2502 pfree(notprobed);
2503 return var;
2504 }
2505 else
2506 {
2507 /* then we will search more big word at the same point */
2508 SplitVar *ptr = var;
2509
2510 while (ptr->next)
2511 ptr = ptr->next;
2512 ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
2513 /* we can find next word */
2514 level++;
2515 AddStem(var, pnstrdup(word + startpos, level - startpos));
2516 node = Conf->Dictionary;
2517 startpos = level;
2518 continue;
2519 }
2520 }
2521 }
2522 node = StopMiddle->node;
2523 }
2524 else
2525 node = NULL;
2526 level++;
2527 }
2528
2529 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2530 pfree(notprobed);
2531 return var;
2532 }
2533
2534 static void
addNorm(TSLexeme ** lres,TSLexeme ** lcur,char * word,int flags,uint16 NVariant)2535 addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
2536 {
2537 if (*lres == NULL)
2538 *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
2539
2540 if (*lcur - *lres < MAX_NORM - 1)
2541 {
2542 (*lcur)->lexeme = word;
2543 (*lcur)->flags = flags;
2544 (*lcur)->nvariant = NVariant;
2545 (*lcur)++;
2546 (*lcur)->lexeme = NULL;
2547 }
2548 }
2549
2550 TSLexeme *
NINormalizeWord(IspellDict * Conf,char * word)2551 NINormalizeWord(IspellDict *Conf, char *word)
2552 {
2553 char **res;
2554 TSLexeme *lcur = NULL,
2555 *lres = NULL;
2556 uint16 NVariant = 1;
2557
2558 res = NormalizeSubWord(Conf, word, 0);
2559
2560 if (res)
2561 {
2562 char **ptr = res;
2563
2564 while (*ptr && (lcur - lres) < MAX_NORM)
2565 {
2566 addNorm(&lres, &lcur, *ptr, 0, NVariant++);
2567 ptr++;
2568 }
2569 pfree(res);
2570 }
2571
2572 if (Conf->usecompound)
2573 {
2574 int wordlen = strlen(word);
2575 SplitVar *ptr,
2576 *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
2577 int i;
2578
2579 while (var)
2580 {
2581 if (var->nstem > 1)
2582 {
2583 char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
2584
2585 if (subres)
2586 {
2587 char **subptr = subres;
2588
2589 while (*subptr)
2590 {
2591 for (i = 0; i < var->nstem - 1; i++)
2592 {
2593 addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
2594 }
2595
2596 addNorm(&lres, &lcur, *subptr, 0, NVariant);
2597 subptr++;
2598 NVariant++;
2599 }
2600
2601 pfree(subres);
2602 var->stem[0] = NULL;
2603 pfree(var->stem[var->nstem - 1]);
2604 }
2605 }
2606
2607 for (i = 0; i < var->nstem && var->stem[i]; i++)
2608 pfree(var->stem[i]);
2609 ptr = var->next;
2610 pfree(var->stem);
2611 pfree(var);
2612 var = ptr;
2613 }
2614 }
2615
2616 return lres;
2617 }
2618