1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 
38 /* Un-munch a root word list with affix tags
39  * to recreate the original word list
40  */
41 
42 #include <ctype.h>
43 #include <string.h>
44 #include <string>
45 #include <unistd.h>
46 #include <stdlib.h>
47 #include <stdint.h>
48 #include <stdio.h>
49 #include <stddef.h>
50 #include <sys/types.h>
51 #include <sys/stat.h>
52 #include <fcntl.h>
53 #include <limits>
54 
55 #include "unmunch.h"
56 
main(int argc,char ** argv)57 int main(int argc, char** argv) {
58   int i;
59   int al;
60 
61   FILE* wrdlst;
62   FILE* afflst;
63 
64   char *wf, *af;
65   char ts[MAX_LN_LEN];
66 
67   (void)argc;
68 
69   /* first parse the command line options */
70   /* arg1 - munched wordlist, arg2 - affix file */
71 
72   if (argv[1]) {
73     wf = mystrdup(argv[1]);
74   } else {
75     fprintf(stderr, "correct syntax is:\n");
76     fprintf(stderr, "unmunch dic_file affix_file\n");
77     exit(1);
78   }
79   if (argv[2]) {
80     af = mystrdup(argv[2]);
81   } else {
82     fprintf(stderr, "correct syntax is:\n");
83     fprintf(stderr, "unmunch dic_file affix_file\n");
84     exit(1);
85   }
86 
87   /* open the affix file */
88   afflst = fopen(af, "r");
89   if (!afflst) {
90     fprintf(stderr, "Error - could not open affix description file\n");
91     exit(1);
92   }
93 
94   /* step one is to parse the affix file building up the internal
95      affix data structures */
96 
97   numpfx = 0;
98   numsfx = 0;
99   fullstrip = 0;
100 
101   if (parse_aff_file(afflst)) {
102     fprintf(stderr, "Error - in affix file loading\n");
103     exit(1);
104   }
105 
106   fclose(afflst);
107 
108   fprintf(stderr, "parsed in %d prefixes and %d suffixes\n", numpfx, numsfx);
109 
110   /* affix file is now parsed so create hash table of wordlist on the fly */
111 
112   /* open the wordlist */
113   wrdlst = fopen(wf, "r");
114   if (!wrdlst) {
115     fprintf(stderr, "Error - could not open word list file\n");
116     exit(1);
117   }
118 
119   /* skip over the hash table size */
120   if (!fgets(ts, MAX_LN_LEN - 1, wrdlst)) {
121     fclose(wrdlst);
122     return 2;
123   }
124   mychomp(ts);
125 
126   while (fgets(ts, MAX_LN_LEN - 1, wrdlst)) {
127     mychomp(ts);
128     /* split each line into word and affix char strings */
129     char* ap = strchr(ts, '/');
130     if (ap) {
131       *ap = '\0';
132       ap++;
133       al = strlen(ap);
134     } else {
135       al = 0;
136       ap = NULL;
137     }
138 
139     int wl = strlen(ts);
140 
141     numwords = 0;
142     wlist[numwords].word = mystrdup(ts);
143     wlist[numwords].pallow = 0;
144     numwords++;
145 
146     if (al)
147       expand_rootword(ts, wl, ap);
148 
149     for (i = 0; i < numwords; i++) {
150       fprintf(stdout, "%s\n", wlist[i].word);
151       free(wlist[i].word);
152       wlist[i].word = NULL;
153       wlist[i].pallow = 0;
154     }
155   }
156 
157   fclose(wrdlst);
158   return 0;
159 }
160 
parse_aff_file(FILE * afflst)161 int parse_aff_file(FILE* afflst) {
162   int i, j;
163   int numents = 0;
164   char achar = '\0';
165   short ff = 0;
166   struct affent* ptr = NULL;
167   struct affent* nptr = NULL;
168   char* line = (char*)malloc(MAX_LN_LEN);
169 
170   while (fgets(line, MAX_LN_LEN, afflst)) {
171     mychomp(line);
172     char ft = ' ';
173     fprintf(stderr, "parsing line: %s\n", line);
174     if (strncmp(line, "FULLSTRIP", 9) == 0)
175       fullstrip = 1;
176     if (strncmp(line, "PFX", 3) == 0)
177       ft = 'P';
178     if (strncmp(line, "SFX", 3) == 0)
179       ft = 'S';
180     if (ft != ' ') {
181       char* tp = line;
182       char* piece;
183       ff = 0;
184       i = 0;
185       while ((piece = mystrsep(&tp, ' '))) {
186         if (*piece != '\0') {
187           switch (i) {
188             case 0:
189               break;
190             case 1: {
191               achar = *piece;
192               break;
193             }
194             case 2: {
195               if (*piece == 'Y')
196                 ff = XPRODUCT;
197               break;
198             }
199             case 3: {
200               numents = atoi(piece);
201               if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
202                                       sizeof(struct affent)) < static_cast<size_t>(numents))) {
203                 fprintf(stderr, "Error: too many entries: %d\n", numents);
204                 numents = 0;
205               } else {
206                 ptr = (struct affent*)malloc(numents * sizeof(struct affent));
207                 ptr->achar = achar;
208                 ptr->xpflg = ff;
209                 fprintf(stderr, "parsing %c entries %d\n", achar, numents);
210               }
211               break;
212             }
213             default:
214               break;
215           }
216           i++;
217         }
218         free(piece);
219       }
220       /* now parse all of the sub entries*/
221       nptr = ptr;
222       for (j = 0; j < numents; j++) {
223         if (!fgets(line, MAX_LN_LEN, afflst))
224           return 1;
225         mychomp(line);
226         tp = line;
227         i = 0;
228         while ((piece = mystrsep(&tp, ' '))) {
229           if (*piece != '\0') {
230             switch (i) {
231               case 0: {
232                 if (nptr != ptr) {
233                   nptr->achar = ptr->achar;
234                   nptr->xpflg = ptr->xpflg;
235                 }
236                 break;
237               }
238               case 1:
239                 break;
240               case 2: {
241                 nptr->strip = mystrdup(piece);
242                 nptr->stripl = strlen(nptr->strip);
243                 if (strcmp(nptr->strip, "0") == 0) {
244                   free(nptr->strip);
245                   nptr->strip = mystrdup("");
246                   nptr->stripl = 0;
247                 }
248                 break;
249               }
250               case 3: {
251                 nptr->appnd = mystrdup(piece);
252                 nptr->appndl = strlen(nptr->appnd);
253                 if (strcmp(nptr->appnd, "0") == 0) {
254                   free(nptr->appnd);
255                   nptr->appnd = mystrdup("");
256                   nptr->appndl = 0;
257                 }
258                 if (strchr(nptr->appnd, '/')) {
259                   char* addseparator =
260                       (char*)realloc(nptr->appnd, nptr->appndl + 2);
261                   if (addseparator) {
262                     nptr->appndl++;
263                     addseparator[nptr->appndl - 1] = '|';
264                     addseparator[nptr->appndl] = '\0';
265                     nptr->appnd = addseparator;
266                   }
267                 }
268                 break;
269               }
270               case 4: {
271                 encodeit(nptr, piece);
272               }
273                 fprintf(stderr, "   affix: %s %d, strip: %s %d\n", nptr->appnd,
274                         nptr->appndl, nptr->strip, nptr->stripl);
275                 // no break
276               default:
277                 break;
278             }
279             i++;
280           }
281           free(piece);
282         }
283         nptr++;
284       }
285       if (ptr) {
286         if (ft == 'P') {
287           ptable[numpfx].aep = ptr;
288           ptable[numpfx].num = numents;
289           fprintf(stderr, "ptable %d num is %d flag %c\n", numpfx,
290                   ptable[numpfx].num, ptr->achar);
291           numpfx++;
292         } else if (ft == 'S') {
293           stable[numsfx].aep = ptr;
294           stable[numsfx].num = numents;
295           fprintf(stderr, "stable %d num is %d flag %c\n", numsfx,
296                   stable[numsfx].num, ptr->achar);
297           numsfx++;
298         }
299         ptr = NULL;
300       }
301       nptr = NULL;
302       numents = 0;
303       achar = '\0';
304     }
305   }
306   free(line);
307   return 0;
308 }
309 
encodeit(struct affent * ptr,char * cs)310 void encodeit(struct affent* ptr, char* cs) {
311   int nc;
312   int neg;
313   int grp;
314   int n;
315   int ec;
316   int nm;
317   int i, j, k;
318   unsigned char mbr[MAX_WD_LEN];
319 
320   /* now clear the conditions array */
321   for (i = 0; i < SET_SIZE; i++)
322     ptr->conds[i] = (unsigned char)0;
323 
324   /* now parse the string to create the conds array */
325   nc = strlen(cs);
326   neg = 0; /* complement indicator */
327   grp = 0; /* group indicator */
328   n = 0;   /* number of conditions */
329   ec = 0;  /* end condition indicator */
330   nm = 0;  /* number of member in group */
331   i = 0;
332   if (strcmp(cs, ".") == 0) {
333     ptr->numconds = 0;
334     return;
335   }
336   while (i < nc) {
337     unsigned char c = *((unsigned char*)(cs + i));
338     if (c == '[') {
339       grp = 1;
340       c = 0;
341     }
342     if ((grp == 1) && (c == '^')) {
343       neg = 1;
344       c = 0;
345     }
346     if (c == ']') {
347       ec = 1;
348       c = 0;
349     }
350     if ((grp == 1) && (c != 0)) {
351       *(mbr + nm) = c;
352       nm++;
353       c = 0;
354     }
355     if (c != 0) {
356       ec = 1;
357     }
358     if (ec) {
359       if (grp == 1) {
360         if (neg == 0) {
361           for (j = 0; j < nm; j++) {
362             k = (unsigned int)mbr[j];
363             ptr->conds[k] = ptr->conds[k] | (1 << n);
364           }
365         } else {
366           for (j = 0; j < SET_SIZE; j++)
367             ptr->conds[j] = ptr->conds[j] | (1 << n);
368           for (j = 0; j < nm; j++) {
369             k = (unsigned int)mbr[j];
370             ptr->conds[k] = ptr->conds[k] & ~(1 << n);
371           }
372         }
373         neg = 0;
374         grp = 0;
375         nm = 0;
376       } else {
377         /* not a group so just set the proper bit for this char */
378         /* but first handle special case of . inside condition */
379         if (c == '.') {
380           /* wild card character so set them all */
381           for (j = 0; j < SET_SIZE; j++)
382             ptr->conds[j] = ptr->conds[j] | (1 << n);
383         } else {
384           ptr->conds[(unsigned int)c] = ptr->conds[(unsigned int)c] | (1 << n);
385         }
386       }
387       n++;
388       ec = 0;
389     }
390     i++;
391   }
392   ptr->numconds = n;
393   return;
394 }
395 
396 /* add a prefix to word */
pfx_add(const char * word,int len,struct affent * ep,int num)397 void pfx_add(const char* word, int len, struct affent* ep, int num) {
398   struct affent* aent;
399   int cond;
400   unsigned char* cp;
401   int i;
402 
403   for (aent = ep, i = num; i > 0; aent++, i--) {
404     /* now make sure all conditions match */
405     if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
406         ((aent->stripl == 0) ||
407          (strncmp(aent->strip, word, aent->stripl) == 0))) {
408       cp = (unsigned char*)word;
409       for (cond = 0; cond < aent->numconds; cond++) {
410         if ((aent->conds[*cp++] & (1 << cond)) == 0)
411           break;
412       }
413       if (cond >= aent->numconds) {
414         std::string tword;
415         /* we have a match so add prefix */
416         if (aent->appndl) {
417           tword.append(aent->appnd);
418         }
419         tword.append(word + aent->stripl);
420 
421         if (numwords < MAX_WORDS) {
422           wlist[numwords].word = mystrdup(tword.c_str());
423           wlist[numwords].pallow = 0;
424           numwords++;
425         }
426       }
427     }
428   }
429 }
430 
431 /* add a suffix to a word */
suf_add(const char * word,int len,struct affent * ep,int num)432 void suf_add(const char* word, int len, struct affent* ep, int num) {
433   struct affent* aent;
434   int cond;
435   unsigned char* cp;
436   int i;
437 
438   for (aent = ep, i = num; i > 0; aent++, i--) {
439     /* if conditions hold on root word
440      * then strip off strip string and add suffix
441      */
442 
443     if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
444         ((aent->stripl == 0) ||
445          (strcmp(aent->strip, word + len - aent->stripl) == 0))) {
446       cp = (unsigned char*)(word + len);
447       for (cond = aent->numconds; --cond >= 0;) {
448         if ((aent->conds[*--cp] & (1 << cond)) == 0)
449           break;
450       }
451       if (cond < 0) {
452         /* we have a matching condition */
453         std::string tword(word);
454         tword.resize(len - aent->stripl);
455         tword.append(aent->appnd);
456 
457         if (numwords < MAX_WORDS) {
458           wlist[numwords].word = mystrdup(tword.c_str());
459           wlist[numwords].pallow = (aent->xpflg & XPRODUCT);
460           numwords++;
461         }
462       }
463     }
464   }
465 }
466 
expand_rootword(const char * ts,int wl,const char * ap)467 int expand_rootword(const char* ts, int wl, const char* ap) {
468   int i;
469   int nh = 0;
470 
471   for (i = 0; i < numsfx; i++) {
472     if (strchr(ap, (stable[i].aep)->achar)) {
473       suf_add(ts, wl, stable[i].aep, stable[i].num);
474     }
475   }
476 
477   nh = numwords;
478 
479   if (nh > 1) {
480     for (int j = 1; j < nh; j++) {
481       if (wlist[j].pallow) {
482         for (i = 0; i < numpfx; i++) {
483           if (strchr(ap, (ptable[i].aep)->achar)) {
484             if ((ptable[i].aep)->xpflg & XPRODUCT) {
485               int nwl = strlen(wlist[j].word);
486               pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num);
487             }
488           }
489         }
490       }
491     }
492   }
493 
494   for (i = 0; i < numpfx; i++) {
495     if (strchr(ap, (ptable[i].aep)->achar)) {
496       pfx_add(ts, wl, ptable[i].aep, ptable[i].num);
497     }
498   }
499   return 0;
500 }
501 
502 /* strip strings into token based on single char delimiter
503  * acts like strsep() but only uses a delim char and not
504  * a delim string
505  */
mystrsep(char ** stringp,const char delim)506 char* mystrsep(char** stringp, const char delim) {
507   char* rv = NULL;
508   char* mp = *stringp;
509   int n = strlen(mp);
510   if (n > 0) {
511     char* dp = (char*)memchr(mp, (int)((unsigned char)delim), n);
512     if (dp) {
513       ptrdiff_t nc;
514       *stringp = dp + 1;
515       nc = dp - mp;
516       rv = (char*)malloc(nc + 1);
517       if (rv) {
518         memcpy(rv, mp, nc);
519         *(rv + nc) = '\0';
520       }
521     } else {
522       rv = (char*)malloc(n + 1);
523       if (rv) {
524         memcpy(rv, mp, n);
525         *(rv + n) = '\0';
526         *stringp = mp + n;
527       }
528     }
529   }
530   return rv;
531 }
532 
mystrdup(const char * s)533 char* mystrdup(const char* s) {
534   char* d = NULL;
535   if (s) {
536     int sl = strlen(s) + 1;
537     d = (char*)malloc(sl);
538     if (d)
539       memcpy(d, s, sl);
540   }
541   return d;
542 }
543 
mychomp(char * s)544 void mychomp(char* s) {
545   int k = strlen(s);
546   if ((k > 0) && (*(s + k - 1) == '\n'))
547     *(s + k - 1) = '\0';
548   if ((k > 1) && (*(s + k - 2) == '\r'))
549     *(s + k - 2) = '\0';
550 }
551