1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37
38 /* Un-munch a root word list with affix tags
39 * to recreate the original word list
40 */
41
42 #include <ctype.h>
43 #include <string.h>
44 #include <string>
45 #include <unistd.h>
46 #include <stdlib.h>
47 #include <stdint.h>
48 #include <stdio.h>
49 #include <stddef.h>
50 #include <sys/types.h>
51 #include <sys/stat.h>
52 #include <fcntl.h>
53 #include <limits>
54
55 #include "unmunch.h"
56
main(int argc,char ** argv)57 int main(int argc, char** argv) {
58 int i;
59 int al;
60
61 FILE* wrdlst;
62 FILE* afflst;
63
64 char *wf, *af;
65 char ts[MAX_LN_LEN];
66
67 (void)argc;
68
69 /* first parse the command line options */
70 /* arg1 - munched wordlist, arg2 - affix file */
71
72 if (argv[1]) {
73 wf = mystrdup(argv[1]);
74 } else {
75 fprintf(stderr, "correct syntax is:\n");
76 fprintf(stderr, "unmunch dic_file affix_file\n");
77 exit(1);
78 }
79 if (argv[2]) {
80 af = mystrdup(argv[2]);
81 } else {
82 fprintf(stderr, "correct syntax is:\n");
83 fprintf(stderr, "unmunch dic_file affix_file\n");
84 exit(1);
85 }
86
87 /* open the affix file */
88 afflst = fopen(af, "r");
89 if (!afflst) {
90 fprintf(stderr, "Error - could not open affix description file\n");
91 exit(1);
92 }
93
94 /* step one is to parse the affix file building up the internal
95 affix data structures */
96
97 numpfx = 0;
98 numsfx = 0;
99 fullstrip = 0;
100
101 if (parse_aff_file(afflst)) {
102 fprintf(stderr, "Error - in affix file loading\n");
103 exit(1);
104 }
105
106 fclose(afflst);
107
108 fprintf(stderr, "parsed in %d prefixes and %d suffixes\n", numpfx, numsfx);
109
110 /* affix file is now parsed so create hash table of wordlist on the fly */
111
112 /* open the wordlist */
113 wrdlst = fopen(wf, "r");
114 if (!wrdlst) {
115 fprintf(stderr, "Error - could not open word list file\n");
116 exit(1);
117 }
118
119 /* skip over the hash table size */
120 if (!fgets(ts, MAX_LN_LEN - 1, wrdlst)) {
121 fclose(wrdlst);
122 return 2;
123 }
124 mychomp(ts);
125
126 while (fgets(ts, MAX_LN_LEN - 1, wrdlst)) {
127 mychomp(ts);
128 /* split each line into word and affix char strings */
129 char* ap = strchr(ts, '/');
130 if (ap) {
131 *ap = '\0';
132 ap++;
133 al = strlen(ap);
134 } else {
135 al = 0;
136 ap = NULL;
137 }
138
139 int wl = strlen(ts);
140
141 numwords = 0;
142 wlist[numwords].word = mystrdup(ts);
143 wlist[numwords].pallow = 0;
144 numwords++;
145
146 if (al)
147 expand_rootword(ts, wl, ap);
148
149 for (i = 0; i < numwords; i++) {
150 fprintf(stdout, "%s\n", wlist[i].word);
151 free(wlist[i].word);
152 wlist[i].word = NULL;
153 wlist[i].pallow = 0;
154 }
155 }
156
157 fclose(wrdlst);
158 return 0;
159 }
160
parse_aff_file(FILE * afflst)161 int parse_aff_file(FILE* afflst) {
162 int i, j;
163 int numents = 0;
164 char achar = '\0';
165 short ff = 0;
166 struct affent* ptr = NULL;
167 struct affent* nptr = NULL;
168 char* line = (char*)malloc(MAX_LN_LEN);
169
170 while (fgets(line, MAX_LN_LEN, afflst)) {
171 mychomp(line);
172 char ft = ' ';
173 fprintf(stderr, "parsing line: %s\n", line);
174 if (strncmp(line, "FULLSTRIP", 9) == 0)
175 fullstrip = 1;
176 if (strncmp(line, "PFX", 3) == 0)
177 ft = 'P';
178 if (strncmp(line, "SFX", 3) == 0)
179 ft = 'S';
180 if (ft != ' ') {
181 char* tp = line;
182 char* piece;
183 ff = 0;
184 i = 0;
185 while ((piece = mystrsep(&tp, ' '))) {
186 if (*piece != '\0') {
187 switch (i) {
188 case 0:
189 break;
190 case 1: {
191 achar = *piece;
192 break;
193 }
194 case 2: {
195 if (*piece == 'Y')
196 ff = XPRODUCT;
197 break;
198 }
199 case 3: {
200 numents = atoi(piece);
201 if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
202 sizeof(struct affent)) < static_cast<size_t>(numents))) {
203 fprintf(stderr, "Error: too many entries: %d\n", numents);
204 numents = 0;
205 } else {
206 ptr = (struct affent*)malloc(numents * sizeof(struct affent));
207 ptr->achar = achar;
208 ptr->xpflg = ff;
209 fprintf(stderr, "parsing %c entries %d\n", achar, numents);
210 }
211 break;
212 }
213 default:
214 break;
215 }
216 i++;
217 }
218 free(piece);
219 }
220 /* now parse all of the sub entries*/
221 nptr = ptr;
222 for (j = 0; j < numents; j++) {
223 if (!fgets(line, MAX_LN_LEN, afflst))
224 return 1;
225 mychomp(line);
226 tp = line;
227 i = 0;
228 while ((piece = mystrsep(&tp, ' '))) {
229 if (*piece != '\0') {
230 switch (i) {
231 case 0: {
232 if (nptr != ptr) {
233 nptr->achar = ptr->achar;
234 nptr->xpflg = ptr->xpflg;
235 }
236 break;
237 }
238 case 1:
239 break;
240 case 2: {
241 nptr->strip = mystrdup(piece);
242 nptr->stripl = strlen(nptr->strip);
243 if (strcmp(nptr->strip, "0") == 0) {
244 free(nptr->strip);
245 nptr->strip = mystrdup("");
246 nptr->stripl = 0;
247 }
248 break;
249 }
250 case 3: {
251 nptr->appnd = mystrdup(piece);
252 nptr->appndl = strlen(nptr->appnd);
253 if (strcmp(nptr->appnd, "0") == 0) {
254 free(nptr->appnd);
255 nptr->appnd = mystrdup("");
256 nptr->appndl = 0;
257 }
258 if (strchr(nptr->appnd, '/')) {
259 char* addseparator =
260 (char*)realloc(nptr->appnd, nptr->appndl + 2);
261 if (addseparator) {
262 nptr->appndl++;
263 addseparator[nptr->appndl - 1] = '|';
264 addseparator[nptr->appndl] = '\0';
265 nptr->appnd = addseparator;
266 }
267 }
268 break;
269 }
270 case 4: {
271 encodeit(nptr, piece);
272 }
273 fprintf(stderr, " affix: %s %d, strip: %s %d\n", nptr->appnd,
274 nptr->appndl, nptr->strip, nptr->stripl);
275 // no break
276 default:
277 break;
278 }
279 i++;
280 }
281 free(piece);
282 }
283 nptr++;
284 }
285 if (ptr) {
286 if (ft == 'P') {
287 ptable[numpfx].aep = ptr;
288 ptable[numpfx].num = numents;
289 fprintf(stderr, "ptable %d num is %d flag %c\n", numpfx,
290 ptable[numpfx].num, ptr->achar);
291 numpfx++;
292 } else if (ft == 'S') {
293 stable[numsfx].aep = ptr;
294 stable[numsfx].num = numents;
295 fprintf(stderr, "stable %d num is %d flag %c\n", numsfx,
296 stable[numsfx].num, ptr->achar);
297 numsfx++;
298 }
299 ptr = NULL;
300 }
301 nptr = NULL;
302 numents = 0;
303 achar = '\0';
304 }
305 }
306 free(line);
307 return 0;
308 }
309
encodeit(struct affent * ptr,char * cs)310 void encodeit(struct affent* ptr, char* cs) {
311 int nc;
312 int neg;
313 int grp;
314 int n;
315 int ec;
316 int nm;
317 int i, j, k;
318 unsigned char mbr[MAX_WD_LEN];
319
320 /* now clear the conditions array */
321 for (i = 0; i < SET_SIZE; i++)
322 ptr->conds[i] = (unsigned char)0;
323
324 /* now parse the string to create the conds array */
325 nc = strlen(cs);
326 neg = 0; /* complement indicator */
327 grp = 0; /* group indicator */
328 n = 0; /* number of conditions */
329 ec = 0; /* end condition indicator */
330 nm = 0; /* number of member in group */
331 i = 0;
332 if (strcmp(cs, ".") == 0) {
333 ptr->numconds = 0;
334 return;
335 }
336 while (i < nc) {
337 unsigned char c = *((unsigned char*)(cs + i));
338 if (c == '[') {
339 grp = 1;
340 c = 0;
341 }
342 if ((grp == 1) && (c == '^')) {
343 neg = 1;
344 c = 0;
345 }
346 if (c == ']') {
347 ec = 1;
348 c = 0;
349 }
350 if ((grp == 1) && (c != 0)) {
351 *(mbr + nm) = c;
352 nm++;
353 c = 0;
354 }
355 if (c != 0) {
356 ec = 1;
357 }
358 if (ec) {
359 if (grp == 1) {
360 if (neg == 0) {
361 for (j = 0; j < nm; j++) {
362 k = (unsigned int)mbr[j];
363 ptr->conds[k] = ptr->conds[k] | (1 << n);
364 }
365 } else {
366 for (j = 0; j < SET_SIZE; j++)
367 ptr->conds[j] = ptr->conds[j] | (1 << n);
368 for (j = 0; j < nm; j++) {
369 k = (unsigned int)mbr[j];
370 ptr->conds[k] = ptr->conds[k] & ~(1 << n);
371 }
372 }
373 neg = 0;
374 grp = 0;
375 nm = 0;
376 } else {
377 /* not a group so just set the proper bit for this char */
378 /* but first handle special case of . inside condition */
379 if (c == '.') {
380 /* wild card character so set them all */
381 for (j = 0; j < SET_SIZE; j++)
382 ptr->conds[j] = ptr->conds[j] | (1 << n);
383 } else {
384 ptr->conds[(unsigned int)c] = ptr->conds[(unsigned int)c] | (1 << n);
385 }
386 }
387 n++;
388 ec = 0;
389 }
390 i++;
391 }
392 ptr->numconds = n;
393 return;
394 }
395
396 /* add a prefix to word */
pfx_add(const char * word,int len,struct affent * ep,int num)397 void pfx_add(const char* word, int len, struct affent* ep, int num) {
398 struct affent* aent;
399 int cond;
400 unsigned char* cp;
401 int i;
402
403 for (aent = ep, i = num; i > 0; aent++, i--) {
404 /* now make sure all conditions match */
405 if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
406 ((aent->stripl == 0) ||
407 (strncmp(aent->strip, word, aent->stripl) == 0))) {
408 cp = (unsigned char*)word;
409 for (cond = 0; cond < aent->numconds; cond++) {
410 if ((aent->conds[*cp++] & (1 << cond)) == 0)
411 break;
412 }
413 if (cond >= aent->numconds) {
414 std::string tword;
415 /* we have a match so add prefix */
416 if (aent->appndl) {
417 tword.append(aent->appnd);
418 }
419 tword.append(word + aent->stripl);
420
421 if (numwords < MAX_WORDS) {
422 wlist[numwords].word = mystrdup(tword.c_str());
423 wlist[numwords].pallow = 0;
424 numwords++;
425 }
426 }
427 }
428 }
429 }
430
431 /* add a suffix to a word */
suf_add(const char * word,int len,struct affent * ep,int num)432 void suf_add(const char* word, int len, struct affent* ep, int num) {
433 struct affent* aent;
434 int cond;
435 unsigned char* cp;
436 int i;
437
438 for (aent = ep, i = num; i > 0; aent++, i--) {
439 /* if conditions hold on root word
440 * then strip off strip string and add suffix
441 */
442
443 if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
444 ((aent->stripl == 0) ||
445 (strcmp(aent->strip, word + len - aent->stripl) == 0))) {
446 cp = (unsigned char*)(word + len);
447 for (cond = aent->numconds; --cond >= 0;) {
448 if ((aent->conds[*--cp] & (1 << cond)) == 0)
449 break;
450 }
451 if (cond < 0) {
452 /* we have a matching condition */
453 std::string tword(word);
454 tword.resize(len - aent->stripl);
455 tword.append(aent->appnd);
456
457 if (numwords < MAX_WORDS) {
458 wlist[numwords].word = mystrdup(tword.c_str());
459 wlist[numwords].pallow = (aent->xpflg & XPRODUCT);
460 numwords++;
461 }
462 }
463 }
464 }
465 }
466
expand_rootword(const char * ts,int wl,const char * ap)467 int expand_rootword(const char* ts, int wl, const char* ap) {
468 int i;
469 int nh = 0;
470
471 for (i = 0; i < numsfx; i++) {
472 if (strchr(ap, (stable[i].aep)->achar)) {
473 suf_add(ts, wl, stable[i].aep, stable[i].num);
474 }
475 }
476
477 nh = numwords;
478
479 if (nh > 1) {
480 for (int j = 1; j < nh; j++) {
481 if (wlist[j].pallow) {
482 for (i = 0; i < numpfx; i++) {
483 if (strchr(ap, (ptable[i].aep)->achar)) {
484 if ((ptable[i].aep)->xpflg & XPRODUCT) {
485 int nwl = strlen(wlist[j].word);
486 pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num);
487 }
488 }
489 }
490 }
491 }
492 }
493
494 for (i = 0; i < numpfx; i++) {
495 if (strchr(ap, (ptable[i].aep)->achar)) {
496 pfx_add(ts, wl, ptable[i].aep, ptable[i].num);
497 }
498 }
499 return 0;
500 }
501
502 /* strip strings into token based on single char delimiter
503 * acts like strsep() but only uses a delim char and not
504 * a delim string
505 */
mystrsep(char ** stringp,const char delim)506 char* mystrsep(char** stringp, const char delim) {
507 char* rv = NULL;
508 char* mp = *stringp;
509 int n = strlen(mp);
510 if (n > 0) {
511 char* dp = (char*)memchr(mp, (int)((unsigned char)delim), n);
512 if (dp) {
513 ptrdiff_t nc;
514 *stringp = dp + 1;
515 nc = dp - mp;
516 rv = (char*)malloc(nc + 1);
517 if (rv) {
518 memcpy(rv, mp, nc);
519 *(rv + nc) = '\0';
520 }
521 } else {
522 rv = (char*)malloc(n + 1);
523 if (rv) {
524 memcpy(rv, mp, n);
525 *(rv + n) = '\0';
526 *stringp = mp + n;
527 }
528 }
529 }
530 return rv;
531 }
532
mystrdup(const char * s)533 char* mystrdup(const char* s) {
534 char* d = NULL;
535 if (s) {
536 int sl = strlen(s) + 1;
537 d = (char*)malloc(sl);
538 if (d)
539 memcpy(d, s, sl);
540 }
541 return d;
542 }
543
mychomp(char * s)544 void mychomp(char* s) {
545 int k = strlen(s);
546 if ((k > 0) && (*(s + k - 1) == '\n'))
547 *(s + k - 1) = '\0';
548 if ((k > 1) && (*(s + k - 2) == '\r'))
549 *(s + k - 2) = '\0';
550 }
551