1 /*
2
3 morph.c - WordNet search code morphology functions
4
5 */
6
7 #include <stdio.h>
8 #include <ctype.h>
9 #include <string.h>
10 #include <stdlib.h>
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14 #include "wn.h"
15
16 #ifdef _WINDOWS
17 #include <windows.h>
18 #include <windowsx.h>
19 #define EXCFILE "%s\\%s.exc"
20 #else
21 #define EXCFILE "%s/%s.exc"
22 #endif
23
24 __FBSDID("$Id: morph.c,v 1.67 2006/11/14 21:00:23 wn Exp $");
25
26 static const char *sufx[] ={
27 /* Noun suffixes */
28 "s", "ses", "xes", "zes", "ches", "shes", "men", "ies",
29 /* Verb suffixes */
30 "s", "ies", "es", "es", "ed", "ed", "ing", "ing",
31 /* Adjective suffixes */
32 "er", "est", "er", "est"
33 };
34
35 static const char *addr[] ={
36 /* Noun endings */
37 "", "s", "x", "z", "ch", "sh", "man", "y",
38 /* Verb endings */
39 "", "y", "e", "", "e", "", "e", "",
40 /* Adjective endings */
41 "", "", "e", "e"
42 };
43
44 static int offsets[NUMPARTS] = { 0, 0, 8, 16 };
45 static int cnts[NUMPARTS] = { 0, 8, 8, 4 };
46 static char msgbuf[256];
47
48
49 static struct {
50 const char *str;
51 int strlen;
52 } prepositions[] = {
53 { "to", 2 },
54 { "at", 2 },
55 { "of", 2 },
56 { "on", 2 },
57 { "off", 3 },
58 { "in", 2 },
59 { "out", 3 },
60 { "up", 2 },
61 { "down", 4 },
62 { "from", 4 },
63 { "with", 4 },
64 { "into", 4 },
65 { "for", 3 },
66 { "about", 5 },
67 { "between", 7 }
68 };
69
70 #define NUMPREPS sizeof(prepositions)/sizeof(prepositions[0])
71
72 static FILE *exc_fps[NUMPARTS + 1];
73
74 static int do_init(void);
75 static int strend(const char *, const char *);
76 static const char *wordbase(const char *, int);
77 static int hasprep(const char *, unsigned int);
78 static const char *exc_lookup(const char *, int);
79 static const char *morphprep(const char *);
80
81 /* Open exception list files */
82
morphinit(void)83 int morphinit(void)
84 {
85 static int done = 0;
86 static int openerr = 0;
87
88 if (!done) {
89 if (OpenDB) { /* make sure WN database files are open */
90 if (!(openerr = do_init()))
91 done = 1;
92 } else
93 openerr = -1;
94 }
95
96 return(openerr);
97 }
98
99 /* Close exception list files and reopen */
re_morphinit(void)100 int re_morphinit(void)
101 {
102 int i;
103
104 for (i = 1; i <= NUMPARTS; i++) {
105 if (exc_fps[i] != NULL) {
106 fclose(exc_fps[i]); exc_fps[i] = NULL;
107 }
108 }
109
110 return(OpenDB ? do_init() : -1);
111 }
112
do_init(void)113 static int do_init(void)
114 {
115 int i, openerr;
116 #ifdef _WINDOWS
117 HKEY hkey;
118 DWORD dwType, dwSize;
119 #else
120 char *env;
121 #endif
122 char searchdir[256], fname[256];
123
124 openerr = 0;
125
126 /* Find base directory for database. If set, use WNSEARCHDIR.
127 If not set, check for WNHOME/dict, otherwise use DEFAULTPATH. */
128
129 #ifdef _WINDOWS
130 if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, TEXT("Software\\WordNet\\3.0"),
131 0, KEY_READ, &hkey) == ERROR_SUCCESS) {
132 dwSize = sizeof(searchdir);
133 RegQueryValueEx(hkey, TEXT("WNHome"),
134 NULL, &dwType, searchdir, &dwSize);
135 RegCloseKey(hkey);
136 strcat(searchdir, DICTDIR);
137 }
138 else if (RegOpenKeyEx(HKEY_CURRENT_USER, TEXT("Software\\WordNet\\3.0"),
139 0, KEY_READ, &hkey) == ERROR_SUCCESS) {
140 dwSize = sizeof(searchdir);
141 RegQueryValueEx(hkey, TEXT("WNHome"),
142 NULL, &dwType, searchdir, &dwSize);
143 RegCloseKey(hkey);
144 strcat(searchdir, DICTDIR);
145 } else
146 sprintf(searchdir, DEFAULTPATH);
147 #else
148 if ((env = getenv("WNSEARCHDIR")) != NULL)
149 strcpy(searchdir, env);
150 else if ((env = getenv("WNHOME")) != NULL)
151 sprintf(searchdir, "%s%s", env, DICTDIR);
152 else
153 strcpy(searchdir, DEFAULTPATH);
154 #endif
155
156 for (i = 1; i <= NUMPARTS; i++) {
157 sprintf(fname, EXCFILE, searchdir, partnames[i]);
158 if ((exc_fps[i] = fopen(fname, "r")) == NULL) {
159 sprintf(msgbuf,
160 "WordNet library error: Can't open exception file(%s)\n\n",
161 fname);
162 display_message(msgbuf);
163 openerr = -1;
164 }
165 }
166 return(openerr);
167 }
168
169 /* Try to find baseform (lemma) of word or collocation in POS.
170 Works like strtok() - first call is with string, subsequent calls
171 with NULL argument return additional baseforms for original string. */
172
173 const char *
morphstr(const char * origstr,int pos)174 morphstr(const char *origstr, int pos)
175 {
176 static char searchstr[WORDBUF], str[WORDBUF];
177 static int svcnt, svprep;
178 char word[WORDBUF];
179 const char *tmp;
180 int cnt, st_idx = 0, end_idx;
181 int prep;
182 const char *end_idx1, *end_idx2;
183 const char *append;
184
185 if (pos == SATELLITE)
186 pos = ADJ;
187
188 /* First time through for this string */
189
190 if (origstr != NULL) {
191 /* Assume string hasn't had spaces substitued with '_' */
192 strtolower(strsubst(strcpy(str, origstr), ' ', '_'));
193 searchstr[0] = '\0';
194 cnt = cntwords(str, '_');
195 svprep = 0;
196
197 /* first try exception list */
198
199 if ((tmp = exc_lookup(str, pos)) && strcmp(tmp, str)) {
200 svcnt = 1; /* force next time to pass NULL */
201 return(tmp);
202 }
203
204 /* Then try simply morph on original string */
205
206 if (pos != VERB && (tmp = morphword(str, pos)) && strcmp(tmp, str))
207 return(tmp);
208
209 if (pos == VERB && cnt > 1 && (prep = hasprep(str, cnt))) {
210 /* assume we have a verb followed by a preposition */
211 svprep = prep;
212 return(morphprep(str));
213 } else {
214 svcnt = cnt = cntwords(str, '-');
215 while (origstr && --cnt) {
216 end_idx1 = strchr(str + st_idx, '_');
217 end_idx2 = strchr(str + st_idx, '-');
218 if (end_idx1 && end_idx2) {
219 if (end_idx1 < end_idx2) {
220 end_idx = (int)(end_idx1 - str);
221 append = "_";
222 } else {
223 end_idx = (int)(end_idx2 - str);
224 append = "-";
225 }
226 } else {
227 if (end_idx1) {
228 end_idx = (int)(end_idx1 - str);
229 append = "_";
230 } else {
231 end_idx = (int)(end_idx2 - str);
232 append = "-";
233 }
234 }
235 if (end_idx < 0) return(NULL); /* shouldn't do this */
236 strncpy(word, str + st_idx, end_idx - st_idx);
237 word[end_idx - st_idx] = '\0';
238 tmp = morphword(word, pos);
239 if(tmp)
240 strcat(searchstr,tmp);
241 else
242 strcat(searchstr,word);
243 strcat(searchstr, append);
244 st_idx = end_idx + 1;
245 }
246
247 tmp = morphword(strcpy(word, str + st_idx), pos);
248 if(tmp)
249 strcat(searchstr,tmp);
250 else
251 strcat(searchstr,word);
252 if(strcmp(searchstr, str) && is_defined(searchstr,pos))
253 return(searchstr);
254 else
255 return(NULL);
256 }
257 } else { /* subsequent call on string */
258 if (svprep) { /* if verb has preposition, no more morphs */
259 svprep = 0;
260 return(NULL);
261 } else if (svcnt == 1)
262 return(exc_lookup(NULL, pos));
263 else {
264 svcnt = 1;
265 if ((tmp = exc_lookup(str, pos)) && strcmp(tmp, str))
266 return(tmp);
267 else
268 return(NULL);
269 }
270 }
271 }
272
273 /* Try to find baseform (lemma) of individual word in POS */
274 const char *
morphword(const char * word,int pos)275 morphword(const char *word, int pos)
276 {
277 int offset, cnt;
278 int i;
279 static char retval[WORDBUF];
280 char tmpbuf[WORDBUF];
281 const char *tmp, *end;
282
283 retval[0] = tmpbuf[0] = '\0';
284 end = "";
285
286 if(word == NULL)
287 return(NULL);
288
289 /* first look for word on exception list */
290
291 if((tmp = exc_lookup(word, pos)) != NULL)
292 return(tmp); /* found it in exception list */
293
294 if (pos == ADV) { /* only use exception list for adverbs */
295 return(NULL);
296 }
297 if (pos == NOUN) {
298 if (strend(word, "ful")) {
299 cnt = strrchr(word, 'f') - word;
300 strncat(tmpbuf, word, cnt);
301 end = "ful";
302 } else
303 /* check for noun ending with 'ss' or short words */
304 if (strend(word, "ss") || (strlen(word) <= 2))
305 return(NULL);
306 }
307
308 /* If not in exception list, try applying rules from tables */
309
310 if (tmpbuf[0] == '\0')
311 strcpy(tmpbuf, word);
312
313 offset = offsets[pos];
314 cnt = cnts[pos];
315
316 for(i = 0; i < cnt; i++){
317 strcpy(retval, wordbase(tmpbuf, (i + offset)));
318 if(strcmp(retval, tmpbuf) && is_defined(retval, pos)) {
319 strcat(retval, end);
320 return(retval);
321 }
322 }
323 return(NULL);
324 }
325
strend(const char * str1,const char * str2)326 static int strend(const char *str1, const char *str2)
327 {
328 const char *pt1;
329
330 if(strlen(str2) >= strlen(str1))
331 return(0);
332 else {
333 pt1=str1;
334 pt1=strchr(str1,0);
335 pt1=pt1-strlen(str2);
336 return(!strcmp(pt1,str2));
337 }
338 }
339
340 static const char *
wordbase(const char * word,int ender)341 wordbase(const char *word, int ender)
342 {
343 char *pt1;
344 static char copy[WORDBUF];
345
346 strcpy(copy, word);
347 if(strend(copy,sufx[ender])) {
348 pt1=strchr(copy,'\0');
349 pt1 -= strlen(sufx[ender]);
350 *pt1='\0';
351 strcat(copy,addr[ender]);
352 }
353 return(copy);
354 }
355
hasprep(const char * s,unsigned int wdcnt)356 static int hasprep(const char *s, unsigned int wdcnt)
357 {
358 /* Find a preposition in the verb string and return its
359 corresponding word number. */
360
361 unsigned int i, wdnum;
362
363 for (wdnum = 2; wdnum <= wdcnt; wdnum++) {
364 s = strchr(s, '_');
365 for (s++, i = 0; i < NUMPREPS; i++)
366 if (!strncmp(s, prepositions[i].str, prepositions[i].strlen) &&
367 (s[prepositions[i].strlen] == '_' ||
368 s[prepositions[i].strlen] == '\0'))
369 return(wdnum);
370 }
371 return(0);
372 }
373
374 static const char *
exc_lookup(const char * word,int pos)375 exc_lookup(const char *word, int pos)
376 {
377 static char line[WORDBUF], *beglp, *endlp;
378 const char *excline;
379
380 if (exc_fps[pos] == NULL)
381 return(NULL);
382
383 /* first time through load line from exception file */
384 if(word != NULL){
385 if ((excline = bin_search(word, exc_fps[pos])) != NULL) {
386 strcpy(line, excline);
387 endlp = strchr(line,' ');
388 } else
389 endlp = NULL;
390 }
391 if(endlp && *(endlp + 1) != ' '){
392 beglp = endlp + 1;
393 while(*beglp && *beglp == ' ') beglp++;
394 endlp = beglp;
395 while(*endlp && *endlp != ' ' && *endlp != '\n') endlp++;
396 if(endlp != beglp){
397 *endlp='\0';
398 return(beglp);
399 }
400 }
401 beglp = NULL;
402 endlp = NULL;
403 return(NULL);
404 }
405
406 static const char *
morphprep(const char * s)407 morphprep(const char *s)
408 {
409 const char *rest, *exc_word, *lastwd = NULL, *last;
410 int i, offset, cnt;
411 char word[WORDBUF], end[WORDBUF];
412 static char retval[WORDBUF];
413
414 /* Assume that the verb is the first word in the phrase. Strip it
415 off, check for validity, then try various morphs with the
416 rest of the phrase tacked on, trying to find a match. */
417
418 rest = strchr(s, '_');
419 last = strrchr(s, '_');
420 if (rest != last) { /* more than 2 words */
421 lastwd = morphword(last + 1, NOUN);
422 if (lastwd) {
423 strncpy(end, rest, last - rest + 1);
424 end[last-rest+1] = '\0';
425 strcat(end, lastwd);
426 }
427 }
428
429 strncpy(word, s, rest - s);
430 word[rest - s] = '\0';
431 for (i = 0, cnt = strlen(word); i < cnt; i++)
432 if (!isalnum((unsigned char)(word[i]))) return(NULL);
433
434 offset = offsets[VERB];
435 cnt = cnts[VERB];
436
437 /* First try to find the verb in the exception list */
438
439 if ((exc_word = exc_lookup(word, VERB)) &&
440 strcmp(exc_word, word)) {
441
442 sprintf(retval, "%s%s", exc_word, rest);
443 if(is_defined(retval, VERB))
444 return(retval);
445 else if (lastwd) {
446 sprintf(retval, "%s%s", exc_word, end);
447 if(is_defined(retval, VERB))
448 return(retval);
449 }
450 }
451
452 for (i = 0; i < cnt; i++) {
453 if ((exc_word = wordbase(word, (i + offset))) &&
454 strcmp(word, exc_word)) { /* ending is different */
455
456 sprintf(retval, "%s%s", exc_word, rest);
457 if(is_defined(retval, VERB))
458 return(retval);
459 else if (lastwd) {
460 sprintf(retval, "%s%s", exc_word, end);
461 if(is_defined(retval, VERB))
462 return(retval);
463 }
464 }
465 }
466 sprintf(retval, "%s%s", word, rest);
467 if (strcmp(s, retval))
468 return(retval);
469 if (lastwd) {
470 sprintf(retval, "%s%s", word, end);
471 if (strcmp(s, retval))
472 return(retval);
473 }
474 return(NULL);
475 }
476
477 /*
478 * Revision 1.1 91/09/25 15:39:47 wn
479 * Initial revision
480 *
481 */
482