1 /*
2
3 morph.c - WordNet search code morphology functions
4
5 */
6
7 #include <stdio.h>
8 #include <ctype.h>
9 #include <string.h>
10 #include <stdlib.h>
11 #include "wn.h"
12
13 #ifdef UNIX
14 #define EXCFILE "%s/%s.exc"
15 #endif
16 #ifdef PC
17 #define EXCFILE "%s\\%s.exc"
18 #endif
19 #ifdef MAC
20 #define EXCFILE "%s:%s.exc"
21 #endif
22
23 static char *Id = "$Id: morph.c,v 1.61 2003/06/23 16:15:39 wn Exp $";
24
25 static char *sufx[] ={
26 /* Noun suffixes */
27 "s", "ses", "xes", "zes", "ches", "shes", "men", "ies",
28 /* Verb suffixes */
29 "s", "ies", "es", "es", "ed", "ed", "ing", "ing",
30 /* Adjective suffixes */
31 "er", "est", "er", "est"
32 };
33
34 static char *addr[] ={
35 /* Noun endings */
36 "", "s", "x", "z", "ch", "sh", "man", "y",
37 /* Verb endings */
38 "", "y", "e", "", "e", "", "e", "",
39 /* Adjective endings */
40 "", "", "e", "e"
41 };
42
43 static int offsets[NUMPARTS] = { 0, 0, 8, 16 };
44 static int cnts[NUMPARTS] = { 0, 8, 8, 4 };
45 static char msgbuf[256];
46
47 #define NUMPREPS 15
48
49 static struct {
50 char *str;
51 int strlen;
52 } prepositions[NUMPREPS] = {
53 "to", 2,
54 "at", 2,
55 "of", 2,
56 "on", 2,
57 "off", 3,
58 "in", 2,
59 "out", 3,
60 "up", 2,
61 "down", 4,
62 "from", 4,
63 "with", 4,
64 "into", 4,
65 "for", 3,
66 "about", 5,
67 "between", 7,
68 };
69
70 static FILE *exc_fps[NUMPARTS + 1];
71
72 static int do_init();
73 static int strend(char *, char *);
74 static char *wordbase(char *, int);
75 static int hasprep(char *, int);
76 static char *exc_lookup(char *, int);
77 static char *morphprep(char *);
78
79 /* Open exception list files */
80
morphinit(void)81 int morphinit(void)
82 {
83 static int done = 0;
84 static int openerr = 0;
85
86 if (!done) {
87 if (OpenDB) { /* make sure WN database files are open */
88 if (!(openerr = do_init()))
89 done = 1;
90 } else
91 openerr = -1;
92 }
93
94 return(openerr);
95 }
96
97 /* Close exception list files and reopen */
re_morphinit(void)98 int re_morphinit(void)
99 {
100 int i;
101
102 for (i = 1; i <= NUMPARTS; i++) {
103 if (exc_fps[i] != NULL) {
104 fclose(exc_fps[i]); exc_fps[i] = NULL;
105 }
106 }
107
108 return(OpenDB ? do_init() : -1);
109 }
110
do_init(void)111 static int do_init(void)
112 {
113 int i, openerr;
114 char *env;
115 char searchdir[256], fname[256];
116
117 openerr = 0;
118
119 /* Find base directory for database. If set, use WNSEARCHDIR.
120 If not set, check for WNHOME/dict, otherwise use DEFAULTPATH. */
121
122 if ((env = getenv("WNSEARCHDIR")) != NULL)
123 strcpy(searchdir, env);
124 else if ((env = getenv("WNHOME")) != NULL)
125 sprintf(searchdir, "%s%s", env, DICTDIR);
126 else
127 strcpy(searchdir, DEFAULTPATH);
128
129 for (i = 1; i <= NUMPARTS; i++) {
130 sprintf(fname, EXCFILE, searchdir, partnames[i]);
131 if ((exc_fps[i] = fopen(fname, "r")) == NULL) {
132 sprintf(msgbuf,
133 "WordNet library error: Can't open exception file(%s)\n\n",
134 fname);
135 display_message(msgbuf);
136 openerr = -1;
137 }
138 }
139 return(openerr);
140 }
141
142 /* Try to find baseform (lemma) of word or collocation in POS.
143 Works like strtok() - first call is with string, subsequent calls
144 with NULL argument return additional baseforms for original string. */
145
morphstr(char * origstr,int pos)146 char *morphstr(char *origstr, int pos)
147 {
148 static char searchstr[WORDBUF], str[WORDBUF];
149 static int svcnt, svprep;
150 char word[WORDBUF], *tmp;
151 int cnt, st_idx = 0, end_idx;
152 int prep;
153 char *end_idx1, *end_idx2;
154 char *append;
155
156 if (pos == SATELLITE)
157 pos = ADJ;
158
159 /* First time through for this string */
160
161 if (origstr != NULL) {
162 /* Assume string hasn't had spaces substitued with '_' */
163 strtolower(strsubst(strcpy(str, origstr), ' ', '_'));
164 searchstr[0] = '\0';
165 cnt = cntwords(str, '_');
166 svprep = 0;
167
168 /* first try exception list */
169
170 if ((tmp = exc_lookup(str, pos)) && strcmp(tmp, str)) {
171 svcnt = 1; /* force next time to pass NULL */
172 return(tmp);
173 }
174
175 /* Then try simply morph on original string */
176
177 if (pos != VERB && (tmp = morphword(str, pos)) && strcmp(tmp, str))
178 return(tmp);
179
180 if (pos == VERB && cnt > 1 && (prep = hasprep(str, cnt))) {
181 /* assume we have a verb followed by a preposition */
182 svprep = prep;
183 return(morphprep(str));
184 } else {
185 svcnt = cnt = cntwords(str, '-');
186 while (origstr && --cnt) {
187 end_idx1 = strchr(str + st_idx, '_');
188 end_idx2 = strchr(str + st_idx, '-');
189 if (end_idx1 && end_idx2) {
190 if (end_idx1 < end_idx2) {
191 end_idx = (int)(end_idx1 - str);
192 append = "_";
193 } else {
194 end_idx = (int)(end_idx2 - str);
195 append = "-";
196 }
197 } else {
198 if (end_idx1) {
199 end_idx = (int)(end_idx1 - str);
200 append = "_";
201 } else {
202 end_idx = (int)(end_idx2 - str);
203 append = "-";
204 }
205 }
206 if (end_idx < 0) return(NULL); /* shouldn't do this */
207 strncpy(word, str + st_idx, end_idx - st_idx);
208 word[end_idx - st_idx] = '\0';
209 if(tmp = morphword(word, pos))
210 strcat(searchstr,tmp);
211 else
212 strcat(searchstr,word);
213 strcat(searchstr, append);
214 st_idx = end_idx + 1;
215 }
216
217 if(tmp = morphword(strcpy(word, str + st_idx), pos))
218 strcat(searchstr,tmp);
219 else
220 strcat(searchstr,word);
221 if(strcmp(searchstr, str) && is_defined(searchstr,pos))
222 return(searchstr);
223 else
224 return(NULL);
225 }
226 } else { /* subsequent call on string */
227 if (svprep) { /* if verb has preposition, no more morphs */
228 svprep = 0;
229 return(NULL);
230 } else if (svcnt == 1)
231 return(exc_lookup(NULL, pos));
232 else {
233 svcnt = 1;
234 if ((tmp = exc_lookup(str, pos)) && strcmp(tmp, str))
235 return(tmp);
236 else
237 return(NULL);
238 }
239 }
240 }
241
242 /* Try to find baseform (lemma) of individual word in POS */
morphword(char * word,int pos)243 char *morphword(char *word, int pos)
244 {
245 int offset, cnt;
246 int i;
247 static char retval[WORDBUF];
248 char *tmp, tmpbuf[WORDBUF], *end;
249
250 sprintf(retval,"");
251 sprintf(tmpbuf, "");
252 end = "";
253
254 if(word == NULL)
255 return(NULL);
256
257 /* first look for word on exception list */
258
259 if((tmp = exc_lookup(word, pos)) != NULL)
260 return(tmp); /* found it in exception list */
261
262 if (pos == ADV) { /* only use exception list for adverbs */
263 return(NULL);
264 }
265 if (pos == NOUN) {
266 if (strend(word, "ful")) {
267 cnt = strrchr(word, 'f') - word;
268 strncat(tmpbuf, word, cnt);
269 end = "ful";
270 } else
271 /* check for noun ending with 'ss' or short words */
272 if (strend(word, "ss") || (strlen(word) <= 2))
273 return(NULL);
274 }
275
276 /* If not in exception list, try applying rules from tables */
277
278 if (tmpbuf[0] == '\0')
279 strcpy(tmpbuf, word);
280
281 offset = offsets[pos];
282 cnt = cnts[pos];
283
284 for(i = 0; i < cnt; i++){
285 strcpy(retval, wordbase(tmpbuf, (i + offset)));
286 if(strcmp(retval, tmpbuf) && is_defined(retval, pos)) {
287 strcat(retval, end);
288 return(retval);
289 }
290 }
291 return(NULL);
292 }
293
strend(char * str1,char * str2)294 static int strend(char *str1, char *str2)
295 {
296 char *pt1;
297
298 if(strlen(str2) >= strlen(str1))
299 return(0);
300 else {
301 pt1=str1;
302 pt1=strchr(str1,0);
303 pt1=pt1-strlen(str2);
304 return(!strcmp(pt1,str2));
305 }
306 }
307
wordbase(char * word,int ender)308 static char *wordbase(char *word, int ender)
309 {
310 char *pt1;
311 static char copy[WORDBUF];
312
313 strcpy(copy, word);
314 if(strend(copy,sufx[ender])) {
315 pt1=strchr(copy,'\0');
316 pt1 -= strlen(sufx[ender]);
317 *pt1='\0';
318 strcat(copy,addr[ender]);
319 }
320 return(copy);
321 }
322
hasprep(char * s,int wdcnt)323 static int hasprep(char *s, int wdcnt)
324 {
325 /* Find a preposition in the verb string and return its
326 corresponding word number. */
327
328 int i, wdnum;
329
330 for (wdnum = 2; wdnum <= wdcnt; wdnum++) {
331 s = strchr(s, '_');
332 for (s++, i = 0; i < NUMPREPS; i++)
333 if (!strncmp(s, prepositions[i].str, prepositions[i].strlen) &&
334 (s[prepositions[i].strlen] == '_' ||
335 s[prepositions[i].strlen] == '\0'))
336 return(wdnum);
337 }
338 return(0);
339 }
340
exc_lookup(char * word,int pos)341 static char *exc_lookup(char *word, int pos)
342 {
343 static char line[WORDBUF], *beglp, *endlp;
344 char *excline;
345 int found = 0;
346
347 if (exc_fps[pos] == NULL)
348 return(NULL);
349
350 /* first time through load line from exception file */
351 if(word != NULL){
352 if ((excline = bin_search(word, exc_fps[pos])) != NULL) {
353 strcpy(line, excline);
354 endlp = strchr(line,' ');
355 } else
356 endlp = NULL;
357 }
358 if(endlp && *(endlp + 1) != ' '){
359 beglp = endlp + 1;
360 while(*beglp && *beglp == ' ') beglp++;
361 endlp = beglp;
362 while(*endlp && *endlp != ' ' && *endlp != '\n') endlp++;
363 if(endlp != beglp){
364 *endlp='\0';
365 return(beglp);
366 }
367 }
368 beglp = NULL;
369 endlp = NULL;
370 return(NULL);
371 }
372
morphprep(char * s)373 static char *morphprep(char *s)
374 {
375 char *rest, *exc_word, *lastwd = NULL, *last;
376 int i, offset, cnt;
377 char word[WORDBUF], end[WORDBUF];
378 static char retval[WORDBUF];
379
380 /* Assume that the verb is the first word in the phrase. Strip it
381 off, check for validity, then try various morphs with the
382 rest of the phrase tacked on, trying to find a match. */
383
384 rest = strchr(s, '_');
385 last = strrchr(s, '_');
386 if (rest != last) { /* more than 2 words */
387 if (lastwd = morphword(last + 1, NOUN)) {
388 strncpy(end, rest, last - rest + 1);
389 end[last-rest+1] = '\0';
390 strcat(end, lastwd);
391 }
392 }
393
394 strncpy(word, s, rest - s);
395 word[rest - s] = '\0';
396 for (i = 0, cnt = strlen(word); i < cnt; i++)
397 if (!isalnum((unsigned char)(word[i]))) return(NULL);
398
399 offset = offsets[VERB];
400 cnt = cnts[VERB];
401
402 /* First try to find the verb in the exception list */
403
404 if ((exc_word = exc_lookup(word, VERB)) &&
405 strcmp(exc_word, word)) {
406
407 sprintf(retval, "%s%s", exc_word, rest);
408 if(is_defined(retval, VERB))
409 return(retval);
410 else if (lastwd) {
411 sprintf(retval, "%s%s", exc_word, end);
412 if(is_defined(retval, VERB))
413 return(retval);
414 }
415 }
416
417 for (i = 0; i < cnt; i++) {
418 if ((exc_word = wordbase(word, (i + offset))) &&
419 strcmp(word, exc_word)) { /* ending is different */
420
421 sprintf(retval, "%s%s", exc_word, rest);
422 if(is_defined(retval, VERB))
423 return(retval);
424 else if (lastwd) {
425 sprintf(retval, "%s%s", exc_word, end);
426 if(is_defined(retval, VERB))
427 return(retval);
428 }
429 }
430 }
431 sprintf(retval, "%s%s", word, rest);
432 if (strcmp(s, retval))
433 return(retval);
434 if (lastwd) {
435 sprintf(retval, "%s%s", word, end);
436 if (strcmp(s, retval))
437 return(retval);
438 }
439 return(NULL);
440 }
441
442 /*
443 * Revision 1.1 91/09/25 15:39:47 wn
444 * Initial revision
445 *
446 */
447