1 /*
2 
3   wnutil.c - utility functions used by WordNet code
4 
5 */
6 
7 #ifdef _WINDOWS
8 #include <windows.h>
9 #include <windowsx.h>
10 #endif
11 
12 #include <stdio.h>
13 #include <ctype.h>
14 
15 #ifdef __unix__
16 #ifndef __MACH__
17 #include <stdlib.h>
18 #include <stdint.h>
19 #endif
20 #endif
21 
22 #include <assert.h>
23 #include <string.h>
24 #include <stdlib.h>
25 
26 #ifdef HAVE_CONFIG_H
27 #include "config.h"
28 #endif
29 #include "wn.h"
30 
31 static int do_init(void);
32 
33 static char msgbuf[256];	/* buffer for constructing error messages */
34 
35 /* used by the strstr wrapper functions */
36 static char *strstr_word;
37 static char *strstr_stringstart;
38 static char *strstr_stringcurrent;
39 
40 
41 /* Initialization functions */
42 
43 static void closefps(void);
44 
wninit(void)45 int wninit(void)
46 {
47     static int done = 0;
48     static int openerr = 0;
49     char *env;
50 
51     if (!done) {
52 	if ((env = getenv("WNDBVERSION"))) {
53 	    wnrelease = env;	/* set release */
54 	}
55 	openerr = do_init();
56 	if (!openerr) {
57 	    done = 1;
58 	    OpenDB = 1;
59 	    openerr = morphinit();
60 	}
61     }
62 
63     return(openerr);
64 }
65 
re_wninit(void)66 int re_wninit(void)
67 {
68     int openerr;
69     char *env;
70 
71     closefps();
72 
73     if ((env = getenv("WNDBVERSION"))) {
74 	wnrelease = env;	/* set release */
75     }
76     openerr = do_init();
77     if (!openerr) {
78 	OpenDB = 1;
79 	openerr = re_morphinit();
80     }
81 
82     return(openerr);
83 }
84 
closefps(void)85 static void closefps(void)
86 {
87     int i;
88 
89     if (OpenDB) {
90 	for (i = 1; i < NUMPARTS + 1; i++) {
91 	    if (datafps[i] != NULL) {
92 		fclose(datafps[i]);
93 		datafps[i] = NULL;
94 	    }
95 	    if (indexfps[i] != NULL) {
96 		fclose(indexfps[i]);
97 		indexfps[i] = NULL;
98 	    }
99 	}
100 	if (sensefp != NULL) {
101 	    fclose(sensefp); sensefp = NULL;
102 	}
103 	if (cntlistfp != NULL) {
104 	    fclose(cntlistfp); cntlistfp = NULL;
105 	}
106 	if (keyindexfp != NULL) {
107 	    fclose(keyindexfp); keyindexfp = NULL;
108 	}
109 	if (vsentfilefp != NULL) {
110 	    fclose(vsentfilefp); vsentfilefp = NULL;
111 	}
112 	if (vidxfilefp != NULL) {
113 	    fclose(vidxfilefp); vidxfilefp = NULL;
114 	}
115 	OpenDB = 0;
116     }
117 }
118 
do_init(void)119 static int do_init(void)
120 {
121     int i, openerr;
122     char searchdir[256], tmpbuf[256];
123 
124 #ifdef _WINDOWS
125     HKEY hkey;
126     DWORD dwType, dwSize;
127 #else
128     char *env;
129 #endif
130 
131     openerr = 0;
132 
133     /* Find base directory for database.  If set, use WNSEARCHDIR.
134        If not set, check for WNHOME/dict, otherwise use DEFAULTPATH. */
135 
136 #ifdef _WINDOWS
137     if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, TEXT("Software\\WordNet\\3.0"),
138 		     0, KEY_READ, &hkey) == ERROR_SUCCESS) {
139 	dwSize = sizeof(searchdir);
140 	RegQueryValueEx(hkey, TEXT("WNHome"),
141 			NULL, &dwType, searchdir, &dwSize);
142 	RegCloseKey(hkey);
143 	strcat(searchdir, DICTDIR);
144     } else if (RegOpenKeyEx(HKEY_CURRENT_USER, TEXT("Software\\WordNet\\3.0"),
145 		     0, KEY_READ, &hkey) == ERROR_SUCCESS) {
146 	dwSize = sizeof(searchdir);
147 	RegQueryValueEx(hkey, TEXT("WNHome"),
148 			NULL, &dwType, searchdir, &dwSize);
149 	RegCloseKey(hkey);
150 	strcat(searchdir, DICTDIR);
151     } else
152 	sprintf(searchdir, DEFAULTPATH);
153 #else
154     if ((env = getenv("WNSEARCHDIR")) != NULL)
155 	strcpy(searchdir, env);
156     else if ((env = getenv("WNHOME")) != NULL)
157 	sprintf(searchdir, "%s%s", env, DICTDIR);
158     else
159 	strcpy(searchdir, DEFAULTPATH);
160 #endif
161 
162     for (i = 1; i < NUMPARTS + 1; i++) {
163 	sprintf(tmpbuf, DATAFILE, searchdir, partnames[i]);
164 	if((datafps[i] = fopen(tmpbuf, "r")) == NULL) {
165 	    sprintf(msgbuf,
166 		    "WordNet library error: Can't open datafile(%s)\n",
167 		    tmpbuf);
168 	    display_message(msgbuf);
169 	    openerr = -1;
170 	}
171 	sprintf(tmpbuf, INDEXFILE, searchdir, partnames[i]);
172 	if((indexfps[i] = fopen(tmpbuf, "r")) == NULL) {
173 	    sprintf(msgbuf,
174 		    "WordNet library error: Can't open indexfile(%s)\n",
175 		    tmpbuf);
176 	    display_message(msgbuf);
177 	    openerr = -1;
178 	}
179     }
180 
181     /* This file isn't used by the library and doesn't have to
182        be present.  No error is reported if the open fails. */
183 
184     sprintf(tmpbuf, SENSEIDXFILE, searchdir);
185     sensefp = fopen(tmpbuf, "r");
186 
187     /* If this file isn't present, the runtime code will skip printint out
188        the number of times each sense was tagged. */
189 
190     sprintf(tmpbuf, CNTLISTFILE, searchdir);
191     cntlistfp = fopen(tmpbuf, "r");
192 
193     /* This file doesn't have to be present.  No error is reported if the
194        open fails. */
195 
196     sprintf(tmpbuf, KEYIDXFILE, searchdir);
197     keyindexfp = fopen(tmpbuf, "r");
198 
199     sprintf(tmpbuf, REVKEYIDXFILE, searchdir);
200     revkeyindexfp = fopen(tmpbuf, "r");
201 
202     sprintf(tmpbuf, VRBSENTFILE, searchdir);
203     if ((vsentfilefp = fopen(tmpbuf, "r")) == NULL) {
204 	sprintf(msgbuf,
205 "WordNet library warning: Can't open verb example sentence file(%s)\n",
206 		tmpbuf);
207 	display_message(msgbuf);
208     }
209 
210     sprintf(tmpbuf, VRBIDXFILE, searchdir);
211     if ((vidxfilefp = fopen(tmpbuf, "r")) == NULL) {
212 	sprintf(msgbuf,
213 "WordNet library warning: Can't open verb example sentence index file(%s)\n",
214 		tmpbuf);
215 	display_message(msgbuf);
216     }
217 
218     return(openerr);
219 }
220 
221 /* Count the number of underscore or space separated words in a string. */
222 
cntwords(char * s,char separator)223 int cntwords(char *s, char separator)
224 {
225     register int wdcnt = 0;
226 
227     while (*s) {
228 	if (*s == separator || *s == ' ' || *s == '_') {
229 	    wdcnt++;
230 	    while (*s && (*s == separator || *s == ' ' || *s == '_'))
231 		s++;
232 	} else
233 	    s++;
234     }
235     return(++wdcnt);
236 }
237 
238 /* Convert string to lower case remove trailing adjective marker if found */
239 
strtolower(char * str)240 char *strtolower(char *str)
241 {
242     register char *s = str;
243 
244     while(*s != '\0') {
245 	if(*s >= 'A' && *s <= 'Z')
246 	    *s += 32;
247 	else if(*s == '(') {
248 	    *s='\0';
249 	    break;
250 	}
251 	s++;
252     }
253     return(str);
254 }
255 
strtolower2(const char * from,char * to)256 char *strtolower2(const char *from, char *to)
257 {
258 
259     char *t = to;
260 
261     do {
262 	if(*from >= 'A' && *from <= 'Z')
263 	    *t = *from++ + 32;
264 	else if(*from == '(')
265 	    *t = '\0';
266 	else
267 	    *t = *from++;
268     } while (*t++);
269     return(to);
270 }
271 
272 /* Convert string passed to lower case */
273 
ToLowerCase(char * str)274 char *ToLowerCase(char *str)
275 {
276     register char *s = str;
277 
278     while(*s != '\0') {
279 	if(*s >= 'A' && *s <= 'Z')
280 	    *s += 32;
281 	s++;
282     }
283     return(str);
284 }
285 
286 /* Replace all occurences of 'from' with 'to' in 'str' */
287 
strsubst(char * str,char from,char to)288 char *strsubst(char *str, char from, char to)
289 {
290     register char *p;
291 
292     for (p = str; *p != 0; ++p)
293 	if (*p == from)
294 	    *p = to;
295     return str;
296 }
297 
298 /* Return pointer code for pointer type characer passed. */
299 
300 unsigned short
getptrtype(const char * ptrstr,char ** end)301 getptrtype(const char *ptrstr, char **end)
302 {
303     unsigned short i;
304     const char *ptype, *pstr;
305     for(i = 1; i <= MAXPTR; i++) {
306 	ptype = ptrtyp[i];
307 	pstr = ptrstr;
308 	while (*pstr == *ptype && *ptype) {
309 	    pstr++;
310 	    ptype++;
311 	}
312 	if (*ptype == '\0' &&
313 	    (*pstr == '\n' || *pstr == ' ' || *pstr == '\0')) {
314 	    if (end)
315 		*end = __DECONST(char *, pstr);
316 	    return(i);
317 	}
318     }
319     fprintf(stderr, "Could not find the type of %s\n", ptrstr);
320     return(0);
321 }
322 
323 /* Return part of speech code for string passed */
324 
325 int
getpos(const char * s)326 getpos(const char *s)
327 {
328     switch (*s) {
329     case 'n':
330 	return(NOUN);
331     case 'a':
332     case 's':
333 	return(ADJ);
334     case 'v':
335 	return(VERB);
336     case 'r':
337 	return(ADV);
338     default:
339 	sprintf(msgbuf,
340 		"WordNet library error: unknown part of speech %s\n", s);
341 	display_message(msgbuf);
342 	exit(-1);
343     }
344 }
345 
346 /* Return synset type code for string passed. */
347 
348 int
getsstype(const char * s)349 getsstype(const char *s)
350 {
351     switch (*s) {
352     case 'n':
353 	return(NOUN);
354     case 'a':
355 	return(ADJ);
356     case 'v':
357 	return(VERB);
358     case 's':
359 	return(SATELLITE);
360     case 'r':
361 	return(ADV);
362     default:
363 	sprintf(msgbuf, "WordNet library error: Unknown synset type %s\n", s);
364 	display_message(msgbuf);
365 	exit(-1);
366     }
367 }
368 
369 /* Pass in string for POS, return corresponding integer value */
370 
371 int
StrToPos(const char * str)372 StrToPos(const char *str)
373 {
374     if (!strcmp(str, "noun"))
375 	return(NOUN);
376     else if (!strcmp(str, "verb"))
377 	return(VERB);
378     else if (!strcmp(str, "adj"))
379 	return(ADJ);
380     else if (!strcmp(str, "adv"))
381 	return(ADV);
382     else {
383 	return(-1);
384     }
385 }
386 
387 #define MAX_TRIES	5
388 
389 /* Find string for 'searchstr' as it is in index file */
390 
GetWNStr(char * searchstr,int dbase)391 char *GetWNStr(char *searchstr, int dbase)
392 {
393     register int i, j, k, offset = 0;
394     register char c;
395     char *underscore = NULL, *hyphen = NULL, *period = NULL;
396     static char strings[MAX_TRIES][WORDBUF];
397 
398     ToLowerCase(searchstr);
399 
400     if (!(underscore = strchr(searchstr, '_')) &&
401 	!(hyphen = strchr(searchstr, '-')) &&
402 	!(period = strchr(searchstr, '.')))
403 	return (strcpy(strings[0],searchstr));
404 
405     for(i = 0; i < 3; i++)
406 	strcpy(strings[i], searchstr);
407     if (underscore != NULL) strsubst(strings[1], '_', '-');
408     if (hyphen != NULL) strsubst(strings[2], '-', '_');
409     for(i = j = k = 0; (c = searchstr[i]) != '\0'; i++){
410 	if(c != '_' && c != '-') strings[3][j++] = c;
411 	if(c != '.') strings[4][k++] = c;
412     }
413     strings[3][j] = '\0';
414     strings[4][k] = '\0';
415 
416     for(i = 1; i < MAX_TRIES; i++)
417 	if(strcmp(strings[0], strings[i]) == 0) strings[i][0] = '\0';
418 
419     for (i = (MAX_TRIES - 1); i >= 0; i--)
420 	if (strings[i][0] != '\0')
421 	    if (bin_search(strings[i], indexfps[dbase]) != NULL)
422 		offset = i;
423 
424     return(strings[offset]);
425 }
426 
427 /* Return synset for sense key passed. */
428 
429 SynsetPtr
GetSynsetForSense(const char * sensekey)430 GetSynsetForSense(const char *sensekey)
431 {
432     long offset;
433 
434     /* Pass in sense key and return parsed sysnet structure */
435 
436     if ((offset = GetDataOffset(sensekey)))
437 	return(read_synset(GetPOS(sensekey),
438 			   offset,
439 			   GetWORD(sensekey)));
440     else
441 	return(NULL);
442 }
443 
444 /* Find offset of sense key in data file */
445 
446 long
GetDataOffset(const char * sensekey)447 GetDataOffset(const char *sensekey)
448 {
449     const char *line;
450 
451     /* Pass in encoded sense string, return byte offset of corresponding
452        synset in data file. */
453 
454     if (sensefp == NULL) {
455 	display_message("WordNet library error: Sense index file not open\n");
456 	return(0L);
457     }
458     line = bin_search(sensekey, sensefp);
459     if (line) {
460 	while (*line++ != ' ');
461 	return(atol(line));
462     } else
463 	return(0L);
464 }
465 
466 /* Find polysemy count for sense key passed. */
467 
468 int
GetPolyCount(const char * sensekey)469 GetPolyCount(const char *sensekey)
470 {
471     IndexPtr idx;
472     int sense_cnt = 0;
473 
474     /* Pass in encoded sense string and return polysemy count
475        for word in corresponding POS */
476 
477     idx = index_lookup(GetWORD(sensekey), GetPOS(sensekey));
478     if (idx) {
479 	sense_cnt = idx->sense_cnt;
480 	free_index(idx);
481     }
482     return(sense_cnt);
483 }
484 
485 /* Return word part of sense key */
486 const char *
GetWORD(const char * sensekey)487 GetWORD(const char *sensekey)
488 {
489     static char word[100];
490     int i = 0;
491 
492     /* Pass in encoded sense string and return WORD */
493 
494     while ((word[i++] = *sensekey++) != '%');
495     word[i - 1] = '\0';
496     return(word);
497 }
498 
499 /* Return POS code for sense key passed. */
500 
501 int
GetPOS(const char * sensekey)502 GetPOS(const char *sensekey)
503 {
504     int pos;
505 
506     /* Pass in encoded sense string and return POS */
507 
508     while (*sensekey++ != '%');	/* skip over WORD */
509     sscanf(sensekey, "%1d", &pos);
510     return(pos == SATELLITE ? ADJ : pos);
511 }
512 
513 /* Reconstruct synset from synset pointer and return ptr to buffer */
514 
515 const char *
FmtSynset(SynsetPtr synptr,int defn)516 FmtSynset(SynsetPtr synptr, int defn)
517 {
518     int i;
519     static char synset[SMLINEBUF];
520 
521     synset[0] = '\0';
522 
523     if (fileinfoflag)
524 	sprintf(synset, "<%s> ", lexfiles[synptr->fnum]);
525 
526     strcat(synset, "{ ");
527     for (i = 0; i < (synptr->wcount - 1); i++)
528 	sprintf(synset + strlen(synset), "%s, ", synptr->words[i]);
529 
530     strcat(synset, synptr->words[i]);
531 
532     if (defn && synptr->defn)
533 	sprintf(synset + strlen(synset), " (%s) ", synptr->defn);
534 
535     strcat(synset, " }");
536     return(synset);
537 }
538 
539 /* Convert WordNet sense number passed of IndexPtr entry to sense key. */
WNSnsToStr(IndexPtr idx,int sense)540 char *WNSnsToStr(IndexPtr idx, int sense)
541 {
542     SynsetPtr sptr, adjss;
543     char sensekey[512], lowerword[256];
544     int j, sstype, pos;
545 
546     pos = getpos(idx->pos);
547     sptr = read_synset(pos, idx->offset[sense - 1], "");
548 
549     if ((sstype = getsstype(sptr->pos)) == SATELLITE) {
550 	for (j = 0; j < sptr->ptrcount; j++) {
551 	    if (sptr->ptrtyp[j] == SIMPTR) {
552 		adjss = read_synset(sptr->ppos[j],sptr->ptroff[j],"");
553 		sptr->headword = malloc (strlen(adjss->words[0]) + 1);
554 		assert(sptr->headword);
555 		strcpy(sptr->headword, adjss->words[0]);
556 		strtolower(sptr->headword);
557 		sptr->headsense = adjss->lexid[0];
558 		free_synset(adjss);
559 		break;
560 	    }
561 	}
562     }
563 
564     for (j = 0; j < sptr->wcount; j++) {
565 	strcpy(lowerword, sptr->words[j]);
566 	strtolower(lowerword);
567 	if(!strcmp(lowerword, idx->wd))
568 	    break;
569     }
570 
571     if (j == sptr->wcount) {
572 	free_synset(sptr);
573 	return(NULL);
574     }
575 
576     if (sstype == SATELLITE)
577 	sprintf(sensekey,"%s%%%-1.1d:%-2.2d:%-2.2d:%s:%-2.2d",
578 		idx->wd, SATELLITE, sptr->fnum,
579 		sptr->lexid[j], sptr->headword,sptr->headsense);
580     else
581 	sprintf(sensekey,"%s%%%-1.1d:%-2.2d:%-2.2d::",
582 		idx->wd, pos, sptr->fnum, sptr->lexid[j]);
583 
584     free_synset(sptr);
585     return(strdup(sensekey));
586 }
587 
588 /* Return sense number in database for word and lexsn passed. */
589 
590 int
GetWNSense(const char * word,const char * lexsn)591 GetWNSense(const char *word, const char *lexsn)
592 {
593     SnsIndexPtr snsidx;
594     char buf[256];
595 
596     sprintf(buf, "%s%%%s", word, lexsn); /* create sensekey */
597     if ((snsidx = GetSenseIndex(buf)) != NULL)
598 	return(snsidx->wnsense);
599     else
600 	return(0);
601 }
602 
603 /* Return parsed sense index entry for sense key passed. */
604 
605 SnsIndexPtr
GetSenseIndex(const char * sensekey)606 GetSenseIndex(const char *sensekey)
607 {
608     const char *line;
609     char buf[256], loc[9];
610     SnsIndexPtr snsidx = NULL;
611 
612     if ((line = bin_search(sensekey, sensefp)) != NULL) {
613 	snsidx = (SnsIndexPtr)malloc(sizeof(SnsIndex));
614 	assert(snsidx);
615 	sscanf(line, "%s %s %d %d\n",
616 	       buf,
617 	       loc,
618 	       &snsidx->wnsense,
619 	       &snsidx->tag_cnt);
620 	snsidx->sensekey = malloc(strlen(buf + 1));
621 	assert(snsidx->sensekey);
622 	strcpy(snsidx->sensekey, buf);
623 	snsidx->loc = atol(loc);
624 	/* Parse out word from sensekey to make things easier for caller */
625 	snsidx->word = strdup(GetWORD(snsidx->sensekey));
626 	assert(snsidx->word);
627 	snsidx->nextsi = NULL;
628     }
629     return(snsidx);
630 }
631 
632 /* Return number of times sense is tagged */
633 
GetTagcnt(IndexPtr idx,int sense)634 int GetTagcnt(IndexPtr idx, int sense)
635 {
636     char *sensekey;
637     const char *line;
638     char buf[256];
639     int snum, cnt = 0;
640 
641     if (cntlistfp) {
642 
643 	sensekey = WNSnsToStr(idx, sense);
644 	if ((line = bin_search(sensekey, cntlistfp)) != NULL) {
645 	    sscanf(line, "%s %d %d", buf, &snum, &cnt);
646 	}
647 	free(sensekey);
648     }
649 
650     return(cnt);
651 }
652 
FreeSenseIndex(SnsIndexPtr snsidx)653 void FreeSenseIndex(SnsIndexPtr snsidx)
654 {
655     if (snsidx) {
656 	free(snsidx->word);
657 	free(snsidx);
658     }
659 }
660 
661 const char *
GetOffsetForKey(unsigned int key)662 GetOffsetForKey(unsigned int key)
663 {
664     unsigned int rkey;
665     char ckey[7];
666     static char loc[11] = "";
667     const char *line;
668     char searchdir[256], tmpbuf[256];
669 
670     /* Try to open file in case wn_init wasn't called */
671 
672     if (!keyindexfp) {
673 	strcpy(searchdir, SetSearchdir());
674 	sprintf(tmpbuf, KEYIDXFILE, searchdir);
675 	keyindexfp = fopen(tmpbuf, "r");
676     }
677     if (keyindexfp) {
678 	sprintf(ckey, "%6.6d", key);
679 	if ((line = bin_search(ckey, keyindexfp)) != NULL) {
680 	    sscanf(line, "%d %s", &rkey, loc);
681 	    return(loc);
682 	}
683     }
684     return(NULL);
685 }
686 
687 unsigned int
GetKeyForOffset(const char * loc)688 GetKeyForOffset(const char *loc)
689 {
690     unsigned int key;
691     char rloc[11] = "";
692     const char *line;
693     char searchdir[256], tmpbuf[256];
694 
695     /* Try to open file in case wn_init wasn't called */
696 
697     if (!revkeyindexfp) {
698 	strcpy(searchdir, SetSearchdir());
699 	sprintf(tmpbuf, REVKEYIDXFILE, searchdir);
700 	revkeyindexfp = fopen(tmpbuf, "r");
701     }
702     if (revkeyindexfp) {
703 	if ((line = bin_search(loc, revkeyindexfp)) != NULL) {
704 	    sscanf(line, "%s %d", rloc, &key );
705 	    return(key);
706 	}
707     }
708     return(0);
709 }
710 
711 const char *
SetSearchdir()712 SetSearchdir()
713 {
714     char *searchdir;
715     const char *env;
716 
717     /* Find base directory for database.  If set, use WNSEARCHDIR.
718        If not set, check for WNHOME/dict, otherwise use DEFAULTPATH. */
719 
720     if ((env = getenv("WNSEARCHDIR")) != NULL)
721 	return(env);
722     else if ((env = getenv("WNHOME")) != NULL) {
723 	searchdir = malloc(strlen(env) + sizeof(DICTDIR));
724 	sprintf(searchdir, "%s%s", env, DICTDIR);
725 	return(searchdir);
726     } else
727 	return(DEFAULTPATH);
728 }
729 
730 #ifndef __GNUC__
731 #	define __unused
732 #endif
default_display_message(const char * msg __unused)733 int default_display_message(const char *msg __unused)
734 {
735     return(-1);
736 }
737 
738 /*
739 ** Wrapper functions for strstr that allow you to retrieve each
740 ** occurance of a word within a longer string, not just the first.
741 **
742 ** strstr_init is called with the same arguments as normal strstr,
743 ** but does not return any value.
744 **
745 ** strstr_getnext returns the position offset (not a pointer, as does
746 ** normal strstr) of the next occurance, or -1 if none remain.
747 */
748 
strstr_init(char * string,char * word)749 void strstr_init (char *string, char *word) {
750    strstr_word = word;
751    strstr_stringstart = string;
752    strstr_stringcurrent = string;
753 }
754 
strstr_getnext(void)755 int strstr_getnext (void) {
756    char *loc = strstr (strstr_stringcurrent, strstr_word);
757    if (loc == NULL) return -1;
758    strstr_stringcurrent = loc + 1;
759    return (loc - strstr_stringstart);
760 }
761