1 /* -- lexicon.c
2 
3 This file reads the lexicon definitions into a chained
4 hash table and handles the lookups of words in the hash table,
5 returning definitions in the form of an input symbol and a
6 standardized text.
7 
8 Prototype 7H08 (This file was written by Walter Sinclair).
9 
10 This file is part of pagc.
11 
12 Copyright (c) 2008 Walter Bruce Sinclair
13 
14 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 
20 */
21 /* For pagc-0.4.2 : last revised 2012-05-23 */
22 
23 #undef DEBUG
24 //#define DEBUG
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stddef.h>
30 #include <ctype.h>
31 #include "pagc_api.h"
32 
33 /* -- Hash table size should be a prime number -- */
34 /* 5581, 5953, 6337, 6733, 7561, 7993, 8893, 10333, 10837, 11353, 12421, 12973, 13537, 15913, 18481  */
35 #define LEXICON_HTABSIZE 7561
36 
37 #ifdef BUILD_API
38 #include "pagc_std_api.h"
39 #endif
40 
41 /* -- local prototypes -- */
42 static unsigned calc_hash( char * ) ;
43 static ENTRY **create_hash_table( ERR_PARAM * ) ;
44 static int add_dict_entry( ERR_PARAM *, ENTRY ** , char * , int , SYMB , char * ) ;
45 
46 #ifndef BUILD_API
47 static char *convert_field( char * , char * ) ;
48 static int read_lexicon( ERR_PARAM *, ENTRY ** , FILE * ) ;
49 #endif
50 
51 LEXICON *lex_init( ERR_PARAM *err_p ) ;
52 static int append_new_def( ERR_PARAM *, ENTRY * , SYMB , char * , int ) ;
53 static unsigned elf_hash( char * ) ;
54 void print_lexicon( ENTRY ** hash_table ) ;
55 
56 #ifdef BUILD_API
57 
58 /*
59 typedef struct LEXICON_s {
60     ENTRY **hash_table;
61     ERR_PARAM *err_p;
62 } LEXICON;
63 
64 */
65 
lex_init(ERR_PARAM * err_p)66 LEXICON *lex_init( ERR_PARAM *err_p )
67 {
68     LEXICON *lex;
69 
70     PAGC_CALLOC_STRUC(lex,LEXICON,1,err_p,NULL);
71 
72     lex->hash_table = create_hash_table( err_p );
73     if (lex->hash_table == NULL) {
74         lex_free(lex);
75         return NULL;
76     }
77 
78     lex->err_p = err_p;
79 
80     return lex;
81 }
82 
lex_add_entry(LEXICON * lex,int seq,char * word,char * stdword,SYMB token)83 int lex_add_entry(LEXICON *lex, int seq, char *word, char *stdword, SYMB token)
84 {
85     return add_dict_entry( lex->err_p, lex->hash_table, word, seq-1, token, stdword);
86 }
87 
lex_free(LEXICON * lex)88 void lex_free(LEXICON *lex)
89 {
90     if (lex == NULL) return;
91     destroy_lexicon(lex->hash_table);
92     free(lex);
93     lex = NULL;
94 }
95 
96 
97 
98 #else
99 
100 /* ---------------------------------------------------------------------
101 lexicon.c (create_lexicon) -
102 read the lexicon file into memory, chaining off a hash table
103 returns a pointer to the hash table, or NULL if error.
104 called by standard.l (init_stand_process)
105 calls util.c (open_aux_file) lexicon.c (read_lexicon, create_hash_table)
106 uses macro LOG_MESS
107 stdio.h (fclose)
108 -----------------------------------------------------------------------*/
create_lexicon(PAGC_GLOBAL * glo_p,const char * lex_name,const char * gaz_name)109 ENTRY **create_lexicon( PAGC_GLOBAL *glo_p ,
110                         const char *lex_name ,
111                         const char *gaz_name ) {
112    /* -- called by init_stand_process to read in the Lexicon and set up the
113       definitions in memory for hash table access -- */
114    FILE *gaz_file ,
115         *dict_file ;
116    ENTRY **hash_table ;
117 
118    if ( (hash_table = create_hash_table( glo_p -> process_errors ) ) == NULL ) {
119        return NULL ;
120    }
121    /* 2009-08-13 : support multiple lexicons */
122    if ( gaz_name != NULL ) {
123       if ( ( gaz_file = open_aux_file( glo_p ,
124                                        gaz_name ) ) == NULL )
125          return NULL ;
126       if ( !read_lexicon( glo_p -> process_errors ,
127                           hash_table ,
128                           gaz_file ) ) {
129           fclose( gaz_file ) ;
130           return NULL ;
131       }
132       fclose( gaz_file ) ;
133    }
134 
135    if ( ( dict_file = open_aux_file( glo_p ,
136                                      lex_name ) ) == NULL )
137       return NULL  ;
138    if ( !read_lexicon( glo_p -> process_errors ,
139                        hash_table ,
140                        dict_file ) ) {
141        fclose( dict_file ) ;
142        return NULL ;
143    }
144    fclose( dict_file ) ;
145    return hash_table ;
146 }
147 
148 /* ----------------------------------------------------
149 lexicon.c (read_lexicon) -
150 called by lexicon.c (create_lexicon) for each file
151 calls convert_field, add_dict_entry
152 returns FALSE if error encountered
153 stdio.h (fgets,feof,sscanf)
154 uses macro BLANK_STRING
155 -------------------------------------------------------*/
read_lexicon(ERR_PARAM * err_p,ENTRY ** hash_table,FILE * CFile)156 static int read_lexicon( ERR_PARAM *err_p ,
157                          ENTRY **hash_table ,
158                          FILE *CFile ) {
159    char record_buffer[ MAXSTRLEN ] ;
160    char lookup_str[ MAXTEXT ] ;
161    char num_str[ MAXTEXT ] ;
162    int cur_token ;
163    int num_def ;
164    char standard_str[ MAXTEXT ] ;
165    char *next_str ;
166 
167    while ( !feof( CFile ) ) {
168       /* -- read in each line of the csv file and add to hash table -- */
169       BLANK_STRING(record_buffer) ;
170       fgets( record_buffer ,
171              MAXSTRLEN ,
172              CFile ) ;
173 
174 #ifdef SEW_NOT_SURE_IF_WE_NEED_THIS
175       /* -- check for and skip over blank lines -- */
176       if (strspn(record_buffer, " \t\r\n") == strlen(record_buffer))
177          continue;
178 #endif
179 
180       /* -- comma-separated values are handled only as well as necessary
181          in the present context -- */
182       if ( ( next_str =
183                 convert_field( num_str ,
184                                record_buffer ) ) == NULL ) {
185          break ;
186       }
187       sscanf( num_str ,
188               "%d" ,
189               &num_def ) ;
190       next_str = convert_field( lookup_str ,
191                                 next_str ) ;
192       next_str = convert_field( num_str ,
193                                 next_str ) ;
194       sscanf( num_str ,
195               "%d" ,
196               &cur_token ) ;
197       next_str = convert_field( standard_str ,
198                                 next_str ) ;
199       if ( add_dict_entry( err_p ,
200                            hash_table ,
201                            lookup_str ,
202                            ( num_def - 1 ) ,
203                            cur_token ,
204                            standard_str ) == ERR_FAIL ) {
205          return FALSE ;
206       }
207    }
208    return TRUE ;
209 }
210 
211 /* ----------------------------------------------------
212 lexicon.c (convert_field)
213 called by lexicon.c (read_lexicon)
214 ctype.h (isspace)
215 uses macro BLANK_STRING
216 -------------------------------------------------------*/
convert_field(char * buf,char * inp)217 static char *convert_field( char *buf ,
218                             char *inp ) {
219    char c ;
220    char *d  = buf;
221    char *s = inp ;
222 
223    BLANK_STRING(d) ;
224    /* -- space at the beginning of a line will stop the read -- */
225    if ( isspace( *s ) )
226       return NULL ;
227    while ( ( c = *s++ ) != SENTINEL ) {
228       if ( c == '\"' ||
229            c == '\r' )
230          continue ; /* -- ignore quotes and carriage returns -- */
231       /* -- zero terminate field and record delimiters -- */
232       if ( c == '\n' ||
233            c == ',' ) {
234           BLANK_STRING(d) ;
235           return s ;
236       }
237       *d++ = c ; /* -- copy it -- */
238    }
239    return NULL ;
240 }
241 
242 #endif
243 
244 /* ----------------------------------------------------
245 lexicon.c (destroy_lexicon)
246 called by standard.l (close_stand_process)
247 calls lexicon.c (destroy_def_list)
248 uses macro FREE_AND_NULL
249 -------------------------------------------------------*/
destroy_lexicon(ENTRY ** hash_table)250 void destroy_lexicon(ENTRY ** hash_table)
251 {
252 	/* -- called by Clean-Up - */
253 	unsigned __i__ ;
254 	ENTRY *__E__,*__F__ ;
255 	if (hash_table == NULL)
256 	{
257 		return ;
258 	}
259 	for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
260 	{
261 		for (__E__ = hash_table[__i__] ;__E__ != NULL ;__E__ = __F__)
262 		{
263 			destroy_def_list(__E__->DefList) ;
264 			__F__ = __E__->Next ;
265 			FREE_AND_NULL(__E__->Lookup) ;
266 			FREE_AND_NULL(__E__) ;
267 		}
268 	}
269     DBG("destroy_lexicon: i=%d", __i__);
270 	/* <revision date='2012-05-23'>free hash table</revision> */
271 	FREE_AND_NULL(hash_table);
272     DBG("leaving destroy_lexicon");
273 }
274 
275 
276 /* ----------------------------------------------------------
277 lexicon.c (destroy_def_list)
278 called by destroy_lexicon and tokenize.c (remove_default_defs)
279 uses macro FREE_AND_NULL
280 ------------------------------------------------------------*/
destroy_def_list(DEF * start_def)281 void destroy_def_list( DEF *start_def ) {
282    DEF *cur_def ;
283    DEF *next_def = NULL ;
284 
285 
286 
287    for ( cur_def = start_def ;
288          cur_def != NULL ;
289          cur_def = next_def ) {
290       next_def = cur_def -> Next ;
291       /* -- Default definitions have no associated text -- */
292       if ( cur_def -> Protect == 0 ) {
293          FREE_AND_NULL( cur_def -> Standard ) ;
294       }
295       FREE_AND_NULL( cur_def ) ;
296    }
297 }
298 
299 /* ----------------------------------------------------
300 lexicon.c (find_entry)
301 called by lexicon.c (add_dict_entry)
302 calls lexicon.c (calc_hash)
303 string.h (strcmp)
304 -------------------------------------------------------*/
find_entry(ENTRY ** hash_table,char * lookup_str)305 ENTRY *find_entry(ENTRY **hash_table,char *lookup_str)
306 {
307 	/* -- called to create a lexeme -- */
308 	ENTRY *__E__ ;
309 	unsigned __hash_index__ ; /* -- 2006-11-20 : to return hash table pointer -- */
310 
311 	__hash_index__ = calc_hash(lookup_str) ;
312 	for (__E__ = hash_table[__hash_index__] ; __E__ != NULL ; __E__ = __E__->Next)
313 	{
314 		if (strcmp(lookup_str,__E__->Lookup) == 0)
315 		{
316 			return __E__ ;
317 		}
318 	}
319 	return __E__ ;
320 }
321 
322 #define US sizeof( unsigned )
323 /* ----------------------------------------------------
324 lexicon.c (elf_hash)
325 called by lexicon.c (calc_hash)
326 -------------------------------------------------------*/
elf_hash(char * key_str)327 static unsigned elf_hash( char *key_str ) {
328   unsigned h ,
329            g ,
330            c ;
331 
332   h = 0 ;
333   while ( ( c = ( unsigned ) *key_str ) != '\0' ) {
334      h = ( h << US ) + c  ;
335      if ( ( g = h & ( ~ ( ( unsigned )( ~0 ) >> US ) ) ) )
336         h ^= g >> ( US * 6 ) ;
337      h &= ~g ;
338      key_str++ ;
339   }
340   return h ;
341 }
342 
343 
344 /* ----------------------------------------------------
345 lexicon.c (calc_hash)
346 called by lexicon.c (find_entry, add_dict_entry)
347 calls lexicon.c (elf_hash)
348 -------------------------------------------------------*/
349 
calc_hash(char * key_str)350 static unsigned calc_hash( char *key_str ) {
351   unsigned h ;
352 
353   h = elf_hash( key_str ) ;
354   return ( h  % LEXICON_HTABSIZE ) ;
355 }
356 
357 /* ----------------------------------------------------
358 lexicon.c (create_hash_table)
359 allocate and initialize hash table in memory
360 return NULL if error
361 called by create_lexicon
362 uses macro PAGC_CALLOC_STRUC
363 -------------------------------------------------------*/
create_hash_table(ERR_PARAM * err_p)364 static ENTRY **create_hash_table(ERR_PARAM *err_p)
365 {
366 	unsigned __i__ ;
367 	ENTRY **__hash_table__ ;
368 	PAGC_CALLOC_STRUC(__hash_table__,ENTRY *,LEXICON_HTABSIZE,err_p,NULL) ;
369 	for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
370 	{
371 		__hash_table__[__i__] = NULL ;
372 	}
373 	return __hash_table__ ;
374 }
375 
376 /* ----------------------------------------------------
377 lexicon.c (add_dict_entry)
378 called by lexicon.c (read_lexicon)
379 calls lexicon.c (calc_hash, create_def, append_new_def)
380 uses macro PAGC_ALLOC_STRUC , PAGC_STORE_STR, RET_ERR
381 return ERR_FAIL if error
382 -------------------------------------------------------*/
add_dict_entry(ERR_PARAM * err_p,ENTRY ** hash_table,char * lookup_str,int def_num,SYMB t,char * standard_str)383 static int add_dict_entry( ERR_PARAM *err_p ,
384                            ENTRY **hash_table ,
385                            char *lookup_str ,
386                            int def_num ,
387                            SYMB t ,
388                            char *standard_str ) {
389    ENTRY *E ;
390 
391    E = find_entry( hash_table ,
392                    lookup_str ) ;
393    if ( E == NULL ) {
394       unsigned hash_index ;
395 
396       PAGC_ALLOC_STRUC(E,ENTRY,err_p,ERR_FAIL);
397       /* -- add the Lookup string to the record -- */
398       PAGC_STORE_STR(E->Lookup,lookup_str,err_p,ERR_FAIL) ;
399       /* -- add new entry to beginning of table -- */
400       hash_index = calc_hash( lookup_str ) ;
401 
402       E -> Next = hash_table[ hash_index ] ; /* -- collision chain -- */
403       hash_table[ hash_index ] = E ;
404       if ( ( E -> DefList = create_def( t ,
405                                         standard_str ,
406                                         def_num ,
407                                         FALSE ,
408                                         err_p ) ) == NULL ) {
409           return ERR_FAIL ;
410       }
411   } else {
412       int err_stat ;
413       if ( E -> DefList == NULL ) {
414          RET_ERR("add_dict_entry: Lexical entry lacks definition" ,
415                  err_p ,
416                  ERR_FAIL ) ;
417       }
418       if ( ( err_stat = append_new_def( err_p ,
419                                         E ,
420                                         t ,
421                                         standard_str ,
422                                         def_num ) ) != TRUE ) {
423          return err_stat ;
424       }
425    }
426    return TRUE ;
427 }
428 
429 /* ----------------------------------------------------
430 lexicon.c (append_new_def)
431 called by lexicon.c (add_dict_entry)
432 calls lexicon.c (create_def)
433 returns FALSE if entry is already there
434 returns ERR_FAIL on allocation error
435 -------------------------------------------------------*/
append_new_def(ERR_PARAM * err_p,ENTRY * E,SYMB t,char * text,int def_num)436 static int append_new_def( ERR_PARAM *err_p ,
437                            ENTRY *E ,
438                            SYMB t ,
439                            char *text ,
440                            int def_num ) {
441 
442    DEF *D,
443        *pd,
444        *cd ;
445    for ( cd = E -> DefList , pd = NULL ;
446          cd != NULL ;
447          cd = cd -> Next ) {
448       pd = cd ;
449       /* -- avoid duplication except for local entries -- */
450       if ( cd -> Type == t ) {
451          return FALSE ;
452       }
453    }
454    if ( ( D = create_def( t ,
455                           text ,
456                           def_num ,
457                           FALSE ,
458                           err_p ) ) == NULL ) {
459        return ERR_FAIL ;
460    }
461    if ( pd == NULL ) {
462       E -> DefList = D ;
463    } else {
464       D -> Next = pd -> Next ;
465       pd -> Next = D ;
466    }
467    return TRUE ;
468 }
469 
470 /*--------------------------------------------------------------------
471 lexicon.c (create_def)
472 called by lexicon.c (append_new_def) tokenize.c (setup_default_defs)
473 allocate memory for lexicon entry.
474 Pflag is TRUE for default entries
475 returns NULL for allocation error
476 uses macro PAGC_ALLOC_STRUC, PAGC_STORE_STR
477 -------------------------------------------------------------------- */
create_def(SYMB s,char * standard_str,int def_num,int PFlag,ERR_PARAM * err_p)478 DEF *create_def ( SYMB s ,
479                   char *standard_str ,
480                   int def_num ,
481                   int PFlag ,
482                   ERR_PARAM *err_p ) {
483    /* -- allocate the memory and set up the definition structure with the
484       standard form -- */
485    DEF *cur_def ;
486 
487    /* -- initialization-time allocation -- */
488    PAGC_ALLOC_STRUC(cur_def,DEF,err_p,NULL) ;
489    cur_def -> Type = s ;
490    cur_def -> Protect = PFlag ; /* -- False for definitions from lexicon
491                                    true for default definitions -- */
492    if ( !PFlag ) {
493       /* -- initialization-time allocation -- */
494       PAGC_STORE_STR(cur_def->Standard,standard_str,err_p,NULL) ;
495    } else
496       cur_def -> Standard = NULL ;
497    cur_def -> Order = def_num ;
498    cur_def -> Next = NULL ;
499    return cur_def ;
500 }
501 
502 /*--------------------------------------------------------------------
503 lexicon.c (print_lexicon)
504 not called by useful for debugging. It will print out the lexicon.
505 --------------------------------------------------------------------*/
print_lexicon(ENTRY ** hash_table)506 void print_lexicon( ENTRY ** hash_table )
507 {
508     unsigned i;
509     ENTRY *E;
510 
511     if (!hash_table) return;
512 
513     for (i=0; i< LEXICON_HTABSIZE; i++)
514     {
515         E = hash_table[i];
516         while (E)
517         {
518             DEF *D = E->DefList;
519             printf("'%s'\n", E->Lookup);
520             while (D)
521             {
522                 printf("    %d, %d, %d, '%s'\n", D->Order, D->Type, D->Protect, D->Standard);
523                 D = D->Next;
524             }
525             E = E->Next;
526         }
527     }
528 }
529 
530