1 /* -- lexicon.c
2
3 This file reads the lexicon definitions into a chained
4 hash table and handles the lookups of words in the hash table,
5 returning definitions in the form of an input symbol and a
6 standardized text.
7
8 Prototype 7H08 (This file was written by Walter Sinclair).
9
10 This file is part of pagc.
11
12 Copyright (c) 2008 Walter Bruce Sinclair
13
14 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19
20 */
21 /* For pagc-0.4.2 : last revised 2012-05-23 */
22
23 #undef DEBUG
24 //#define DEBUG
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stddef.h>
30 #include <ctype.h>
31 #include "pagc_api.h"
32
33 /* -- Hash table size should be a prime number -- */
34 /* 5581, 5953, 6337, 6733, 7561, 7993, 8893, 10333, 10837, 11353, 12421, 12973, 13537, 15913, 18481 */
35 #define LEXICON_HTABSIZE 7561
36
37 #ifdef BUILD_API
38 #include "pagc_std_api.h"
39 #endif
40
41 /* -- local prototypes -- */
42 static unsigned calc_hash( char * ) ;
43 static ENTRY **create_hash_table( ERR_PARAM * ) ;
44 static int add_dict_entry( ERR_PARAM *, ENTRY ** , char * , int , SYMB , char * ) ;
45
46 #ifndef BUILD_API
47 static char *convert_field( char * , char * ) ;
48 static int read_lexicon( ERR_PARAM *, ENTRY ** , FILE * ) ;
49 #endif
50
51 LEXICON *lex_init( ERR_PARAM *err_p ) ;
52 static int append_new_def( ERR_PARAM *, ENTRY * , SYMB , char * , int ) ;
53 static unsigned elf_hash( char * ) ;
54 void print_lexicon( ENTRY ** hash_table ) ;
55
56 #ifdef BUILD_API
57
58 /*
59 typedef struct LEXICON_s {
60 ENTRY **hash_table;
61 ERR_PARAM *err_p;
62 } LEXICON;
63
64 */
65
lex_init(ERR_PARAM * err_p)66 LEXICON *lex_init( ERR_PARAM *err_p )
67 {
68 LEXICON *lex;
69
70 PAGC_CALLOC_STRUC(lex,LEXICON,1,err_p,NULL);
71
72 lex->hash_table = create_hash_table( err_p );
73 if (lex->hash_table == NULL) {
74 lex_free(lex);
75 return NULL;
76 }
77
78 lex->err_p = err_p;
79
80 return lex;
81 }
82
lex_add_entry(LEXICON * lex,int seq,char * word,char * stdword,SYMB token)83 int lex_add_entry(LEXICON *lex, int seq, char *word, char *stdword, SYMB token)
84 {
85 return add_dict_entry( lex->err_p, lex->hash_table, word, seq-1, token, stdword);
86 }
87
lex_free(LEXICON * lex)88 void lex_free(LEXICON *lex)
89 {
90 if (lex == NULL) return;
91 destroy_lexicon(lex->hash_table);
92 free(lex);
93 lex = NULL;
94 }
95
96
97
98 #else
99
100 /* ---------------------------------------------------------------------
101 lexicon.c (create_lexicon) -
102 read the lexicon file into memory, chaining off a hash table
103 returns a pointer to the hash table, or NULL if error.
104 called by standard.l (init_stand_process)
105 calls util.c (open_aux_file) lexicon.c (read_lexicon, create_hash_table)
106 uses macro LOG_MESS
107 stdio.h (fclose)
108 -----------------------------------------------------------------------*/
create_lexicon(PAGC_GLOBAL * glo_p,const char * lex_name,const char * gaz_name)109 ENTRY **create_lexicon( PAGC_GLOBAL *glo_p ,
110 const char *lex_name ,
111 const char *gaz_name ) {
112 /* -- called by init_stand_process to read in the Lexicon and set up the
113 definitions in memory for hash table access -- */
114 FILE *gaz_file ,
115 *dict_file ;
116 ENTRY **hash_table ;
117
118 if ( (hash_table = create_hash_table( glo_p -> process_errors ) ) == NULL ) {
119 return NULL ;
120 }
121 /* 2009-08-13 : support multiple lexicons */
122 if ( gaz_name != NULL ) {
123 if ( ( gaz_file = open_aux_file( glo_p ,
124 gaz_name ) ) == NULL )
125 return NULL ;
126 if ( !read_lexicon( glo_p -> process_errors ,
127 hash_table ,
128 gaz_file ) ) {
129 fclose( gaz_file ) ;
130 return NULL ;
131 }
132 fclose( gaz_file ) ;
133 }
134
135 if ( ( dict_file = open_aux_file( glo_p ,
136 lex_name ) ) == NULL )
137 return NULL ;
138 if ( !read_lexicon( glo_p -> process_errors ,
139 hash_table ,
140 dict_file ) ) {
141 fclose( dict_file ) ;
142 return NULL ;
143 }
144 fclose( dict_file ) ;
145 return hash_table ;
146 }
147
148 /* ----------------------------------------------------
149 lexicon.c (read_lexicon) -
150 called by lexicon.c (create_lexicon) for each file
151 calls convert_field, add_dict_entry
152 returns FALSE if error encountered
153 stdio.h (fgets,feof,sscanf)
154 uses macro BLANK_STRING
155 -------------------------------------------------------*/
read_lexicon(ERR_PARAM * err_p,ENTRY ** hash_table,FILE * CFile)156 static int read_lexicon( ERR_PARAM *err_p ,
157 ENTRY **hash_table ,
158 FILE *CFile ) {
159 char record_buffer[ MAXSTRLEN ] ;
160 char lookup_str[ MAXTEXT ] ;
161 char num_str[ MAXTEXT ] ;
162 int cur_token ;
163 int num_def ;
164 char standard_str[ MAXTEXT ] ;
165 char *next_str ;
166
167 while ( !feof( CFile ) ) {
168 /* -- read in each line of the csv file and add to hash table -- */
169 BLANK_STRING(record_buffer) ;
170 fgets( record_buffer ,
171 MAXSTRLEN ,
172 CFile ) ;
173
174 #ifdef SEW_NOT_SURE_IF_WE_NEED_THIS
175 /* -- check for and skip over blank lines -- */
176 if (strspn(record_buffer, " \t\r\n") == strlen(record_buffer))
177 continue;
178 #endif
179
180 /* -- comma-separated values are handled only as well as necessary
181 in the present context -- */
182 if ( ( next_str =
183 convert_field( num_str ,
184 record_buffer ) ) == NULL ) {
185 break ;
186 }
187 sscanf( num_str ,
188 "%d" ,
189 &num_def ) ;
190 next_str = convert_field( lookup_str ,
191 next_str ) ;
192 next_str = convert_field( num_str ,
193 next_str ) ;
194 sscanf( num_str ,
195 "%d" ,
196 &cur_token ) ;
197 next_str = convert_field( standard_str ,
198 next_str ) ;
199 if ( add_dict_entry( err_p ,
200 hash_table ,
201 lookup_str ,
202 ( num_def - 1 ) ,
203 cur_token ,
204 standard_str ) == ERR_FAIL ) {
205 return FALSE ;
206 }
207 }
208 return TRUE ;
209 }
210
211 /* ----------------------------------------------------
212 lexicon.c (convert_field)
213 called by lexicon.c (read_lexicon)
214 ctype.h (isspace)
215 uses macro BLANK_STRING
216 -------------------------------------------------------*/
convert_field(char * buf,char * inp)217 static char *convert_field( char *buf ,
218 char *inp ) {
219 char c ;
220 char *d = buf;
221 char *s = inp ;
222
223 BLANK_STRING(d) ;
224 /* -- space at the beginning of a line will stop the read -- */
225 if ( isspace( *s ) )
226 return NULL ;
227 while ( ( c = *s++ ) != SENTINEL ) {
228 if ( c == '\"' ||
229 c == '\r' )
230 continue ; /* -- ignore quotes and carriage returns -- */
231 /* -- zero terminate field and record delimiters -- */
232 if ( c == '\n' ||
233 c == ',' ) {
234 BLANK_STRING(d) ;
235 return s ;
236 }
237 *d++ = c ; /* -- copy it -- */
238 }
239 return NULL ;
240 }
241
242 #endif
243
244 /* ----------------------------------------------------
245 lexicon.c (destroy_lexicon)
246 called by standard.l (close_stand_process)
247 calls lexicon.c (destroy_def_list)
248 uses macro FREE_AND_NULL
249 -------------------------------------------------------*/
destroy_lexicon(ENTRY ** hash_table)250 void destroy_lexicon(ENTRY ** hash_table)
251 {
252 /* -- called by Clean-Up - */
253 unsigned __i__ ;
254 ENTRY *__E__,*__F__ ;
255 if (hash_table == NULL)
256 {
257 return ;
258 }
259 for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
260 {
261 for (__E__ = hash_table[__i__] ;__E__ != NULL ;__E__ = __F__)
262 {
263 destroy_def_list(__E__->DefList) ;
264 __F__ = __E__->Next ;
265 FREE_AND_NULL(__E__->Lookup) ;
266 FREE_AND_NULL(__E__) ;
267 }
268 }
269 DBG("destroy_lexicon: i=%d", __i__);
270 /* <revision date='2012-05-23'>free hash table</revision> */
271 FREE_AND_NULL(hash_table);
272 DBG("leaving destroy_lexicon");
273 }
274
275
276 /* ----------------------------------------------------------
277 lexicon.c (destroy_def_list)
278 called by destroy_lexicon and tokenize.c (remove_default_defs)
279 uses macro FREE_AND_NULL
280 ------------------------------------------------------------*/
destroy_def_list(DEF * start_def)281 void destroy_def_list( DEF *start_def ) {
282 DEF *cur_def ;
283 DEF *next_def = NULL ;
284
285
286
287 for ( cur_def = start_def ;
288 cur_def != NULL ;
289 cur_def = next_def ) {
290 next_def = cur_def -> Next ;
291 /* -- Default definitions have no associated text -- */
292 if ( cur_def -> Protect == 0 ) {
293 FREE_AND_NULL( cur_def -> Standard ) ;
294 }
295 FREE_AND_NULL( cur_def ) ;
296 }
297 }
298
299 /* ----------------------------------------------------
300 lexicon.c (find_entry)
301 called by lexicon.c (add_dict_entry)
302 calls lexicon.c (calc_hash)
303 string.h (strcmp)
304 -------------------------------------------------------*/
find_entry(ENTRY ** hash_table,char * lookup_str)305 ENTRY *find_entry(ENTRY **hash_table,char *lookup_str)
306 {
307 /* -- called to create a lexeme -- */
308 ENTRY *__E__ ;
309 unsigned __hash_index__ ; /* -- 2006-11-20 : to return hash table pointer -- */
310
311 __hash_index__ = calc_hash(lookup_str) ;
312 for (__E__ = hash_table[__hash_index__] ; __E__ != NULL ; __E__ = __E__->Next)
313 {
314 if (strcmp(lookup_str,__E__->Lookup) == 0)
315 {
316 return __E__ ;
317 }
318 }
319 return __E__ ;
320 }
321
322 #define US sizeof( unsigned )
323 /* ----------------------------------------------------
324 lexicon.c (elf_hash)
325 called by lexicon.c (calc_hash)
326 -------------------------------------------------------*/
elf_hash(char * key_str)327 static unsigned elf_hash( char *key_str ) {
328 unsigned h ,
329 g ,
330 c ;
331
332 h = 0 ;
333 while ( ( c = ( unsigned ) *key_str ) != '\0' ) {
334 h = ( h << US ) + c ;
335 if ( ( g = h & ( ~ ( ( unsigned )( ~0 ) >> US ) ) ) )
336 h ^= g >> ( US * 6 ) ;
337 h &= ~g ;
338 key_str++ ;
339 }
340 return h ;
341 }
342
343
344 /* ----------------------------------------------------
345 lexicon.c (calc_hash)
346 called by lexicon.c (find_entry, add_dict_entry)
347 calls lexicon.c (elf_hash)
348 -------------------------------------------------------*/
349
calc_hash(char * key_str)350 static unsigned calc_hash( char *key_str ) {
351 unsigned h ;
352
353 h = elf_hash( key_str ) ;
354 return ( h % LEXICON_HTABSIZE ) ;
355 }
356
357 /* ----------------------------------------------------
358 lexicon.c (create_hash_table)
359 allocate and initialize hash table in memory
360 return NULL if error
361 called by create_lexicon
362 uses macro PAGC_CALLOC_STRUC
363 -------------------------------------------------------*/
create_hash_table(ERR_PARAM * err_p)364 static ENTRY **create_hash_table(ERR_PARAM *err_p)
365 {
366 unsigned __i__ ;
367 ENTRY **__hash_table__ ;
368 PAGC_CALLOC_STRUC(__hash_table__,ENTRY *,LEXICON_HTABSIZE,err_p,NULL) ;
369 for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
370 {
371 __hash_table__[__i__] = NULL ;
372 }
373 return __hash_table__ ;
374 }
375
376 /* ----------------------------------------------------
377 lexicon.c (add_dict_entry)
378 called by lexicon.c (read_lexicon)
379 calls lexicon.c (calc_hash, create_def, append_new_def)
380 uses macro PAGC_ALLOC_STRUC , PAGC_STORE_STR, RET_ERR
381 return ERR_FAIL if error
382 -------------------------------------------------------*/
add_dict_entry(ERR_PARAM * err_p,ENTRY ** hash_table,char * lookup_str,int def_num,SYMB t,char * standard_str)383 static int add_dict_entry( ERR_PARAM *err_p ,
384 ENTRY **hash_table ,
385 char *lookup_str ,
386 int def_num ,
387 SYMB t ,
388 char *standard_str ) {
389 ENTRY *E ;
390
391 E = find_entry( hash_table ,
392 lookup_str ) ;
393 if ( E == NULL ) {
394 unsigned hash_index ;
395
396 PAGC_ALLOC_STRUC(E,ENTRY,err_p,ERR_FAIL);
397 /* -- add the Lookup string to the record -- */
398 PAGC_STORE_STR(E->Lookup,lookup_str,err_p,ERR_FAIL) ;
399 /* -- add new entry to beginning of table -- */
400 hash_index = calc_hash( lookup_str ) ;
401
402 E -> Next = hash_table[ hash_index ] ; /* -- collision chain -- */
403 hash_table[ hash_index ] = E ;
404 if ( ( E -> DefList = create_def( t ,
405 standard_str ,
406 def_num ,
407 FALSE ,
408 err_p ) ) == NULL ) {
409 return ERR_FAIL ;
410 }
411 } else {
412 int err_stat ;
413 if ( E -> DefList == NULL ) {
414 RET_ERR("add_dict_entry: Lexical entry lacks definition" ,
415 err_p ,
416 ERR_FAIL ) ;
417 }
418 if ( ( err_stat = append_new_def( err_p ,
419 E ,
420 t ,
421 standard_str ,
422 def_num ) ) != TRUE ) {
423 return err_stat ;
424 }
425 }
426 return TRUE ;
427 }
428
429 /* ----------------------------------------------------
430 lexicon.c (append_new_def)
431 called by lexicon.c (add_dict_entry)
432 calls lexicon.c (create_def)
433 returns FALSE if entry is already there
434 returns ERR_FAIL on allocation error
435 -------------------------------------------------------*/
append_new_def(ERR_PARAM * err_p,ENTRY * E,SYMB t,char * text,int def_num)436 static int append_new_def( ERR_PARAM *err_p ,
437 ENTRY *E ,
438 SYMB t ,
439 char *text ,
440 int def_num ) {
441
442 DEF *D,
443 *pd,
444 *cd ;
445 for ( cd = E -> DefList , pd = NULL ;
446 cd != NULL ;
447 cd = cd -> Next ) {
448 pd = cd ;
449 /* -- avoid duplication except for local entries -- */
450 if ( cd -> Type == t ) {
451 return FALSE ;
452 }
453 }
454 if ( ( D = create_def( t ,
455 text ,
456 def_num ,
457 FALSE ,
458 err_p ) ) == NULL ) {
459 return ERR_FAIL ;
460 }
461 if ( pd == NULL ) {
462 E -> DefList = D ;
463 } else {
464 D -> Next = pd -> Next ;
465 pd -> Next = D ;
466 }
467 return TRUE ;
468 }
469
470 /*--------------------------------------------------------------------
471 lexicon.c (create_def)
472 called by lexicon.c (append_new_def) tokenize.c (setup_default_defs)
473 allocate memory for lexicon entry.
474 Pflag is TRUE for default entries
475 returns NULL for allocation error
476 uses macro PAGC_ALLOC_STRUC, PAGC_STORE_STR
477 -------------------------------------------------------------------- */
create_def(SYMB s,char * standard_str,int def_num,int PFlag,ERR_PARAM * err_p)478 DEF *create_def ( SYMB s ,
479 char *standard_str ,
480 int def_num ,
481 int PFlag ,
482 ERR_PARAM *err_p ) {
483 /* -- allocate the memory and set up the definition structure with the
484 standard form -- */
485 DEF *cur_def ;
486
487 /* -- initialization-time allocation -- */
488 PAGC_ALLOC_STRUC(cur_def,DEF,err_p,NULL) ;
489 cur_def -> Type = s ;
490 cur_def -> Protect = PFlag ; /* -- False for definitions from lexicon
491 true for default definitions -- */
492 if ( !PFlag ) {
493 /* -- initialization-time allocation -- */
494 PAGC_STORE_STR(cur_def->Standard,standard_str,err_p,NULL) ;
495 } else
496 cur_def -> Standard = NULL ;
497 cur_def -> Order = def_num ;
498 cur_def -> Next = NULL ;
499 return cur_def ;
500 }
501
502 /*--------------------------------------------------------------------
503 lexicon.c (print_lexicon)
504 not called by useful for debugging. It will print out the lexicon.
505 --------------------------------------------------------------------*/
print_lexicon(ENTRY ** hash_table)506 void print_lexicon( ENTRY ** hash_table )
507 {
508 unsigned i;
509 ENTRY *E;
510
511 if (!hash_table) return;
512
513 for (i=0; i< LEXICON_HTABSIZE; i++)
514 {
515 E = hash_table[i];
516 while (E)
517 {
518 DEF *D = E->DefList;
519 printf("'%s'\n", E->Lookup);
520 while (D)
521 {
522 printf(" %d, %d, %d, '%s'\n", D->Order, D->Type, D->Protect, D->Standard);
523 D = D->Next;
524 }
525 E = E->Next;
526 }
527 }
528 }
529
530