1 
2 #include "OVLexicon.h"
3 #include "OVOneToOne.h"
4 #include "OVHeapArray.h"
5 #include "OVreturns.h"
6 
7 
8 /* should only be accessed by special methods */
9 
10 typedef struct {
11   ov_word offset;
12   ov_word next;                 /* NOTE: 1-based, 0 is the sentinel */
13   ov_word ref_cnt;
14   ov_uword hash;
15   ov_size size;
16 } lex_entry;
17 
18 struct _OVLexicon {
19   OVHeap *heap;
20   OVOneToOne *up;               /* maps hash_key to index of first zero-based entry */
21   lex_entry *entry;             /* maintained as a 1-based index */
22   ov_uword n_entry, n_active;
23   ov_char8 *data;
24   ov_uword data_size;
25   ov_uword data_unused;
26   ov_word free_index;           /* NOTE: 1-based, 0 is the sentinel */
27 };
28 
OVLexicon_GetNActive(OVLexicon * uk)29 ov_uword OVLexicon_GetNActive(OVLexicon * uk)
30 {
31   return uk->n_active;
32 }
33 
OVLexicon_New(OVHeap * heap)34 OVLexicon *OVLexicon_New(OVHeap * heap)
35 {
36   OVLexicon *I = NULL;
37   if(heap) {
38     I = OVHeap_ALLOC(heap, OVLexicon);
39     if(I) {
40       I->heap = heap;
41       I->up = OVOneToOne_New(heap);
42       if(!(I->up)) {
43         OVLexicon_DEL_AUTO_NULL(I);
44       }
45     }
46   }
47   return I;
48 }
49 
OVLexicon_Del(OVLexicon * I)50 void OVLexicon_Del(OVLexicon * I)
51 {
52   if(I) {
53     OVOneToOne_DEL_AUTO_NULL(I->up);
54     if(I->entry) {
55       I->entry++;               /* allow for 1-based indexing */
56       OVHeapArray_FREE_AUTO_NULL(I->entry);
57     }
58     OVHeapArray_FREE_AUTO_NULL(I->data);
59     OVHeap_Free(I->heap, I);
60   }
61 }
62 
OVLexicon_CheckStorage(OVLexicon * uk,ov_word entry_size,ov_size data_size)63 static OVstatus OVLexicon_CheckStorage(OVLexicon * uk, ov_word entry_size,
64                                        ov_size data_size)
65 {
66   if(!uk->entry) {              /* auto zero to initialize ref_cnt fields */
67     uk->entry = OVHeapArray_CALLOC(uk->heap, lex_entry, entry_size);
68     if(!uk->entry) {
69       return_OVstatus_OUT_OF_MEMORY;
70     }
71     uk->entry--;                /* allow for 1-based indexing */
72 
73   } else {
74     uk->entry++;                /* allow for 1-based indexing */
75     if(!OVHeapArray_CHECK(uk->entry, lex_entry, (ov_size) (entry_size - 1))) {
76       return_OVstatus_OUT_OF_MEMORY;
77     }
78     uk->entry--;                /* allow for 1-based indexing */
79   }
80 
81   if(!uk->data) {
82     uk->data = OVHeapArray_MALLOC(uk->heap, ov_char8, data_size);
83     if(!uk->data) {
84       return_OVstatus_OUT_OF_MEMORY;
85     }
86   } else if(!OVHeapArray_CHECK(uk->data, ov_char8, data_size - 1)) {
87     return_OVstatus_OUT_OF_MEMORY;
88   }
89   return_OVstatus_SUCCESS;
90 }
91 
92 /*============================================================================
93  * _GetCStringHash -- returns a djb2 string hash key of the input
94  * PARAMS
95  *   string str
96  * RETURNS
97  *   string hash key
98  */
_GetCStringHash(ov_uchar8 * str)99 static ov_word _GetCStringHash(ov_uchar8 * str)
100 {
101   const ov_uchar8 *p = str;
102   ov_word x;
103   ov_size len = 0;
104   ov_uchar8 c;
105 
106   x = *p << 7;
107   while((c = *(p++))) {
108     x = (x << 5) + x + c;       /*  djb2 (G5 = 2.8, P3 = 18.7, P4=2.9) FASTEST OVERALL */
109     len++;
110   }
111   x ^= len;
112   return x;
113 }
114 
OVLexicon_Pack(OVLexicon * uk)115 OVstatus OVLexicon_Pack(OVLexicon * uk)
116 {
117 
118   if(uk->entry && uk->data && uk->n_entry && uk->data_unused) {
119     ov_size new_count = 0;
120     ov_size new_size = 0;
121 
122     {                           /* compute storage requirements */
123       ov_size a;
124       ov_size n_entry = uk->n_entry;
125       lex_entry *cur_entry = uk->entry + 1;     /* NOTE: 1-based array */
126       for(a = 0; a < n_entry; a++) {
127         if(cur_entry->ref_cnt > 0) {
128           new_size += cur_entry->size;
129           new_count++;
130         }
131         cur_entry++;
132       }
133     }
134     if((!new_count) && (!new_size)) {   /* if lexicon is completely empty, then purge both */
135       uk->entry++;              /* alloc for 1-based indexing */
136       OVHeapArray_FREE_AUTO_NULL(uk->entry);
137       OVHeapArray_FREE_AUTO_NULL(uk->data);
138       OVOneToOne_Reset(uk->up);
139       uk->n_entry = 0;
140       uk->n_active = 0;
141       uk->data_unused = 0;
142       uk->data_size = 0;
143       uk->free_index = 0;
144     } else {                    /* otherwise, pack the string fields, and track the free entries */
145       OVstatus status;
146       ov_char8 *old_data = uk->data;
147       uk->data = NULL;
148 
149       if(OVreturn_IS_ERROR(status = OVLexicon_CheckStorage(uk, uk->n_entry, new_size))) {
150         uk->data = old_data;
151         return status;
152       }
153 
154       {                         /* copy data */
155         ov_word a;
156         ov_size n_entry = uk->n_entry, new_size = 0;
157         lex_entry *cur_entry = uk->entry + 1;  /* NOTE: 1-based array */
158         ov_char8 *data = old_data;
159         ov_char8 *new_data = uk->data;
160         ov_word free_index = 0;
161         ov_size entry_size;
162 
163         for(a = 1; a <= (ov_word) n_entry; a++) {
164           if(cur_entry->ref_cnt > 0) {
165             entry_size = cur_entry->size;
166             memcpy(new_data, data + cur_entry->offset, entry_size);
167             cur_entry->offset = new_size;
168             new_size += entry_size;
169             new_data += entry_size;
170           } else {
171             /*  remember for later */
172             cur_entry->next = free_index;
173             cur_entry->ref_cnt = 0;
174             free_index = a;
175           }
176           cur_entry++;
177         }
178 
179         OVHeapArray_FREE(old_data);
180         uk->data_unused = 0;
181         uk->data_size = new_size;
182         uk->free_index = free_index;
183 
184       }
185     }
186   }
187   return_OVstatus_SUCCESS;
188 }
189 
OVLexicon_DecRef(OVLexicon * uk,ov_word id)190 OVstatus OVLexicon_DecRef(OVLexicon * uk, ov_word id)
191 {
192   if((!uk->entry) || (id < 1) || (id > (ov_word) uk->n_entry)) {        /* range checking */
193     if(id)
194       printf("OVLexicon_DecRef-Warning: key %zd not found, this might be a bug\n", id);
195     return_OVstatus_NOT_FOUND;
196   } else {
197     lex_entry *cur_entry = uk->entry + id;
198     ov_word ref_cnt = (--cur_entry->ref_cnt);
199     if(ref_cnt < 0) {
200       printf("OVLexicon_DecRef-Warning: key %zd with ref_cnt %zd, this might be a bug\n", id, ref_cnt);
201       return_OVstatus_INVALID_REF_CNT;
202     } else if(!ref_cnt) {
203       OVreturn_word result = OVOneToOne_GetForward(uk->up, cur_entry->hash);
204       if(OVreturn_IS_OK(result)) {
205         ov_word index = result.word;
206         if(index != id) {
207           lex_entry *entry = uk->entry;
208           ov_word next;
209           while(index && (next = entry[index].next) != id)
210             index = next;
211           if(index)
212             entry[index].next = entry[id].next; /* excise from list */
213         } else {
214           /* remove entry from OneToOne */
215           OVOneToOne_DelReverse(uk->up, id);
216           /* if non-terminal, then add next entry to OneToOne */
217           if(cur_entry->next) {
218             OVOneToOne_Set(uk->up, cur_entry->hash, cur_entry->next);   /* NOTE: no error checking performed! */
219           }
220         }
221       }
222       uk->data_unused += cur_entry->size;
223       uk->n_active--;
224       if(uk->data_unused >= (uk->data_size >> 1))       /* if 50% unutilized, then pack */
225         OVLexicon_Pack(uk);
226     }
227     return_OVstatus_SUCCESS;
228   }
229 }
230 
OVLexicon_IncRef(OVLexicon * uk,ov_word id)231 OVstatus OVLexicon_IncRef(OVLexicon * uk, ov_word id)
232 {
233   if((!uk->entry) || (id < 1) || (id > (ov_word) uk->n_entry)) {        /* range checking */
234     return_OVstatus_NOT_FOUND;
235   } else {
236     lex_entry *entry = uk->entry + id;
237     ov_word ref_cnt = (++entry->ref_cnt);
238     if(ref_cnt < 2) {           /* was reference count zero or less? */
239       /* safety precauations */
240       entry->ref_cnt = 0;
241       entry->size = 0;
242       entry->offset = 0;
243       return_OVstatus_INVALID_REF_CNT;
244     } else {
245       return_OVstatus_SUCCESS;
246     }
247   }
248 }
249 
OVLexicon_GetFromCString(OVLexicon * uk,const ov_char8 * str)250 OVreturn_word OVLexicon_GetFromCString(OVLexicon * uk, const ov_char8 * str)
251 {
252   ov_word hash = _GetCStringHash((ov_uchar8 *) str);
253   OVreturn_word search = OVOneToOne_GetForward(uk->up, hash);
254   ov_word index = 0;
255   ov_word cur_index = 0;
256   ov_char8 *c;
257 
258   if(OVreturn_IS_OK(search)) {
259     /* the hash key has been registered, so follow the chain
260      * looking for a match... */
261     ov_char8 *data = uk->data;
262     lex_entry *entry = uk->entry, *entry_ptr;
263     cur_index = (index = search.word);
264     while(index) {              /* found */
265       c = data + (entry_ptr = (entry + index))->offset;
266       if(strcmp(c, str) != 0) { /* verify match */
267         index = entry_ptr->next;        /* not a match? keep looking */
268       } else {
269         break;
270       }
271     }
272   }
273   if(!index) {
274     /* no match was found (or hash is new) so add this string to the lexicon */
275     ov_size st_size = strlen(str) + 1;
276     lex_entry *entry, *entry_ptr, *cur_entry_ptr;
277     OVstatus status;
278     {
279       ov_size new_size = uk->data_size + st_size;
280       ov_size new_n_entry = uk->n_entry;
281       if(!uk->free_index)
282         new_n_entry++;
283       /* allocate storage as necessary */
284       if(OVreturn_IS_ERROR(status = OVLexicon_CheckStorage(uk, new_n_entry, new_size))) {
285         OVreturn_word result;
286         result.status = status.status;
287         result.word = 0;
288         return result;
289       }
290     }
291     /* WARNING: uk->entry, uk->data_size, and uk->data may have changed! */
292 
293     if(!uk->free_index)
294       index = (++uk->n_entry);  /* NOTE: 1-based indices */
295     else {
296       index = uk->free_index;
297       uk->free_index = uk->entry[index].next;
298     }
299     uk->n_active++;
300 
301     if(!cur_index) {
302       /* if the hash key was new, add it to the OneToOne, and setup entry */
303       if(OVreturn_IS_ERROR(status = OVOneToOne_Set(uk->up, hash, index))) {
304         OVreturn_word result;
305         uk->entry[index].next = uk->free_index; /* record this as a free entry */
306         uk->free_index = index;
307         uk->n_active--;
308         result.status = status.status;
309         result.word = 0;
310         return result;
311       }
312       entry_ptr = uk->entry + index;
313       entry_ptr->next = 0;
314 
315     } else {
316       /* otherwise, simply add this entry on to the list in position 2 */
317 
318       cur_entry_ptr = (entry = uk->entry) + cur_index;
319       entry_ptr = entry + index;
320       entry_ptr->next = cur_entry_ptr->next;
321       cur_entry_ptr->next = index;
322     }
323     /* increase the counts and sizes to accomodate this new entry & copy info */
324     entry_ptr->hash = hash;
325     entry_ptr->size = st_size;
326     entry_ptr->offset = uk->data_size;
327     entry_ptr->ref_cnt++;       /* increase ref_cnt */
328     strcpy(uk->data + uk->data_size, str);
329     uk->data_size += st_size;
330   } else {
331     lex_entry *entry_ptr;
332     entry_ptr = uk->entry + index;
333     entry_ptr->ref_cnt++;       /* increase ref_cnt */
334   }
335   {
336     OVreturn_word result = { OVstatus_SUCCESS };
337     result.word = index;
338     return result;
339   }
340 }
341 
OVLexicon_BorrowFromCString(OVLexicon * uk,const ov_char8 * str)342 OVreturn_word OVLexicon_BorrowFromCString(OVLexicon * uk, const ov_char8 * str)
343 {
344   /* get hash token for this word */
345   ov_word hash = _GetCStringHash((ov_uchar8 *) str);
346   OVreturn_word search = OVOneToOne_GetForward(uk->up, hash);
347   ov_word index = 0;
348   ov_char8 *c;
349 
350   /* error is something like OVstatus_NULL_PTR, OVstatus_NOT_FOUND; see ov/src/OVreturns.h */
351   if(OVreturn_IS_ERROR(search)) {
352     return search;
353   } else {
354     /* the hash key has been registered, so follow the chain
355      * looking for a match... */
356     ov_char8 *data = uk->data;
357     lex_entry *entry = uk->entry, *entry_ptr;
358     index = search.word;
359     while(index) {              /* found */
360       c = data + (entry_ptr = (entry + index))->offset;
361       if(strcmp(c, str) != 0) { /* verify match */
362         index = entry_ptr->next;        /* not a match? keep looking */
363       } else {
364         break;
365       }
366     }
367   }
368 
369   if(!index) {
370     OVreturn_word result = { OVstatus_NOT_FOUND };
371     result.word = index;
372     return result;
373   } else {
374     OVreturn_word result = { OVstatus_SUCCESS };
375     result.word = index;
376     return result;
377   }
378 }
379 
OVLexicon_FetchCString(OVLexicon * uk,ov_word id)380 ov_char8 *OVLexicon_FetchCString(OVLexicon * uk, ov_word id)
381 {
382   if(id <= (ov_word) uk->n_entry)
383     return uk->data + uk->entry[id].offset;
384   return NULL;
385 }
386