1 /* Partial emulation of Perl hashes (associative arrays),
2 * mapping keys (ASCII char strings) to array indices.
3 *
4 * Contents:
5 * 1. The <ESL_KEYHASH> object.
6 * 2. Storing and retrieving keys.
7 * 3. Internal functions.
8 * 4. Benchmark drivers.
9 * 5. Unit tests.
10 * 6. Test driver.
11 * 7. Example.
12 */
13 #include "esl_config.h"
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <limits.h>
19
20 #include "easel.h"
21 #include "esl_mem.h"
22
23 #include "esl_keyhash.h"
24
25 static ESL_KEYHASH *keyhash_create(uint32_t hashsize, int init_key_alloc, int init_string_alloc);
26 static uint32_t jenkins_hash(const char *key, esl_pos_t n, uint32_t hashsize);
27 static int key_upsize(ESL_KEYHASH *kh);
28
29
30 /*****************************************************************
31 *# 1. The <ESL_KEYHASH> object
32 *****************************************************************/
33
34 /* Function: esl_keyhash_Create()
35 * Synopsis: Allocates a new keyhash.
36 *
37 * Purpose: Create a new hash table for key indexing, and returns
38 * a pointer to it.
39 *
40 * Throws: <NULL> on allocation failure.
41 *
42 * Note: 128*sizeof(int)*3 + 2048*sizeof(char) + sizeof(ESL_KEYHASH):
43 * about 2400 bytes for an initial KEYHASH.
44 */
45 ESL_KEYHASH *
esl_keyhash_Create(void)46 esl_keyhash_Create(void)
47 {
48 return keyhash_create(128, /* initial hash table size (power of 2) */
49 128, /* initial alloc for up to 128 keys */
50 2048); /* initial alloc for keys totalling up to 2048 chars */
51 }
52
53
54 /* Function: esl_keyhash_CreateCustom()
55 * Synopsis: Allocate a new keyhash with customized initial allocations.
56 *
57 * Purpose: Create a new hash table, initially allocating for
58 * a hash table of size <hashsize> entries, <kalloc>
59 * keys, and a total key string length of <salloc>.
60 * <hashsize> must be a power of 2, and all allocations
61 * must be $\geq 0$.
62 *
63 * The object will still expand as needed, so the reason to
64 * use a customized allocation is when you're trying to
65 * minimize memory footprint and you expect your keyhash to
66 * be smaller than the default (of up to 128 keys, of total
67 * length up to 2048).
68 *
69 * Throws: <NULL> on allocation failure.
70 */
71 ESL_KEYHASH *
esl_keyhash_CreateCustom(uint32_t hashsize,int kalloc,int salloc)72 esl_keyhash_CreateCustom(uint32_t hashsize, int kalloc, int salloc)
73 {
74 ESL_DASSERT1((hashsize && ((hashsize & (hashsize-1)) == 0))); /* hashsize is a power of 2 (bitshifting trickery) */
75 return keyhash_create(hashsize, kalloc, salloc);
76 }
77
78 /* Function: esl_keyhash_Clone()
79 * Synopsis: Duplicates a keyhash.
80 *
81 * Purpose: Allocates and duplicates a keyhash <kh>. Returns a
82 * pointer to the duplicate.
83 *
84 * Throws: <NULL> on allocation failure.
85 */
86 ESL_KEYHASH *
esl_keyhash_Clone(const ESL_KEYHASH * kh)87 esl_keyhash_Clone(const ESL_KEYHASH *kh)
88 {
89 ESL_KEYHASH *nw;
90 int h;
91
92 if ((nw = keyhash_create(kh->hashsize, kh->kalloc, kh->salloc)) == NULL) goto ERROR;
93
94 for (h = 0; h < kh->hashsize; h++)
95 nw->hashtable[h] = kh->hashtable[h];
96
97 for (h = 0; h < kh->nkeys; h++)
98 {
99 nw->nxt[h] = kh->nxt[h];
100 nw->key_offset[h] = kh->key_offset[h];
101 }
102 nw->nkeys = kh->nkeys;
103
104 memcpy(nw->smem, kh->smem, sizeof(char) * kh->sn);
105 nw->sn = kh->sn;
106 return nw;
107
108 ERROR:
109 esl_keyhash_Destroy(nw);
110 return NULL;
111 }
112
113
114 /* Function: esl_keyhash_Get()
115 * Synopsis: Returns a key name, given its index.
116 *
117 * Purpose: Returns a pointer to the key name associated
118 * with index <idx>. The key name is a <NUL>-terminated
119 * string whose memory is managed internally in
120 * the keyhash <kh>.
121 */
122 char *
esl_keyhash_Get(const ESL_KEYHASH * kh,int idx)123 esl_keyhash_Get(const ESL_KEYHASH *kh, int idx)
124 {
125 return kh->smem + kh->key_offset[idx];
126 }
127
128 /* Function: esl_keyhash_GetNumber()
129 * Synopsis: Returns the total number of keys stored.
130 *
131 * Purpose: Returns the total number of keys currently stored in the
132 * keyhash <kh>.
133 */
134 int
esl_keyhash_GetNumber(const ESL_KEYHASH * kh)135 esl_keyhash_GetNumber(const ESL_KEYHASH *kh)
136 {
137 return kh->nkeys;
138 }
139
140 /* Function: esl_keyhash_Sizeof()
141 * Synopsis: Returns the size of a keyhash object, in bytes.
142 */
143 size_t
esl_keyhash_Sizeof(const ESL_KEYHASH * kh)144 esl_keyhash_Sizeof(const ESL_KEYHASH *kh)
145 {
146 size_t n = 0;
147
148 if (kh)
149 {
150 n += sizeof(ESL_KEYHASH);
151 n += sizeof(int) * kh->hashsize;
152 n += sizeof(int) * kh->kalloc * 2;
153 n += sizeof(char) * kh->salloc;
154 }
155 return n;
156 }
157
158 /* Function: esl_keyhash_Reuse()
159 * Synopsis: Recycle a keyhash.
160 *
161 * Purpose: Empties keyhash <kh> so it can be reused without
162 * creating a new one.
163 *
164 * Returns: <eslOK> on success.
165 */
166 int
esl_keyhash_Reuse(ESL_KEYHASH * kh)167 esl_keyhash_Reuse(ESL_KEYHASH *kh)
168 {
169 int i;
170
171 for (i = 0; i < kh->hashsize; i++) kh->hashtable[i] = -1;
172 kh->nkeys = 0;
173 kh->sn = 0;
174 return eslOK;
175 }
176
177
178
179 /* Function: esl_keyhash_Destroy()
180 * Synopsis: Frees a keyhash.
181 *
182 * Purpose: Destroys <kh>.
183 *
184 * Returns: (void)
185 */
186 void
esl_keyhash_Destroy(ESL_KEYHASH * kh)187 esl_keyhash_Destroy(ESL_KEYHASH *kh)
188 {
189 if (kh == NULL) return;
190 if (kh->hashtable != NULL) free(kh->hashtable);
191 if (kh->key_offset != NULL) free(kh->key_offset);
192 if (kh->nxt != NULL) free(kh->nxt);
193 if (kh->smem != NULL) free(kh->smem);
194 free(kh);
195 }
196
197 /* Function: esl_keyhash_Dump()
198 * Synopsis: Dumps debugging information about a keyhash.
199 *
200 * Purpose: Mainly for debugging purposes. Dump
201 * some information about the hash table <kh>
202 * to the stream <fp>, which might be stderr
203 * or stdout.
204 */
205 void
esl_keyhash_Dump(FILE * fp,const ESL_KEYHASH * kh)206 esl_keyhash_Dump(FILE *fp, const ESL_KEYHASH *kh)
207 {
208 int idx;
209 int h;
210 int nkeys;
211 int nempty = 0;
212 int maxkeys = -1;
213 int minkeys = INT_MAX;
214
215 for (h = 0; h < kh->hashsize; h++)
216 {
217 for (nkeys = 0, idx = kh->hashtable[h]; idx != -1; idx = kh->nxt[idx]) nkeys++;
218
219 if (nkeys == 0) nempty++;
220 if (nkeys > maxkeys) maxkeys = nkeys;
221 if (nkeys < minkeys) minkeys = nkeys;
222 }
223
224 fprintf(fp, "Total keys: %d\n", kh->nkeys);
225 fprintf(fp, "Hash table size: %d\n", kh->hashsize);
226 fprintf(fp, "Average occupancy: %.2f\n", (float) kh->nkeys /(float) kh->hashsize);
227 fprintf(fp, "Unoccupied slots: %d\n", nempty);
228 fprintf(fp, "Most in one slot: %d\n", maxkeys);
229 fprintf(fp, "Least in one slot: %d\n", minkeys);
230 fprintf(fp, "Keys allocated for: %d\n", kh->kalloc);
231 fprintf(fp, "Key string space alloc: %d\n", kh->salloc);
232 fprintf(fp, "Key string space used: %d\n", kh->sn);
233 fprintf(fp, "Total obj size, bytes: %d\n", (int) esl_keyhash_Sizeof(kh));
234 }
235 /*--------------- end, <ESL_KEYHASH> object ---------------------*/
236
237
238
239
240 /*****************************************************************
241 *# 2. Storing and retrieving keys
242 *****************************************************************/
243
244 /* Function: esl_keyhash_Store()
245 * Synopsis: Store a key and get a key index for it.
246 *
247 * Purpose: Store a string (or mem) <key> of length <n> in the key
248 * index hash table <kh>. Associate it with a unique key
249 * index, counting from 0. This index maps hashed keys to
250 * integer-indexed C arrays, clumsily emulating hashes or
251 * associative arrays. Optionally returns the index through
252 * <opt_index>.
253 *
254 * <key>, <n> follow the standard idiom for strings and
255 * unterminated buffers. If <key> is raw memory, <n> must
256 * be provided; if <key> is a \0-terminated string, <n>
257 * may be -1.
258 *
259 * Returns: <eslOK> on success; stores <key> in <kh>; <opt_index> is
260 * returned, set to the next higher index value.
261 * Returns <eslEDUP> if <key> was already stored in the table;
262 * <opt_index> is set to the existing index for <key>.
263 *
264 * Throws: <eslEMEM> on allocation failure, and sets <opt_index> to -1.
265 */
266 int
esl_keyhash_Store(ESL_KEYHASH * kh,const char * key,esl_pos_t n,int * opt_index)267 esl_keyhash_Store(ESL_KEYHASH *kh, const char *key, esl_pos_t n, int *opt_index)
268 {
269 uint32_t val = jenkins_hash(key, n, kh->hashsize);
270 int idx;
271 int status;
272
273 if (n == -1) n = strlen(key);
274
275 /* Was this key already stored? */
276 for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
277 if (esl_memstrcmp(key, n, kh->smem + kh->key_offset[idx]))
278 {
279 if (opt_index) *opt_index = idx;
280 return eslEDUP;
281 }
282
283 /* Reallocate key ptr/index memory if needed */
284 if (kh->nkeys == kh->kalloc)
285 {
286 ESL_REALLOC(kh->key_offset, sizeof(int)*kh->kalloc*2);
287 ESL_REALLOC(kh->nxt, sizeof(int)*kh->kalloc*2);
288 kh->kalloc *= 2;
289 }
290
291 /* Reallocate key string memory if needed */
292 while (kh->sn + n + 1 > kh->salloc)
293 {
294 ESL_REALLOC(kh->smem, sizeof(char) * kh->salloc * 2);
295 kh->salloc *= 2;
296 }
297
298 /* Copy the key, assign its index */
299 idx = kh->nkeys;
300 kh->key_offset[idx] = kh->sn;
301 kh->sn += n+1;
302 esl_memstrcpy(key, n, kh->smem + kh->key_offset[idx]);
303 kh->nkeys++;
304
305 /* Insert new element at head of the approp linked list in hashtable */
306 kh->nxt[idx] = kh->hashtable[val];
307 kh->hashtable[val] = idx;
308
309 /* Time to upsize? If we're 3x saturated, expand the hash table */
310 if (kh->nkeys > 3*kh->hashsize)
311 if ((status = key_upsize(kh)) != eslOK) goto ERROR;
312
313 if (opt_index != NULL) *opt_index = idx;
314 return eslOK;
315
316 ERROR:
317 if (opt_index != NULL) *opt_index = -1;
318 return status;
319 }
320
321 /* Function: esl_keyhash_Lookup()
322 * Synopsis: Look up a key's array index.
323 *
324 * Purpose: Look up string or mem <key> of length <n> in hash table <kh>.
325 * If <key> is found, return <eslOK>, and optionally set <*opt_index>
326 * to its array index (0..nkeys-1).
327 * If <key> is not found, return <eslENOTFOUND>, and
328 * optionally set <*opt_index> to -1.
329 *
330 * If <key> is a \0-terminated string, <n> may be -1.
331 */
332 int
esl_keyhash_Lookup(const ESL_KEYHASH * kh,const char * key,esl_pos_t n,int * opt_index)333 esl_keyhash_Lookup(const ESL_KEYHASH *kh, const char *key, esl_pos_t n, int *opt_index)
334 {
335 uint32_t val = jenkins_hash(key, n, kh->hashsize);
336 int idx;
337
338 if (n == -1)
339 {
340 for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
341 if (strcmp(key, kh->smem + kh->key_offset[idx]) == 0)
342 {
343 if (opt_index) *opt_index = idx;
344 return eslOK;
345 }
346 }
347 else
348 {
349 for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
350 if (esl_memstrcmp(key, n, kh->smem + kh->key_offset[idx]))
351 {
352 if (opt_index) *opt_index = idx;
353 return eslOK;
354 }
355 }
356
357 if (opt_index != NULL) *opt_index = -1;
358 return eslENOTFOUND;
359 }
360
361
362 /*---------- end, API for storing/retrieving keys ---------------*/
363
364
365
366
367 /*****************************************************************
368 * 3. Internal functions
369 *****************************************************************/
370
371 /* keyhash_create()
372 *
373 * The real creation function, which takes arguments for memory sizes.
374 * This is abstracted to a static function because it's used by both
375 * Create() and Clone() but slightly differently.
376 *
377 * Args: hashsize - size of hash table; this must be a power of two.
378 * init_key_alloc - initial allocation for # of keys.
379 * init_string_alloc - initial allocation for total size of key strings.
380 *
381 * Returns: An allocated hash table structure; or NULL on failure.
382 */
383 ESL_KEYHASH *
keyhash_create(uint32_t hashsize,int init_key_alloc,int init_string_alloc)384 keyhash_create(uint32_t hashsize, int init_key_alloc, int init_string_alloc)
385 {
386 ESL_KEYHASH *kh = NULL;
387 int i;
388 int status;
389
390 ESL_ALLOC(kh, sizeof(ESL_KEYHASH));
391 kh->hashtable = NULL;
392 kh->key_offset = NULL;
393 kh->nxt = NULL;
394 kh->smem = NULL;
395
396 kh->hashsize = hashsize;
397 kh->kalloc = init_key_alloc;
398 kh->salloc = init_string_alloc;
399
400 ESL_ALLOC(kh->hashtable, sizeof(int) * kh->hashsize);
401 for (i = 0; i < kh->hashsize; i++) kh->hashtable[i] = -1;
402
403 ESL_ALLOC(kh->key_offset, sizeof(int) * kh->kalloc);
404 ESL_ALLOC(kh->nxt, sizeof(int) * kh->kalloc);
405 for (i = 0; i < kh->kalloc; i++) kh->nxt[i] = -1;
406
407 ESL_ALLOC(kh->smem, sizeof(char) * kh->salloc);
408 kh->nkeys = 0;
409 kh->sn = 0;
410 return kh;
411
412 ERROR:
413 esl_keyhash_Destroy(kh);
414 return NULL;
415 }
416
417
418 /* jenkins_hash()
419 *
420 * The hash function.
421 * This is Bob Jenkins' "one at a time" hash.
422 * <key> is a NUL-terminated string of any length.
423 * <hashsize> must be a power of 2.
424 *
425 * References:
426 * [1] http://en.wikipedia.org/wiki/Hash_table
427 * [2] http://www.burtleburtle.net/bob/hash/doobs.html
428 */
429 static uint32_t
jenkins_hash(const char * key,esl_pos_t n,uint32_t hashsize)430 jenkins_hash(const char *key, esl_pos_t n, uint32_t hashsize)
431 {
432 esl_pos_t pos;
433 uint32_t val = 0;
434
435 if (n == -1)
436 { /* string version */
437 for (; *key != '\0'; key++)
438 {
439 val += *key;
440 val += (val << 10);
441 val ^= (val >> 6);
442 }
443 }
444 else
445 { /* buffer version */
446 for (pos = 0; pos < n; pos++)
447 {
448 val += key[pos];
449 val += (val << 10);
450 val ^= (val >> 6);
451 }
452 }
453 val += (val << 3);
454 val ^= (val >> 11);
455 val += (val << 15);
456
457 return (val & (hashsize - 1));
458 }
459
460 /* key_upsize()
461 *
462 * Grow the hash table to the next available size.
463 *
464 * Args: old - the KEY hash table to reallocate.
465 *
466 * Returns: <eslOK> on success. 'Success' includes the case
467 * where the hash table is already at its maximum size,
468 * and cannot be upsized any more.
469 *
470 * Throws: <eslEMEM> on allocation failure, and
471 * the hash table is left in its initial state.
472 */
473 static int
key_upsize(ESL_KEYHASH * kh)474 key_upsize(ESL_KEYHASH *kh)
475 {
476 void *p;
477 int i;
478 uint32_t val;
479 int status;
480
481 /* 28 below because we're going to upsize in steps of 8x (2^3); need to be < 2^{31-3} */
482 if (kh->hashsize >= (1<<28)) return eslOK; /* quasi-success (can't grow any more) */
483
484 /* The catch here is that when you upsize the table, all the hash functions
485 * change; so you have to go through all the keys, recompute their hash functions,
486 * and store them again in the new table.
487 */
488 /* Allocate a new, larger hash table. (Don't change <kh> until this succeeds) */
489 ESL_RALLOC(kh->hashtable, p, sizeof(int) * (kh->hashsize << 3));
490 kh->hashsize = kh->hashsize << 3; /* 8x */
491 for (i = 0; i < kh->hashsize; i++) kh->hashtable[i] = -1;
492
493 /* Store all the keys again. */
494 for (i = 0; i < kh->nkeys; i++)
495 {
496 val = jenkins_hash(kh->smem + kh->key_offset[i], -1, kh->hashsize);
497 kh->nxt[i] = kh->hashtable[val];
498 kh->hashtable[val] = i;
499 }
500 return eslOK;
501
502 ERROR:
503 return eslEMEM;
504 }
505 /*--------------- end, internal functions -----------------*/
506
507
508 /*****************************************************************
509 * 4. Benchmark driver
510 *****************************************************************/
511 #ifdef eslKEYHASH_BENCHMARK
512 /*
513 gcc -g -O2 -o keyhash_benchmark -I. -L. -DeslKEYHASH_BENCHMARK esl_keyhash.c -leasel -lm
514 time ./keyhash_benchmark /usr/share/dict/words /usr/share/dict/words
515 */
516 #include "esl_config.h"
517
518 #include <stdio.h>
519
520 #include "easel.h"
521 #include "esl_getopts.h"
522 #include "esl_keyhash.h"
523 #include "esl_stopwatch.h"
524
525 static ESL_OPTIONS options[] = {
526 /* name type default env range toggles reqs incomp help docgroup*/
527 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
528 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
529 };
530 static char usage[] = "[-options] <keyfile1> <keyfile2>";
531 static char banner[] = "benchmarking speed of keyhash module";
532
533 int
main(int argc,char ** argv)534 main(int argc, char **argv)
535 {
536 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage);
537 ESL_KEYHASH *kh = esl_keyhash_Create();
538 ESL_STOPWATCH *w = esl_stopwatch_Create();
539 char *file1 = esl_opt_GetArg(go, 1);
540 char *file2 = esl_opt_GetArg(go, 2);
541 FILE *fp;
542 char buf[256];
543 char *s, *tok;
544 int idx;
545 int nstored, nsearched, nshared;
546
547 /* Read/store keys from file 1.
548 */
549 esl_stopwatch_Start(w);
550 if ((fp = fopen(file1, "r")) == NULL)
551 { fprintf(stderr, "couldn't open %s\n", argv[1]); exit(1); }
552 nstored = 0;
553 while (fgets(buf, 256, fp) != NULL)
554 {
555 s = buf;
556 esl_strtok(&s, " \t\r\n", &tok);
557 esl_keyhash_Store(kh, tok, -1, &idx);
558 nstored++;
559 }
560 fclose(fp);
561 printf("Stored %d keys.\n", nstored);
562
563 /* Look up keys from file 2.
564 */
565 if ((fp = fopen(file2, "r")) == NULL)
566 { fprintf(stderr, "couldn't open %s\n", argv[2]); exit(1); }
567 nsearched = nshared = 0;
568 while (fgets(buf, 256, fp) != NULL)
569 {
570 s = buf;
571 esl_strtok(&s, " \t\r\n", &tok);
572
573 if (esl_keyhash_Lookup(kh, tok, -1, &idx) == eslOK) nshared++;
574 nsearched++;
575 }
576 fclose(fp);
577 esl_stopwatch_Stop(w);
578 printf("Looked up %d keys.\n", nsearched);
579 printf("In common: %d keys.\n", nshared);
580 esl_stopwatch_Display(stdout, w, "# CPU Time: ");
581
582 esl_stopwatch_Destroy(w);
583 esl_keyhash_Destroy(kh);
584 esl_getopts_Destroy(go);
585 return 0;
586 }
587 #endif /*eslKEYHASH_BENCHMARK*/
588
589
590
591 #ifdef eslKEYHASH_BENCHMARK2
592
593 /* Benchmark #2 is a benchmark just of the hash function.
594 * First we read in a bunch of keys from any file, one key per line.
595 * Then we start timing, and compute a hash for each key.
596 */
597
598 /* gcc -O2 -o keyhash_benchmark2 -I. -L. -DeslKEYHASH_BENCHMARK2 esl_keyhash.c -leasel -lm
599 * ./keyhash_benchmark2 <keyfile>
600 */
601 #include "esl_config.h"
602
603 #include <stdio.h>
604 #include <math.h>
605
606 #include "easel.h"
607 #include "esl_fileparser.h"
608 #include "esl_getopts.h"
609 #include "esl_keyhash.h"
610 #include "esl_stats.h"
611 #include "esl_stopwatch.h"
612 #include "esl_vectorops.h"
613
614 static ESL_OPTIONS options[] = {
615 /* name type default env range toggles reqs incomp help docgroup*/
616 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
617 { "-s", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show statistical test for hash uniformity", 0 },
618 { "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be verbose: print hash values for keys", 0 },
619 { "-x", eslARG_INT, "32768", NULL, NULL, NULL, NULL, NULL, "set hash table size to <n>", 0 },
620 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
621 };
622 static char usage[] = "[-options] <keyfile>";
623 static char banner[] = "benchmarking speed of hash function in keyhash module";
624
625 int
main(int argc,char ** argv)626 main(int argc, char **argv)
627 {
628 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
629 ESL_FILEPARSER *efp = NULL;
630 ESL_STOPWATCH *w = esl_stopwatch_Create();
631 ESL_KEYHASH *kh = esl_keyhash_Create();
632 char *keyfile = esl_opt_GetArg(go, 1);
633 uint32_t hashsize = esl_opt_GetInteger(go, "-x");
634 char *key;
635 int keylen;
636 char **karr = NULL;
637 int kalloc;
638 int *ct = NULL;
639 int nkeys;
640 int i;
641 int status;
642 uint32_t (*hashfunc)(const char*,uint32_t) = jenkins_hash;
643
644 /* 1. Store the keys from the file, before starting the benchmark timer. */
645 kalloc = 256;
646 ESL_ALLOC(karr, sizeof(char *) * kalloc);
647
648 if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile);
649
650 nkeys = 0;
651 while (esl_fileparser_NextLine(efp) == eslOK)
652 {
653 if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failure in parsing key file\n");
654
655 if (nkeys == kalloc) {
656 void *tmp;
657 ESL_RALLOC(karr, tmp, sizeof(char *) * kalloc * 2);
658 kalloc *= 2;
659 }
660
661 esl_strdup(key, keylen, &(karr[nkeys]));
662 nkeys++;
663 }
664 esl_fileparser_Close(efp);
665 /* and karr[0..nkeys-1] are now the keys. */
666
667
668 /* 2. benchmark hashing the keys. */
669 esl_stopwatch_Start(w);
670 for (i = 0; i < nkeys; i++) (*hashfunc)(karr[i], hashsize);
671 esl_stopwatch_Stop(w);
672 esl_stopwatch_Display(stdout, w, "# CPU Time: ");
673
674 /* If user wanted to see the hashes, do that
675 * separately, outside the timing loop.
676 */
677 if (esl_opt_GetBoolean(go, "-v"))
678 {
679 for (i = 0; i < nkeys; i++)
680 printf("%-20s %9d\n", karr[i], (*hashfunc)(karr[i], hashsize));
681 }
682
683 /* Likewise, if user wanted to see statistical uniformity test...
684 */
685 if (esl_opt_GetBoolean(go, "-s"))
686 {
687 double mean, var, X2, pval;
688
689 ESL_ALLOC(ct, sizeof(int) * hashsize);
690 esl_vec_ISet(ct, hashsize, 0);
691 for (i = 0; i < nkeys; i++) ct[(*hashfunc)(karr[i], hashsize)]++;
692
693 esl_stats_IMean(ct, hashsize, &mean, &var);
694 for (X2 = 0.0, i = 0; i < hashsize; i++)
695 X2 += (((double) ct[i] - mean) * ((double) ct[i] - mean)) / mean;
696
697 esl_stats_ChiSquaredTest(hashsize-1, X2, &pval);
698
699 printf("Number of keys: %d\n", nkeys);
700 printf("Hash table size: %d\n", hashsize);
701 printf("Mean hash occupancy: %.2f\n", mean);
702 printf("Minimum: %d\n", esl_vec_IMin(ct, hashsize));
703 printf("Maximum: %d\n", esl_vec_IMax(ct, hashsize));
704 printf("Variance: %.2f\n", var);
705 printf("Chi-squared: %.2f\n", X2);
706 printf("Chi-squared p-value: %.4f\n", pval);
707 }
708
709
710 /* 3. cleanup, exit. */
711 for (i = 0; i < nkeys; i++) free(karr[i]);
712 free(karr);
713 esl_stopwatch_Destroy(w);
714 esl_getopts_Destroy(go);
715 return 0;
716
717 ERROR:
718 return status;
719 }
720 #endif /*eslKEYHASH_BENCHMARK2*/
721
722
723 /*------------------- end, benchmark drivers --------------------*/
724
725
726 /*****************************************************************
727 * 5. Unit tests
728 *****************************************************************/
729 #ifdef eslKEYHASH_TESTDRIVE
730 #include "esl_matrixops.h"
731 #include "esl_random.h"
732
733 /* utest_stringkeys()
734 * store a bunch of keys, look up a bunch of keys.
735 */
736 static void
utest_stringkeys(void)737 utest_stringkeys(void)
738 {
739 char msg[] = "keyhash stringkeys test failed";
740 ESL_RANDOMNESS *rng = esl_randomness_Create(42); // fixed RNG seed is deliberate. this test uses identical sample every time.
741 ESL_KEYHASH *kh = esl_keyhash_Create();
742 int nstore = 1200;
743 int nlookup = 1200;
744 int keylen = 2;
745 char **keys = esl_mat_CCreate(nstore+nlookup, keylen+1);
746 int nfound = 0;
747 int nmissed = 0;
748 int nk;
749 int i,j,h,h42;
750 int status;
751
752 /* Generate 2400 random k=2 keys "aa".."zz", 26^2 = 676 possible.
753 * Store the first 1200, search on remaining 1200.
754 * At 1.775x saturation, expect Poisson P(0) = 17% miss rate,
755 * so we will exercise both hits and misses on lookup. With the
756 * fixed 42 seed, <nmissed> comes out to be 209 (17.4%).
757 */
758 for (i = 0; i < nstore+nlookup; i++)
759 {
760 for (j = 0; j < keylen; j++)
761 keys[i][j] = 'a' + esl_rnd_Roll(rng, 26);
762 keys[i][j] = '\0';
763 }
764 /* Spike a known one, "XX", at key 42. */
765 keys[42][0] = keys[42][1] = 'X';
766
767 /* Store the first 1200. */
768 nk = 0;
769 for (i = 0; i < nstore; i++)
770 {
771 status = esl_keyhash_Store(kh, keys[i], -1, &h);
772 if (status == eslOK ) { if (h != nk) esl_fatal(msg); nk++; }
773 else if (status == eslEDUP) { if (h >= nk) esl_fatal(msg); }
774 else esl_fatal(msg);
775
776 if (i == 42) { h42 = h; } // remember where key 42 "XX" went.
777 }
778
779 /* Lookups */
780 for (i = nstore; i < nstore+nlookup; i++)
781 {
782 status = esl_keyhash_Lookup(kh, keys[i], -1, &h);
783 if (status == eslOK) { nfound++; if (h < 0 || h >= nk) esl_fatal(msg); }
784 else if (status == eslENOTFOUND) { nmissed++; if (h != -1) esl_fatal(msg); }
785 }
786 if ( esl_keyhash_Lookup(kh, "XX", -1, &h) != eslOK || h != h42) esl_fatal(msg);
787
788 if (nmissed != 209) esl_fatal(msg); // 209 is what you get with seed=42.
789
790 esl_mat_CDestroy(keys);
791 esl_keyhash_Destroy(kh);
792 esl_randomness_Destroy(rng);
793 }
794
795
796 /* utest_memkeys()
797 * Ditto, but now with arrays as keys (without \0-termination)
798 */
799 static void
utest_memkeys(void)800 utest_memkeys(void)
801 {
802 char msg[] = "keyhash memkeys test failed";
803 ESL_RANDOMNESS *rng = esl_randomness_Create(42);
804 ESL_KEYHASH *kh = esl_keyhash_Create();
805 int nstore = 1200;
806 int nlookup = 1200;
807 int keylen = 2;
808 char **keys = esl_mat_CCreate(nstore+nlookup, keylen);
809 int nfound = 0;
810 int nmissed = 0;
811 int nk;
812 int i,h,h42;
813 int status;
814
815 for (i = 0; i < (nstore+nlookup)*keylen; i++)
816 keys[0][i] = 'a' + esl_rnd_Roll(rng, 26); // chumminess with easel's 1D layout of a 2D array
817 keys[42][0] = keys[42][1] = 'X';
818
819 nk = 0;
820 for (i = 0; i < nstore; i++)
821 {
822 status = esl_keyhash_Store(kh, keys[i], keylen, &h);
823 if (status == eslOK ) { if (h != nk) esl_fatal(msg); nk++; }
824 else if (status == eslEDUP) { if (h >= nk) esl_fatal(msg); }
825 else esl_fatal(msg);
826 if (i == 42) { h42 = h; }
827 }
828
829 for (i = nstore; i < nstore+nlookup; i++)
830 {
831 status = esl_keyhash_Lookup(kh, keys[i], keylen, &h);
832 if (status == eslOK) { nfound++; if (h < 0 || h >= nk) esl_fatal(msg); }
833 else if (status == eslENOTFOUND) { nmissed++; if (h != -1) esl_fatal(msg); }
834 }
835 if ( esl_keyhash_Lookup(kh, keys[42], keylen, &h) != eslOK || h != h42) esl_fatal(msg);
836 if (nmissed != 209) esl_fatal(msg);
837
838 esl_mat_CDestroy(keys);
839 esl_keyhash_Destroy(kh);
840 esl_randomness_Destroy(rng);
841 }
842 #endif /*esl_KEYHASH_TESTDRIVE*/
843
844 /*---------------------- end, unit tests ------------------------*/
845
846 /*****************************************************************
847 * 6. Test driver
848 *****************************************************************/
849 #ifdef eslKEYHASH_TESTDRIVE
850 #include "esl_config.h"
851
852 #include <stdlib.h>
853 #include <stdio.h>
854
855 #include "easel.h"
856 #include "esl_getopts.h"
857
858
859 static ESL_OPTIONS options[] = {
860 /* name type default env range togs reqs incomp help docgrp */
861 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
862 { 0,0,0,0,0,0,0,0,0,0},
863 };
864 static char usage[] = "[-options]";
865 static char banner[] = "test driver for keyhash module";
866
867
868 int
main(int argc,char ** argv)869 main(int argc, char **argv)
870 {
871 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
872
873 fprintf(stderr, "## %s\n", argv[0]);
874
875 utest_stringkeys();
876 utest_memkeys();
877
878 fprintf(stderr, "# status = ok\n");
879 esl_getopts_Destroy(go);
880 return eslOK;
881 }
882 #endif /*eslKEYHASH_TESTDRIVE*/
883
884 /*--------------------- end, test driver ------------------------*/
885
886
887
888 /*****************************************************************
889 * 7. Example
890 *****************************************************************/
891 #ifdef eslKEYHASH_EXAMPLE
892 /*::cexcerpt::keyhash_example::begin::*/
893 /* gcc -g -Wall -o keyhash_example -I. -DeslKEYHASH_EXAMPLE esl_keyhash.c easel.c
894 * ./example /usr/share/dict/words /usr/share/dict/words
895 */
896 #include <stdio.h>
897 #include "easel.h"
898 #include "esl_keyhash.h"
899
900 int
main(int argc,char ** argv)901 main(int argc, char **argv)
902 {
903 ESL_KEYHASH *h = esl_keyhash_Create();
904 FILE *fp;
905 char buf[256];
906 char *s, *tok;
907 int idx;
908 int nstored, nsearched, nshared;
909
910 /* Read/store keys from file 1. */
911 if ((fp = fopen(argv[1], "r")) == NULL) esl_fatal("couldn't open %s\n", argv[1]);
912 nstored = 0;
913 while (fgets(buf, 256, fp) != NULL)
914 {
915 s = buf;
916 esl_strtok(&s, " \t\r\n", &tok);
917 esl_keyhash_Store(h, tok, -1, &idx);
918 nstored++;
919 }
920 fclose(fp);
921 printf("Stored %d keys.\n", nstored);
922
923 /* Look up keys from file 2. */
924 if ((fp = fopen(argv[2], "r")) == NULL) esl_fatal("couldn't open %s\n", argv[1]);
925 nsearched = nshared = 0;
926 while (fgets(buf, 256, fp) != NULL)
927 {
928 s = buf;
929 esl_strtok(&s, " \t\r\n", &tok);
930 if (esl_keyhash_Lookup(h, tok, -1, &idx) == eslOK) nshared++;
931 nsearched++;
932 }
933 fclose(fp);
934 printf("Looked up %d keys.\n", nsearched);
935 printf("In common: %d keys.\n", nshared);
936 esl_keyhash_Destroy(h);
937 return 0;
938 }
939 /*::cexcerpt::keyhash_example::end::*/
940 #endif /*eslKEYHASH_EXAMPLE*/
941 /*----------------------- end, example --------------------------*/
942