1 /* Partial emulation of Perl hashes (associative arrays),
2  * mapping keys (ASCII char strings) to array indices.
3  *
4  * Contents:
5  *    1. The <ESL_KEYHASH> object.
6  *    2. Storing and retrieving keys.
7  *    3. Internal functions.
8  *    4. Benchmark drivers.
9  *    5. Unit tests.
10  *    6. Test driver.
11  *    7. Example.
12  */
13 #include "esl_config.h"
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <limits.h>
19 
20 #include "easel.h"
21 #include "esl_mem.h"
22 
23 #include "esl_keyhash.h"
24 
25 static ESL_KEYHASH *keyhash_create(uint32_t hashsize, int init_key_alloc, int init_string_alloc);
26 static uint32_t     jenkins_hash(const char *key, esl_pos_t n, uint32_t hashsize);
27 static int          key_upsize(ESL_KEYHASH *kh);
28 
29 
30 /*****************************************************************
31  *# 1. The <ESL_KEYHASH> object
32  *****************************************************************/
33 
34 /* Function:  esl_keyhash_Create()
35  * Synopsis:  Allocates a new keyhash.
36  *
37  * Purpose:   Create a new hash table for key indexing, and returns
38  *            a pointer to it.
39  *
40  * Throws:    <NULL> on allocation failure.
41  *
42  * Note:      128*sizeof(int)*3 + 2048*sizeof(char) + sizeof(ESL_KEYHASH):
43  *            about 2400 bytes for an initial KEYHASH.
44  */
45 ESL_KEYHASH *
esl_keyhash_Create(void)46 esl_keyhash_Create(void)
47 {
48   return keyhash_create(128,   /* initial hash table size (power of 2)              */
49 			128,   /* initial alloc for up to 128 keys                  */
50 			2048); /* initial alloc for keys totalling up to 2048 chars */
51 }
52 
53 
54 /* Function:  esl_keyhash_CreateCustom()
55  * Synopsis:  Allocate a new keyhash with customized initial allocations.
56  *
57  * Purpose:   Create a new hash table, initially allocating for
58  *            a hash table of size <hashsize> entries, <kalloc>
59  *            keys, and a total key string length of <salloc>.
60  *            <hashsize> must be a power of 2, and all allocations
61  *            must be $\geq 0$.
62  *
63  *            The object will still expand as needed, so the reason to
64  *            use a customized allocation is when you're trying to
65  *            minimize memory footprint and you expect your keyhash to
66  *            be smaller than the default (of up to 128 keys, of total
67  *            length up to 2048).
68  *
69  * Throws:    <NULL> on allocation failure.
70  */
71 ESL_KEYHASH *
esl_keyhash_CreateCustom(uint32_t hashsize,int kalloc,int salloc)72 esl_keyhash_CreateCustom(uint32_t hashsize, int kalloc, int salloc)
73 {
74   ESL_DASSERT1((hashsize && ((hashsize & (hashsize-1)) == 0))); /* hashsize is a power of 2 (bitshifting trickery) */
75   return keyhash_create(hashsize, kalloc, salloc);
76 }
77 
78 /* Function:  esl_keyhash_Clone()
79  * Synopsis:  Duplicates a keyhash.
80  *
81  * Purpose:   Allocates and duplicates a keyhash <kh>. Returns a
82  *            pointer to the duplicate.
83  *
84  * Throws:    <NULL> on allocation failure.
85  */
86 ESL_KEYHASH *
esl_keyhash_Clone(const ESL_KEYHASH * kh)87 esl_keyhash_Clone(const ESL_KEYHASH *kh)
88 {
89   ESL_KEYHASH *nw;
90   int          h;
91 
92   if ((nw = keyhash_create(kh->hashsize, kh->kalloc, kh->salloc)) == NULL) goto ERROR;
93 
94   for (h = 0; h < kh->hashsize; h++)
95     nw->hashtable[h] = kh->hashtable[h];
96 
97   for (h = 0; h < kh->nkeys; h++)
98     {
99       nw->nxt[h]        = kh->nxt[h];
100       nw->key_offset[h] = kh->key_offset[h];
101     }
102   nw->nkeys = kh->nkeys;
103 
104   memcpy(nw->smem, kh->smem, sizeof(char) * kh->sn);
105   nw->sn = kh->sn;
106   return nw;
107 
108  ERROR:
109   esl_keyhash_Destroy(nw);
110   return NULL;
111 }
112 
113 
114 /* Function:  esl_keyhash_Get()
115  * Synopsis:  Returns a key name, given its index.
116  *
117  * Purpose:   Returns a pointer to the key name associated
118  *            with index <idx>. The key name is a <NUL>-terminated
119  *            string whose memory is managed internally in
120  *            the keyhash <kh>.
121  */
122 char *
esl_keyhash_Get(const ESL_KEYHASH * kh,int idx)123 esl_keyhash_Get(const ESL_KEYHASH *kh, int idx)
124 {
125   return kh->smem + kh->key_offset[idx];
126 }
127 
128 /* Function:  esl_keyhash_GetNumber()
129  * Synopsis:  Returns the total number of keys stored.
130  *
131  * Purpose:   Returns the total number of keys currently stored in the
132  *            keyhash <kh>.
133  */
134 int
esl_keyhash_GetNumber(const ESL_KEYHASH * kh)135 esl_keyhash_GetNumber(const ESL_KEYHASH *kh)
136 {
137   return kh->nkeys;
138 }
139 
140 /* Function:  esl_keyhash_Sizeof()
141  * Synopsis:  Returns the size of a keyhash object, in bytes.
142  */
143 size_t
esl_keyhash_Sizeof(const ESL_KEYHASH * kh)144 esl_keyhash_Sizeof(const ESL_KEYHASH *kh)
145 {
146   size_t n = 0;
147 
148   if (kh)
149     {
150       n += sizeof(ESL_KEYHASH);
151       n += sizeof(int)  * kh->hashsize;
152       n += sizeof(int)  * kh->kalloc * 2;
153       n += sizeof(char) * kh->salloc;
154     }
155   return n;
156 }
157 
158 /* Function:  esl_keyhash_Reuse()
159  * Synopsis:  Recycle a keyhash.
160  *
161  * Purpose:   Empties keyhash <kh> so it can be reused without
162  *            creating a new one.
163  *
164  * Returns:   <eslOK> on success.
165  */
166 int
esl_keyhash_Reuse(ESL_KEYHASH * kh)167 esl_keyhash_Reuse(ESL_KEYHASH *kh)
168 {
169   int i;
170 
171   for (i = 0; i < kh->hashsize; i++) kh->hashtable[i] = -1;
172   kh->nkeys = 0;
173   kh->sn = 0;
174   return eslOK;
175 }
176 
177 
178 
179 /* Function:  esl_keyhash_Destroy()
180  * Synopsis:  Frees a keyhash.
181  *
182  * Purpose:   Destroys <kh>.
183  *
184  * Returns:   (void)
185  */
186 void
esl_keyhash_Destroy(ESL_KEYHASH * kh)187 esl_keyhash_Destroy(ESL_KEYHASH *kh)
188 {
189   if (kh == NULL) return;
190   if (kh->hashtable  != NULL) free(kh->hashtable);
191   if (kh->key_offset != NULL) free(kh->key_offset);
192   if (kh->nxt        != NULL) free(kh->nxt);
193   if (kh->smem       != NULL) free(kh->smem);
194   free(kh);
195 }
196 
197 /* Function:  esl_keyhash_Dump()
198  * Synopsis:  Dumps debugging information about a keyhash.
199  *
200  * Purpose:   Mainly for debugging purposes. Dump
201  *            some information about the hash table <kh>
202  *            to the stream <fp>, which might be stderr
203  *            or stdout.
204  */
205 void
esl_keyhash_Dump(FILE * fp,const ESL_KEYHASH * kh)206 esl_keyhash_Dump(FILE *fp, const ESL_KEYHASH *kh)
207 {
208   int idx;
209   int h;
210   int nkeys;
211   int nempty  = 0;
212   int maxkeys = -1;
213   int minkeys = INT_MAX;
214 
215   for (h = 0; h < kh->hashsize; h++)
216     {
217       for (nkeys = 0, idx = kh->hashtable[h]; idx != -1; idx = kh->nxt[idx]) nkeys++;
218 
219       if (nkeys == 0)      nempty++;
220       if (nkeys > maxkeys) maxkeys = nkeys;
221       if (nkeys < minkeys) minkeys = nkeys;
222     }
223 
224   fprintf(fp, "Total keys:             %d\n", kh->nkeys);
225   fprintf(fp, "Hash table size:        %d\n", kh->hashsize);
226   fprintf(fp, "Average occupancy:      %.2f\n", (float) kh->nkeys /(float) kh->hashsize);
227   fprintf(fp, "Unoccupied slots:       %d\n", nempty);
228   fprintf(fp, "Most in one slot:       %d\n", maxkeys);
229   fprintf(fp, "Least in one slot:      %d\n", minkeys);
230   fprintf(fp, "Keys allocated for:     %d\n", kh->kalloc);
231   fprintf(fp, "Key string space alloc: %d\n", kh->salloc);
232   fprintf(fp, "Key string space used:  %d\n", kh->sn);
233   fprintf(fp, "Total obj size, bytes:  %d\n", (int) esl_keyhash_Sizeof(kh));
234 }
235 /*--------------- end, <ESL_KEYHASH> object ---------------------*/
236 
237 
238 
239 
240 /*****************************************************************
241  *# 2. Storing and retrieving keys
242  *****************************************************************/
243 
244 /* Function: esl_keyhash_Store()
245  * Synopsis: Store a key and get a key index for it.
246  *
247  * Purpose:  Store a string (or mem) <key> of length <n> in the key
248  *           index hash table <kh>.  Associate it with a unique key
249  *           index, counting from 0. This index maps hashed keys to
250  *           integer-indexed C arrays, clumsily emulating hashes or
251  *           associative arrays. Optionally returns the index through
252  *           <opt_index>.
253  *
254  *           <key>, <n> follow the standard idiom for strings and
255  *           unterminated buffers. If <key> is raw memory, <n> must
256  *           be provided; if <key> is a \0-terminated string, <n>
257  *           may be -1.
258  *
259  * Returns:  <eslOK> on success; stores <key> in <kh>; <opt_index> is
260  *           returned, set to the next higher index value.
261  *           Returns <eslEDUP> if <key> was already stored in the table;
262  *           <opt_index> is set to the existing index for <key>.
263  *
264  * Throws:   <eslEMEM> on allocation failure, and sets <opt_index> to -1.
265  */
266 int
esl_keyhash_Store(ESL_KEYHASH * kh,const char * key,esl_pos_t n,int * opt_index)267 esl_keyhash_Store(ESL_KEYHASH *kh, const char *key, esl_pos_t n, int *opt_index)
268 {
269   uint32_t val = jenkins_hash(key, n, kh->hashsize);
270   int idx;
271   int status;
272 
273   if (n == -1) n = strlen(key);
274 
275   /* Was this key already stored?  */
276   for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
277     if (esl_memstrcmp(key, n, kh->smem + kh->key_offset[idx]))
278       {
279 	if (opt_index) *opt_index = idx;
280 	return eslEDUP;
281       }
282 
283   /* Reallocate key ptr/index memory if needed */
284   if (kh->nkeys == kh->kalloc)
285     {
286       ESL_REALLOC(kh->key_offset, sizeof(int)*kh->kalloc*2);
287       ESL_REALLOC(kh->nxt,        sizeof(int)*kh->kalloc*2);
288       kh->kalloc *= 2;
289     }
290 
291   /* Reallocate key string memory if needed */
292   while (kh->sn + n + 1 > kh->salloc)
293     {
294       ESL_REALLOC(kh->smem, sizeof(char) * kh->salloc * 2);
295       kh->salloc *= 2;
296     }
297 
298   /* Copy the key, assign its index */
299   idx                 = kh->nkeys;
300   kh->key_offset[idx] = kh->sn;
301   kh->sn             += n+1;
302   esl_memstrcpy(key, n, kh->smem + kh->key_offset[idx]);
303   kh->nkeys++;
304 
305   /* Insert new element at head of the approp linked list in hashtable */
306   kh->nxt[idx]       = kh->hashtable[val];
307   kh->hashtable[val] = idx;
308 
309   /* Time to upsize? If we're 3x saturated, expand the hash table */
310   if (kh->nkeys > 3*kh->hashsize)
311     if ((status = key_upsize(kh)) != eslOK) goto ERROR;
312 
313   if (opt_index != NULL) *opt_index = idx;
314   return eslOK;
315 
316  ERROR:
317   if (opt_index != NULL) *opt_index = -1;
318   return status;
319 }
320 
321 /* Function:  esl_keyhash_Lookup()
322  * Synopsis:  Look up a key's array index.
323  *
324  * Purpose:   Look up string or mem <key> of length <n> in hash table <kh>.
325  *            If <key> is found, return <eslOK>, and optionally set <*opt_index>
326  *            to its array index (0..nkeys-1).
327  *            If <key> is not found, return <eslENOTFOUND>, and
328  *            optionally set <*opt_index> to -1.
329  *
330  *            If <key> is a \0-terminated string, <n> may be -1.
331  */
332 int
esl_keyhash_Lookup(const ESL_KEYHASH * kh,const char * key,esl_pos_t n,int * opt_index)333 esl_keyhash_Lookup(const ESL_KEYHASH *kh, const char *key, esl_pos_t n, int *opt_index)
334 {
335   uint32_t val  = jenkins_hash(key, n, kh->hashsize);
336   int      idx;
337 
338   if (n == -1)
339     {
340       for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
341 	if (strcmp(key, kh->smem + kh->key_offset[idx]) == 0)
342 	  {
343 	    if (opt_index) *opt_index = idx;
344 	    return eslOK;
345 	  }
346     }
347   else
348     {
349       for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
350 	if (esl_memstrcmp(key, n, kh->smem + kh->key_offset[idx]))
351 	  {
352 	    if (opt_index) *opt_index = idx;
353 	    return eslOK;
354 	  }
355     }
356 
357   if (opt_index != NULL) *opt_index = -1;
358   return eslENOTFOUND;
359 }
360 
361 
362 /*---------- end, API for storing/retrieving keys ---------------*/
363 
364 
365 
366 
367 /*****************************************************************
368  * 3. Internal functions
369  *****************************************************************/
370 
371 /* keyhash_create()
372  *
373  * The real creation function, which takes arguments for memory sizes.
374  * This is abstracted to a static function because it's used by both
375  * Create() and Clone() but slightly differently.
376  *
377  * Args:  hashsize          - size of hash table; this must be a power of two.
378  *        init_key_alloc    - initial allocation for # of keys.
379  *        init_string_alloc - initial allocation for total size of key strings.
380  *
381  * Returns:  An allocated hash table structure; or NULL on failure.
382  */
383 ESL_KEYHASH *
keyhash_create(uint32_t hashsize,int init_key_alloc,int init_string_alloc)384 keyhash_create(uint32_t hashsize, int init_key_alloc, int init_string_alloc)
385 {
386   ESL_KEYHASH *kh = NULL;
387   int  i;
388   int  status;
389 
390   ESL_ALLOC(kh, sizeof(ESL_KEYHASH));
391   kh->hashtable  = NULL;
392   kh->key_offset = NULL;
393   kh->nxt        = NULL;
394   kh->smem       = NULL;
395 
396   kh->hashsize  = hashsize;
397   kh->kalloc    = init_key_alloc;
398   kh->salloc    = init_string_alloc;
399 
400   ESL_ALLOC(kh->hashtable, sizeof(int) * kh->hashsize);
401   for (i = 0; i < kh->hashsize; i++)  kh->hashtable[i] = -1;
402 
403   ESL_ALLOC(kh->key_offset, sizeof(int) * kh->kalloc);
404   ESL_ALLOC(kh->nxt,        sizeof(int) * kh->kalloc);
405   for (i = 0; i < kh->kalloc; i++)  kh->nxt[i] = -1;
406 
407   ESL_ALLOC(kh->smem,   sizeof(char) * kh->salloc);
408   kh->nkeys = 0;
409   kh->sn    = 0;
410   return kh;
411 
412  ERROR:
413   esl_keyhash_Destroy(kh);
414   return NULL;
415 }
416 
417 
418 /* jenkins_hash()
419  *
420  * The hash function.
421  * This is Bob Jenkins' "one at a time" hash.
422  * <key> is a NUL-terminated string of any length.
423  * <hashsize> must be a power of 2.
424  *
425  * References:
426  * [1]  http://en.wikipedia.org/wiki/Hash_table
427  * [2]  http://www.burtleburtle.net/bob/hash/doobs.html
428  */
429 static uint32_t
jenkins_hash(const char * key,esl_pos_t n,uint32_t hashsize)430 jenkins_hash(const char *key, esl_pos_t n, uint32_t hashsize)
431 {
432   esl_pos_t pos;
433   uint32_t  val = 0;
434 
435   if (n == -1)
436     { /* string version */
437       for (; *key != '\0'; key++)
438 	{
439 	  val += *key;
440 	  val += (val << 10);
441 	  val ^= (val >>  6);
442 	}
443     }
444   else
445     { /* buffer version */
446       for (pos = 0; pos < n; pos++)
447       {
448 	val += key[pos];
449 	val += (val << 10);
450 	val ^= (val >>  6);
451       }
452     }
453   val += (val <<  3);
454   val ^= (val >> 11);
455   val += (val << 15);
456 
457   return (val & (hashsize - 1));
458 }
459 
460 /* key_upsize()
461  *
462  * Grow the hash table to the next available size.
463  *
464  * Args:     old - the KEY hash table to reallocate.
465  *
466  * Returns:  <eslOK> on success. 'Success' includes the case
467  *           where the hash table is already at its maximum size,
468  *           and cannot be upsized any more.
469  *
470  * Throws:   <eslEMEM> on allocation failure, and
471  *           the hash table is left in its initial state.
472  */
473 static int
key_upsize(ESL_KEYHASH * kh)474 key_upsize(ESL_KEYHASH *kh)
475 {
476   void     *p;
477   int       i;
478   uint32_t  val;
479   int       status;
480 
481   /* 28 below because we're going to upsize in steps of 8x (2^3); need to be < 2^{31-3} */
482   if (kh->hashsize >= (1<<28)) return eslOK; /* quasi-success (can't grow any more)    */
483 
484   /* The catch here is that when you upsize the table, all the hash functions
485    * change; so you have to go through all the keys, recompute their hash functions,
486    * and store them again in the new table.
487    */
488   /* Allocate a new, larger hash table. (Don't change <kh> until this succeeds) */
489   ESL_RALLOC(kh->hashtable, p, sizeof(int) * (kh->hashsize << 3));
490   kh->hashsize  = kh->hashsize << 3; /* 8x */
491   for (i = 0; i < kh->hashsize; i++) kh->hashtable[i] = -1;
492 
493   /* Store all the keys again. */
494   for (i = 0; i < kh->nkeys; i++)
495     {
496       val                = jenkins_hash(kh->smem + kh->key_offset[i], -1, kh->hashsize);
497       kh->nxt[i]         = kh->hashtable[val];
498       kh->hashtable[val] = i;
499     }
500   return eslOK;
501 
502  ERROR:
503   return eslEMEM;
504 }
505 /*--------------- end, internal functions -----------------*/
506 
507 
508 /*****************************************************************
509  * 4. Benchmark driver
510  *****************************************************************/
511 #ifdef eslKEYHASH_BENCHMARK
512 /*
513    gcc -g -O2 -o keyhash_benchmark -I. -L. -DeslKEYHASH_BENCHMARK esl_keyhash.c -leasel -lm
514    time ./keyhash_benchmark /usr/share/dict/words /usr/share/dict/words
515  */
516 #include "esl_config.h"
517 
518 #include <stdio.h>
519 
520 #include "easel.h"
521 #include "esl_getopts.h"
522 #include "esl_keyhash.h"
523 #include "esl_stopwatch.h"
524 
525 static ESL_OPTIONS options[] = {
526   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
527   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
528   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
529 };
530 static char usage[]  = "[-options] <keyfile1> <keyfile2>";
531 static char banner[] = "benchmarking speed of keyhash module";
532 
533 int
main(int argc,char ** argv)534 main(int argc, char **argv)
535 {
536   ESL_GETOPTS    *go      = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage);
537   ESL_KEYHASH    *kh      = esl_keyhash_Create();
538   ESL_STOPWATCH  *w       = esl_stopwatch_Create();
539   char           *file1   = esl_opt_GetArg(go, 1);
540   char           *file2   = esl_opt_GetArg(go, 2);
541   FILE           *fp;
542   char            buf[256];
543   char           *s, *tok;
544   int             idx;
545   int             nstored, nsearched, nshared;
546 
547   /* Read/store keys from file 1.
548    */
549   esl_stopwatch_Start(w);
550   if ((fp = fopen(file1, "r")) == NULL)
551     { fprintf(stderr, "couldn't open %s\n", argv[1]); exit(1); }
552   nstored = 0;
553   while (fgets(buf, 256, fp) != NULL)
554     {
555       s = buf;
556       esl_strtok(&s, " \t\r\n", &tok);
557       esl_keyhash_Store(kh, tok, -1, &idx);
558       nstored++;
559     }
560   fclose(fp);
561   printf("Stored %d keys.\n", nstored);
562 
563   /* Look up keys from file 2.
564    */
565   if ((fp = fopen(file2, "r")) == NULL)
566     { fprintf(stderr, "couldn't open %s\n", argv[2]); exit(1); }
567   nsearched = nshared = 0;
568   while (fgets(buf, 256, fp) != NULL)
569     {
570       s = buf;
571       esl_strtok(&s, " \t\r\n", &tok);
572 
573       if (esl_keyhash_Lookup(kh, tok, -1, &idx) == eslOK) nshared++;
574       nsearched++;
575     }
576   fclose(fp);
577   esl_stopwatch_Stop(w);
578   printf("Looked up %d keys.\n", nsearched);
579   printf("In common: %d keys.\n", nshared);
580   esl_stopwatch_Display(stdout, w, "# CPU Time: ");
581 
582   esl_stopwatch_Destroy(w);
583   esl_keyhash_Destroy(kh);
584   esl_getopts_Destroy(go);
585   return 0;
586 }
587 #endif /*eslKEYHASH_BENCHMARK*/
588 
589 
590 
591 #ifdef eslKEYHASH_BENCHMARK2
592 
593 /* Benchmark #2 is a benchmark just of the hash function.
594  * First we read in a bunch of keys from any file, one key per line.
595  * Then we start timing, and compute a hash for each key.
596  */
597 
598 /* gcc -O2 -o keyhash_benchmark2 -I. -L. -DeslKEYHASH_BENCHMARK2 esl_keyhash.c -leasel -lm
599  * ./keyhash_benchmark2 <keyfile>
600  */
601 #include "esl_config.h"
602 
603 #include <stdio.h>
604 #include <math.h>
605 
606 #include "easel.h"
607 #include "esl_fileparser.h"
608 #include "esl_getopts.h"
609 #include "esl_keyhash.h"
610 #include "esl_stats.h"
611 #include "esl_stopwatch.h"
612 #include "esl_vectorops.h"
613 
614 static ESL_OPTIONS options[] = {
615   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
616   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
617   { "-s",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show statistical test for hash uniformity",        0 },
618   { "-v",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "be verbose: print hash values for keys",           0 },
619   { "-x",        eslARG_INT,   "32768", NULL, NULL,  NULL,  NULL, NULL, "set hash table size to <n>",                       0 },
620   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
621 };
622 static char usage[]  = "[-options] <keyfile>";
623 static char banner[] = "benchmarking speed of hash function in keyhash module";
624 
625 int
main(int argc,char ** argv)626 main(int argc, char **argv)
627 {
628   ESL_GETOPTS    *go         = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
629   ESL_FILEPARSER *efp        = NULL;
630   ESL_STOPWATCH  *w          = esl_stopwatch_Create();
631   ESL_KEYHASH    *kh         = esl_keyhash_Create();
632   char           *keyfile    = esl_opt_GetArg(go, 1);
633   uint32_t        hashsize   = esl_opt_GetInteger(go, "-x");
634   char           *key;
635   int             keylen;
636   char          **karr       = NULL;
637   int             kalloc;
638   int            *ct         = NULL;
639   int             nkeys;
640   int             i;
641   int             status;
642   uint32_t (*hashfunc)(const char*,uint32_t) = jenkins_hash;
643 
644   /* 1. Store the keys from the file, before starting the benchmark timer. */
645   kalloc = 256;
646   ESL_ALLOC(karr, sizeof(char *) * kalloc);
647 
648   if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile);
649 
650   nkeys = 0;
651   while (esl_fileparser_NextLine(efp) == eslOK)
652     {
653       if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failure in parsing key file\n");
654 
655       if (nkeys == kalloc) {
656 	void *tmp;
657 	ESL_RALLOC(karr, tmp, sizeof(char *) * kalloc * 2);
658 	kalloc *= 2;
659       }
660 
661       esl_strdup(key, keylen, &(karr[nkeys]));
662       nkeys++;
663     }
664   esl_fileparser_Close(efp);
665   /* and karr[0..nkeys-1] are now the keys. */
666 
667 
668   /* 2. benchmark hashing the keys. */
669   esl_stopwatch_Start(w);
670   for (i = 0; i < nkeys; i++) (*hashfunc)(karr[i], hashsize);
671   esl_stopwatch_Stop(w);
672   esl_stopwatch_Display(stdout, w, "# CPU Time: ");
673 
674   /* If user wanted to see the hashes, do that
675    * separately, outside the timing loop.
676    */
677   if (esl_opt_GetBoolean(go, "-v"))
678     {
679       for (i = 0; i < nkeys; i++)
680 	printf("%-20s %9d\n", karr[i], (*hashfunc)(karr[i], hashsize));
681     }
682 
683   /* Likewise, if user wanted to see statistical uniformity test...
684    */
685   if (esl_opt_GetBoolean(go, "-s"))
686     {
687       double mean, var, X2, pval;
688 
689       ESL_ALLOC(ct, sizeof(int) * hashsize);
690       esl_vec_ISet(ct, hashsize, 0);
691       for (i = 0; i < nkeys; i++) ct[(*hashfunc)(karr[i], hashsize)]++;
692 
693       esl_stats_IMean(ct, hashsize, &mean, &var);
694       for (X2 = 0.0, i = 0; i < hashsize; i++)
695 	X2 += (((double) ct[i] - mean) *  ((double) ct[i] - mean)) / mean;
696 
697       esl_stats_ChiSquaredTest(hashsize-1, X2, &pval);
698 
699       printf("Number of keys:      %d\n",   nkeys);
700       printf("Hash table size:     %d\n",   hashsize);
701       printf("Mean hash occupancy: %.2f\n", mean);
702       printf("Minimum:             %d\n",   esl_vec_IMin(ct, hashsize));
703       printf("Maximum:             %d\n",   esl_vec_IMax(ct, hashsize));
704       printf("Variance:            %.2f\n", var);
705       printf("Chi-squared:         %.2f\n", X2);
706       printf("Chi-squared p-value: %.4f\n", pval);
707     }
708 
709 
710   /* 3. cleanup, exit. */
711   for (i = 0; i < nkeys; i++) free(karr[i]);
712   free(karr);
713   esl_stopwatch_Destroy(w);
714   esl_getopts_Destroy(go);
715   return 0;
716 
717  ERROR:
718   return status;
719 }
720 #endif /*eslKEYHASH_BENCHMARK2*/
721 
722 
723 /*------------------- end, benchmark drivers --------------------*/
724 
725 
726 /*****************************************************************
727  * 5. Unit tests
728  *****************************************************************/
729 #ifdef eslKEYHASH_TESTDRIVE
730 #include "esl_matrixops.h"
731 #include "esl_random.h"
732 
733 /* utest_stringkeys()
734  * store a bunch of keys, look up a bunch of keys.
735  */
736 static void
utest_stringkeys(void)737 utest_stringkeys(void)
738 {
739   char            msg[]   = "keyhash stringkeys test failed";
740   ESL_RANDOMNESS *rng     = esl_randomness_Create(42);      // fixed RNG seed is deliberate. this test uses identical sample every time.
741   ESL_KEYHASH    *kh      = esl_keyhash_Create();
742   int             nstore  = 1200;
743   int             nlookup = 1200;
744   int             keylen  = 2;
745   char          **keys    = esl_mat_CCreate(nstore+nlookup, keylen+1);
746   int             nfound  = 0;
747   int             nmissed = 0;
748   int             nk;
749   int             i,j,h,h42;
750   int             status;
751 
752   /* Generate 2400 random k=2 keys "aa".."zz", 26^2 = 676 possible.
753    * Store the first 1200, search on remaining 1200.
754    * At 1.775x saturation, expect Poisson P(0) = 17% miss rate,
755    * so we will exercise both hits and misses on lookup. With the
756    * fixed 42 seed, <nmissed> comes out to be 209 (17.4%).
757    */
758   for (i = 0; i < nstore+nlookup; i++)
759     {
760       for (j = 0; j < keylen; j++)
761 	keys[i][j] = 'a' + esl_rnd_Roll(rng, 26);
762       keys[i][j] = '\0';
763     }
764   /* Spike a known one, "XX", at key 42. */
765   keys[42][0] = keys[42][1] = 'X';
766 
767   /* Store the first 1200. */
768   nk = 0;
769   for (i = 0; i < nstore; i++)
770     {
771       status = esl_keyhash_Store(kh, keys[i], -1, &h);
772       if      (status == eslOK )  { if (h != nk) esl_fatal(msg); nk++; }
773       else if (status == eslEDUP) { if (h >= nk) esl_fatal(msg);       }
774       else esl_fatal(msg);
775 
776       if (i == 42) { h42 = h; } // remember where key 42 "XX" went.
777     }
778 
779   /* Lookups */
780   for (i = nstore; i < nstore+nlookup; i++)
781     {
782       status = esl_keyhash_Lookup(kh, keys[i], -1, &h);
783       if      (status == eslOK)        { nfound++;  if (h < 0 || h >= nk) esl_fatal(msg); }
784       else if (status == eslENOTFOUND) { nmissed++; if (h != -1)          esl_fatal(msg); }
785     }
786   if ( esl_keyhash_Lookup(kh, "XX", -1, &h) != eslOK || h != h42) esl_fatal(msg);
787 
788   if (nmissed != 209) esl_fatal(msg);  // 209 is what you get with seed=42.
789 
790   esl_mat_CDestroy(keys);
791   esl_keyhash_Destroy(kh);
792   esl_randomness_Destroy(rng);
793 }
794 
795 
796 /* utest_memkeys()
797  * Ditto, but now with arrays as keys (without \0-termination)
798  */
799 static void
utest_memkeys(void)800 utest_memkeys(void)
801 {
802   char            msg[]   = "keyhash memkeys test failed";
803   ESL_RANDOMNESS *rng     = esl_randomness_Create(42);
804   ESL_KEYHASH    *kh      = esl_keyhash_Create();
805   int             nstore  = 1200;
806   int             nlookup = 1200;
807   int             keylen  = 2;
808   char          **keys    = esl_mat_CCreate(nstore+nlookup, keylen);
809   int             nfound  = 0;
810   int             nmissed = 0;
811   int             nk;
812   int             i,h,h42;
813   int             status;
814 
815   for (i = 0; i < (nstore+nlookup)*keylen; i++)
816     keys[0][i] = 'a' + esl_rnd_Roll(rng, 26);   // chumminess with easel's 1D layout of a 2D array
817   keys[42][0] = keys[42][1] = 'X';
818 
819   nk = 0;
820   for (i = 0; i < nstore; i++)
821     {
822       status = esl_keyhash_Store(kh, keys[i], keylen, &h);
823       if      (status == eslOK )  { if (h != nk) esl_fatal(msg); nk++; }
824       else if (status == eslEDUP) { if (h >= nk) esl_fatal(msg);       }
825       else esl_fatal(msg);
826       if (i == 42) { h42 = h; }
827     }
828 
829   for (i = nstore; i < nstore+nlookup; i++)
830     {
831       status = esl_keyhash_Lookup(kh, keys[i], keylen, &h);
832       if      (status == eslOK)        { nfound++;  if (h < 0 || h >= nk) esl_fatal(msg); }
833       else if (status == eslENOTFOUND) { nmissed++; if (h != -1)          esl_fatal(msg); }
834     }
835   if ( esl_keyhash_Lookup(kh, keys[42], keylen, &h) != eslOK || h != h42) esl_fatal(msg);
836   if (nmissed != 209) esl_fatal(msg);
837 
838   esl_mat_CDestroy(keys);
839   esl_keyhash_Destroy(kh);
840   esl_randomness_Destroy(rng);
841 }
842 #endif /*esl_KEYHASH_TESTDRIVE*/
843 
844 /*---------------------- end, unit tests ------------------------*/
845 
846 /*****************************************************************
847  * 6. Test driver
848  *****************************************************************/
849 #ifdef eslKEYHASH_TESTDRIVE
850 #include "esl_config.h"
851 
852 #include <stdlib.h>
853 #include <stdio.h>
854 
855 #include "easel.h"
856 #include "esl_getopts.h"
857 
858 
859 static ESL_OPTIONS options[] = {
860   /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
861   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",    0},
862   { 0,0,0,0,0,0,0,0,0,0},
863 };
864 static char usage[]  = "[-options]";
865 static char banner[] = "test driver for keyhash module";
866 
867 
868 int
main(int argc,char ** argv)869 main(int argc, char **argv)
870 {
871   ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
872 
873   fprintf(stderr, "## %s\n", argv[0]);
874 
875   utest_stringkeys();
876   utest_memkeys();
877 
878   fprintf(stderr, "#  status = ok\n");
879   esl_getopts_Destroy(go);
880   return eslOK;
881 }
882 #endif /*eslKEYHASH_TESTDRIVE*/
883 
884 /*--------------------- end, test driver ------------------------*/
885 
886 
887 
888 /*****************************************************************
889  * 7. Example
890  *****************************************************************/
891 #ifdef eslKEYHASH_EXAMPLE
892 /*::cexcerpt::keyhash_example::begin::*/
893 /* gcc -g -Wall -o keyhash_example -I. -DeslKEYHASH_EXAMPLE esl_keyhash.c easel.c
894  * ./example /usr/share/dict/words /usr/share/dict/words
895  */
896 #include <stdio.h>
897 #include "easel.h"
898 #include "esl_keyhash.h"
899 
900 int
main(int argc,char ** argv)901 main(int argc, char **argv)
902 {
903   ESL_KEYHASH *h   = esl_keyhash_Create();
904   FILE        *fp;
905   char         buf[256];
906   char        *s, *tok;
907   int          idx;
908   int          nstored, nsearched, nshared;
909 
910   /* Read/store keys from file 1. */
911   if ((fp = fopen(argv[1], "r")) == NULL) esl_fatal("couldn't open %s\n", argv[1]);
912   nstored = 0;
913   while (fgets(buf, 256, fp) != NULL)
914     {
915       s = buf;
916       esl_strtok(&s, " \t\r\n", &tok);
917       esl_keyhash_Store(h, tok, -1, &idx);
918       nstored++;
919     }
920   fclose(fp);
921   printf("Stored %d keys.\n", nstored);
922 
923   /* Look up keys from file 2. */
924   if ((fp = fopen(argv[2], "r")) == NULL) esl_fatal("couldn't open %s\n", argv[1]);
925   nsearched = nshared = 0;
926   while (fgets(buf, 256, fp) != NULL)
927     {
928       s = buf;
929       esl_strtok(&s, " \t\r\n", &tok);
930       if (esl_keyhash_Lookup(h, tok, -1, &idx) == eslOK) nshared++;
931       nsearched++;
932     }
933   fclose(fp);
934   printf("Looked up %d keys.\n", nsearched);
935   printf("In common: %d keys.\n", nshared);
936   esl_keyhash_Destroy(h);
937   return 0;
938 }
939 /*::cexcerpt::keyhash_example::end::*/
940 #endif /*eslKEYHASH_EXAMPLE*/
941 /*----------------------- end, example --------------------------*/
942