1 /*
2  * Copyright (c) 2005-2011, 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifdef HAVE_CONFIG_H
35 #include "io_lib_config.h"
36 #endif
37 
38 #include <stdio.h>
39 #include <string.h>
40 #include <stdlib.h>
41 #include "io_lib/os.h"
42 #include "io_lib/hash_table.h"
43 #include "io_lib/jenkins_lookup3.h"
44 
45 /* =========================================================================
46  * TCL's hash function. Basically hash*9 + char.
47  * =========================================================================
48  */
49 
HashTcl(uint8_t * data,int len)50 uint32_t HashTcl(uint8_t *data, int len) {
51     uint32_t hash = 0;
52     int i;
53 
54     for (i = 0; i < len; i++) {
55 	hash += (hash<<3) + data[i];
56     }
57 
58     return hash;
59 }
60 
61 /* =========================================================================
62  * Paul Hsieh's hash function
63  * http://www.azillionmonkeys.com/qed/hash.html
64  * =========================================================================
65  */
66 
67 #undef get16bits
68 #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
69   || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
70 #define get16bits(d) (*((const uint16_t *) (d)))
71 #endif
72 
73 #if !defined (get16bits)
74 #define get16bits(d) ((((const uint8_t *)(d))[1] << 8UL)\
75                       +((const uint8_t *)(d))[0])
76 #endif
77 
HashHsieh(uint8_t * data,int len)78 uint32_t HashHsieh(uint8_t *data, int len) {
79     uint32_t hash = 0, tmp;
80     int rem;
81 
82     if (len <= 0 || data == NULL) return 0;
83 
84     rem = len & 3;
85     len >>= 2;
86 
87     /* Main loop */
88     for (;len > 0; len--) {
89         hash  += get16bits (data);
90         tmp    = (get16bits (data+2) << 11) ^ hash;
91         hash   = (hash << 16) ^ tmp;
92         data  += 2*sizeof (uint16_t);
93         hash  += hash >> 11;
94     }
95 
96     /* Handle end cases */
97     switch (rem) {
98     case 3: hash += get16bits (data);
99 	hash ^= hash << 16;
100 	hash ^= data[sizeof (uint16_t)] << 18;
101 	hash += hash >> 11;
102 	break;
103     case 2: hash += get16bits (data);
104 	hash ^= hash << 11;
105 	hash += hash >> 17;
106 	break;
107     case 1: hash += *data;
108 	hash ^= hash << 10;
109 	hash += hash >> 1;
110     }
111 
112     /* Force "avalanching" of final 127 bits */
113     hash ^= hash << 3;
114     hash += hash >> 5;
115     hash ^= hash << 2;
116     hash += hash >> 15;
117     hash ^= hash << 10;
118 
119     return hash;
120 }
121 
122 /* =========================================================================
123  * Bob Jenkins' hash function
124  * http://burtleburtle.net/bob/hash/doobs.html
125  *
126  * See jenkins_lookup3.c for a new version of this that has good hash
127  * characteristics for a full 64-bit hash value.
128  * =========================================================================
129  */
130 
131 #define hashsize(n) ((uint32_t)1<<(n))
132 #define hashmask(n) (hashsize(n)-1)
133 
134 /*
135 --------------------------------------------------------------------
136 mix -- mix 3 32-bit values reversibly.
137 For every delta with one or two bits set, and the deltas of all three
138   high bits or all three low bits, whether the original value of a,b,c
139   is almost all zero or is uniformly distributed,
140 * If mix() is run forward or backward, at least 32 bits in a,b,c
141   have at least 1/4 probability of changing.
142 * If mix() is run forward, every bit of c will change between 1/3 and
143   2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
144 mix() was built out of 36 single-cycle latency instructions in a
145   structure that could supported 2x parallelism, like so:
146       a -= b;
147       a -= c; x = (c>>13);
148       b -= c; a ^= x;
149       b -= a; x = (a<<8);
150       c -= a; b ^= x;
151       c -= b; x = (b>>13);
152       ...
153   Unfortunately, superscalar Pentiums and Sparcs can't take advantage
154   of that parallelism.  They've also turned some of those single-cycle
155   latency instructions into multi-cycle latency instructions.  Still,
156   this is the fastest good hash I could find.  There were about 2^^68
157   to choose from.  I only looked at a billion or so.
158 --------------------------------------------------------------------
159 */
160 #define mix(a,b,c) \
161 { \
162   a -= b; a -= c; a ^= (c>>13); \
163   b -= c; b -= a; b ^= (a<<8); \
164   c -= a; c -= b; c ^= (b>>13); \
165   a -= b; a -= c; a ^= (c>>12);  \
166   b -= c; b -= a; b ^= (a<<16); \
167   c -= a; c -= b; c ^= (b>>5); \
168   a -= b; a -= c; a ^= (c>>3);  \
169   b -= c; b -= a; b ^= (a<<10); \
170   c -= a; c -= b; c ^= (b>>15); \
171 }
172 
173 /*
174 --------------------------------------------------------------------
175 hash() -- hash a variable-length key into a 32-bit value
176   k       : the key (the unaligned variable-length array of bytes)
177   len     : the length of the key, counting by bytes
178   initval : can be any 4-byte value
179 Returns a 32-bit value.  Every bit of the key affects every bit of
180 the return value.  Every 1-bit and 2-bit delta achieves avalanche.
181 About 6*len+35 instructions.
182 
183 The best hash table sizes are powers of 2.  There is no need to do
184 mod a prime (mod is sooo slow!).  If you need less than 32 bits,
185 use a bitmask.  For example, if you need only 10 bits, do
186   h = (h & hashmask(10));
187 In which case, the hash table should have hashsize(10) elements.
188 
189 If you are hashing n strings (uint8_t **)k, do it like this:
190   for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
191 
192 By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use this
193 code any way you wish, private, educational, or commercial.  It's free.
194 
195 See http://burtleburtle.net/bob/hash/evahash.html
196 Use for hash table lookup, or anything where one collision in 2^^32 is
197 acceptable.  Do NOT use for cryptographic purposes.
198 --------------------------------------------------------------------
199 */
200 
HashJenkins(uint8_t * k,int length)201 uint32_t HashJenkins(uint8_t *k, int length /*, uint32_t initval */)
202 {
203    register uint32_t a,b,c,len;
204 
205    /* Set up the internal state */
206    len = length;
207    a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
208    c = 0; /* initval; */        /* the previous hash value */
209 
210    /*---------------------------------------- handle most of the key */
211    while (len >= 12)
212    {
213       a += (k[0] +((uint32_t)k[1]<<8) +((uint32_t)k[2]<<16) +((uint32_t)k[3]<<24));
214       b += (k[4] +((uint32_t)k[5]<<8) +((uint32_t)k[6]<<16) +((uint32_t)k[7]<<24));
215       c += (k[8] +((uint32_t)k[9]<<8) +((uint32_t)k[10]<<16)+((uint32_t)k[11]<<24));
216       mix(a,b,c);
217       k += 12; len -= 12;
218    }
219 
220    /*------------------------------------- handle the last 11 bytes */
221    c += length;
222    switch(len)              /* all the case statements fall through */
223    {
224    case 11: c+=((uint32_t)k[10]<<24);
225    case 10: c+=((uint32_t)k[9]<<16);
226    case 9 : c+=((uint32_t)k[8]<<8);
227       /* the first byte of c is reserved for the length */
228    case 8 : b+=((uint32_t)k[7]<<24);
229    case 7 : b+=((uint32_t)k[6]<<16);
230    case 6 : b+=((uint32_t)k[5]<<8);
231    case 5 : b+=k[4];
232    case 4 : a+=((uint32_t)k[3]<<24);
233    case 3 : a+=((uint32_t)k[2]<<16);
234    case 2 : a+=((uint32_t)k[1]<<8);
235    case 1 : a+=k[0];
236      /* case 0: nothing left to add */
237    }
238    mix(a,b,c);
239    /*-------------------------------------------- report the result */
240    return c;
241 }
242 
243 /*
244  * An interface to the above hash functions.
245  * Returns:
246  *    A 32-bit hash key, suitable for masking down to smaller bit sizes
247  */
hash(int func,uint8_t * key,int key_len)248 uint32_t hash(int func, uint8_t *key, int key_len) {
249     switch (func) {
250     case HASH_FUNC_HSIEH:
251 	return HashHsieh(key, key_len);
252 
253     case HASH_FUNC_TCL:
254 	return HashTcl(key, key_len);
255 
256     case HASH_FUNC_JENKINS:
257 	return HashJenkins(key, key_len);
258 
259     case HASH_FUNC_JENKINS3:
260       {
261 	uint32_t pc = 0, pb = 0;
262 	HashJenkins3(key, key_len, &pc, &pb);
263 	return pc;
264       }
265     }
266 
267     return 0;
268 }
269 
270 /*
271  * As per hash() above but returns a 64-bit key. For 32-bit hash functions
272  * this is simply a duplication of the 32-bit value.
273  */
hash64(int func,uint8_t * key,int key_len)274 uint64_t hash64(int func, uint8_t *key, int key_len) {
275     uint32_t pc = 0, pb = 0;
276 
277     switch (func) {
278     case HASH_FUNC_HSIEH:
279 	pb = pc = HashHsieh(key, key_len);
280 	break;
281 
282     case HASH_FUNC_TCL:
283 	pb = pc = HashTcl(key, key_len);
284 	break;
285 
286     case HASH_FUNC_JENKINS:
287 	pb = pc = HashJenkins(key, key_len);
288 	break;
289 
290     case HASH_FUNC_JENKINS3:
291 	HashJenkins3(key, key_len, &pc, &pb);
292 	break;
293     }
294 
295     return pc + (((uint64_t)pb)<<32);
296 }
297 
298 /* =========================================================================
299  * Hash Table handling code
300  * =========================================================================
301  */
302 
303 /* Multiplicative factors indicating when to grow or shrink the hash table */
304 #define HASH_TABLE_RESIZE 3
305 
306 /*
307  * Creates a HashItem for use with HashTable h.
308  *
309  * Returns:
310  *    A pointer to new HashItem on success
311  *    NULL on failure.
312  */
HashItemCreate(HashTable * h)313 static HashItem *HashItemCreate(HashTable *h) {
314     HashItem *hi;
315 
316     hi = (h->options & HASH_POOL_ITEMS
317 	  ? pool_alloc(h->hi_pool) : malloc(sizeof(*hi)));
318     if (NULL == hi) return NULL;
319 
320     hi->data.p    = NULL;
321     hi->data.i    = 0;
322     hi->next      = NULL;
323     hi->key       = NULL;
324     hi->key_len   = 0;
325 
326     h->nused++;
327 
328     return hi;
329 }
330 
331 /*
332  * Deallocates a HashItem created via HashItemCreate.
333  *
334  * This function will not remove the item from the HashTable so be sure to
335  * call HashTableDel() first if appropriate.
336  */
HashItemDestroy(HashTable * h,HashItem * hi,int deallocate_data)337 static void HashItemDestroy(HashTable *h, HashItem *hi, int deallocate_data) {
338     if (!hi) return;
339 
340     if (!(h->options & HASH_NONVOLATILE_KEYS) || (h->options & HASH_OWN_KEYS))
341 	if (hi->key)
342 	    free(hi->key);
343 
344     if (deallocate_data && hi->data.p)
345 	free(hi->data.p);
346 
347     if (h->options & HASH_POOL_ITEMS) {
348         pool_free(h->hi_pool, hi);
349     } else {
350 	free(hi);
351     }
352 
353     h->nused--;
354 }
355 
356 /*
357  * Creates a new HashTable object. Size will be rounded up to the next
358  * power of 2. It is a starting point and hash tables may be grown or shrunk
359  * as needed (if HASH_DYNAMIC_SIZE is used).
360  *
361  * If HASH_POOL_ITEMS is used, HashItems will be allocated in blocks to reduce
362  * malloc overhead in the case where a large number of items is required.
363  * HashItems allocated this way will be put on a free list when destroyed; the
364  * memory will only be reclaimed when the entire hash table is destroyed.
365  *
366  * Options are as defined in the header file (see HASH_* macros).
367  *
368  * Returns:
369  *    A pointer to a HashTable on success
370  *    NULL on failure
371  */
HashTableCreate(int size,int options)372 HashTable *HashTableCreate(int size, int options) {
373     HashTable *h;
374     int i, bits;
375     uint32_t mask;
376 
377     if (!(h = (HashTable *)malloc(sizeof(*h))))
378 	return NULL;
379 
380     if (options & HASH_POOL_ITEMS) {
381         h->hi_pool = pool_create(sizeof(HashItem));
382 	if (NULL == h->hi_pool) {
383 	    free(h);
384 	    return NULL;
385 	}
386     } else {
387         h->hi_pool = NULL;
388     }
389 
390     if (size < 4)
391 	size = 4; /* an inconsequential minimum size */
392 
393     /* Round the requested size to the next power of 2 */
394     bits = 0;
395     size--;
396     while (size) {
397 	size /= 2;
398 	bits++;
399     }
400     size = 1<<bits;
401     mask = size-1;
402 
403     h->nbuckets = size;
404     h->mask = mask;
405     h->options = options;
406     h->nused = 0;
407     h->bucket = (HashItem **)malloc(sizeof(*h->bucket) * size);
408     if (NULL == h->bucket) {
409         HashTableDestroy(h, 0);
410         return NULL;
411     }
412 
413     for (i = 0; i < size; i++) {
414 	h->bucket[i] = NULL;
415     }
416 
417     return h;
418 }
419 
420 /*
421  * Deallocates a HashTable object (created by HashTableCreate).
422  *
423  * The deallocate_data parameter is a boolean to indicate whether the
424  * data attached to the hash table should also be free()d. DO NOT USE
425  * this if the HashData attached was not a pointer allocated using
426  * malloc().
427  */
HashTableDestroy(HashTable * h,int deallocate_data)428 void HashTableDestroy(HashTable *h, int deallocate_data) {
429     int i;
430 
431     if (!h)
432 	return;
433 
434     if (h->bucket) {
435         for (i = 0; i < h->nbuckets; i++) {
436 	    HashItem *hi = h->bucket[i], *next = NULL;
437 	    for (hi = h->bucket[i]; hi; hi = next) {
438 	        next = hi->next;
439 		HashItemDestroy(h, hi, deallocate_data);
440 	    }
441 	}
442 
443 	free(h->bucket);
444     }
445 
446     if (h->hi_pool) pool_destroy(h->hi_pool);
447 
448     free(h);
449 }
450 
451 /*
452  * Resizes a HashTable to have 'newsize' buckets.
453  * This is called automatically when adding or removing items so that the
454  * hash table keeps at a sensible scale.
455  *
456  * FIXME: Halving the size of the hash table is simply a matter of coaelescing
457  * every other bucket. Instead we currently rehash (which is slower).
458  * Doubling the size of the hash table currently requires rehashing, but this
459  * too could be optimised by storing the full 32-bit hash of the key along
460  * with the key itself. This then means that it's just a matter of seeing what
461  * the next significant bit is. It's a memory vs speed tradeoff though and
462  * re-hashing is pretty quick.
463  *
464  * Returns 0 for success
465  *        -1 for failure
466  */
HashTableResize(HashTable * h,int newsize)467 int HashTableResize(HashTable *h, int newsize) {
468     HashTable *h2;
469     int i;
470 
471     /* fprintf(stderr, "Resizing to %d\n", newsize); */
472 
473     /* Create a new hash table and rehash everything into it */
474     h2 = HashTableCreate(newsize, h->options);
475 
476     for (i = 0; i < h->nbuckets; i++) {
477 	HashItem *hi, *next;
478 	for (hi = h->bucket[i]; hi; hi = next) {
479 	    uint64_t hv = h2->options & HASH_INT_KEYS
480 		? hash64(h2->options & HASH_FUNC_MASK,
481 			 (uint8_t *)&hi->key, hi->key_len) & h2->mask
482 		: hash64(h2->options & HASH_FUNC_MASK,
483 			 (uint8_t *)hi->key, hi->key_len) & h2->mask;
484 	    next = hi->next;
485 	    hi->next = h2->bucket[hv];
486 	    h2->bucket[hv] = hi;
487 	}
488     }
489 
490     /* Swap the links over & free */
491     free(h->bucket);
492     h->bucket   = h2->bucket;
493     h->nbuckets = h2->nbuckets;
494     h->mask     = h2->mask;
495 
496     if (h2->hi_pool)
497 	pool_destroy(h2->hi_pool);
498     free(h2);
499 
500     return 0;
501 }
502 
503 /*
504  * Adds a HashData item to HashTable h with a specific key. Key can be binary
505  * data, but if key_len is passed as zero then strlen() will be used to
506  * determine the key length.
507  *
508  * The "new" pointer may be passed as NULL. When not NULL it is filled out
509  * as a boolean to indicate whether the key is already in this hash table.
510  *
511  * The HASH_ALLOW_DUP_KEYS option (specified when using HashTableCreate)
512  * will allow duplicate keys to be stored, and hence *new is also zero.
513  * By default duplicate keys are disallowed.
514  *
515  * Keys are considered to be volatile memory (ie temporary storage) and so the
516  * hash table takes separate copies of them. To avoid this use the
517  * HASH_NONVOLATILE_KEYS option.
518  *
519  * If the HASH_OWN_KEYS option was specified when creating the table then
520  * keys will be considered to be owned by the hash table. In this case
521  * the key will be freed when the table is destroyed regardless of
522  * whether the HASH_NONVOLATILE_KEYS option was used to allocate its
523  * own private copy.
524  *
525  * Returns:
526  *    The HashItem created (or matching if a duplicate) on success
527  *    NULL on failure
528  */
HashTableAdd(HashTable * h,char * key,int key_len,HashData data,int * new)529 HashItem *HashTableAdd(HashTable *h, char *key, int key_len, HashData data,
530 		       int *new) {
531     uint64_t hv;
532     HashItem *hi;
533 
534     if (!key_len)
535 	key_len = strlen(key);
536 
537     hv = h->options & HASH_INT_KEYS
538 	? hash64(h->options & HASH_FUNC_MASK, (uint8_t *)&key, key_len) & h->mask
539 	: hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) & h->mask;
540 
541     /* Already exists? */
542     if (!(h->options & HASH_ALLOW_DUP_KEYS)) {
543 	for (hi = h->bucket[hv]; hi; hi = hi->next) {
544 	    if (h->options & HASH_INT_KEYS) {
545 		if ((int)(size_t)hi->key == (int)(size_t)key) {
546 		    if (new) *new = 0;
547 		    return hi;
548 		}
549 	    } else {
550 		if (key_len == hi->key_len && key[0] == hi->key[0] &&
551 		    memcmp(key, hi->key, key_len) == 0) {
552 		    if (new) *new = 0;
553 		    return hi;
554 		}
555 	    }
556 	}
557     }
558 
559     /* No, so create a new one and link it in */
560     if (NULL == (hi = HashItemCreate(h)))
561 	return NULL;
562 
563     if (h->options & HASH_NONVOLATILE_KEYS)
564 	hi->key = key;
565     else {
566 	hi->key = (char *)malloc(key_len+1);
567 	memcpy(hi->key, key, key_len);
568 	hi->key[key_len] = 0; /* null terminate incase others print keys */
569     }
570     hi->key_len = key_len;
571     hi->data = data;
572     hi->next = h->bucket[hv];
573     h->bucket[hv] = hi;
574 
575     if ((h->options & HASH_DYNAMIC_SIZE) &&
576 	h->nused > HASH_TABLE_RESIZE * h->nbuckets)
577 	HashTableResize(h, h->nbuckets*4);
578 
579     if (new) *new = 1;
580 
581     return hi;
582 }
583 
584 
585 /*
586  * Removes a specified HashItem from the HashTable. (To perform this it needs
587  * to rehash based on the hash key as hash_item only has a next pointer and
588  * not a previous pointer.)
589  *
590  * The HashItem itself is also destroyed (by an internal call to
591  * HashItemDestroy). The deallocate_data parameter controls whether the data
592  * associated with the HashItem should also be free()d.
593  *
594  * See also the HashTableRemove() function to remove by key instead of
595  * HashItem.
596  *
597  * Returns 0 on success
598  *        -1 on failure (eg HashItem not in the HashTable);
599  */
HashTableDel(HashTable * h,HashItem * hi,int deallocate_data)600 int HashTableDel(HashTable *h, HashItem *hi, int deallocate_data) {
601     uint64_t hv;
602     HashItem *next, *last;
603 
604     hv = h->options & HASH_INT_KEYS
605 	? hash64(h->options & HASH_FUNC_MASK,
606 		 (uint8_t *)&hi->key, hi->key_len) & h->mask
607 	: hash64(h->options & HASH_FUNC_MASK,
608 		 (uint8_t *)hi->key, hi->key_len) & h->mask;
609 
610     for (last = NULL, next = h->bucket[hv]; next;
611 	 last = next, next = next->next) {
612 	if (next == hi) {
613 	    /* Link last to next->next */
614 	    if (last)
615 		last->next = next->next;
616 	    else
617 		h->bucket[hv] = next->next;
618 
619 	    HashItemDestroy(h, hi, deallocate_data);
620 
621 	    return 0;
622 	}
623     }
624 
625     return -1;
626 }
627 
628 
629 /*
630  * Searches the HashTable for the data registered with 'key' and removes
631  * these items from the HashTable. In essence this is a combination of
632  * HashTableSearch and HashTableDel functions.
633  *
634  * If HASH_ALLOW_DUP_KEYS is used this will remove all items matching 'key',
635  * otherwise just a single item will be removed.
636  *
637  * If 'deallocate_data' is true the data associated with the HashItem will
638  * be free()d.
639  *
640  * Returns
641  *    0 on success (at least one item found)
642  *   -1 on failure (no items found).
643  */
HashTableRemove(HashTable * h,char * key,int key_len,int deallocate_data)644 int HashTableRemove(HashTable *h, char *key, int key_len,
645 		    int deallocate_data) {
646     uint64_t hv;
647     HashItem *last, *next, *hi;
648     int retval = -1;
649 
650     if (!key_len)
651 	key_len = strlen(key);
652 
653     hv = h->options & HASH_INT_KEYS
654 	? hash64(h->options & HASH_FUNC_MASK, (uint8_t *)&key, key_len) & h->mask
655 	: hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) & h->mask;
656 
657     last = NULL;
658     next = h->bucket[hv];
659 
660     while (next) {
661 	hi = next;
662 	if (((h->options & HASH_INT_KEYS)
663 	     ? ((int)(size_t)key == (int)(size_t)hi->key)
664 	     : (key_len == hi->key_len && memcmp(key, hi->key, key_len) == 0))) {
665 	    /* An item to remove, adjust links and destroy */
666 	    if (last)
667 		last->next = hi->next;
668 	    else
669 		h->bucket[hv] = hi->next;
670 
671 	    next = hi->next;
672 	    HashItemDestroy(h, hi, deallocate_data);
673 
674 	    retval = 0;
675 	    if (!(h->options & HASH_ALLOW_DUP_KEYS))
676 		break;
677 
678 	} else {
679 	    /* We only update last when it's something we haven't destroyed */
680 	    last = hi;
681 	    next = hi->next;
682 	}
683     }
684 
685     return retval;
686 }
687 
688 /*
689  * Searches the HashTable for the data registered with 'key'.
690  * If HASH_ALLOW_DUP_KEYS is used this will just be the first one found.
691  * You will then need to use HashTableNext to iterate through the matches.
692  *
693  * Returns
694  *    HashItem if found
695  *    NULL if not found
696  */
HashTableSearch(HashTable * h,char * key,int key_len)697 HashItem *HashTableSearch(HashTable *h, char *key, int key_len) {
698     uint64_t hv;
699     HashItem *hi;
700 
701     if (!key_len)
702 	key_len = strlen(key);
703 
704     if (h->options & HASH_INT_KEYS) {
705 	hv = hash64(h->options & HASH_FUNC_MASK, (uint8_t *)&key, key_len)& h->mask;
706 
707 	for (hi = h->bucket[hv]; hi; hi = hi->next) {
708 	    if ((int)(size_t)key == (int)(size_t)hi->key)
709 		return hi;
710 	}
711     } else {
712 	hv = hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) & h->mask;
713 
714 	for (hi = h->bucket[hv]; hi; hi = hi->next) {
715 	    if (key_len == hi->key_len &&
716 		memcmp(key, hi->key, key_len) == 0)
717 		return hi;
718 	}
719     }
720 
721     return NULL;
722 }
723 
724 /*
725  * Find the next HashItem (starting from 'hi') to also match this key.
726  * This is only valid when the HASH_ALLOW_DUP_KEYS is in use and
727  * we're not using HASH_INT_KEYS.
728  *
729  * Returns
730  *    HashItem if found
731  *    NULL if not found
732  */
HashTableNext(HashItem * hi,char * key,int key_len)733 HashItem *HashTableNext(HashItem *hi, char *key, int key_len) {
734     if (!hi)
735 	return NULL;
736 
737     for (hi = hi->next; hi; hi = hi->next) {
738 	if (key_len == hi->key_len &&
739 	    memcmp(key, hi->key, key_len) == 0)
740 	    return hi;
741     }
742 
743     return NULL;
744 }
745 
HashTableNextInt(HashItem * hi,char * key,int key_len)746 HashItem *HashTableNextInt(HashItem *hi, char *key, int key_len) {
747     if (!hi)
748 	return NULL;
749 
750     for (hi = hi->next; hi; hi = hi->next) {
751 	if (key_len == hi->key_len &&
752 	    memcmp(&key, &hi->key, key_len) == 0)
753 	    return hi;
754     }
755 
756     return NULL;
757 }
758 
759 /*
760  * Dumps a textual represenation of the hash table to stdout.
761  */
HashTableDump(HashTable * h,FILE * fp,char * prefix)762 void HashTableDump(HashTable *h, FILE *fp, char *prefix) {
763     int i;
764     for (i = 0; i < h->nbuckets; i++) {
765 	HashItem *hi;
766 	for (hi = h->bucket[i]; hi; hi = hi->next) {
767 	    if (h->options & HASH_INT_KEYS) {
768 		fprintf(fp, "%s%d => %"PRId64" (0x%"PRIx64")\n",
769 			prefix ? prefix : "",
770 			(int)(size_t)hi->key,
771 			hi->data.i, hi->data.i);
772 	    } else {
773 		fprintf(fp, "%s%.*s => %"PRId64" (0x%"PRIx64")\n",
774 			prefix ? prefix : "",
775 			hi->key_len, hi->key,
776 			hi->data.i, hi->data.i);
777 	    }
778 	}
779     }
780 }
781 
782 /*
783  * Produces some simple statistics on the hash table population.
784  */
HashTableStats(HashTable * h,FILE * fp)785 void HashTableStats(HashTable *h, FILE *fp) {
786     int i;
787     double avg = (double)h->nused / h->nbuckets;
788     double var = 0;
789     int maxlen = 0;
790     int filled = 0;
791     int clen[51];
792 
793     for (i = 0; i <= 50; i++)
794 	clen[i] = 0;
795 
796     for (i = 0; i < h->nbuckets; i++) {
797 	int len = 0;
798 	HashItem *hi;
799 	for (hi = h->bucket[i]; hi; hi = hi->next) {
800 	    len++;
801 	}
802 	if (len > 0) {
803 	    filled++;
804 	    if (len > maxlen)
805 		maxlen = len;
806 	}
807 	clen[len <= 50 ? len : 50]++;
808 	var += (len-avg) * (len-avg);
809     }
810     var /= h->nbuckets;
811     /* sd = sqrt(var); */
812 
813     fprintf(fp, "Nbuckets  = %d\n", h->nbuckets);
814     fprintf(fp, "Nused     = %d\n", h->nused);
815     fprintf(fp, "Avg chain = %f\n", avg);
816     fprintf(fp, "Chain var.= %f\n", var);
817     fprintf(fp, "%%age full = %f\n", (100.0*filled)/h->nbuckets);
818     fprintf(fp, "max len   = %d\n", maxlen);
819     for (i = 0; i <= maxlen; i++) {
820 	fprintf(fp, "Chain %2d   = %d\n", i, clen[i]);
821     }
822 }
823 
824 /*
825  * --------------------------------------------------------------------
826  * Below we have a specialisation of the HashTable code where the data
827  * attached to the hash table is a position,size pair. This allows for the
828  * hash table to encode positions and sizes of items within a file archive.
829  * --------------------------------------------------------------------
830  */
831 
832 /*
833  * Writes the HashTable structures to 'fp'.
834  * This is a specialisation of the HashTable where the HashData is a
835  * position,size tuple.
836  *
837  * This consists of the following format:
838  * Header:
839  *    ".hsh" (magic numebr)
840  *    x4     (1-bytes of version code, eg "1.00")
841  *    x1     (HASH_FUNC_? function used)
842  *    x1     (number of file headers: FH. These count from 1 to FH inclusive)
843  *    x1     (number of file footers: FF. These count from 1 to FF inclusive)
844  *    x1     (number of archives indexed: NA)
845  *    x4     (4-bytes big-endian; number of hash buckets)
846  *    x8     (offset to add to item positions. eg size of this index)
847  *    x4     (4-bytes big-endian; number of bytes in hash file, inc. header)
848  * Archive name: (NH copies of, or just 1 zero byte if none)
849  *    x1     (length, zero => no name, eg when same file as hash index)
850  *    ?      (archive filename)
851  * File headers (FH copies of):
852  *    x1     (archive no.)
853  *    x7     (position)
854  *    x4     (size)
855  * File footers (FH copies of):
856  *    x1     (archive no.)
857  *    x7     (position)
858  *    x4     (size)
859  * Buckets (multiples of)
860  *    x4     (4-byte offset of linked list pos,  rel. to the start of the hdr)
861  * Items (per bucket chain, not written if Bucket[?]==0)
862  *    x1     (key length, zero => end of chain)
863  *    ?      (key)
864  *    x0.5   (File header to use. zero => none) top 4 bits
865  *    x0.5   (File footer to use. zero => none) bottom 4 bits
866  *    x8     (position)
867  *    x4     (size)
868  * ... arbitrary gap (but likely none)
869  * Index footer:
870  *    ".hsh" (magic number)
871  *    x8     (offset to Hash Header. +ve = absolute, -ve = relative to end)
872  *
873  * It is designed such that on-disk querying of the hash table can be done
874  * purely by forward seeks. (This is generally faster due to pre-fetching of
875  * the subsequent blocks by many disk controllers.)
876  *
877  * Returns: the number of bytes written on success
878  *         -1 for error
879  */
HashFileSave(HashFile * hf,FILE * fp,int64_t offset)880 uint64_t HashFileSave(HashFile *hf, FILE *fp, int64_t offset) {
881     int i;
882     HashItem *hi;
883     uint32_t *bucket_pos;
884     uint64_t hfsize = 0, be_hfsize;
885     HashTable *h = hf->h;
886     HashFileFooter foot;
887 
888     /* Compute the coordinates of the hash items */
889     hfsize = HHSIZE;				 /* header */
890     hfsize += h->nbuckets * 4; 			 /* buckets */
891     for (i = 0; i < hf->nheaders; i++)		 /* headers */
892 	hfsize += 12;
893     for (i = 0; i < hf->nfooters; i++)		 /* footers */
894 	hfsize += 12;
895     if (hf->narchives) {
896 	for (i = 0; i < hf->narchives; i++)
897 	    hfsize += strlen(hf->archives[i])+1; /* archive filename */
898     } else {
899 	hfsize++;
900     }
901     bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t));
902     for (i = 0; i < h->nbuckets; i++) {
903 	bucket_pos[i] = hfsize;
904 
905 	if (!(hi = h->bucket[i]))
906 	    continue;
907 	for (; hi; hi = hi->next) {
908 	    hfsize += 1 + 1 + hi->key_len + 8 + 4; /* keys, pos, size */
909 	}
910 	hfsize++;				/* list-end marker */
911     }
912     hfsize += sizeof(foot);
913 
914     /* Write the header: */
915     memcpy(hf->hh.magic, HASHFILE_MAGIC, 4);
916     if (hf->narchives > 1)
917 	memcpy(hf->hh.vers,  HASHFILE_VERSION, 4);
918     else
919 	memcpy(hf->hh.vers,  HASHFILE_VERSION100, 4);
920     hf->hh.hfunc     = h->options & HASH_FUNC_MASK;
921     hf->hh.nheaders  = hf->nheaders;
922     hf->hh.nfooters  = hf->nfooters;
923     hf->hh.narchives = hf->narchives == 1 ? 0 : hf->narchives;
924     hf->hh.nbuckets  = be_int4(h->nbuckets);
925     hf->hh.offset    = offset == HASHFILE_PREPEND
926 	? be_int8(hfsize) /* archive will be appended to this file */
927 	: be_int8(offset);
928     hf->hh.size     = be_int4(hfsize);
929     fwrite(&hf->hh, HHSIZE, 1, fp);
930 
931     /* Write the archive filename, if known */
932     if (hf->narchives) {
933 	for (i = 0; i < hf->narchives; i++) {
934 	    fputc(strlen(hf->archives[i]), fp);
935 	    fputs(hf->archives[i], fp);
936 	}
937     } else {
938 	/* Compatibility with v1.00 file format */
939 	fputc(0, fp);
940     }
941 
942     /* Write out the headers and footers */
943     for (i = 0; i < hf->nheaders; i++) {
944 	HashFileSection hs;
945 	hs.pos  = be_int8(hf->headers[i].pos);
946 	*(char *)&hs.pos = hf->headers[i].archive_no;
947 	fwrite(&hs.pos, 8, 1, fp);
948 	hs.size = be_int4(hf->headers[i].size);
949 	fwrite(&hs.size, 4, 1, fp);
950     }
951 
952     for (i = 0; i < hf->nfooters; i++) {
953 	HashFileSection hs;
954 	hs.pos  = be_int8(hf->footers[i].pos);
955 	*(char *)&hs.pos = hf->footers[i].archive_no;
956 	fwrite(&hs.pos, 8, 1, fp);
957 	hs.size = be_int4(hf->footers[i].size);
958 	fwrite(&hs.size, 4, 1, fp);
959     }
960 
961     /* Write out hash buckets */
962     for (i = 0; i < h->nbuckets; i++) {
963 	uint32_t zero = 0;
964 	uint32_t be32;
965 
966 	if (!(hi = h->bucket[i])) {
967 	    fwrite(&zero, 4, 1, fp);
968 	    continue;
969 	}
970 
971 	be32 = be_int4(bucket_pos[i]);
972 	fwrite(&be32, 4, 1, fp);
973     }
974     free(bucket_pos);
975 
976     /*
977      * Write the hash_item linked lists. The first item is the
978      * hash key length. We append a zero to the end of the list so we
979      * can check this key length to determine the end of this hash
980      * item list.
981      */
982     for (i = 0; i < h->nbuckets; i++) {
983 	if (!(hi = h->bucket[i]))
984 	    continue;
985 	for (; hi; hi = hi->next) {
986 	    uint64_t be64;
987 	    uint32_t be32;
988 	    HashFileItem *hfi = (HashFileItem *)hi->data.p;
989 	    unsigned char headfoot = 0;
990 
991 	    fprintf(fp, "%c%.*s", hi->key_len,
992 		    hi->key_len, hi->key);
993 	    headfoot = (((hfi->header) & 0xf) << 4) | ((hfi->footer) & 0xf);
994 	    fwrite(&headfoot, 1, 1, fp);
995 	    be64 = be_int8(hfi->pos);
996 	    *(char *)&be64 = hfi->archive;
997 	    fwrite(&be64, 8, 1, fp);
998 	    be32 = be_int4(hfi->size);
999 	    fwrite(&be32, 4, 1, fp);
1000 	}
1001         fputc(0, fp);
1002     }
1003 
1004     /* Finally write the footer referencing back to the header start */
1005     memcpy(foot.magic, HASHFILE_MAGIC, 4);
1006     be_hfsize = be_int8(-hfsize);
1007     memcpy(foot.offset, &be_hfsize, 8);
1008     fwrite(&foot, sizeof(foot), 1, fp);
1009 
1010     return hfsize;
1011 }
1012 
1013 #if 0
1014 /*
1015  * Reads an entire HashTable from fp.
1016  *
1017  * Returns:
1018  *    A filled out HashTable pointer on success
1019  *    NULL on failure
1020  */
1021 HashFile *HashFileLoad_old(FILE *fp) {
1022     int i;
1023     HashTable *h;
1024     HashItem *hi;
1025     HashFile *hf;
1026     uint32_t *bucket_pos;
1027     unsigned char *htable;
1028     int htable_pos;
1029     int fnamelen;
1030 
1031     if (NULL == (hf = (HashFile *)calloc(1, sizeof(HashFile))))
1032 	return NULL;
1033     if (NULL == (htable = (unsigned char *)malloc(HHSIZE)))
1034 	return NULL;
1035 
1036     /* Read and create the hash table header */
1037     if (HHSIZE != fread(htable, 1, HHSIZE, fp))
1038 	return NULL;
1039     memcpy(&hf->hh, htable, HHSIZE);
1040     hf->hh.nbuckets = be_int4(hf->hh.nbuckets);
1041     hf->hh.offset = be_int8(hf->hh.offset);
1042     hf->hh.size = be_int4(hf->hh.size);
1043     hf->h = h = HashTableCreate(hf->hh.nbuckets, hf->hh.hfunc);
1044     bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t));
1045 
1046     /* Load the archive filename */
1047     if (hf->narchives) {
1048 	hf->archives = (char **)malloc(hf->narchives * sizeof(char *));
1049     } else {
1050 	hf->archives = NULL;
1051     }
1052 
1053     if (hf->narchives) {
1054 	for (i = 0; i < hf->narchives; i++) {
1055 	    fnamelen = fgetc(fp);
1056 	    hf->archives[i] = malloc(fnamelen+1);
1057 	    fread(hf->archives[i], 1, fnamelen, fp);
1058 	    hf->archives[i][fnamelen] = 0;
1059 	}
1060     } else {
1061 	/* Consume 0 byte for v1.00 format */
1062 	fgetc(fp);
1063     }
1064 
1065     /* Load the rest of the hash table to memory */
1066     htable_pos = HHSIZE + fnamelen + 1;
1067     if (NULL == (htable = (unsigned char *)realloc(htable, hf->hh.size)))
1068 	return NULL;
1069     if (hf->hh.size-htable_pos !=
1070 	fread(&htable[htable_pos], 1, hf->hh.size-htable_pos, fp))
1071 	return NULL;
1072 
1073     /* Read the header / footer items */
1074     for (i = 0; i < hf->hh.nheaders; i++)
1075 	htable_pos += 8; /* skip them for now */
1076     for (i = 0; i < hf->hh.nfooters; i++)
1077 	htable_pos += 8; /* skip them for now */
1078 
1079     /* Identify the "bucket pos". Detemines which buckets have data */
1080     for (i = 0; i < h->nbuckets; i++) {
1081 	memcpy(&bucket_pos[i], &htable[htable_pos], 4);
1082 	bucket_pos[i] = be_int4(bucket_pos[i]);
1083 	htable_pos += 4;
1084     }
1085 
1086     /* Read the hash table items */
1087     for (i = 0; i < h->nbuckets; i++) {
1088 	if (!bucket_pos[i])
1089 	    continue;
1090 	for (;;) {
1091 	    int c;
1092 	    unsigned char uc;
1093 	    char key[256];
1094 	    uint64_t pos;
1095 	    uint32_t size;
1096 	    HashFileItem *hfi;
1097 
1098 	    c = htable[htable_pos++];
1099 	    if (c == EOF || !c)
1100 		break;
1101 
1102 	    /* key */
1103 	    memcpy(key, &htable[htable_pos], c);
1104 	    htable_pos += c;
1105 
1106 	    /* header/footer */
1107 	    uc = htable[htable_pos++];
1108 	    hfi = (HashFileItem *)malloc(sizeof(*hfi));
1109 	    hfi->header = (uc >> 4) & 0xf;
1110 	    hfi->footer = uc & 0xf;
1111 
1112 	    /* pos */
1113 	    memcpy(&pos, &htable[htable_pos], 8);
1114 	    htable_pos += 8;
1115 	    hfi->pos = be_int8(pos) + hf->hh.offset;
1116 
1117 	    /* size */
1118 	    memcpy(&size, &htable[htable_pos], 4);
1119 	    htable_pos += 4;
1120 	    hfi->size = be_int4(size);
1121 
1122 	    /* Add to the hash table */
1123 	    hi = HashItemCreate(h);
1124 	    hi->next = h->bucket[i];
1125 	    h->bucket[i] = hi;
1126 	    hi->key_len = c;
1127 	    hi->key = (char *)malloc(c+1);
1128 	    memcpy(hi->key, key, c);
1129 	    hi->key[c] = 0; /* For debugging convenience only */
1130 	    hi->data.p = hfi;
1131 	}
1132     }
1133 
1134     fprintf(stderr, "done\n");
1135     fflush(stderr);
1136     free(bucket_pos);
1137 
1138     return hf;
1139 }
1140 #endif
1141 
1142 /*
1143  * Opens a stored hash table file. It also internally keeps an open file to
1144  * hash and the archive files.
1145  *
1146  * Returns the HashFile pointer on success
1147  *         NULL on failure
1148  */
HashFileFopen(FILE * fp)1149 HashFile *HashFileFopen(FILE *fp) {
1150     HashFile *hf = HashFileCreate(0, 0);
1151     int archive_len;
1152     int i, fnamelen;
1153 
1154     /* Set the stdio buffer to be small to avoid massive I/O wastage */
1155 
1156     /* Read the header */
1157     hf->hfp = fp;
1158     hf->hf_start = ftello(hf->hfp);
1159 
1160     if (HHSIZE != fread(&hf->hh, 1, HHSIZE, hf->hfp)) {
1161 	HashFileDestroy(hf);
1162 	return NULL;
1163     }
1164     if (memcmp(HASHFILE_MAGIC, &hf->hh, 4) != 0) {
1165 	HashFileFooter foot;
1166 	int64_t offset;
1167 
1168 	/* Invalid magic number, try other end of file! */
1169 	fseeko(hf->hfp, -(off_t)sizeof(HashFileFooter), SEEK_END);
1170 	if (sizeof(foot) != fread(&foot, 1, sizeof(foot), hf->hfp)) {
1171 	    HashFileDestroy(hf);
1172 	    return NULL;
1173 	}
1174 	if (memcmp(HASHFILE_MAGIC, &foot.magic, 4) != 0) {
1175 	    HashFileDestroy(hf);
1176 	    return NULL;
1177 	}
1178 	memcpy(&offset, foot.offset, 8);
1179 	offset = be_int8(offset);
1180 	fseeko(hf->hfp, offset, SEEK_CUR);
1181 	hf->hf_start = ftello(hf->hfp);
1182 	if (HHSIZE != fread(&hf->hh, 1, HHSIZE, hf->hfp)) {
1183 	    HashFileDestroy(hf);
1184 	    return NULL;
1185 	}
1186     }
1187     if (memcmp(hf->hh.vers, HASHFILE_VERSION,    4) != 0 &&
1188 	memcmp(hf->hh.vers, HASHFILE_VERSION100, 4) != 0) {
1189 	/* incorrect version */
1190 	HashFileDestroy(hf);
1191 	return NULL;
1192     }
1193 
1194     hf->hh.nbuckets = be_int4(hf->hh.nbuckets);
1195     hf->hh.offset   = be_int8(hf->hh.offset);
1196     hf->hh.size     = be_int4(hf->hh.size);
1197 
1198     /* Load the archive filename(s) */
1199     hf->narchives = hf->hh.narchives;
1200 
1201     /* Old archives had narchives fixed as zero, so check again */
1202     if (!hf->narchives) {
1203 	int n = fgetc(fp);
1204 	if (!n) {
1205 	    archive_len = 1;
1206 	} else {
1207 	    ungetc(n, fp);
1208 	    hf->narchives = 1;
1209 	}
1210     }
1211 
1212     if (hf->narchives) {
1213 	hf->archives = (char **)malloc(hf->narchives * sizeof(char *));
1214 	hf->afp = calloc(hf->narchives, sizeof(FILE *));
1215     } else {
1216 	hf->archives = NULL;
1217 	hf->afp = &hf->hfp;
1218     }
1219 
1220     if (hf->narchives) {
1221 	archive_len = 0;
1222 	for (i = 0; i < hf->narchives; i++) {
1223 	    fnamelen = fgetc(fp);
1224 	    hf->archives[i] = malloc(fnamelen+1);
1225 	    if (fnamelen != fread(hf->archives[i], 1, fnamelen, fp))
1226 		return NULL;
1227 	    hf->archives[i][fnamelen] = 0;
1228 	    archive_len += fnamelen+1;
1229 	}
1230     }
1231 
1232     hf->header_size = HHSIZE + archive_len +
1233 	12 * (hf->hh.nheaders + hf->hh.nfooters);
1234     hf->nheaders = hf->hh.nheaders;
1235     hf->nfooters = hf->hh.nfooters;
1236 
1237     /* Load the header and footer locations */
1238     hf->headers = hf->nheaders
1239 	? (HashFileSection *)malloc(hf->nheaders * sizeof(HashFileSection))
1240 	: NULL;
1241     for (i = 0; i < hf->nheaders; i++) {
1242 	if (1 != fread(&hf->headers[i].pos,  8, 1, hf->hfp))
1243 	    return NULL;
1244 	if (1 != fread(&hf->headers[i].size, 4, 1, hf->hfp))
1245 	    return NULL;
1246 	hf->headers[i].archive_no = *(char *)&hf->headers[i].pos;
1247 	*(char *)&hf->headers[i].pos = 0;
1248 	hf->headers[i].pos  = be_int8(hf->headers[i].pos) + hf->hh.offset;
1249 	hf->headers[i].size = be_int4(hf->headers[i].size);
1250 	hf->headers[i].cached_data = NULL;
1251     }
1252 
1253     hf->footers = hf->nfooters
1254 	? (HashFileSection *)malloc(hf->nfooters * sizeof(HashFileSection))
1255 	: NULL;
1256     for (i = 0; i < hf->nfooters; i++) {
1257 	if (1 != fread(&hf->footers[i].pos,  8, 1, hf->hfp))
1258 	    return NULL;
1259 	if (1 != fread(&hf->footers[i].size, 4, 1, hf->hfp))
1260 	    return NULL;
1261 	hf->footers[i].archive_no = *(char *)&hf->footers[i].pos;
1262 	*(char *)&hf->footers[i].pos = 0;
1263 	hf->footers[i].pos  = be_int8(hf->footers[i].pos) + hf->hh.offset;
1264 	hf->footers[i].size = be_int4(hf->footers[i].size);
1265 	hf->footers[i].cached_data = NULL;
1266     }
1267 
1268     return hf;
1269 }
1270 
HashFileOpen(char * fname)1271 HashFile *HashFileOpen(char *fname) {
1272     FILE *fp;
1273     HashFile *hf;
1274 
1275     /* Open the hash and read the header */
1276     if (NULL == (fp = fopen(fname, "rb")))
1277 	return NULL;
1278 
1279     if (!(hf = HashFileFopen(fp)))
1280 	return NULL;
1281 
1282     /* Open the main archive too? Usually deferred */
1283     if (hf->narchives) {
1284 	int i;
1285 
1286 	hf->afp = malloc(hf->narchives * sizeof(FILE *));
1287 	if (hf->afp == NULL)
1288 	    return NULL;
1289 
1290 	/* Delay opening the main archive until required */
1291 	for (i = 0; i < hf->narchives; i++) {
1292 	    hf->afp[i] = NULL;
1293 	}
1294 
1295 #if 0
1296 	    if (NULL == (hf->afp[i] = fopen(hf->archives[i], "rb"))) {
1297 		/* Possibly done via a relative pathname (optimal infact) */
1298 		char *cp;
1299 		char aname[1024];
1300 		if (NULL == (cp = strrchr(fname, '/'))) {
1301 		    HashFileDestroy(hf);
1302 		    return NULL;
1303 		}
1304 		sprintf(aname, "%.*s%s", (int)(cp-fname+1), fname,
1305 			hf->archives[i]);
1306 		if (NULL == (hf->afp[i] = fopen(aname, "rb"))) {
1307 
1308 		    return NULL;
1309 		}
1310 	    }
1311 #endif
1312     } else {
1313 	hf->afp = &hf->hfp;
1314     }
1315 
1316     return hf;
1317 }
1318 
HashFileLoad(FILE * fp)1319 HashFile *HashFileLoad(FILE *fp) {
1320     HashFile *hf;
1321     char *htable;
1322     off_t htable_pos;
1323     int i;
1324     HashItem *hi;
1325     HashTable *h;
1326     uint32_t *bucket_pos;
1327     uint32_t hsize;
1328 
1329     /* Open the hash table */
1330     fseeko(fp, 0, SEEK_SET);
1331     if (NULL == (hf = HashFileFopen(fp)))
1332 	return NULL;
1333 
1334     HashTableDestroy(hf->h, 1);
1335     h = hf->h = HashTableCreate(hf->hh.nbuckets, hf->hh.hfunc);
1336     bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t));
1337 
1338     /* Also load in the entire thing to memory */
1339     htable = (char *)malloc(hf->hh.size);
1340     fseeko(fp, hf->hf_start, SEEK_SET);
1341     hsize = fread(htable, 1, hf->hh.size, fp);
1342     if (hf->hh.size != hsize) {
1343 	free(htable);
1344 	return NULL;
1345     }
1346 
1347     /*
1348      * HashFileOpen has already decoded the headers up to and including the
1349      * individual file header/footer sections, but not the buckets and item
1350      * lists, so we start from there.
1351      */
1352     htable_pos = hf->header_size;
1353 
1354     /* Identify the "bucket pos". Detemines which buckets have data */
1355     for (i = 0; i < h->nbuckets; i++) {
1356 	memcpy(&bucket_pos[i], &htable[htable_pos], 4);
1357 	bucket_pos[i] = be_int4(bucket_pos[i]);
1358 	htable_pos += 4;
1359     }
1360 
1361     /* Read the hash table items */
1362     for (i = 0; i < h->nbuckets; i++) {
1363 	if (!bucket_pos[i])
1364 	    continue;
1365 	for (;;) {
1366 	    int c;
1367 	    unsigned char uc;
1368 	    char key[256];
1369 	    uint64_t pos;
1370 	    uint32_t size;
1371 	    HashFileItem *hfi;
1372 
1373 	    c = htable[htable_pos++];
1374 	    if (c == EOF || !c)
1375 		break;
1376 
1377 	    /* key */
1378 	    memcpy(key, &htable[htable_pos], c);
1379 	    htable_pos += c;
1380 
1381 	    /* header/footer */
1382 	    uc = htable[htable_pos++];
1383 	    hfi = (HashFileItem *)malloc(sizeof(*hfi));
1384 	    hfi->header = (uc >> 4) & 0xf;
1385 	    hfi->footer = uc & 0xf;
1386 
1387 	    /* archive no. + pos */
1388 	    memcpy(&pos, &htable[htable_pos], 8);
1389 	    htable_pos += 8;
1390 	    hfi->archive = *(char *)&pos;
1391 	    *(char *)&pos = 0;
1392 	    hfi->pos = be_int8(pos) + hf->hh.offset;
1393 
1394 	    /* size */
1395 	    memcpy(&size, &htable[htable_pos], 4);
1396 	    htable_pos += 4;
1397 	    hfi->size = be_int4(size);
1398 
1399 	    /* Add to the hash table */
1400 	    hi = HashItemCreate(h);
1401 	    hi->next = h->bucket[i];
1402 	    h->bucket[i] = hi;
1403 	    hi->key_len = c;
1404 	    hi->key = (char *)malloc(c+1);
1405 	    memcpy(hi->key, key, c);
1406 	    hi->key[c] = 0; /* For debugging convenience only */
1407 	    hi->data.p = hfi;
1408 	}
1409     }
1410 
1411     fflush(stderr);
1412     free(bucket_pos);
1413     free(htable);
1414 
1415     return hf;
1416 }
1417 
1418 /*
1419  * Searches the named HashFile for a specific key.
1420  * When found it returns the position and size of the object in pos and size.
1421  *
1422  * Returns
1423  *    0 on success (pos & size updated)
1424  *   -1 on failure
1425  */
HashFileQuery(HashFile * hf,uint8_t * key,int key_len,HashFileItem * item)1426 int HashFileQuery(HashFile *hf, uint8_t *key, int key_len,
1427 		  HashFileItem *item) {
1428     uint64_t hval;
1429     uint32_t pos;
1430     int klen;
1431     int cur_offset = 0;
1432 
1433     /* Hash 'key' to compute the bucket number */
1434     hval = hash64(hf->hh.hfunc, key, key_len) & (hf->hh.nbuckets-1);
1435 
1436     /* Read the bucket to find the first linked list item location */
1437     if (-1 == fseeko(hf->hfp, hf->hf_start + 4*hval + hf->header_size,SEEK_SET))
1438 	return -1;
1439     if (4 != fread(&pos, 1, 4, hf->hfp))
1440 	return -1;
1441     pos = be_int4(pos);
1442     cur_offset = 4*hval + 4 + hf->header_size;
1443 
1444     if (0 == pos)
1445 	/* No bucket pos => key not present */
1446 	return -1;
1447 
1448     /* Jump to the HashItems list and look through for key */
1449     if (-1 == fseeko(hf->hfp, pos - cur_offset, SEEK_CUR))
1450 	return -1;
1451 
1452     for (klen = fgetc(hf->hfp); klen; klen = fgetc(hf->hfp)) {
1453 	char k[256];
1454 	unsigned char headfoot;
1455 	uint64_t pos;
1456 	uint32_t size;
1457 
1458 	if (1 != fread(k, klen, 1, hf->hfp))
1459 	    return -1;
1460 	if (1 != fread(&headfoot, 1, 1, hf->hfp))
1461 	    return -1;
1462 	item->header = (headfoot >> 4) & 0xf;
1463 	item->footer = headfoot & 0xf;
1464 	if (1 != fread(&pos, 8, 1, hf->hfp))
1465 	    return -1;
1466 	item->archive = *(char *)&pos;
1467 	*(char *)&pos = 0;
1468 	pos = be_int8(pos) + hf->hh.offset;
1469 	if (1 != fread(&size, 4, 1, hf->hfp))
1470 	    return -1;
1471 	size = be_int4(size);
1472 	if (klen == key_len && 0 == memcmp(key, k, key_len)) {
1473 	    item->pos = pos;
1474 	    item->size = size;
1475 	    return 0;
1476 	}
1477     }
1478 
1479     return -1;
1480 }
1481 
HashFileCreate(int size,int options)1482 HashFile *HashFileCreate(int size, int options) {
1483     HashFile *hf;
1484 
1485     if (NULL == (hf = (HashFile *)calloc(1, sizeof(*hf))))
1486 	return NULL;
1487     if (NULL == (hf->h = HashTableCreate(size, options)))
1488 	return NULL;
1489 
1490     return hf;
1491 }
1492 
HashFileDestroy(HashFile * hf)1493 void HashFileDestroy(HashFile *hf) {
1494     if (!hf)
1495 	return;
1496 
1497     if (hf->h)
1498 	HashTableDestroy(hf->h, 1);
1499 
1500     if (hf->narchives) {
1501 	int i;
1502 	for (i = 0; i < hf->narchives; i++)
1503 	    if (hf->archives[i])
1504 		free(hf->archives[i]);
1505 	free(hf->archives);
1506     }
1507 
1508     if (hf->headers) {
1509 	int i;
1510 	for (i = 0; i < hf->nheaders; i++) {
1511 	    if (hf->headers[i].cached_data)
1512 		free(hf->headers[i].cached_data);
1513 	}
1514 	free(hf->headers);
1515     }
1516 
1517     if (hf->footers) {
1518 	int i;
1519 	for (i = 0; i < hf->nfooters; i++) {
1520 	    if (hf->footers[i].cached_data)
1521 		free(hf->footers[i].cached_data);
1522 	}
1523 	free(hf->footers);
1524     }
1525 
1526     if (hf->afp) {
1527 	int i;
1528 
1529 	for (i = 0; i < hf->narchives; i++)
1530 	    if (hf->afp[i] && hf->afp[i] != hf->hfp)
1531 		fclose(hf->afp[i]);
1532 
1533 	if (hf->afp != &hf->hfp)
1534 	    free(hf->afp);
1535     }
1536 
1537     if (hf->hfp)
1538 	fclose(hf->hfp);
1539 
1540     free(hf);
1541 }
1542 
1543 
1544 /*
1545  * Opens a specific archive number.
1546  * Returns 0 on success,
1547  *        -1 on failure
1548  */
HashFileOpenArchive(HashFile * hf,int archive_no)1549 static int HashFileOpenArchive(HashFile *hf, int archive_no) {
1550     if (hf->narchives && archive_no > hf->narchives)
1551 	return -1;
1552 
1553     if (hf->afp[archive_no])
1554 	return 0;
1555 
1556     if (NULL == (hf->afp[archive_no] = fopen(hf->archives[archive_no], "rb")))
1557 	return -1;
1558 
1559     return 0;
1560 }
1561 
1562 
1563 /*
1564  * Extracts the contents for a file out of the HashFile.
1565  */
HashFileExtract(HashFile * hf,char * fname,size_t * len)1566 char *HashFileExtract(HashFile *hf, char *fname, size_t *len) {
1567     HashFileItem hfi;
1568     size_t sz, pos;
1569     char *data;
1570     HashFileSection *head = NULL, *foot = NULL;
1571 
1572     /* Find out if and where the item is in the archive */
1573     if (-1 == HashFileQuery(hf, (uint8_t *)fname, strlen(fname), &hfi))
1574 	return NULL;
1575 
1576     /* Work out the size including header/footer and allocate */
1577     sz = hfi.size;
1578     if (hfi.header) {
1579 	head = &hf->headers[hfi.header-1];
1580 	sz += head->size;
1581     }
1582     if (hfi.footer) {
1583 	foot = &hf->footers[hfi.footer-1];
1584 	sz += foot->size;
1585     }
1586     *len = sz;
1587 
1588     if (NULL == (data = (char *)malloc(sz+1)))
1589 	return NULL;
1590     data[sz] = 0;
1591 
1592     /* Header */
1593     pos = 0;
1594     if (head) {
1595 	HashFileOpenArchive(hf, head->archive_no);
1596 	if (!hf->afp[head->archive_no])
1597 	    return NULL;
1598 
1599 	fseeko(hf->afp[head->archive_no], head->pos, SEEK_SET);
1600 	if (1 != fread(&data[pos], head->size, 1, hf->afp[head->archive_no]))
1601 	    return NULL;
1602 	pos += head->size;
1603     }
1604 
1605     /* Main file */
1606     HashFileOpenArchive(hf, hfi.archive);
1607     if (!hf->afp[hfi.archive])
1608 	return NULL;
1609 
1610     fseeko(hf->afp[hfi.archive], hfi.pos, SEEK_SET);
1611     if (1 != fread(&data[pos], hfi.size, 1, hf->afp[hfi.archive]))
1612 	return NULL;
1613     pos += hfi.size;
1614 
1615     /* Footer */
1616     if (foot) {
1617 	HashFileOpenArchive(hf, foot->archive_no);
1618 	if (!hf->afp[foot->archive_no])
1619 	    return NULL;
1620 
1621 	fseeko(hf->afp[foot->archive_no], foot->pos, SEEK_SET);
1622 	if (1 != fread(&data[pos], foot->size, 1, hf->afp[foot->archive_no]))
1623 	    return NULL;
1624 	pos += foot->size;
1625     }
1626 
1627     return data;
1628 }
1629 
1630 /*
1631  * Iterates through members of a hash table returning items sequentially.
1632  *
1633  * Returns the next HashItem on success
1634  *         NULL on failure.
1635  */
HashTableIterNext(HashTable * h,HashIter * iter)1636 HashItem *HashTableIterNext(HashTable *h, HashIter *iter) {
1637     do {
1638 	if (iter->hi == NULL) {
1639 	    if (++iter->bnum >= h->nbuckets)
1640 		break;
1641 	    iter->hi = h->bucket[iter->bnum];
1642 	} else {
1643 	    iter->hi = iter->hi->next;
1644 	}
1645     } while (!iter->hi);
1646 
1647     return iter->hi;
1648 }
1649 
HashTableIterReset(HashIter * iter)1650 void HashTableIterReset(HashIter *iter) {
1651     if (iter) {
1652 	iter->bnum = -1;
1653 	iter->hi = NULL;
1654     }
1655 }
1656 
HashTableIterCreate(void)1657 HashIter *HashTableIterCreate(void) {
1658     HashIter *iter = (HashIter *)malloc(sizeof(*iter));
1659 
1660     HashTableIterReset(iter);
1661     return iter;
1662 }
1663 
HashTableIterDestroy(HashIter * iter)1664 void HashTableIterDestroy(HashIter *iter) {
1665     if (iter)
1666 	free(iter);
1667 }
1668