1 /*
2 * Copyright (c) 2005-2011, 2013 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifdef HAVE_CONFIG_H
35 #include "io_lib_config.h"
36 #endif
37
38 #include <stdio.h>
39 #include <string.h>
40 #include <stdlib.h>
41 #include "io_lib/os.h"
42 #include "io_lib/hash_table.h"
43 #include "io_lib/jenkins_lookup3.h"
44
45 /* =========================================================================
46 * TCL's hash function. Basically hash*9 + char.
47 * =========================================================================
48 */
49
HashTcl(uint8_t * data,int len)50 uint32_t HashTcl(uint8_t *data, int len) {
51 uint32_t hash = 0;
52 int i;
53
54 for (i = 0; i < len; i++) {
55 hash += (hash<<3) + data[i];
56 }
57
58 return hash;
59 }
60
61 /* =========================================================================
62 * Paul Hsieh's hash function
63 * http://www.azillionmonkeys.com/qed/hash.html
64 * =========================================================================
65 */
66
67 #undef get16bits
68 #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
69 || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
70 #define get16bits(d) (*((const uint16_t *) (d)))
71 #endif
72
73 #if !defined (get16bits)
74 #define get16bits(d) ((((const uint8_t *)(d))[1] << 8UL)\
75 +((const uint8_t *)(d))[0])
76 #endif
77
HashHsieh(uint8_t * data,int len)78 uint32_t HashHsieh(uint8_t *data, int len) {
79 uint32_t hash = 0, tmp;
80 int rem;
81
82 if (len <= 0 || data == NULL) return 0;
83
84 rem = len & 3;
85 len >>= 2;
86
87 /* Main loop */
88 for (;len > 0; len--) {
89 hash += get16bits (data);
90 tmp = (get16bits (data+2) << 11) ^ hash;
91 hash = (hash << 16) ^ tmp;
92 data += 2*sizeof (uint16_t);
93 hash += hash >> 11;
94 }
95
96 /* Handle end cases */
97 switch (rem) {
98 case 3: hash += get16bits (data);
99 hash ^= hash << 16;
100 hash ^= data[sizeof (uint16_t)] << 18;
101 hash += hash >> 11;
102 break;
103 case 2: hash += get16bits (data);
104 hash ^= hash << 11;
105 hash += hash >> 17;
106 break;
107 case 1: hash += *data;
108 hash ^= hash << 10;
109 hash += hash >> 1;
110 }
111
112 /* Force "avalanching" of final 127 bits */
113 hash ^= hash << 3;
114 hash += hash >> 5;
115 hash ^= hash << 2;
116 hash += hash >> 15;
117 hash ^= hash << 10;
118
119 return hash;
120 }
121
122 /* =========================================================================
123 * Bob Jenkins' hash function
124 * http://burtleburtle.net/bob/hash/doobs.html
125 *
126 * See jenkins_lookup3.c for a new version of this that has good hash
127 * characteristics for a full 64-bit hash value.
128 * =========================================================================
129 */
130
131 #define hashsize(n) ((uint32_t)1<<(n))
132 #define hashmask(n) (hashsize(n)-1)
133
134 /*
135 --------------------------------------------------------------------
136 mix -- mix 3 32-bit values reversibly.
137 For every delta with one or two bits set, and the deltas of all three
138 high bits or all three low bits, whether the original value of a,b,c
139 is almost all zero or is uniformly distributed,
140 * If mix() is run forward or backward, at least 32 bits in a,b,c
141 have at least 1/4 probability of changing.
142 * If mix() is run forward, every bit of c will change between 1/3 and
143 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
144 mix() was built out of 36 single-cycle latency instructions in a
145 structure that could supported 2x parallelism, like so:
146 a -= b;
147 a -= c; x = (c>>13);
148 b -= c; a ^= x;
149 b -= a; x = (a<<8);
150 c -= a; b ^= x;
151 c -= b; x = (b>>13);
152 ...
153 Unfortunately, superscalar Pentiums and Sparcs can't take advantage
154 of that parallelism. They've also turned some of those single-cycle
155 latency instructions into multi-cycle latency instructions. Still,
156 this is the fastest good hash I could find. There were about 2^^68
157 to choose from. I only looked at a billion or so.
158 --------------------------------------------------------------------
159 */
160 #define mix(a,b,c) \
161 { \
162 a -= b; a -= c; a ^= (c>>13); \
163 b -= c; b -= a; b ^= (a<<8); \
164 c -= a; c -= b; c ^= (b>>13); \
165 a -= b; a -= c; a ^= (c>>12); \
166 b -= c; b -= a; b ^= (a<<16); \
167 c -= a; c -= b; c ^= (b>>5); \
168 a -= b; a -= c; a ^= (c>>3); \
169 b -= c; b -= a; b ^= (a<<10); \
170 c -= a; c -= b; c ^= (b>>15); \
171 }
172
173 /*
174 --------------------------------------------------------------------
175 hash() -- hash a variable-length key into a 32-bit value
176 k : the key (the unaligned variable-length array of bytes)
177 len : the length of the key, counting by bytes
178 initval : can be any 4-byte value
179 Returns a 32-bit value. Every bit of the key affects every bit of
180 the return value. Every 1-bit and 2-bit delta achieves avalanche.
181 About 6*len+35 instructions.
182
183 The best hash table sizes are powers of 2. There is no need to do
184 mod a prime (mod is sooo slow!). If you need less than 32 bits,
185 use a bitmask. For example, if you need only 10 bits, do
186 h = (h & hashmask(10));
187 In which case, the hash table should have hashsize(10) elements.
188
189 If you are hashing n strings (uint8_t **)k, do it like this:
190 for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
191
192 By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this
193 code any way you wish, private, educational, or commercial. It's free.
194
195 See http://burtleburtle.net/bob/hash/evahash.html
196 Use for hash table lookup, or anything where one collision in 2^^32 is
197 acceptable. Do NOT use for cryptographic purposes.
198 --------------------------------------------------------------------
199 */
200
HashJenkins(uint8_t * k,int length)201 uint32_t HashJenkins(uint8_t *k, int length /*, uint32_t initval */)
202 {
203 register uint32_t a,b,c,len;
204
205 /* Set up the internal state */
206 len = length;
207 a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
208 c = 0; /* initval; */ /* the previous hash value */
209
210 /*---------------------------------------- handle most of the key */
211 while (len >= 12)
212 {
213 a += (k[0] +((uint32_t)k[1]<<8) +((uint32_t)k[2]<<16) +((uint32_t)k[3]<<24));
214 b += (k[4] +((uint32_t)k[5]<<8) +((uint32_t)k[6]<<16) +((uint32_t)k[7]<<24));
215 c += (k[8] +((uint32_t)k[9]<<8) +((uint32_t)k[10]<<16)+((uint32_t)k[11]<<24));
216 mix(a,b,c);
217 k += 12; len -= 12;
218 }
219
220 /*------------------------------------- handle the last 11 bytes */
221 c += length;
222 switch(len) /* all the case statements fall through */
223 {
224 case 11: c+=((uint32_t)k[10]<<24);
225 case 10: c+=((uint32_t)k[9]<<16);
226 case 9 : c+=((uint32_t)k[8]<<8);
227 /* the first byte of c is reserved for the length */
228 case 8 : b+=((uint32_t)k[7]<<24);
229 case 7 : b+=((uint32_t)k[6]<<16);
230 case 6 : b+=((uint32_t)k[5]<<8);
231 case 5 : b+=k[4];
232 case 4 : a+=((uint32_t)k[3]<<24);
233 case 3 : a+=((uint32_t)k[2]<<16);
234 case 2 : a+=((uint32_t)k[1]<<8);
235 case 1 : a+=k[0];
236 /* case 0: nothing left to add */
237 }
238 mix(a,b,c);
239 /*-------------------------------------------- report the result */
240 return c;
241 }
242
243 /*
244 * An interface to the above hash functions.
245 * Returns:
246 * A 32-bit hash key, suitable for masking down to smaller bit sizes
247 */
hash(int func,uint8_t * key,int key_len)248 uint32_t hash(int func, uint8_t *key, int key_len) {
249 switch (func) {
250 case HASH_FUNC_HSIEH:
251 return HashHsieh(key, key_len);
252
253 case HASH_FUNC_TCL:
254 return HashTcl(key, key_len);
255
256 case HASH_FUNC_JENKINS:
257 return HashJenkins(key, key_len);
258
259 case HASH_FUNC_JENKINS3:
260 {
261 uint32_t pc = 0, pb = 0;
262 HashJenkins3(key, key_len, &pc, &pb);
263 return pc;
264 }
265 }
266
267 return 0;
268 }
269
270 /*
271 * As per hash() above but returns a 64-bit key. For 32-bit hash functions
272 * this is simply a duplication of the 32-bit value.
273 */
hash64(int func,uint8_t * key,int key_len)274 uint64_t hash64(int func, uint8_t *key, int key_len) {
275 uint32_t pc = 0, pb = 0;
276
277 switch (func) {
278 case HASH_FUNC_HSIEH:
279 pb = pc = HashHsieh(key, key_len);
280 break;
281
282 case HASH_FUNC_TCL:
283 pb = pc = HashTcl(key, key_len);
284 break;
285
286 case HASH_FUNC_JENKINS:
287 pb = pc = HashJenkins(key, key_len);
288 break;
289
290 case HASH_FUNC_JENKINS3:
291 HashJenkins3(key, key_len, &pc, &pb);
292 break;
293 }
294
295 return pc + (((uint64_t)pb)<<32);
296 }
297
298 /* =========================================================================
299 * Hash Table handling code
300 * =========================================================================
301 */
302
303 /* Multiplicative factors indicating when to grow or shrink the hash table */
304 #define HASH_TABLE_RESIZE 3
305
306 /*
307 * Creates a HashItem for use with HashTable h.
308 *
309 * Returns:
310 * A pointer to new HashItem on success
311 * NULL on failure.
312 */
HashItemCreate(HashTable * h)313 static HashItem *HashItemCreate(HashTable *h) {
314 HashItem *hi;
315
316 hi = (h->options & HASH_POOL_ITEMS
317 ? pool_alloc(h->hi_pool) : malloc(sizeof(*hi)));
318 if (NULL == hi) return NULL;
319
320 hi->data.p = NULL;
321 hi->data.i = 0;
322 hi->next = NULL;
323 hi->key = NULL;
324 hi->key_len = 0;
325
326 h->nused++;
327
328 return hi;
329 }
330
331 /*
332 * Deallocates a HashItem created via HashItemCreate.
333 *
334 * This function will not remove the item from the HashTable so be sure to
335 * call HashTableDel() first if appropriate.
336 */
HashItemDestroy(HashTable * h,HashItem * hi,int deallocate_data)337 static void HashItemDestroy(HashTable *h, HashItem *hi, int deallocate_data) {
338 if (!hi) return;
339
340 if (!(h->options & HASH_NONVOLATILE_KEYS) || (h->options & HASH_OWN_KEYS))
341 if (hi->key)
342 free(hi->key);
343
344 if (deallocate_data && hi->data.p)
345 free(hi->data.p);
346
347 if (h->options & HASH_POOL_ITEMS) {
348 pool_free(h->hi_pool, hi);
349 } else {
350 free(hi);
351 }
352
353 h->nused--;
354 }
355
356 /*
357 * Creates a new HashTable object. Size will be rounded up to the next
358 * power of 2. It is a starting point and hash tables may be grown or shrunk
359 * as needed (if HASH_DYNAMIC_SIZE is used).
360 *
361 * If HASH_POOL_ITEMS is used, HashItems will be allocated in blocks to reduce
362 * malloc overhead in the case where a large number of items is required.
363 * HashItems allocated this way will be put on a free list when destroyed; the
364 * memory will only be reclaimed when the entire hash table is destroyed.
365 *
366 * Options are as defined in the header file (see HASH_* macros).
367 *
368 * Returns:
369 * A pointer to a HashTable on success
370 * NULL on failure
371 */
HashTableCreate(int size,int options)372 HashTable *HashTableCreate(int size, int options) {
373 HashTable *h;
374 int i, bits;
375 uint32_t mask;
376
377 if (!(h = (HashTable *)malloc(sizeof(*h))))
378 return NULL;
379
380 if (options & HASH_POOL_ITEMS) {
381 h->hi_pool = pool_create(sizeof(HashItem));
382 if (NULL == h->hi_pool) {
383 free(h);
384 return NULL;
385 }
386 } else {
387 h->hi_pool = NULL;
388 }
389
390 if (size < 4)
391 size = 4; /* an inconsequential minimum size */
392
393 /* Round the requested size to the next power of 2 */
394 bits = 0;
395 size--;
396 while (size) {
397 size /= 2;
398 bits++;
399 }
400 size = 1<<bits;
401 mask = size-1;
402
403 h->nbuckets = size;
404 h->mask = mask;
405 h->options = options;
406 h->nused = 0;
407 h->bucket = (HashItem **)malloc(sizeof(*h->bucket) * size);
408 if (NULL == h->bucket) {
409 HashTableDestroy(h, 0);
410 return NULL;
411 }
412
413 for (i = 0; i < size; i++) {
414 h->bucket[i] = NULL;
415 }
416
417 return h;
418 }
419
420 /*
421 * Deallocates a HashTable object (created by HashTableCreate).
422 *
423 * The deallocate_data parameter is a boolean to indicate whether the
424 * data attached to the hash table should also be free()d. DO NOT USE
425 * this if the HashData attached was not a pointer allocated using
426 * malloc().
427 */
HashTableDestroy(HashTable * h,int deallocate_data)428 void HashTableDestroy(HashTable *h, int deallocate_data) {
429 int i;
430
431 if (!h)
432 return;
433
434 if (h->bucket) {
435 for (i = 0; i < h->nbuckets; i++) {
436 HashItem *hi = h->bucket[i], *next = NULL;
437 for (hi = h->bucket[i]; hi; hi = next) {
438 next = hi->next;
439 HashItemDestroy(h, hi, deallocate_data);
440 }
441 }
442
443 free(h->bucket);
444 }
445
446 if (h->hi_pool) pool_destroy(h->hi_pool);
447
448 free(h);
449 }
450
451 /*
452 * Resizes a HashTable to have 'newsize' buckets.
453 * This is called automatically when adding or removing items so that the
454 * hash table keeps at a sensible scale.
455 *
456 * FIXME: Halving the size of the hash table is simply a matter of coaelescing
457 * every other bucket. Instead we currently rehash (which is slower).
458 * Doubling the size of the hash table currently requires rehashing, but this
459 * too could be optimised by storing the full 32-bit hash of the key along
460 * with the key itself. This then means that it's just a matter of seeing what
461 * the next significant bit is. It's a memory vs speed tradeoff though and
462 * re-hashing is pretty quick.
463 *
464 * Returns 0 for success
465 * -1 for failure
466 */
HashTableResize(HashTable * h,int newsize)467 int HashTableResize(HashTable *h, int newsize) {
468 HashTable *h2;
469 int i;
470
471 /* fprintf(stderr, "Resizing to %d\n", newsize); */
472
473 /* Create a new hash table and rehash everything into it */
474 h2 = HashTableCreate(newsize, h->options);
475
476 for (i = 0; i < h->nbuckets; i++) {
477 HashItem *hi, *next;
478 for (hi = h->bucket[i]; hi; hi = next) {
479 uint64_t hv = h2->options & HASH_INT_KEYS
480 ? hash64(h2->options & HASH_FUNC_MASK,
481 (uint8_t *)&hi->key, hi->key_len) & h2->mask
482 : hash64(h2->options & HASH_FUNC_MASK,
483 (uint8_t *)hi->key, hi->key_len) & h2->mask;
484 next = hi->next;
485 hi->next = h2->bucket[hv];
486 h2->bucket[hv] = hi;
487 }
488 }
489
490 /* Swap the links over & free */
491 free(h->bucket);
492 h->bucket = h2->bucket;
493 h->nbuckets = h2->nbuckets;
494 h->mask = h2->mask;
495
496 if (h2->hi_pool)
497 pool_destroy(h2->hi_pool);
498 free(h2);
499
500 return 0;
501 }
502
503 /*
504 * Adds a HashData item to HashTable h with a specific key. Key can be binary
505 * data, but if key_len is passed as zero then strlen() will be used to
506 * determine the key length.
507 *
508 * The "new" pointer may be passed as NULL. When not NULL it is filled out
509 * as a boolean to indicate whether the key is already in this hash table.
510 *
511 * The HASH_ALLOW_DUP_KEYS option (specified when using HashTableCreate)
512 * will allow duplicate keys to be stored, and hence *new is also zero.
513 * By default duplicate keys are disallowed.
514 *
515 * Keys are considered to be volatile memory (ie temporary storage) and so the
516 * hash table takes separate copies of them. To avoid this use the
517 * HASH_NONVOLATILE_KEYS option.
518 *
519 * If the HASH_OWN_KEYS option was specified when creating the table then
520 * keys will be considered to be owned by the hash table. In this case
521 * the key will be freed when the table is destroyed regardless of
522 * whether the HASH_NONVOLATILE_KEYS option was used to allocate its
523 * own private copy.
524 *
525 * Returns:
526 * The HashItem created (or matching if a duplicate) on success
527 * NULL on failure
528 */
HashTableAdd(HashTable * h,char * key,int key_len,HashData data,int * new)529 HashItem *HashTableAdd(HashTable *h, char *key, int key_len, HashData data,
530 int *new) {
531 uint64_t hv;
532 HashItem *hi;
533
534 if (!key_len)
535 key_len = strlen(key);
536
537 hv = h->options & HASH_INT_KEYS
538 ? hash64(h->options & HASH_FUNC_MASK, (uint8_t *)&key, key_len) & h->mask
539 : hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) & h->mask;
540
541 /* Already exists? */
542 if (!(h->options & HASH_ALLOW_DUP_KEYS)) {
543 for (hi = h->bucket[hv]; hi; hi = hi->next) {
544 if (h->options & HASH_INT_KEYS) {
545 if ((int)(size_t)hi->key == (int)(size_t)key) {
546 if (new) *new = 0;
547 return hi;
548 }
549 } else {
550 if (key_len == hi->key_len && key[0] == hi->key[0] &&
551 memcmp(key, hi->key, key_len) == 0) {
552 if (new) *new = 0;
553 return hi;
554 }
555 }
556 }
557 }
558
559 /* No, so create a new one and link it in */
560 if (NULL == (hi = HashItemCreate(h)))
561 return NULL;
562
563 if (h->options & HASH_NONVOLATILE_KEYS)
564 hi->key = key;
565 else {
566 hi->key = (char *)malloc(key_len+1);
567 memcpy(hi->key, key, key_len);
568 hi->key[key_len] = 0; /* null terminate incase others print keys */
569 }
570 hi->key_len = key_len;
571 hi->data = data;
572 hi->next = h->bucket[hv];
573 h->bucket[hv] = hi;
574
575 if ((h->options & HASH_DYNAMIC_SIZE) &&
576 h->nused > HASH_TABLE_RESIZE * h->nbuckets)
577 HashTableResize(h, h->nbuckets*4);
578
579 if (new) *new = 1;
580
581 return hi;
582 }
583
584
585 /*
586 * Removes a specified HashItem from the HashTable. (To perform this it needs
587 * to rehash based on the hash key as hash_item only has a next pointer and
588 * not a previous pointer.)
589 *
590 * The HashItem itself is also destroyed (by an internal call to
591 * HashItemDestroy). The deallocate_data parameter controls whether the data
592 * associated with the HashItem should also be free()d.
593 *
594 * See also the HashTableRemove() function to remove by key instead of
595 * HashItem.
596 *
597 * Returns 0 on success
598 * -1 on failure (eg HashItem not in the HashTable);
599 */
HashTableDel(HashTable * h,HashItem * hi,int deallocate_data)600 int HashTableDel(HashTable *h, HashItem *hi, int deallocate_data) {
601 uint64_t hv;
602 HashItem *next, *last;
603
604 hv = h->options & HASH_INT_KEYS
605 ? hash64(h->options & HASH_FUNC_MASK,
606 (uint8_t *)&hi->key, hi->key_len) & h->mask
607 : hash64(h->options & HASH_FUNC_MASK,
608 (uint8_t *)hi->key, hi->key_len) & h->mask;
609
610 for (last = NULL, next = h->bucket[hv]; next;
611 last = next, next = next->next) {
612 if (next == hi) {
613 /* Link last to next->next */
614 if (last)
615 last->next = next->next;
616 else
617 h->bucket[hv] = next->next;
618
619 HashItemDestroy(h, hi, deallocate_data);
620
621 return 0;
622 }
623 }
624
625 return -1;
626 }
627
628
629 /*
630 * Searches the HashTable for the data registered with 'key' and removes
631 * these items from the HashTable. In essence this is a combination of
632 * HashTableSearch and HashTableDel functions.
633 *
634 * If HASH_ALLOW_DUP_KEYS is used this will remove all items matching 'key',
635 * otherwise just a single item will be removed.
636 *
637 * If 'deallocate_data' is true the data associated with the HashItem will
638 * be free()d.
639 *
640 * Returns
641 * 0 on success (at least one item found)
642 * -1 on failure (no items found).
643 */
HashTableRemove(HashTable * h,char * key,int key_len,int deallocate_data)644 int HashTableRemove(HashTable *h, char *key, int key_len,
645 int deallocate_data) {
646 uint64_t hv;
647 HashItem *last, *next, *hi;
648 int retval = -1;
649
650 if (!key_len)
651 key_len = strlen(key);
652
653 hv = h->options & HASH_INT_KEYS
654 ? hash64(h->options & HASH_FUNC_MASK, (uint8_t *)&key, key_len) & h->mask
655 : hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) & h->mask;
656
657 last = NULL;
658 next = h->bucket[hv];
659
660 while (next) {
661 hi = next;
662 if (((h->options & HASH_INT_KEYS)
663 ? ((int)(size_t)key == (int)(size_t)hi->key)
664 : (key_len == hi->key_len && memcmp(key, hi->key, key_len) == 0))) {
665 /* An item to remove, adjust links and destroy */
666 if (last)
667 last->next = hi->next;
668 else
669 h->bucket[hv] = hi->next;
670
671 next = hi->next;
672 HashItemDestroy(h, hi, deallocate_data);
673
674 retval = 0;
675 if (!(h->options & HASH_ALLOW_DUP_KEYS))
676 break;
677
678 } else {
679 /* We only update last when it's something we haven't destroyed */
680 last = hi;
681 next = hi->next;
682 }
683 }
684
685 return retval;
686 }
687
688 /*
689 * Searches the HashTable for the data registered with 'key'.
690 * If HASH_ALLOW_DUP_KEYS is used this will just be the first one found.
691 * You will then need to use HashTableNext to iterate through the matches.
692 *
693 * Returns
694 * HashItem if found
695 * NULL if not found
696 */
HashTableSearch(HashTable * h,char * key,int key_len)697 HashItem *HashTableSearch(HashTable *h, char *key, int key_len) {
698 uint64_t hv;
699 HashItem *hi;
700
701 if (!key_len)
702 key_len = strlen(key);
703
704 if (h->options & HASH_INT_KEYS) {
705 hv = hash64(h->options & HASH_FUNC_MASK, (uint8_t *)&key, key_len)& h->mask;
706
707 for (hi = h->bucket[hv]; hi; hi = hi->next) {
708 if ((int)(size_t)key == (int)(size_t)hi->key)
709 return hi;
710 }
711 } else {
712 hv = hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) & h->mask;
713
714 for (hi = h->bucket[hv]; hi; hi = hi->next) {
715 if (key_len == hi->key_len &&
716 memcmp(key, hi->key, key_len) == 0)
717 return hi;
718 }
719 }
720
721 return NULL;
722 }
723
724 /*
725 * Find the next HashItem (starting from 'hi') to also match this key.
726 * This is only valid when the HASH_ALLOW_DUP_KEYS is in use and
727 * we're not using HASH_INT_KEYS.
728 *
729 * Returns
730 * HashItem if found
731 * NULL if not found
732 */
HashTableNext(HashItem * hi,char * key,int key_len)733 HashItem *HashTableNext(HashItem *hi, char *key, int key_len) {
734 if (!hi)
735 return NULL;
736
737 for (hi = hi->next; hi; hi = hi->next) {
738 if (key_len == hi->key_len &&
739 memcmp(key, hi->key, key_len) == 0)
740 return hi;
741 }
742
743 return NULL;
744 }
745
HashTableNextInt(HashItem * hi,char * key,int key_len)746 HashItem *HashTableNextInt(HashItem *hi, char *key, int key_len) {
747 if (!hi)
748 return NULL;
749
750 for (hi = hi->next; hi; hi = hi->next) {
751 if (key_len == hi->key_len &&
752 memcmp(&key, &hi->key, key_len) == 0)
753 return hi;
754 }
755
756 return NULL;
757 }
758
759 /*
760 * Dumps a textual represenation of the hash table to stdout.
761 */
HashTableDump(HashTable * h,FILE * fp,char * prefix)762 void HashTableDump(HashTable *h, FILE *fp, char *prefix) {
763 int i;
764 for (i = 0; i < h->nbuckets; i++) {
765 HashItem *hi;
766 for (hi = h->bucket[i]; hi; hi = hi->next) {
767 if (h->options & HASH_INT_KEYS) {
768 fprintf(fp, "%s%d => %"PRId64" (0x%"PRIx64")\n",
769 prefix ? prefix : "",
770 (int)(size_t)hi->key,
771 hi->data.i, hi->data.i);
772 } else {
773 fprintf(fp, "%s%.*s => %"PRId64" (0x%"PRIx64")\n",
774 prefix ? prefix : "",
775 hi->key_len, hi->key,
776 hi->data.i, hi->data.i);
777 }
778 }
779 }
780 }
781
782 /*
783 * Produces some simple statistics on the hash table population.
784 */
HashTableStats(HashTable * h,FILE * fp)785 void HashTableStats(HashTable *h, FILE *fp) {
786 int i;
787 double avg = (double)h->nused / h->nbuckets;
788 double var = 0;
789 int maxlen = 0;
790 int filled = 0;
791 int clen[51];
792
793 for (i = 0; i <= 50; i++)
794 clen[i] = 0;
795
796 for (i = 0; i < h->nbuckets; i++) {
797 int len = 0;
798 HashItem *hi;
799 for (hi = h->bucket[i]; hi; hi = hi->next) {
800 len++;
801 }
802 if (len > 0) {
803 filled++;
804 if (len > maxlen)
805 maxlen = len;
806 }
807 clen[len <= 50 ? len : 50]++;
808 var += (len-avg) * (len-avg);
809 }
810 var /= h->nbuckets;
811 /* sd = sqrt(var); */
812
813 fprintf(fp, "Nbuckets = %d\n", h->nbuckets);
814 fprintf(fp, "Nused = %d\n", h->nused);
815 fprintf(fp, "Avg chain = %f\n", avg);
816 fprintf(fp, "Chain var.= %f\n", var);
817 fprintf(fp, "%%age full = %f\n", (100.0*filled)/h->nbuckets);
818 fprintf(fp, "max len = %d\n", maxlen);
819 for (i = 0; i <= maxlen; i++) {
820 fprintf(fp, "Chain %2d = %d\n", i, clen[i]);
821 }
822 }
823
824 /*
825 * --------------------------------------------------------------------
826 * Below we have a specialisation of the HashTable code where the data
827 * attached to the hash table is a position,size pair. This allows for the
828 * hash table to encode positions and sizes of items within a file archive.
829 * --------------------------------------------------------------------
830 */
831
832 /*
833 * Writes the HashTable structures to 'fp'.
834 * This is a specialisation of the HashTable where the HashData is a
835 * position,size tuple.
836 *
837 * This consists of the following format:
838 * Header:
839 * ".hsh" (magic numebr)
840 * x4 (1-bytes of version code, eg "1.00")
841 * x1 (HASH_FUNC_? function used)
842 * x1 (number of file headers: FH. These count from 1 to FH inclusive)
843 * x1 (number of file footers: FF. These count from 1 to FF inclusive)
844 * x1 (number of archives indexed: NA)
845 * x4 (4-bytes big-endian; number of hash buckets)
846 * x8 (offset to add to item positions. eg size of this index)
847 * x4 (4-bytes big-endian; number of bytes in hash file, inc. header)
848 * Archive name: (NH copies of, or just 1 zero byte if none)
849 * x1 (length, zero => no name, eg when same file as hash index)
850 * ? (archive filename)
851 * File headers (FH copies of):
852 * x1 (archive no.)
853 * x7 (position)
854 * x4 (size)
855 * File footers (FH copies of):
856 * x1 (archive no.)
857 * x7 (position)
858 * x4 (size)
859 * Buckets (multiples of)
860 * x4 (4-byte offset of linked list pos, rel. to the start of the hdr)
861 * Items (per bucket chain, not written if Bucket[?]==0)
862 * x1 (key length, zero => end of chain)
863 * ? (key)
864 * x0.5 (File header to use. zero => none) top 4 bits
865 * x0.5 (File footer to use. zero => none) bottom 4 bits
866 * x8 (position)
867 * x4 (size)
868 * ... arbitrary gap (but likely none)
869 * Index footer:
870 * ".hsh" (magic number)
871 * x8 (offset to Hash Header. +ve = absolute, -ve = relative to end)
872 *
873 * It is designed such that on-disk querying of the hash table can be done
874 * purely by forward seeks. (This is generally faster due to pre-fetching of
875 * the subsequent blocks by many disk controllers.)
876 *
877 * Returns: the number of bytes written on success
878 * -1 for error
879 */
HashFileSave(HashFile * hf,FILE * fp,int64_t offset)880 uint64_t HashFileSave(HashFile *hf, FILE *fp, int64_t offset) {
881 int i;
882 HashItem *hi;
883 uint32_t *bucket_pos;
884 uint64_t hfsize = 0, be_hfsize;
885 HashTable *h = hf->h;
886 HashFileFooter foot;
887
888 /* Compute the coordinates of the hash items */
889 hfsize = HHSIZE; /* header */
890 hfsize += h->nbuckets * 4; /* buckets */
891 for (i = 0; i < hf->nheaders; i++) /* headers */
892 hfsize += 12;
893 for (i = 0; i < hf->nfooters; i++) /* footers */
894 hfsize += 12;
895 if (hf->narchives) {
896 for (i = 0; i < hf->narchives; i++)
897 hfsize += strlen(hf->archives[i])+1; /* archive filename */
898 } else {
899 hfsize++;
900 }
901 bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t));
902 for (i = 0; i < h->nbuckets; i++) {
903 bucket_pos[i] = hfsize;
904
905 if (!(hi = h->bucket[i]))
906 continue;
907 for (; hi; hi = hi->next) {
908 hfsize += 1 + 1 + hi->key_len + 8 + 4; /* keys, pos, size */
909 }
910 hfsize++; /* list-end marker */
911 }
912 hfsize += sizeof(foot);
913
914 /* Write the header: */
915 memcpy(hf->hh.magic, HASHFILE_MAGIC, 4);
916 if (hf->narchives > 1)
917 memcpy(hf->hh.vers, HASHFILE_VERSION, 4);
918 else
919 memcpy(hf->hh.vers, HASHFILE_VERSION100, 4);
920 hf->hh.hfunc = h->options & HASH_FUNC_MASK;
921 hf->hh.nheaders = hf->nheaders;
922 hf->hh.nfooters = hf->nfooters;
923 hf->hh.narchives = hf->narchives == 1 ? 0 : hf->narchives;
924 hf->hh.nbuckets = be_int4(h->nbuckets);
925 hf->hh.offset = offset == HASHFILE_PREPEND
926 ? be_int8(hfsize) /* archive will be appended to this file */
927 : be_int8(offset);
928 hf->hh.size = be_int4(hfsize);
929 fwrite(&hf->hh, HHSIZE, 1, fp);
930
931 /* Write the archive filename, if known */
932 if (hf->narchives) {
933 for (i = 0; i < hf->narchives; i++) {
934 fputc(strlen(hf->archives[i]), fp);
935 fputs(hf->archives[i], fp);
936 }
937 } else {
938 /* Compatibility with v1.00 file format */
939 fputc(0, fp);
940 }
941
942 /* Write out the headers and footers */
943 for (i = 0; i < hf->nheaders; i++) {
944 HashFileSection hs;
945 hs.pos = be_int8(hf->headers[i].pos);
946 *(char *)&hs.pos = hf->headers[i].archive_no;
947 fwrite(&hs.pos, 8, 1, fp);
948 hs.size = be_int4(hf->headers[i].size);
949 fwrite(&hs.size, 4, 1, fp);
950 }
951
952 for (i = 0; i < hf->nfooters; i++) {
953 HashFileSection hs;
954 hs.pos = be_int8(hf->footers[i].pos);
955 *(char *)&hs.pos = hf->footers[i].archive_no;
956 fwrite(&hs.pos, 8, 1, fp);
957 hs.size = be_int4(hf->footers[i].size);
958 fwrite(&hs.size, 4, 1, fp);
959 }
960
961 /* Write out hash buckets */
962 for (i = 0; i < h->nbuckets; i++) {
963 uint32_t zero = 0;
964 uint32_t be32;
965
966 if (!(hi = h->bucket[i])) {
967 fwrite(&zero, 4, 1, fp);
968 continue;
969 }
970
971 be32 = be_int4(bucket_pos[i]);
972 fwrite(&be32, 4, 1, fp);
973 }
974 free(bucket_pos);
975
976 /*
977 * Write the hash_item linked lists. The first item is the
978 * hash key length. We append a zero to the end of the list so we
979 * can check this key length to determine the end of this hash
980 * item list.
981 */
982 for (i = 0; i < h->nbuckets; i++) {
983 if (!(hi = h->bucket[i]))
984 continue;
985 for (; hi; hi = hi->next) {
986 uint64_t be64;
987 uint32_t be32;
988 HashFileItem *hfi = (HashFileItem *)hi->data.p;
989 unsigned char headfoot = 0;
990
991 fprintf(fp, "%c%.*s", hi->key_len,
992 hi->key_len, hi->key);
993 headfoot = (((hfi->header) & 0xf) << 4) | ((hfi->footer) & 0xf);
994 fwrite(&headfoot, 1, 1, fp);
995 be64 = be_int8(hfi->pos);
996 *(char *)&be64 = hfi->archive;
997 fwrite(&be64, 8, 1, fp);
998 be32 = be_int4(hfi->size);
999 fwrite(&be32, 4, 1, fp);
1000 }
1001 fputc(0, fp);
1002 }
1003
1004 /* Finally write the footer referencing back to the header start */
1005 memcpy(foot.magic, HASHFILE_MAGIC, 4);
1006 be_hfsize = be_int8(-hfsize);
1007 memcpy(foot.offset, &be_hfsize, 8);
1008 fwrite(&foot, sizeof(foot), 1, fp);
1009
1010 return hfsize;
1011 }
1012
1013 #if 0
1014 /*
1015 * Reads an entire HashTable from fp.
1016 *
1017 * Returns:
1018 * A filled out HashTable pointer on success
1019 * NULL on failure
1020 */
1021 HashFile *HashFileLoad_old(FILE *fp) {
1022 int i;
1023 HashTable *h;
1024 HashItem *hi;
1025 HashFile *hf;
1026 uint32_t *bucket_pos;
1027 unsigned char *htable;
1028 int htable_pos;
1029 int fnamelen;
1030
1031 if (NULL == (hf = (HashFile *)calloc(1, sizeof(HashFile))))
1032 return NULL;
1033 if (NULL == (htable = (unsigned char *)malloc(HHSIZE)))
1034 return NULL;
1035
1036 /* Read and create the hash table header */
1037 if (HHSIZE != fread(htable, 1, HHSIZE, fp))
1038 return NULL;
1039 memcpy(&hf->hh, htable, HHSIZE);
1040 hf->hh.nbuckets = be_int4(hf->hh.nbuckets);
1041 hf->hh.offset = be_int8(hf->hh.offset);
1042 hf->hh.size = be_int4(hf->hh.size);
1043 hf->h = h = HashTableCreate(hf->hh.nbuckets, hf->hh.hfunc);
1044 bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t));
1045
1046 /* Load the archive filename */
1047 if (hf->narchives) {
1048 hf->archives = (char **)malloc(hf->narchives * sizeof(char *));
1049 } else {
1050 hf->archives = NULL;
1051 }
1052
1053 if (hf->narchives) {
1054 for (i = 0; i < hf->narchives; i++) {
1055 fnamelen = fgetc(fp);
1056 hf->archives[i] = malloc(fnamelen+1);
1057 fread(hf->archives[i], 1, fnamelen, fp);
1058 hf->archives[i][fnamelen] = 0;
1059 }
1060 } else {
1061 /* Consume 0 byte for v1.00 format */
1062 fgetc(fp);
1063 }
1064
1065 /* Load the rest of the hash table to memory */
1066 htable_pos = HHSIZE + fnamelen + 1;
1067 if (NULL == (htable = (unsigned char *)realloc(htable, hf->hh.size)))
1068 return NULL;
1069 if (hf->hh.size-htable_pos !=
1070 fread(&htable[htable_pos], 1, hf->hh.size-htable_pos, fp))
1071 return NULL;
1072
1073 /* Read the header / footer items */
1074 for (i = 0; i < hf->hh.nheaders; i++)
1075 htable_pos += 8; /* skip them for now */
1076 for (i = 0; i < hf->hh.nfooters; i++)
1077 htable_pos += 8; /* skip them for now */
1078
1079 /* Identify the "bucket pos". Detemines which buckets have data */
1080 for (i = 0; i < h->nbuckets; i++) {
1081 memcpy(&bucket_pos[i], &htable[htable_pos], 4);
1082 bucket_pos[i] = be_int4(bucket_pos[i]);
1083 htable_pos += 4;
1084 }
1085
1086 /* Read the hash table items */
1087 for (i = 0; i < h->nbuckets; i++) {
1088 if (!bucket_pos[i])
1089 continue;
1090 for (;;) {
1091 int c;
1092 unsigned char uc;
1093 char key[256];
1094 uint64_t pos;
1095 uint32_t size;
1096 HashFileItem *hfi;
1097
1098 c = htable[htable_pos++];
1099 if (c == EOF || !c)
1100 break;
1101
1102 /* key */
1103 memcpy(key, &htable[htable_pos], c);
1104 htable_pos += c;
1105
1106 /* header/footer */
1107 uc = htable[htable_pos++];
1108 hfi = (HashFileItem *)malloc(sizeof(*hfi));
1109 hfi->header = (uc >> 4) & 0xf;
1110 hfi->footer = uc & 0xf;
1111
1112 /* pos */
1113 memcpy(&pos, &htable[htable_pos], 8);
1114 htable_pos += 8;
1115 hfi->pos = be_int8(pos) + hf->hh.offset;
1116
1117 /* size */
1118 memcpy(&size, &htable[htable_pos], 4);
1119 htable_pos += 4;
1120 hfi->size = be_int4(size);
1121
1122 /* Add to the hash table */
1123 hi = HashItemCreate(h);
1124 hi->next = h->bucket[i];
1125 h->bucket[i] = hi;
1126 hi->key_len = c;
1127 hi->key = (char *)malloc(c+1);
1128 memcpy(hi->key, key, c);
1129 hi->key[c] = 0; /* For debugging convenience only */
1130 hi->data.p = hfi;
1131 }
1132 }
1133
1134 fprintf(stderr, "done\n");
1135 fflush(stderr);
1136 free(bucket_pos);
1137
1138 return hf;
1139 }
1140 #endif
1141
1142 /*
1143 * Opens a stored hash table file. It also internally keeps an open file to
1144 * hash and the archive files.
1145 *
1146 * Returns the HashFile pointer on success
1147 * NULL on failure
1148 */
HashFileFopen(FILE * fp)1149 HashFile *HashFileFopen(FILE *fp) {
1150 HashFile *hf = HashFileCreate(0, 0);
1151 int archive_len;
1152 int i, fnamelen;
1153
1154 /* Set the stdio buffer to be small to avoid massive I/O wastage */
1155
1156 /* Read the header */
1157 hf->hfp = fp;
1158 hf->hf_start = ftello(hf->hfp);
1159
1160 if (HHSIZE != fread(&hf->hh, 1, HHSIZE, hf->hfp)) {
1161 HashFileDestroy(hf);
1162 return NULL;
1163 }
1164 if (memcmp(HASHFILE_MAGIC, &hf->hh, 4) != 0) {
1165 HashFileFooter foot;
1166 int64_t offset;
1167
1168 /* Invalid magic number, try other end of file! */
1169 fseeko(hf->hfp, -(off_t)sizeof(HashFileFooter), SEEK_END);
1170 if (sizeof(foot) != fread(&foot, 1, sizeof(foot), hf->hfp)) {
1171 HashFileDestroy(hf);
1172 return NULL;
1173 }
1174 if (memcmp(HASHFILE_MAGIC, &foot.magic, 4) != 0) {
1175 HashFileDestroy(hf);
1176 return NULL;
1177 }
1178 memcpy(&offset, foot.offset, 8);
1179 offset = be_int8(offset);
1180 fseeko(hf->hfp, offset, SEEK_CUR);
1181 hf->hf_start = ftello(hf->hfp);
1182 if (HHSIZE != fread(&hf->hh, 1, HHSIZE, hf->hfp)) {
1183 HashFileDestroy(hf);
1184 return NULL;
1185 }
1186 }
1187 if (memcmp(hf->hh.vers, HASHFILE_VERSION, 4) != 0 &&
1188 memcmp(hf->hh.vers, HASHFILE_VERSION100, 4) != 0) {
1189 /* incorrect version */
1190 HashFileDestroy(hf);
1191 return NULL;
1192 }
1193
1194 hf->hh.nbuckets = be_int4(hf->hh.nbuckets);
1195 hf->hh.offset = be_int8(hf->hh.offset);
1196 hf->hh.size = be_int4(hf->hh.size);
1197
1198 /* Load the archive filename(s) */
1199 hf->narchives = hf->hh.narchives;
1200
1201 /* Old archives had narchives fixed as zero, so check again */
1202 if (!hf->narchives) {
1203 int n = fgetc(fp);
1204 if (!n) {
1205 archive_len = 1;
1206 } else {
1207 ungetc(n, fp);
1208 hf->narchives = 1;
1209 }
1210 }
1211
1212 if (hf->narchives) {
1213 hf->archives = (char **)malloc(hf->narchives * sizeof(char *));
1214 hf->afp = calloc(hf->narchives, sizeof(FILE *));
1215 } else {
1216 hf->archives = NULL;
1217 hf->afp = &hf->hfp;
1218 }
1219
1220 if (hf->narchives) {
1221 archive_len = 0;
1222 for (i = 0; i < hf->narchives; i++) {
1223 fnamelen = fgetc(fp);
1224 hf->archives[i] = malloc(fnamelen+1);
1225 if (fnamelen != fread(hf->archives[i], 1, fnamelen, fp))
1226 return NULL;
1227 hf->archives[i][fnamelen] = 0;
1228 archive_len += fnamelen+1;
1229 }
1230 }
1231
1232 hf->header_size = HHSIZE + archive_len +
1233 12 * (hf->hh.nheaders + hf->hh.nfooters);
1234 hf->nheaders = hf->hh.nheaders;
1235 hf->nfooters = hf->hh.nfooters;
1236
1237 /* Load the header and footer locations */
1238 hf->headers = hf->nheaders
1239 ? (HashFileSection *)malloc(hf->nheaders * sizeof(HashFileSection))
1240 : NULL;
1241 for (i = 0; i < hf->nheaders; i++) {
1242 if (1 != fread(&hf->headers[i].pos, 8, 1, hf->hfp))
1243 return NULL;
1244 if (1 != fread(&hf->headers[i].size, 4, 1, hf->hfp))
1245 return NULL;
1246 hf->headers[i].archive_no = *(char *)&hf->headers[i].pos;
1247 *(char *)&hf->headers[i].pos = 0;
1248 hf->headers[i].pos = be_int8(hf->headers[i].pos) + hf->hh.offset;
1249 hf->headers[i].size = be_int4(hf->headers[i].size);
1250 hf->headers[i].cached_data = NULL;
1251 }
1252
1253 hf->footers = hf->nfooters
1254 ? (HashFileSection *)malloc(hf->nfooters * sizeof(HashFileSection))
1255 : NULL;
1256 for (i = 0; i < hf->nfooters; i++) {
1257 if (1 != fread(&hf->footers[i].pos, 8, 1, hf->hfp))
1258 return NULL;
1259 if (1 != fread(&hf->footers[i].size, 4, 1, hf->hfp))
1260 return NULL;
1261 hf->footers[i].archive_no = *(char *)&hf->footers[i].pos;
1262 *(char *)&hf->footers[i].pos = 0;
1263 hf->footers[i].pos = be_int8(hf->footers[i].pos) + hf->hh.offset;
1264 hf->footers[i].size = be_int4(hf->footers[i].size);
1265 hf->footers[i].cached_data = NULL;
1266 }
1267
1268 return hf;
1269 }
1270
HashFileOpen(char * fname)1271 HashFile *HashFileOpen(char *fname) {
1272 FILE *fp;
1273 HashFile *hf;
1274
1275 /* Open the hash and read the header */
1276 if (NULL == (fp = fopen(fname, "rb")))
1277 return NULL;
1278
1279 if (!(hf = HashFileFopen(fp)))
1280 return NULL;
1281
1282 /* Open the main archive too? Usually deferred */
1283 if (hf->narchives) {
1284 int i;
1285
1286 hf->afp = malloc(hf->narchives * sizeof(FILE *));
1287 if (hf->afp == NULL)
1288 return NULL;
1289
1290 /* Delay opening the main archive until required */
1291 for (i = 0; i < hf->narchives; i++) {
1292 hf->afp[i] = NULL;
1293 }
1294
1295 #if 0
1296 if (NULL == (hf->afp[i] = fopen(hf->archives[i], "rb"))) {
1297 /* Possibly done via a relative pathname (optimal infact) */
1298 char *cp;
1299 char aname[1024];
1300 if (NULL == (cp = strrchr(fname, '/'))) {
1301 HashFileDestroy(hf);
1302 return NULL;
1303 }
1304 sprintf(aname, "%.*s%s", (int)(cp-fname+1), fname,
1305 hf->archives[i]);
1306 if (NULL == (hf->afp[i] = fopen(aname, "rb"))) {
1307
1308 return NULL;
1309 }
1310 }
1311 #endif
1312 } else {
1313 hf->afp = &hf->hfp;
1314 }
1315
1316 return hf;
1317 }
1318
HashFileLoad(FILE * fp)1319 HashFile *HashFileLoad(FILE *fp) {
1320 HashFile *hf;
1321 char *htable;
1322 off_t htable_pos;
1323 int i;
1324 HashItem *hi;
1325 HashTable *h;
1326 uint32_t *bucket_pos;
1327 uint32_t hsize;
1328
1329 /* Open the hash table */
1330 fseeko(fp, 0, SEEK_SET);
1331 if (NULL == (hf = HashFileFopen(fp)))
1332 return NULL;
1333
1334 HashTableDestroy(hf->h, 1);
1335 h = hf->h = HashTableCreate(hf->hh.nbuckets, hf->hh.hfunc);
1336 bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t));
1337
1338 /* Also load in the entire thing to memory */
1339 htable = (char *)malloc(hf->hh.size);
1340 fseeko(fp, hf->hf_start, SEEK_SET);
1341 hsize = fread(htable, 1, hf->hh.size, fp);
1342 if (hf->hh.size != hsize) {
1343 free(htable);
1344 return NULL;
1345 }
1346
1347 /*
1348 * HashFileOpen has already decoded the headers up to and including the
1349 * individual file header/footer sections, but not the buckets and item
1350 * lists, so we start from there.
1351 */
1352 htable_pos = hf->header_size;
1353
1354 /* Identify the "bucket pos". Detemines which buckets have data */
1355 for (i = 0; i < h->nbuckets; i++) {
1356 memcpy(&bucket_pos[i], &htable[htable_pos], 4);
1357 bucket_pos[i] = be_int4(bucket_pos[i]);
1358 htable_pos += 4;
1359 }
1360
1361 /* Read the hash table items */
1362 for (i = 0; i < h->nbuckets; i++) {
1363 if (!bucket_pos[i])
1364 continue;
1365 for (;;) {
1366 int c;
1367 unsigned char uc;
1368 char key[256];
1369 uint64_t pos;
1370 uint32_t size;
1371 HashFileItem *hfi;
1372
1373 c = htable[htable_pos++];
1374 if (c == EOF || !c)
1375 break;
1376
1377 /* key */
1378 memcpy(key, &htable[htable_pos], c);
1379 htable_pos += c;
1380
1381 /* header/footer */
1382 uc = htable[htable_pos++];
1383 hfi = (HashFileItem *)malloc(sizeof(*hfi));
1384 hfi->header = (uc >> 4) & 0xf;
1385 hfi->footer = uc & 0xf;
1386
1387 /* archive no. + pos */
1388 memcpy(&pos, &htable[htable_pos], 8);
1389 htable_pos += 8;
1390 hfi->archive = *(char *)&pos;
1391 *(char *)&pos = 0;
1392 hfi->pos = be_int8(pos) + hf->hh.offset;
1393
1394 /* size */
1395 memcpy(&size, &htable[htable_pos], 4);
1396 htable_pos += 4;
1397 hfi->size = be_int4(size);
1398
1399 /* Add to the hash table */
1400 hi = HashItemCreate(h);
1401 hi->next = h->bucket[i];
1402 h->bucket[i] = hi;
1403 hi->key_len = c;
1404 hi->key = (char *)malloc(c+1);
1405 memcpy(hi->key, key, c);
1406 hi->key[c] = 0; /* For debugging convenience only */
1407 hi->data.p = hfi;
1408 }
1409 }
1410
1411 fflush(stderr);
1412 free(bucket_pos);
1413 free(htable);
1414
1415 return hf;
1416 }
1417
1418 /*
1419 * Searches the named HashFile for a specific key.
1420 * When found it returns the position and size of the object in pos and size.
1421 *
1422 * Returns
1423 * 0 on success (pos & size updated)
1424 * -1 on failure
1425 */
HashFileQuery(HashFile * hf,uint8_t * key,int key_len,HashFileItem * item)1426 int HashFileQuery(HashFile *hf, uint8_t *key, int key_len,
1427 HashFileItem *item) {
1428 uint64_t hval;
1429 uint32_t pos;
1430 int klen;
1431 int cur_offset = 0;
1432
1433 /* Hash 'key' to compute the bucket number */
1434 hval = hash64(hf->hh.hfunc, key, key_len) & (hf->hh.nbuckets-1);
1435
1436 /* Read the bucket to find the first linked list item location */
1437 if (-1 == fseeko(hf->hfp, hf->hf_start + 4*hval + hf->header_size,SEEK_SET))
1438 return -1;
1439 if (4 != fread(&pos, 1, 4, hf->hfp))
1440 return -1;
1441 pos = be_int4(pos);
1442 cur_offset = 4*hval + 4 + hf->header_size;
1443
1444 if (0 == pos)
1445 /* No bucket pos => key not present */
1446 return -1;
1447
1448 /* Jump to the HashItems list and look through for key */
1449 if (-1 == fseeko(hf->hfp, pos - cur_offset, SEEK_CUR))
1450 return -1;
1451
1452 for (klen = fgetc(hf->hfp); klen; klen = fgetc(hf->hfp)) {
1453 char k[256];
1454 unsigned char headfoot;
1455 uint64_t pos;
1456 uint32_t size;
1457
1458 if (1 != fread(k, klen, 1, hf->hfp))
1459 return -1;
1460 if (1 != fread(&headfoot, 1, 1, hf->hfp))
1461 return -1;
1462 item->header = (headfoot >> 4) & 0xf;
1463 item->footer = headfoot & 0xf;
1464 if (1 != fread(&pos, 8, 1, hf->hfp))
1465 return -1;
1466 item->archive = *(char *)&pos;
1467 *(char *)&pos = 0;
1468 pos = be_int8(pos) + hf->hh.offset;
1469 if (1 != fread(&size, 4, 1, hf->hfp))
1470 return -1;
1471 size = be_int4(size);
1472 if (klen == key_len && 0 == memcmp(key, k, key_len)) {
1473 item->pos = pos;
1474 item->size = size;
1475 return 0;
1476 }
1477 }
1478
1479 return -1;
1480 }
1481
HashFileCreate(int size,int options)1482 HashFile *HashFileCreate(int size, int options) {
1483 HashFile *hf;
1484
1485 if (NULL == (hf = (HashFile *)calloc(1, sizeof(*hf))))
1486 return NULL;
1487 if (NULL == (hf->h = HashTableCreate(size, options)))
1488 return NULL;
1489
1490 return hf;
1491 }
1492
HashFileDestroy(HashFile * hf)1493 void HashFileDestroy(HashFile *hf) {
1494 if (!hf)
1495 return;
1496
1497 if (hf->h)
1498 HashTableDestroy(hf->h, 1);
1499
1500 if (hf->narchives) {
1501 int i;
1502 for (i = 0; i < hf->narchives; i++)
1503 if (hf->archives[i])
1504 free(hf->archives[i]);
1505 free(hf->archives);
1506 }
1507
1508 if (hf->headers) {
1509 int i;
1510 for (i = 0; i < hf->nheaders; i++) {
1511 if (hf->headers[i].cached_data)
1512 free(hf->headers[i].cached_data);
1513 }
1514 free(hf->headers);
1515 }
1516
1517 if (hf->footers) {
1518 int i;
1519 for (i = 0; i < hf->nfooters; i++) {
1520 if (hf->footers[i].cached_data)
1521 free(hf->footers[i].cached_data);
1522 }
1523 free(hf->footers);
1524 }
1525
1526 if (hf->afp) {
1527 int i;
1528
1529 for (i = 0; i < hf->narchives; i++)
1530 if (hf->afp[i] && hf->afp[i] != hf->hfp)
1531 fclose(hf->afp[i]);
1532
1533 if (hf->afp != &hf->hfp)
1534 free(hf->afp);
1535 }
1536
1537 if (hf->hfp)
1538 fclose(hf->hfp);
1539
1540 free(hf);
1541 }
1542
1543
1544 /*
1545 * Opens a specific archive number.
1546 * Returns 0 on success,
1547 * -1 on failure
1548 */
HashFileOpenArchive(HashFile * hf,int archive_no)1549 static int HashFileOpenArchive(HashFile *hf, int archive_no) {
1550 if (hf->narchives && archive_no > hf->narchives)
1551 return -1;
1552
1553 if (hf->afp[archive_no])
1554 return 0;
1555
1556 if (NULL == (hf->afp[archive_no] = fopen(hf->archives[archive_no], "rb")))
1557 return -1;
1558
1559 return 0;
1560 }
1561
1562
1563 /*
1564 * Extracts the contents for a file out of the HashFile.
1565 */
HashFileExtract(HashFile * hf,char * fname,size_t * len)1566 char *HashFileExtract(HashFile *hf, char *fname, size_t *len) {
1567 HashFileItem hfi;
1568 size_t sz, pos;
1569 char *data;
1570 HashFileSection *head = NULL, *foot = NULL;
1571
1572 /* Find out if and where the item is in the archive */
1573 if (-1 == HashFileQuery(hf, (uint8_t *)fname, strlen(fname), &hfi))
1574 return NULL;
1575
1576 /* Work out the size including header/footer and allocate */
1577 sz = hfi.size;
1578 if (hfi.header) {
1579 head = &hf->headers[hfi.header-1];
1580 sz += head->size;
1581 }
1582 if (hfi.footer) {
1583 foot = &hf->footers[hfi.footer-1];
1584 sz += foot->size;
1585 }
1586 *len = sz;
1587
1588 if (NULL == (data = (char *)malloc(sz+1)))
1589 return NULL;
1590 data[sz] = 0;
1591
1592 /* Header */
1593 pos = 0;
1594 if (head) {
1595 HashFileOpenArchive(hf, head->archive_no);
1596 if (!hf->afp[head->archive_no])
1597 return NULL;
1598
1599 fseeko(hf->afp[head->archive_no], head->pos, SEEK_SET);
1600 if (1 != fread(&data[pos], head->size, 1, hf->afp[head->archive_no]))
1601 return NULL;
1602 pos += head->size;
1603 }
1604
1605 /* Main file */
1606 HashFileOpenArchive(hf, hfi.archive);
1607 if (!hf->afp[hfi.archive])
1608 return NULL;
1609
1610 fseeko(hf->afp[hfi.archive], hfi.pos, SEEK_SET);
1611 if (1 != fread(&data[pos], hfi.size, 1, hf->afp[hfi.archive]))
1612 return NULL;
1613 pos += hfi.size;
1614
1615 /* Footer */
1616 if (foot) {
1617 HashFileOpenArchive(hf, foot->archive_no);
1618 if (!hf->afp[foot->archive_no])
1619 return NULL;
1620
1621 fseeko(hf->afp[foot->archive_no], foot->pos, SEEK_SET);
1622 if (1 != fread(&data[pos], foot->size, 1, hf->afp[foot->archive_no]))
1623 return NULL;
1624 pos += foot->size;
1625 }
1626
1627 return data;
1628 }
1629
1630 /*
1631 * Iterates through members of a hash table returning items sequentially.
1632 *
1633 * Returns the next HashItem on success
1634 * NULL on failure.
1635 */
HashTableIterNext(HashTable * h,HashIter * iter)1636 HashItem *HashTableIterNext(HashTable *h, HashIter *iter) {
1637 do {
1638 if (iter->hi == NULL) {
1639 if (++iter->bnum >= h->nbuckets)
1640 break;
1641 iter->hi = h->bucket[iter->bnum];
1642 } else {
1643 iter->hi = iter->hi->next;
1644 }
1645 } while (!iter->hi);
1646
1647 return iter->hi;
1648 }
1649
HashTableIterReset(HashIter * iter)1650 void HashTableIterReset(HashIter *iter) {
1651 if (iter) {
1652 iter->bnum = -1;
1653 iter->hi = NULL;
1654 }
1655 }
1656
HashTableIterCreate(void)1657 HashIter *HashTableIterCreate(void) {
1658 HashIter *iter = (HashIter *)malloc(sizeof(*iter));
1659
1660 HashTableIterReset(iter);
1661 return iter;
1662 }
1663
HashTableIterDestroy(HashIter * iter)1664 void HashTableIterDestroy(HashIter *iter) {
1665 if (iter)
1666 free(iter);
1667 }
1668