1 /* 2 * Copyright (c) 2005-2010, 2013 Genome Research Ltd. 3 * Author(s): James Bonfield 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following 13 * disclaimer in the documentation and/or other materials provided 14 * with the distribution. 15 * 16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 17 * Institute nor the names of its contributors may be used to endorse 18 * or promote products derived from this software without specific 19 * prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS 22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH 25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifndef _HASH_TABLE_H_ 35 #define _HASH_TABLE_H_ 36 37 #include <inttypes.h> 38 #include <sys/types.h> 39 #include <stdio.h> 40 #include <stdlib.h> 41 42 #include "io_lib/pooled_alloc.h" 43 44 #ifdef __cplusplus 45 extern "C" { 46 #endif 47 48 /* The data referenced by the hash table */ 49 typedef union { 50 uint64_t i; 51 uint32_t i32[2]; 52 float f; 53 double d; 54 void *p; 55 } HashData; 56 57 /* A hash item with "next" pointer to use in a linked list */ 58 typedef struct HashItemStruct { 59 HashData data; /* user defined data attached to this key */ 60 char *key; /* key we hashed on */ 61 int key_len; /* and its length */ 62 struct HashItemStruct *next; 63 } HashItem; 64 65 /* The main hash table structure itself */ 66 typedef struct { 67 int options; /* HASH_FUNC & HASH_OPT macros */ 68 uint32_t nbuckets; /* Number of hash buckets; power of 2 */ 69 uint32_t mask; /* bit-mask equiv of nbuckets */ 70 int nused; /* How many hash entries we're storing */ 71 HashItem **bucket; /* The bucket "list heads" themselves */ 72 pool_alloc_t *hi_pool; /* Pool of allocated HashItem structs */ 73 } HashTable; 74 75 /* An iterator on HashTable items */ 76 typedef struct { 77 int bnum; 78 HashItem *hi; 79 } HashIter; 80 81 #define HASHFILE_MAGIC ".hsh" 82 #define HASHFILE_VERSION100 "1.00" 83 #define HASHFILE_VERSION "1.01" 84 #define HASHFILE_PREPEND -1 85 86 /* File format: the hash table header */ 87 typedef struct { 88 char magic[4]; 89 char vers[4]; 90 char hfunc; 91 unsigned char nheaders; 92 unsigned char nfooters; 93 unsigned char narchives; 94 uint32_t nbuckets; 95 int64_t offset; 96 uint32_t size; 97 } HashFileHeader; 98 99 /* sizeof(HashFileHeader) minus terminal padding */ 100 #define HHSIZE 28 101 102 typedef struct { 103 char magic[4]; 104 char offset[8]; 105 } HashFileFooter; 106 107 /* The data block attached to the hash table */ 108 typedef struct { 109 uint64_t pos; 110 uint32_t size; 111 unsigned char archive; 112 unsigned char header; /* zero if not set */ 113 unsigned char footer; /* zero if not set */ 114 } HashFileItem; 115 116 /* Common headers or footers to prepend to the archive contents */ 117 typedef struct { 118 unsigned char archive_no; 119 uint64_t pos; 120 uint32_t size; 121 unsigned char *cached_data; 122 } HashFileSection; 123 124 /* 125 * The main structure for the HashFile functions. 126 * 127 * We obtain an existing HashFile by opening a stored hash file or by 128 * loading the entire thing. 129 * New empty ones can be created using HashFileCreate. 130 */ 131 typedef struct { 132 HashFileHeader hh; /* on-disk file header */ 133 HashTable *h; /* the in-memory hash table */ 134 int nheaders; /* number of common file headers */ 135 HashFileSection *headers; /* on-disk common file headers struct */ 136 int nfooters; /* number of common file footers */ 137 HashFileSection *footers; /* on-disk common file footers struct */ 138 int narchives; /* number of archive files, 0 if inline file */ 139 char **archives; /* archive filenames */ 140 FILE *hfp; /* hash FILE */ 141 FILE **afp; /* archive FILE(s) */ 142 int header_size; /* size of header + filename + N(head/feet) */ 143 off_t hf_start; /* location of HashFile header in file */ 144 } HashFile; 145 146 /* Functions to to use HashTable.options */ 147 #define HASH_FUNC_HSIEH 0 148 #define HASH_FUNC_TCL 1 149 #define HASH_FUNC_JENKINS 2 150 #define HASH_FUNC_JENKINS3 3 151 #define HASH_FUNC_MASK 7 152 153 /* Other HashTable.options values */ 154 #define HASH_NONVOLATILE_KEYS (1<<3) 155 #define HASH_ALLOW_DUP_KEYS (1<<4) 156 #define HASH_DYNAMIC_SIZE (1<<5) 157 #define HASH_OWN_KEYS (1<<6) 158 #define HASH_POOL_ITEMS (1<<7) 159 #define HASH_INT_KEYS (1<<8) 160 161 /* Hashing prototypes */ 162 uint32_t hash(int func, uint8_t *key, int key_len); 163 uint64_t hash64(int func, uint8_t *key, int key_len); 164 uint32_t HashJenkins(uint8_t *k, int length); 165 uint32_t HashTcl(uint8_t *data, int len); 166 uint32_t HashHsieh(uint8_t *k, int length); 167 168 /* HashTable management prototypes */ 169 HashTable *HashTableCreate(int size, int options); 170 void HashTableDestroy(HashTable *h, int deallocate_date); 171 int HashTableResize(HashTable *h, int newsize); 172 HashItem *HashTableAdd(HashTable *h, char *key, int key_len, 173 HashData data, int *added); 174 int HashTableDel(HashTable *h, HashItem *hi, int deallocate_data); 175 int HashTableRemove(HashTable *h, char *key, int key_len, int deallocate_data); 176 HashItem *HashTableSearch(HashTable *h, char *key, int key_len); 177 HashItem *HashTableNext(HashItem *hi, char *key, int key_len); 178 HashItem *HashTableNextInt(HashItem *hi, char *key, int key_len); 179 180 void HashTableStats(HashTable *h, FILE *fp); 181 void HashTableDump(HashTable *h, FILE *fp, char *prefix); 182 183 /* Iterator prototypes */ 184 HashIter *HashTableIterCreate(void); 185 void HashTableIterDestroy(HashIter *iter); 186 HashItem *HashTableIterNext(HashTable *h, HashIter *iter); 187 void HashTableIterReset(HashIter *iter); 188 189 /* HashFile prototypes */ 190 uint64_t HashFileSave(HashFile *hf, FILE *fp, int64_t offset); 191 HashFile *HashFileLoad(FILE *fp); 192 int HashFileQuery(HashFile *hf, uint8_t *key, int key_len, HashFileItem *item); 193 char *HashFileExtract(HashFile *hf, char *fname, size_t *len); 194 195 196 HashFile *HashFileCreate(int size, int options); 197 void HashFileDestroy(HashFile *hf); 198 HashFile *HashFileOpen(char *fname); 199 HashFile *HashFileFopen(FILE *fp); 200 201 #ifdef __cplusplus 202 } 203 #endif 204 205 #endif /* _HASH_TABLE_H_ */ 206