1 /*
2  * Copyright (c) 2005-2010, 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef _HASH_TABLE_H_
35 #define _HASH_TABLE_H_
36 
37 #include <inttypes.h>
38 #include <sys/types.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 
42 #include "io_lib/pooled_alloc.h"
43 
44 #ifdef __cplusplus
45 extern "C" {
46 #endif
47 
48 /* The data referenced by the hash table */
49 typedef union {
50     uint64_t i;
51     uint32_t i32[2];
52     float f;
53     double d;
54     void *p;
55 } HashData;
56 
57 /* A hash item with "next" pointer to use in a linked list */
58 typedef struct HashItemStruct {
59     HashData data;        /* user defined data attached to this key */
60     char    *key;         /* key we hashed on */
61     int      key_len;     /* and its length */
62     struct HashItemStruct *next;
63 } HashItem;
64 
65 /* The main hash table structure itself */
66 typedef struct {
67     int       options;  /* HASH_FUNC & HASH_OPT macros */
68     uint32_t  nbuckets; /* Number of hash buckets; power of 2 */
69     uint32_t  mask;	/* bit-mask equiv of nbuckets */
70     int       nused;    /* How many hash entries we're storing */
71     HashItem **bucket;  /* The bucket "list heads" themselves */
72     pool_alloc_t *hi_pool; /* Pool of allocated HashItem structs */
73 } HashTable;
74 
75 /* An iterator on HashTable items */
76 typedef struct {
77     int bnum;
78     HashItem *hi;
79 } HashIter;
80 
81 #define HASHFILE_MAGIC ".hsh"
82 #define HASHFILE_VERSION100 "1.00"
83 #define HASHFILE_VERSION "1.01"
84 #define HASHFILE_PREPEND -1
85 
86 /* File format: the hash table header */
87 typedef struct {
88     char magic[4];
89     char vers[4];
90     char hfunc;
91     unsigned char nheaders;
92     unsigned char nfooters;
93     unsigned char narchives;
94     uint32_t nbuckets;
95     int64_t offset;
96     uint32_t size;
97 } HashFileHeader;
98 
99 /* sizeof(HashFileHeader) minus terminal padding */
100 #define HHSIZE 28
101 
102 typedef struct {
103     char magic[4];
104     char offset[8];
105 } HashFileFooter;
106 
107 /* The data block attached to the hash table */
108 typedef struct {
109     uint64_t pos;
110     uint32_t size;
111     unsigned char archive;
112     unsigned char header; /* zero if not set */
113     unsigned char footer; /* zero if not set */
114 } HashFileItem;
115 
116 /* Common headers or footers to prepend to the archive contents */
117 typedef struct {
118     unsigned char archive_no;
119     uint64_t pos;
120     uint32_t size;
121     unsigned char *cached_data;
122 } HashFileSection;
123 
124 /*
125  * The main structure for the HashFile functions.
126  *
127  * We obtain an existing HashFile by opening a stored hash file or by
128  * loading the entire thing.
129  * New empty ones can be created using HashFileCreate.
130  */
131 typedef struct {
132     HashFileHeader hh;		/* on-disk file header */
133     HashTable *h;		/* the in-memory hash table */
134     int nheaders;		/* number of common file headers */
135     HashFileSection *headers;	/* on-disk common file headers struct */
136     int nfooters;		/* number of common file footers */
137     HashFileSection *footers;	/* on-disk common file footers struct */
138     int narchives;		/* number of archive files, 0 if inline file */
139     char **archives;		/* archive filenames */
140     FILE *hfp;			/* hash FILE */
141     FILE **afp;			/* archive FILE(s) */
142     int header_size;		/* size of header + filename + N(head/feet) */
143     off_t hf_start;		/* location of HashFile header in file */
144 } HashFile;
145 
146 /* Functions to to use HashTable.options */
147 #define HASH_FUNC_HSIEH       0
148 #define HASH_FUNC_TCL         1
149 #define HASH_FUNC_JENKINS     2
150 #define HASH_FUNC_JENKINS3    3
151 #define HASH_FUNC_MASK        7
152 
153 /* Other HashTable.options values */
154 #define HASH_NONVOLATILE_KEYS (1<<3)
155 #define HASH_ALLOW_DUP_KEYS   (1<<4)
156 #define HASH_DYNAMIC_SIZE     (1<<5)
157 #define HASH_OWN_KEYS	      (1<<6)
158 #define HASH_POOL_ITEMS       (1<<7)
159 #define HASH_INT_KEYS 	      (1<<8)
160 
161 /* Hashing prototypes */
162 uint32_t hash(int func, uint8_t *key, int key_len);
163 uint64_t hash64(int func, uint8_t *key, int key_len);
164 uint32_t HashJenkins(uint8_t *k, int length);
165 uint32_t HashTcl(uint8_t *data, int len);
166 uint32_t HashHsieh(uint8_t *k, int length);
167 
168 /* HashTable management prototypes */
169 HashTable *HashTableCreate(int size, int options);
170 void HashTableDestroy(HashTable *h, int deallocate_date);
171 int HashTableResize(HashTable *h, int newsize);
172 HashItem *HashTableAdd(HashTable *h, char *key, int key_len,
173 		       HashData data, int *added);
174 int HashTableDel(HashTable *h, HashItem *hi, int deallocate_data);
175 int HashTableRemove(HashTable *h, char *key, int key_len, int deallocate_data);
176 HashItem *HashTableSearch(HashTable *h, char *key, int key_len);
177 HashItem *HashTableNext(HashItem *hi, char *key, int key_len);
178 HashItem *HashTableNextInt(HashItem *hi, char *key, int key_len);
179 
180 void HashTableStats(HashTable *h, FILE *fp);
181 void HashTableDump(HashTable *h, FILE *fp, char *prefix);
182 
183 /* Iterator prototypes */
184 HashIter *HashTableIterCreate(void);
185 void HashTableIterDestroy(HashIter *iter);
186 HashItem *HashTableIterNext(HashTable *h, HashIter *iter);
187 void HashTableIterReset(HashIter *iter);
188 
189 /* HashFile prototypes */
190 uint64_t HashFileSave(HashFile *hf, FILE *fp, int64_t offset);
191 HashFile *HashFileLoad(FILE *fp);
192 int HashFileQuery(HashFile *hf, uint8_t *key, int key_len, HashFileItem *item);
193 char *HashFileExtract(HashFile *hf, char *fname, size_t *len);
194 
195 
196 HashFile *HashFileCreate(int size, int options);
197 void HashFileDestroy(HashFile *hf);
198 HashFile *HashFileOpen(char *fname);
199 HashFile *HashFileFopen(FILE *fp);
200 
201 #ifdef __cplusplus
202 }
203 #endif
204 
205 #endif /* _HASH_TABLE_H_ */
206