1 #include "graph.h"
2 #include "chm.h"
3 #include "cmph_structs.h"
4 #include "chm_structs.h"
5 #include "hash.h"
6 #include "bitbool.h"
7
8 #include <math.h>
9 #include <stdlib.h>
10 #include <stdio.h>
11 #include <assert.h>
12 #include <string.h>
13
14 //#define DEBUG
15 #include "debug.h"
16
17 static int chm_gen_edges(cmph_config_t *mph);
18 static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v);
19
chm_config_new(void)20 chm_config_data_t *chm_config_new(void)
21 {
22 chm_config_data_t *chm = NULL;
23 chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t));
24 if (!chm) return NULL;
25 memset(chm, 0, sizeof(chm_config_data_t));
26 chm->hashfuncs[0] = CMPH_HASH_JENKINS;
27 chm->hashfuncs[1] = CMPH_HASH_JENKINS;
28 chm->g = NULL;
29 chm->graph = NULL;
30 chm->hashes = NULL;
31 return chm;
32 }
chm_config_destroy(cmph_config_t * mph)33 void chm_config_destroy(cmph_config_t *mph)
34 {
35 chm_config_data_t *data = (chm_config_data_t *)mph->data;
36 DEBUGP("Destroying algorithm dependent data\n");
37 free(data);
38 }
39
chm_config_set_hashfuncs(cmph_config_t * mph,CMPH_HASH * hashfuncs)40 void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
41 {
42 chm_config_data_t *chm = (chm_config_data_t *)mph->data;
43 CMPH_HASH *hashptr = hashfuncs;
44 cmph_uint32 i = 0;
45 while(*hashptr != CMPH_HASH_COUNT)
46 {
47 if (i >= 2) break; //chm only uses two hash functions
48 chm->hashfuncs[i] = *hashptr;
49 ++i, ++hashptr;
50 }
51 }
52
chm_new(cmph_config_t * mph,double c)53 cmph_t *chm_new(cmph_config_t *mph, double c)
54 {
55 cmph_t *mphf = NULL;
56 chm_data_t *chmf = NULL;
57
58 cmph_uint32 i;
59 cmph_uint32 iterations = 20;
60 cmph_uint8 *visited = NULL;
61 chm_config_data_t *chm = (chm_config_data_t *)mph->data;
62 chm->m = mph->key_source->nkeys;
63 if (c == 0) c = 2.09;
64 chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys);
65 DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c);
66 chm->graph = graph_new(chm->n, chm->m);
67 DEBUGP("Created graph\n");
68
69 chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
70 for(i = 0; i < 3; ++i) chm->hashes[i] = NULL;
71 //Mapping step
72 if (mph->verbosity)
73 {
74 fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", chm->m, chm->n);
75 }
76 while(1)
77 {
78 int ok;
79 chm->hashes[0] = hash_state_new(chm->hashfuncs[0], chm->n);
80 chm->hashes[1] = hash_state_new(chm->hashfuncs[1], chm->n);
81 ok = chm_gen_edges(mph);
82 if (!ok)
83 {
84 --iterations;
85 hash_state_destroy(chm->hashes[0]);
86 chm->hashes[0] = NULL;
87 hash_state_destroy(chm->hashes[1]);
88 chm->hashes[1] = NULL;
89 DEBUGP("%u iterations remaining\n", iterations);
90 if (mph->verbosity)
91 {
92 fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations);
93 }
94 if (iterations == 0) break;
95 }
96 else break;
97 }
98 if (iterations == 0)
99 {
100 graph_destroy(chm->graph);
101 return NULL;
102 }
103
104 //Assignment step
105 if (mph->verbosity)
106 {
107 fprintf(stderr, "Starting assignment step\n");
108 }
109 DEBUGP("Assignment step\n");
110 visited = (cmph_uint8 *)malloc((size_t)(chm->n/8 + 1));
111 memset(visited, 0, (size_t)(chm->n/8 + 1));
112 free(chm->g);
113 chm->g = (cmph_uint32 *)malloc(chm->n * sizeof(cmph_uint32));
114 assert(chm->g);
115 for (i = 0; i < chm->n; ++i)
116 {
117 if (!GETBIT(visited,i))
118 {
119 chm->g[i] = 0;
120 chm_traverse(chm, visited, i);
121 }
122 }
123 graph_destroy(chm->graph);
124 free(visited);
125 chm->graph = NULL;
126
127 mphf = (cmph_t *)malloc(sizeof(cmph_t));
128 mphf->algo = mph->algo;
129 chmf = (chm_data_t *)malloc(sizeof(chm_data_t));
130 chmf->g = chm->g;
131 chm->g = NULL; //transfer memory ownership
132 chmf->hashes = chm->hashes;
133 chm->hashes = NULL; //transfer memory ownership
134 chmf->n = chm->n;
135 chmf->m = chm->m;
136 mphf->data = chmf;
137 mphf->size = chm->m;
138 DEBUGP("Successfully generated minimal perfect hash\n");
139 if (mph->verbosity)
140 {
141 fprintf(stderr, "Successfully generated minimal perfect hash function\n");
142 }
143 return mphf;
144 }
145
chm_traverse(chm_config_data_t * chm,cmph_uint8 * visited,cmph_uint32 v)146 static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v)
147 {
148
149 graph_iterator_t it = graph_neighbors_it(chm->graph, v);
150 cmph_uint32 neighbor = 0;
151 SETBIT(visited,v);
152
153 DEBUGP("Visiting vertex %u\n", v);
154 while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR)
155 {
156 DEBUGP("Visiting neighbor %u\n", neighbor);
157 if(GETBIT(visited,neighbor)) continue;
158 DEBUGP("Visiting neighbor %u\n", neighbor);
159 DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(chm->graph, v, neighbor));
160 chm->g[neighbor] = graph_edge_id(chm->graph, v, neighbor) - chm->g[v];
161 DEBUGP("g is %u (%u - %u mod %u)\n", chm->g[neighbor], graph_edge_id(chm->graph, v, neighbor), chm->g[v], chm->m);
162 chm_traverse(chm, visited, neighbor);
163 }
164 }
165
chm_gen_edges(cmph_config_t * mph)166 static int chm_gen_edges(cmph_config_t *mph)
167 {
168 cmph_uint32 e;
169 chm_config_data_t *chm = (chm_config_data_t *)mph->data;
170 int cycles = 0;
171
172 DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]);
173 graph_clear_edges(chm->graph);
174 mph->key_source->rewind(mph->key_source->data);
175 for (e = 0; e < mph->key_source->nkeys; ++e)
176 {
177 cmph_uint32 h1, h2;
178 cmph_uint32 keylen;
179 char *key;
180 mph->key_source->read(mph->key_source->data, &key, &keylen);
181 h1 = hash(chm->hashes[0], key, keylen) % chm->n;
182 h2 = hash(chm->hashes[1], key, keylen) % chm->n;
183 if (h1 == h2) if (++h2 >= chm->n) h2 = 0;
184 if (h1 == h2)
185 {
186 if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e);
187 mph->key_source->dispose(mph->key_source->data, key, keylen);
188 return 0;
189 }
190 DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
191 mph->key_source->dispose(mph->key_source->data, key, keylen);
192 graph_add_edge(chm->graph, h1, h2);
193 }
194 cycles = graph_is_cyclic(chm->graph);
195 if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n");
196 DEBUGP("Looking for cycles: %u\n", cycles);
197
198 return ! cycles;
199 }
200
chm_dump(cmph_t * mphf,FILE * fd)201 int chm_dump(cmph_t *mphf, FILE *fd)
202 {
203 char *buf = NULL;
204 cmph_uint32 buflen;
205 cmph_uint32 two = 2; //number of hash functions
206 chm_data_t *data = (chm_data_t *)mphf->data;
207 register size_t nbytes;
208
209 __cmph_dump(mphf, fd);
210
211 nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd);
212 hash_state_dump(data->hashes[0], &buf, &buflen);
213 DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
214 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
215 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
216 free(buf);
217
218 hash_state_dump(data->hashes[1], &buf, &buflen);
219 DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
220 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
221 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
222 free(buf);
223
224 nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
225 nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
226
227 nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd);
228 /* #ifdef DEBUG
229 fprintf(stderr, "G: ");
230 for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
231 fprintf(stderr, "\n");
232 #endif*/
233 return 1;
234 }
235
chm_load(FILE * f,cmph_t * mphf)236 void chm_load(FILE *f, cmph_t *mphf)
237 {
238 cmph_uint32 nhashes;
239 char *buf = NULL;
240 cmph_uint32 buflen;
241 cmph_uint32 i;
242 chm_data_t *chm = (chm_data_t *)malloc(sizeof(chm_data_t));
243 register size_t nbytes;
244 DEBUGP("Loading chm mphf\n");
245 mphf->data = chm;
246 nbytes = fread(&nhashes, sizeof(cmph_uint32), (size_t)1, f);
247 chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
248 chm->hashes[nhashes] = NULL;
249 DEBUGP("Reading %u hashes\n", nhashes);
250 for (i = 0; i < nhashes; ++i)
251 {
252 hash_state_t *state = NULL;
253 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
254 DEBUGP("Hash state has %u bytes\n", buflen);
255 buf = (char *)malloc((size_t)buflen);
256 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
257 state = hash_state_load(buf, buflen);
258 chm->hashes[i] = state;
259 free(buf);
260 }
261
262 DEBUGP("Reading m and n\n");
263 nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f);
264 nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f);
265
266 chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n);
267 nbytes = fread(chm->g, chm->n*sizeof(cmph_uint32), (size_t)1, f);
268 #ifdef DEBUG
269 fprintf(stderr, "G: ");
270 for (i = 0; i < chm->n; ++i) fprintf(stderr, "%u ", chm->g[i]);
271 fprintf(stderr, "\n");
272 #endif
273 return;
274 }
275
276
chm_search(cmph_t * mphf,const char * key,cmph_uint32 keylen)277 cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
278 {
279 chm_data_t *chm = (chm_data_t *)mphf->data;
280 cmph_uint32 h1 = hash(chm->hashes[0], key, keylen) % chm->n;
281 cmph_uint32 h2 = hash(chm->hashes[1], key, keylen) % chm->n;
282 DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
283 if (h1 == h2 && ++h2 >= chm->n) h2 = 0;
284 DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, chm->g[h1], chm->g[h2], chm->m);
285 return (chm->g[h1] + chm->g[h2]) % chm->m;
286 }
chm_destroy(cmph_t * mphf)287 void chm_destroy(cmph_t *mphf)
288 {
289 chm_data_t *data = (chm_data_t *)mphf->data;
290 free(data->g);
291 hash_state_destroy(data->hashes[0]);
292 hash_state_destroy(data->hashes[1]);
293 free(data->hashes);
294 free(data);
295 free(mphf);
296 }
297
298 /** \fn void chm_pack(cmph_t *mphf, void *packed_mphf);
299 * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
300 * \param mphf pointer to the resulting mphf
301 * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
302 */
chm_pack(cmph_t * mphf,void * packed_mphf)303 void chm_pack(cmph_t *mphf, void *packed_mphf)
304 {
305 chm_data_t *data = (chm_data_t *)mphf->data;
306 cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
307
308 // packing h1 type
309 CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
310 *((cmph_uint32 *) ptr) = h1_type;
311 ptr += sizeof(cmph_uint32);
312
313 // packing h1
314 hash_state_pack(data->hashes[0], ptr);
315 ptr += hash_state_packed_size(h1_type);
316
317 // packing h2 type
318 CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
319 *((cmph_uint32 *) ptr) = h2_type;
320 ptr += sizeof(cmph_uint32);
321
322 // packing h2
323 hash_state_pack(data->hashes[1], ptr);
324 ptr += hash_state_packed_size(h2_type);
325
326 // packing n
327 *((cmph_uint32 *) ptr) = data->n;
328 ptr += sizeof(data->n);
329
330 // packing m
331 *((cmph_uint32 *) ptr) = data->m;
332 ptr += sizeof(data->m);
333
334 // packing g
335 memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n);
336 }
337
338 /** \fn cmph_uint32 chm_packed_size(cmph_t *mphf);
339 * \brief Return the amount of space needed to pack mphf.
340 * \param mphf pointer to a mphf
341 * \return the size of the packed function or zero for failures
342 */
chm_packed_size(cmph_t * mphf)343 cmph_uint32 chm_packed_size(cmph_t *mphf)
344 {
345 chm_data_t *data = (chm_data_t *)mphf->data;
346 CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
347 CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
348
349 return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
350 4*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n);
351 }
352
353 /** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
354 * \brief Use the packed mphf to do a search.
355 * \param packed_mphf pointer to the packed mphf
356 * \param key key to be hashed
357 * \param keylen key legth in bytes
358 * \return The mphf value
359 */
chm_search_packed(void * packed_mphf,const char * key,cmph_uint32 keylen)360 cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
361 {
362 register cmph_uint8 *h1_ptr = (cmph_uint8 *)packed_mphf;
363 register CMPH_HASH h1_type = (CMPH_HASH)(*((cmph_uint32 *)h1_ptr));
364 h1_ptr += 4;
365
366 register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
367 register CMPH_HASH h2_type = (CMPH_HASH)(*((cmph_uint32 *)h2_ptr));
368 h2_ptr += 4;
369
370 register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type));
371
372 register cmph_uint32 n = *g_ptr++;
373 register cmph_uint32 m = *g_ptr++;
374
375 register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
376 register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
377 DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
378 if (h1 == h2 && ++h2 >= n) h2 = 0;
379 DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m);
380 return (g_ptr[h1] + g_ptr[h2]) % m;
381 }
382