1 #include "graph.h"
2 #include "chm.h"
3 #include "cmph_structs.h"
4 #include "chm_structs.h"
5 #include "hash.h"
6 #include "bitbool.h"
7 
8 #include <math.h>
9 #include <stdlib.h>
10 #include <stdio.h>
11 #include <assert.h>
12 #include <string.h>
13 
14 //#define DEBUG
15 #include "debug.h"
16 
17 static int chm_gen_edges(cmph_config_t *mph);
18 static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v);
19 
chm_config_new(void)20 chm_config_data_t *chm_config_new(void)
21 {
22 	chm_config_data_t *chm = NULL;
23 	chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t));
24         if (!chm) return NULL;
25 	memset(chm, 0, sizeof(chm_config_data_t));
26 	chm->hashfuncs[0] = CMPH_HASH_JENKINS;
27 	chm->hashfuncs[1] = CMPH_HASH_JENKINS;
28 	chm->g = NULL;
29 	chm->graph = NULL;
30 	chm->hashes = NULL;
31 	return chm;
32 }
chm_config_destroy(cmph_config_t * mph)33 void chm_config_destroy(cmph_config_t *mph)
34 {
35 	chm_config_data_t *data = (chm_config_data_t *)mph->data;
36 	DEBUGP("Destroying algorithm dependent data\n");
37 	free(data);
38 }
39 
chm_config_set_hashfuncs(cmph_config_t * mph,CMPH_HASH * hashfuncs)40 void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
41 {
42 	chm_config_data_t *chm = (chm_config_data_t *)mph->data;
43 	CMPH_HASH *hashptr = hashfuncs;
44 	cmph_uint32 i = 0;
45 	while(*hashptr != CMPH_HASH_COUNT)
46 	{
47 		if (i >= 2) break; //chm only uses two hash functions
48 		chm->hashfuncs[i] = *hashptr;
49 		++i, ++hashptr;
50 	}
51 }
52 
chm_new(cmph_config_t * mph,double c)53 cmph_t *chm_new(cmph_config_t *mph, double c)
54 {
55 	cmph_t *mphf = NULL;
56 	chm_data_t *chmf = NULL;
57 
58 	cmph_uint32 i;
59 	cmph_uint32 iterations = 20;
60 	cmph_uint8 *visited = NULL;
61 	chm_config_data_t *chm = (chm_config_data_t *)mph->data;
62 	chm->m = mph->key_source->nkeys;
63 	if (c == 0) c = 2.09;
64 	chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys);
65 	DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c);
66 	chm->graph = graph_new(chm->n, chm->m);
67 	DEBUGP("Created graph\n");
68 
69 	chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
70 	for(i = 0; i < 3; ++i) chm->hashes[i] = NULL;
71 	//Mapping step
72 	if (mph->verbosity)
73 	{
74 		fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", chm->m, chm->n);
75 	}
76 	while(1)
77 	{
78 		int ok;
79 		chm->hashes[0] = hash_state_new(chm->hashfuncs[0], chm->n);
80 		chm->hashes[1] = hash_state_new(chm->hashfuncs[1], chm->n);
81 		ok = chm_gen_edges(mph);
82 		if (!ok)
83 		{
84 			--iterations;
85 			hash_state_destroy(chm->hashes[0]);
86 			chm->hashes[0] = NULL;
87 			hash_state_destroy(chm->hashes[1]);
88 			chm->hashes[1] = NULL;
89 			DEBUGP("%u iterations remaining\n", iterations);
90 			if (mph->verbosity)
91 			{
92 				fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations);
93 			}
94 			if (iterations == 0) break;
95 		}
96 		else break;
97 	}
98 	if (iterations == 0)
99 	{
100 		graph_destroy(chm->graph);
101 		return NULL;
102 	}
103 
104 	//Assignment step
105 	if (mph->verbosity)
106 	{
107 		fprintf(stderr, "Starting assignment step\n");
108 	}
109 	DEBUGP("Assignment step\n");
110  	visited = (cmph_uint8 *)malloc((size_t)(chm->n/8 + 1));
111 	memset(visited, 0, (size_t)(chm->n/8 + 1));
112 	free(chm->g);
113 	chm->g = (cmph_uint32 *)malloc(chm->n * sizeof(cmph_uint32));
114 	assert(chm->g);
115 	for (i = 0; i < chm->n; ++i)
116 	{
117 	        if (!GETBIT(visited,i))
118 		{
119 			chm->g[i] = 0;
120 			chm_traverse(chm, visited, i);
121 		}
122 	}
123 	graph_destroy(chm->graph);
124 	free(visited);
125 	chm->graph = NULL;
126 
127 	mphf = (cmph_t *)malloc(sizeof(cmph_t));
128 	mphf->algo = mph->algo;
129 	chmf = (chm_data_t *)malloc(sizeof(chm_data_t));
130 	chmf->g = chm->g;
131 	chm->g = NULL; //transfer memory ownership
132 	chmf->hashes = chm->hashes;
133 	chm->hashes = NULL; //transfer memory ownership
134 	chmf->n = chm->n;
135 	chmf->m = chm->m;
136 	mphf->data = chmf;
137 	mphf->size = chm->m;
138 	DEBUGP("Successfully generated minimal perfect hash\n");
139 	if (mph->verbosity)
140 	{
141 		fprintf(stderr, "Successfully generated minimal perfect hash function\n");
142 	}
143 	return mphf;
144 }
145 
chm_traverse(chm_config_data_t * chm,cmph_uint8 * visited,cmph_uint32 v)146 static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v)
147 {
148 
149 	graph_iterator_t it = graph_neighbors_it(chm->graph, v);
150 	cmph_uint32 neighbor = 0;
151 	SETBIT(visited,v);
152 
153 	DEBUGP("Visiting vertex %u\n", v);
154 	while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR)
155 	{
156 		DEBUGP("Visiting neighbor %u\n", neighbor);
157 		if(GETBIT(visited,neighbor)) continue;
158 		DEBUGP("Visiting neighbor %u\n", neighbor);
159 		DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(chm->graph, v, neighbor));
160 		chm->g[neighbor] = graph_edge_id(chm->graph, v, neighbor) - chm->g[v];
161 		DEBUGP("g is %u (%u - %u mod %u)\n", chm->g[neighbor], graph_edge_id(chm->graph, v, neighbor), chm->g[v], chm->m);
162 		chm_traverse(chm, visited, neighbor);
163 	}
164 }
165 
chm_gen_edges(cmph_config_t * mph)166 static int chm_gen_edges(cmph_config_t *mph)
167 {
168 	cmph_uint32 e;
169 	chm_config_data_t *chm = (chm_config_data_t *)mph->data;
170 	int cycles = 0;
171 
172 	DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]);
173 	graph_clear_edges(chm->graph);
174 	mph->key_source->rewind(mph->key_source->data);
175 	for (e = 0; e < mph->key_source->nkeys; ++e)
176 	{
177 		cmph_uint32 h1, h2;
178 		cmph_uint32 keylen;
179 		char *key;
180 		mph->key_source->read(mph->key_source->data, &key, &keylen);
181 		h1 = hash(chm->hashes[0], key, keylen) % chm->n;
182 		h2 = hash(chm->hashes[1], key, keylen) % chm->n;
183 		if (h1 == h2) if (++h2 >= chm->n) h2 = 0;
184 		if (h1 == h2)
185 		{
186 			if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e);
187 			mph->key_source->dispose(mph->key_source->data, key, keylen);
188 			return 0;
189 		}
190 		DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
191 		mph->key_source->dispose(mph->key_source->data, key, keylen);
192 		graph_add_edge(chm->graph, h1, h2);
193 	}
194 	cycles = graph_is_cyclic(chm->graph);
195 	if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n");
196 	DEBUGP("Looking for cycles: %u\n", cycles);
197 
198 	return ! cycles;
199 }
200 
chm_dump(cmph_t * mphf,FILE * fd)201 int chm_dump(cmph_t *mphf, FILE *fd)
202 {
203 	char *buf = NULL;
204 	cmph_uint32 buflen;
205 	cmph_uint32 two = 2; //number of hash functions
206 	chm_data_t *data = (chm_data_t *)mphf->data;
207 	register size_t nbytes;
208 
209 	__cmph_dump(mphf, fd);
210 
211 	nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd);
212 	hash_state_dump(data->hashes[0], &buf, &buflen);
213 	DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
214 	nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
215 	nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
216 	free(buf);
217 
218 	hash_state_dump(data->hashes[1], &buf, &buflen);
219 	DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
220 	nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
221 	nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
222 	free(buf);
223 
224 	nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
225 	nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
226 
227 	nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd);
228 /*	#ifdef DEBUG
229 	fprintf(stderr, "G: ");
230 	for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
231 	fprintf(stderr, "\n");
232 	#endif*/
233 	return 1;
234 }
235 
chm_load(FILE * f,cmph_t * mphf)236 void chm_load(FILE *f, cmph_t *mphf)
237 {
238 	cmph_uint32 nhashes;
239 	char *buf = NULL;
240 	cmph_uint32 buflen;
241 	cmph_uint32 i;
242 	chm_data_t *chm = (chm_data_t *)malloc(sizeof(chm_data_t));
243 	register size_t nbytes;
244 	DEBUGP("Loading chm mphf\n");
245 	mphf->data = chm;
246 	nbytes = fread(&nhashes, sizeof(cmph_uint32), (size_t)1, f);
247 	chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
248 	chm->hashes[nhashes] = NULL;
249 	DEBUGP("Reading %u hashes\n", nhashes);
250 	for (i = 0; i < nhashes; ++i)
251 	{
252 		hash_state_t *state = NULL;
253 		nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
254 		DEBUGP("Hash state has %u bytes\n", buflen);
255 		buf = (char *)malloc((size_t)buflen);
256 		nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
257 		state = hash_state_load(buf, buflen);
258 		chm->hashes[i] = state;
259 		free(buf);
260 	}
261 
262 	DEBUGP("Reading m and n\n");
263 	nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f);
264 	nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f);
265 
266 	chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n);
267 	nbytes = fread(chm->g, chm->n*sizeof(cmph_uint32), (size_t)1, f);
268 	#ifdef DEBUG
269 	fprintf(stderr, "G: ");
270 	for (i = 0; i < chm->n; ++i) fprintf(stderr, "%u ", chm->g[i]);
271 	fprintf(stderr, "\n");
272 	#endif
273 	return;
274 }
275 
276 
chm_search(cmph_t * mphf,const char * key,cmph_uint32 keylen)277 cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
278 {
279 	chm_data_t *chm = (chm_data_t *)mphf->data;
280 	cmph_uint32 h1 = hash(chm->hashes[0], key, keylen) % chm->n;
281 	cmph_uint32 h2 = hash(chm->hashes[1], key, keylen) % chm->n;
282 	DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
283 	if (h1 == h2 && ++h2 >= chm->n) h2 = 0;
284 	DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, chm->g[h1], chm->g[h2], chm->m);
285 	return (chm->g[h1] + chm->g[h2]) % chm->m;
286 }
chm_destroy(cmph_t * mphf)287 void chm_destroy(cmph_t *mphf)
288 {
289 	chm_data_t *data = (chm_data_t *)mphf->data;
290 	free(data->g);
291 	hash_state_destroy(data->hashes[0]);
292 	hash_state_destroy(data->hashes[1]);
293 	free(data->hashes);
294 	free(data);
295 	free(mphf);
296 }
297 
298 /** \fn void chm_pack(cmph_t *mphf, void *packed_mphf);
299  *  \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
300  *  \param mphf pointer to the resulting mphf
301  *  \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
302  */
chm_pack(cmph_t * mphf,void * packed_mphf)303 void chm_pack(cmph_t *mphf, void *packed_mphf)
304 {
305 	chm_data_t *data = (chm_data_t *)mphf->data;
306 	cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
307 
308 	// packing h1 type
309 	CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
310 	*((cmph_uint32 *) ptr) = h1_type;
311 	ptr += sizeof(cmph_uint32);
312 
313 	// packing h1
314 	hash_state_pack(data->hashes[0], ptr);
315 	ptr += hash_state_packed_size(h1_type);
316 
317 	// packing h2 type
318 	CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
319 	*((cmph_uint32 *) ptr) = h2_type;
320 	ptr += sizeof(cmph_uint32);
321 
322 	// packing h2
323 	hash_state_pack(data->hashes[1], ptr);
324 	ptr += hash_state_packed_size(h2_type);
325 
326 	// packing n
327 	*((cmph_uint32 *) ptr) = data->n;
328 	ptr += sizeof(data->n);
329 
330 	// packing m
331 	*((cmph_uint32 *) ptr) = data->m;
332 	ptr += sizeof(data->m);
333 
334 	// packing g
335 	memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n);
336 }
337 
338 /** \fn cmph_uint32 chm_packed_size(cmph_t *mphf);
339  *  \brief Return the amount of space needed to pack mphf.
340  *  \param mphf pointer to a mphf
341  *  \return the size of the packed function or zero for failures
342  */
chm_packed_size(cmph_t * mphf)343 cmph_uint32 chm_packed_size(cmph_t *mphf)
344 {
345 	chm_data_t *data = (chm_data_t *)mphf->data;
346 	CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
347 	CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
348 
349 	return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
350 			4*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n);
351 }
352 
353 /** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
354  *  \brief Use the packed mphf to do a search.
355  *  \param  packed_mphf pointer to the packed mphf
356  *  \param key key to be hashed
357  *  \param keylen key legth in bytes
358  *  \return The mphf value
359  */
chm_search_packed(void * packed_mphf,const char * key,cmph_uint32 keylen)360 cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
361 {
362 	register cmph_uint8 *h1_ptr = (cmph_uint8 *)packed_mphf;
363 	register CMPH_HASH h1_type  = (CMPH_HASH)(*((cmph_uint32 *)h1_ptr));
364 	h1_ptr += 4;
365 
366 	register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
367 	register CMPH_HASH h2_type  = (CMPH_HASH)(*((cmph_uint32 *)h2_ptr));
368 	h2_ptr += 4;
369 
370 	register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type));
371 
372 	register cmph_uint32 n = *g_ptr++;
373 	register cmph_uint32 m = *g_ptr++;
374 
375 	register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
376 	register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
377 	DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
378 	if (h1 == h2 && ++h2 >= n) h2 = 0;
379 	DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m);
380 	return (g_ptr[h1] + g_ptr[h2]) % m;
381 }
382