1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<string.h>
4 #include<math.h>
5 #include<time.h>
6 #include<assert.h>
7 #include<limits.h>
8 
9 #include "cmph_structs.h"
10 #include "chd_structs_ph.h"
11 #include "chd_ph.h"
12 #include"miller_rabin.h"
13 #include"bitbool.h"
14 
15 
16 //#define DEBUG
17 #include "debug.h"
18 
19 // NO_ELEMENT is equivalent to null pointer
20 #ifndef NO_ELEMENT
21 #define NO_ELEMENT UINT_MAX
22 #endif
23 
24 // struct used to represent items at mapping, ordering and searching phases
25 struct _chd_ph_item_t
26 {
27 	cmph_uint32 f;
28 	cmph_uint32 h;
29 };
30 typedef struct _chd_ph_item_t chd_ph_item_t;
31 
32 // struct to represent the items at mapping phase only.
33 struct _chd_ph_map_item_t
34 {
35 	cmph_uint32 f;
36 	cmph_uint32 h;
37 	cmph_uint32 bucket_num;
38 };
39 typedef struct _chd_ph_map_item_t chd_ph_map_item_t;
40 
41 // struct to represent a bucket
42 struct _chd_ph_bucket_t
43 {
44 	cmph_uint32 items_list; // offset
45 	union
46 	{
47 		cmph_uint32 size;
48 		cmph_uint32 bucket_id;
49 	};
50 };
51 
52 typedef struct _chd_ph_bucket_t chd_ph_bucket_t;
53 
54 struct _chd_ph_sorted_list_t
55 {
56 	cmph_uint32 buckets_list;
57 	cmph_uint32 size;
58 };
59 
60 typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t;
61 
62 
63 static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets);
64 static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets);
65 static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets);
66 
chd_ph_bucket_new(cmph_uint32 nbuckets)67 chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets)
68 {
69     chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t));
70     return buckets;
71 }
72 
chd_ph_bucket_clean(chd_ph_bucket_t * buckets,cmph_uint32 nbuckets)73 void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets)
74 {
75 	register cmph_uint32 i = 0;
76 	assert(buckets);
77 	for(i = 0; i < nbuckets; i++)
78 		buckets[i].size = 0;
79 }
chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items,chd_ph_item_t * items,cmph_uint32 nbuckets,cmph_uint32 item_idx)80 static cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items,
81 				cmph_uint32 nbuckets,cmph_uint32 item_idx)
82 {
83 	register cmph_uint32 i = 0;
84 	register chd_ph_item_t * tmp_item;
85 	register chd_ph_map_item_t * tmp_map_item = map_items + item_idx;
86 	register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num;
87 	tmp_item = items + bucket->items_list;
88 
89 	for(i = 0; i < bucket->size; i++)
90 	{
91 		if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h)
92 		{
93 			DEBUGP("Item not added\n");
94 			return 0;
95 		};
96 		tmp_item++;
97 	};
98 	tmp_item->f = tmp_map_item->f;
99 	tmp_item->h = tmp_map_item->h;
100 	bucket->size++;
101 	return 1;
102 };
chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)103 void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)
104 {
105     free(buckets);
106 }
107 
108 static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items,
109 					cmph_uint32 *max_bucket_size);
110 
111 static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items,
112 				cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size);
113 
114 static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
115 	cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table);
116 
chd_ph_space_lower_bound(cmph_uint32 _n,cmph_uint32 _r)117 static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
118 {
119 	double r = _r, n = _n;
120 	return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2);
121 };
122 
123 /* computes the entropy of non empty buckets.*/
chd_ph_get_entropy(cmph_uint32 * disp_table,cmph_uint32 n,cmph_uint32 max_probes)124 static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes)
125 {
126 	register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32));
127 	register cmph_uint32 i;
128 	register double entropy = 0;
129 
130 	for(i = 0; i < n; i++)
131 	{
132 		probe_counts[disp_table[i]]++;
133 	};
134 
135 	for(i = 0; i < max_probes; i++)
136 	{
137 		if(probe_counts[i] > 0)
138 			entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2);
139 	};
140 	free(probe_counts);
141 	return entropy;
142 };
143 
chd_ph_config_new(void)144 chd_ph_config_data_t *chd_ph_config_new(void)
145 {
146 	chd_ph_config_data_t *chd_ph;
147 	chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t));
148         if (!chd_ph) return NULL;
149 	memset(chd_ph, 0, sizeof(chd_ph_config_data_t));
150 
151 	chd_ph->hashfunc = CMPH_HASH_JENKINS;
152 	chd_ph->cs = NULL;
153 	chd_ph->nbuckets = 0;
154 	chd_ph->n = 0;
155 	chd_ph->hl = NULL;
156 
157 	chd_ph->m = 0;
158 	chd_ph->use_h = 1;
159 	chd_ph->keys_per_bin = 1;
160 	chd_ph->keys_per_bucket = 4;
161 	chd_ph->occup_table = 0;
162 
163 	return chd_ph;
164 }
165 
chd_ph_config_destroy(cmph_config_t * mph)166 void chd_ph_config_destroy(cmph_config_t *mph)
167 {
168 	chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
169 	DEBUGP("Destroying algorithm dependent data\n");
170 	if(data->occup_table)
171 	{
172 		free(data->occup_table);
173 		data->occup_table = NULL;
174 	}
175 	free(data);
176 }
177 
178 
chd_ph_config_set_hashfuncs(cmph_config_t * mph,CMPH_HASH * hashfuncs)179 void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
180 {
181 	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
182 	CMPH_HASH *hashptr = hashfuncs;
183 	cmph_uint32 i = 0;
184 	while(*hashptr != CMPH_HASH_COUNT)
185 	{
186 		if (i >= 1) break; //chd_ph only uses one linear hash function
187 		chd_ph->hashfunc = *hashptr;
188 		++i, ++hashptr;
189 	}
190 }
191 
192 
chd_ph_config_set_b(cmph_config_t * mph,cmph_uint32 keys_per_bucket)193 void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
194 {
195 	assert(mph);
196 	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
197 	if(keys_per_bucket < 1 || keys_per_bucket >= 15)
198 	{
199 	    keys_per_bucket = 4;
200 	}
201 	chd_ph->keys_per_bucket = keys_per_bucket;
202 }
203 
204 
chd_ph_config_set_keys_per_bin(cmph_config_t * mph,cmph_uint32 keys_per_bin)205 void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
206 {
207 	assert(mph);
208 	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
209 	if(keys_per_bin <= 1 || keys_per_bin >= 128)
210 	{
211 	    keys_per_bin = 1;
212 	}
213 	chd_ph->keys_per_bin = keys_per_bin;
214 }
215 
chd_ph_mapping(cmph_config_t * mph,chd_ph_bucket_t * buckets,chd_ph_item_t * items,cmph_uint32 * max_bucket_size)216 cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size)
217 {
218 	register cmph_uint32 i = 0, g = 0;
219 	cmph_uint32 hl[3];
220 	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
221 	char * key = NULL;
222 	cmph_uint32 keylen = 0;
223 	chd_ph_map_item_t * map_item;
224 	chd_ph_map_item_t * map_items = (chd_ph_map_item_t *)malloc(chd_ph->m*sizeof(chd_ph_map_item_t));
225 	register cmph_uint32 mapping_iterations = 1000;
226 	*max_bucket_size = 0;
227 	while(1)
228 	{
229 		mapping_iterations--;
230 		if (chd_ph->hl) hash_state_destroy(chd_ph->hl);
231 		chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m);
232 
233 		chd_ph_bucket_clean(buckets, chd_ph->nbuckets);
234 
235 		mph->key_source->rewind(mph->key_source->data);
236 
237 		for(i = 0; i < chd_ph->m; i++)
238 		{
239 			mph->key_source->read(mph->key_source->data, &key, &keylen);
240 			hash_vector(chd_ph->hl, key, keylen, hl);
241 
242 			map_item = (map_items + i);
243 
244 			g = hl[0] % chd_ph->nbuckets;
245 			map_item->f = hl[1] % chd_ph->n;
246 			map_item->h = hl[2] % (chd_ph->n - 1) + 1;
247 			map_item->bucket_num=g;
248 			mph->key_source->dispose(mph->key_source->data, key, keylen);
249 // 			if(buckets[g].size == (chd_ph->keys_per_bucket << 2))
250 // 			{
251 // 				DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2));
252 // 				goto error;
253 // 			}
254 			buckets[g].size++;
255 			if(buckets[g].size > *max_bucket_size)
256 			{
257 				  *max_bucket_size = buckets[g].size;
258 			}
259 		}
260 		buckets[0].items_list = 0;
261 		for(i = 1; i < chd_ph->nbuckets; i++)
262 		{
263 			buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size;
264 			buckets[i - 1].size = 0;
265 		};
266 		buckets[i - 1].size = 0;
267 		for(i = 0; i < chd_ph->m; i++)
268 		{
269 			map_item = (map_items + i);
270 			if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i))
271 				break;
272 		}
273 		if(i == chd_ph->m)
274 		{
275 			free(map_items);
276 			return 1; // SUCCESS
277 		}
278 
279 		if(mapping_iterations == 0)
280 		{
281 		      goto error;
282 		}
283 	}
284 error:
285 	free(map_items);
286 	hash_state_destroy(chd_ph->hl);
287 	chd_ph->hl = NULL;
288 	return 0; // FAILURE
289 }
290 
chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** _items,cmph_uint32 nbuckets,cmph_uint32 nitems,cmph_uint32 max_bucket_size)291 chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items,
292 	cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size)
293 {
294 	chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t));
295 
296 	chd_ph_bucket_t * input_buckets = (*_buckets);
297 	chd_ph_bucket_t * output_buckets;
298 	chd_ph_item_t * input_items = (*_items);
299 	chd_ph_item_t * output_items;
300 	register cmph_uint32 i, j, bucket_size, position, position2;
301 // 	cmph_uint32 non_empty_buckets;
302 	DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size);
303 	// Determine size of each list of buckets
304 	for(i = 0; i < nbuckets; i++)
305 	{
306 		bucket_size = input_buckets[i].size;
307 		if(bucket_size == 0)
308 			continue;
309 		sorted_lists[bucket_size].size++;
310 	};
311 	sorted_lists[1].buckets_list = 0;
312 	// Determine final position of list of buckets into the contiguous array that will store all the buckets
313 	for(i = 2; i <= max_bucket_size; i++)
314 	{
315 		sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size;
316 		sorted_lists[i-1].size = 0;
317 	};
318 	sorted_lists[i-1].size = 0;
319 	// Store the buckets in a new array which is sorted by bucket sizes
320 	output_buckets = (chd_ph_bucket_t *)calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero
321 //  	non_empty_buckets = nbuckets;
322 
323 	for(i = 0; i < nbuckets; i++)
324 	{
325 		bucket_size = input_buckets[i].size;
326 		if(bucket_size == 0)
327 		{
328 // 			non_empty_buckets--;
329 			continue;
330 		};
331 		position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size;
332 		output_buckets[position].bucket_id = i;
333 		output_buckets[position].items_list = input_buckets[i].items_list;
334 		sorted_lists[bucket_size].size++;
335 	};
336 /*	for(i = non_empty_buckets; i < nbuckets; i++)
337 		output_buckets[i].size=0;*/
338 	// Return the buckets sorted in new order and free the old buckets sorted in old order
339 	free(input_buckets);
340 	(*_buckets) = output_buckets;
341 
342 
343 	// Store the items according to the new order of buckets.
344 	output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t));
345 	position = 0;
346 	i = 0;
347 	for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
348 	{
349 		for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++)
350 		{
351 			position2 = output_buckets[i].items_list;
352 			output_buckets[i].items_list = position;
353 			for(j = 0; j < bucket_size; j++)
354 			{
355 				output_items[position].f = input_items[position2].f;
356 				output_items[position].h = input_items[position2].h;
357 				position++;
358 				position2++;
359 			};
360 		};
361 	};
362 	//Return the items sorted in new order and free the old items sorted in old order
363 	free(input_items);
364 	(*_items) = output_items;
365 	return sorted_lists;
366 };
367 
place_bucket_probe(chd_ph_config_data_t * chd_ph,chd_ph_bucket_t * buckets,chd_ph_item_t * items,cmph_uint32 probe0_num,cmph_uint32 probe1_num,cmph_uint32 bucket_num,cmph_uint32 size)368 static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
369 					    chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num,
370 					    cmph_uint32 bucket_num, cmph_uint32 size)
371 {
372 	register cmph_uint32 i;
373 	register chd_ph_item_t * item;
374 	register cmph_uint32 position;
375 
376 	item = items + buckets[bucket_num].items_list;
377 	// try place bucket with probe_num
378 	if(chd_ph->keys_per_bin > 1)
379 	{
380 		for(i = 0; i < size; i++) // placement
381 		{
382 			position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
383 			if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
384 			{
385 				break;
386 			}
387 			(chd_ph->occup_table[position])++;
388 			item++;
389 		};
390 	} else
391 	{
392 		for(i = 0; i < size; i++) // placement
393 		{
394 			position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
395 			if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position))
396 			{
397 				break;
398 			}
399 			SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
400 			item++;
401 		};
402 	};
403 	if(i != size) // Undo the placement
404 	{
405 		item = items + buckets[bucket_num].items_list;
406 		if(chd_ph->keys_per_bin > 1)
407 		{
408 			while(1)
409 			{
410 				if(i == 0)
411 				{
412 					break;
413 				}
414 				position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
415 				(chd_ph->occup_table[position])--;
416 				item++;
417 				i--;
418 			};
419 		} else
420 		{
421 			while(1)
422 			{
423 				if(i == 0)
424 				{
425 					break;
426 				}
427 				position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
428 				UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
429 
430 // 				([position/32]^=(1<<(position%32));
431 				item++;
432 				i--;
433 			};
434 		};
435 		return 0;
436 	}
437 	return 1;
438 };
439 
place_bucket(chd_ph_config_data_t * chd_ph,chd_ph_bucket_t * buckets,chd_ph_item_t * items,cmph_uint32 max_probes,cmph_uint32 * disp_table,cmph_uint32 bucket_num,cmph_uint32 size)440 static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes,
441                                       cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size)
442 
443 {
444 	register cmph_uint32 probe0_num, probe1_num, probe_num;
445 	probe0_num = 0;
446 	probe1_num = 0;
447 	probe_num = 0;
448 
449 	while(1)
450 	{
451 		if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size))
452 		{
453 			disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n;
454 			return 1;
455 		}
456 		probe0_num++;
457 		if(probe0_num >= chd_ph->n)
458 		{
459 			probe0_num -= chd_ph->n;
460 			probe1_num++;
461 		};
462 		probe_num++;
463 		if(probe_num >= max_probes || probe1_num >= chd_ph->n)
464 		{
465 			return 0;
466 		};
467 	};
468 	return 0;
469 };
470 
place_buckets1(chd_ph_config_data_t * chd_ph,chd_ph_bucket_t * buckets,chd_ph_item_t * items,cmph_uint32 max_bucket_size,chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_probes,cmph_uint32 * disp_table)471 static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items,
472 					cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
473 					cmph_uint32 * disp_table)
474 {
475 	register cmph_uint32 i = 0;
476 	register cmph_uint32 curr_bucket = 0;
477 
478 	for(i = max_bucket_size; i > 0; i--)
479 	{
480 		curr_bucket = sorted_lists[i].buckets_list;
481 		while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list)
482 		{
483 			if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i))
484 			{
485 				return 0;
486 			}
487 			curr_bucket++;
488 		};
489 	};
490 	return 1;
491 };
492 
place_buckets2(chd_ph_config_data_t * chd_ph,chd_ph_bucket_t * buckets,chd_ph_item_t * items,cmph_uint32 max_bucket_size,chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_probes,cmph_uint32 * disp_table)493 static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items,
494 					cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
495 					cmph_uint32 * disp_table)
496 {
497 	register cmph_uint32 i,j, non_placed_bucket;
498 	register cmph_uint32 curr_bucket;
499 	register cmph_uint32 probe_num, probe0_num, probe1_num;
500 	cmph_uint32 sorted_list_size;
501 #ifdef DEBUG
502 	cmph_uint32 items_list;
503 	cmph_uint32 bucket_id;
504 #endif
505 	DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n");
506 	for(i = max_bucket_size; i > 0; i--)
507 	{
508 		probe_num = 0;
509 		probe0_num = 0;
510 		probe1_num = 0;
511 		sorted_list_size = sorted_lists[i].size;
512 		while(sorted_lists[i].size != 0)
513 		{
514 			curr_bucket = sorted_lists[i].buckets_list;
515 			for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++)
516 			{
517 				// if bucket is successfully placed remove it from list
518 				if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i))
519 				{
520 					disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n;
521 // 					DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
522 				}
523 				else
524 				{
525 // 					DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket);
526 #ifdef DEBUG
527 					items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list;
528 					bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id;
529 #endif
530 					buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list;
531 					buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id;
532 #ifdef DEBUG
533 					buckets[curr_bucket].items_list=items_list;
534 					buckets[curr_bucket].bucket_id=bucket_id;
535 #endif
536 					non_placed_bucket++;
537 				}
538 				curr_bucket++;
539 			};
540 			sorted_lists[i].size = non_placed_bucket;
541 			probe0_num++;
542 			if(probe0_num >= chd_ph->n)
543 			{
544 				probe0_num -= chd_ph->n;
545 				probe1_num++;
546 			};
547 			probe_num++;
548 			if(probe_num >= max_probes || probe1_num >= chd_ph->n)
549 			{
550 				sorted_lists[i].size = sorted_list_size;
551 				return 0;
552 			};
553 		};
554 		sorted_lists[i].size = sorted_list_size;
555 	};
556 	return 1;
557 };
558 
chd_ph_searching(chd_ph_config_data_t * chd_ph,chd_ph_bucket_t * buckets,chd_ph_item_t * items,cmph_uint32 max_bucket_size,chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_probes,cmph_uint32 * disp_table)559 cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
560 			    cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
561 			    cmph_uint32 * disp_table)
562 {
563 	if(chd_ph->use_h)
564 	{
565 		return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
566 	}
567 	else
568 	{
569 		return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
570 	}
571 
572 }
573 
chd_ph_check_bin_hashing(chd_ph_config_data_t * chd_ph,chd_ph_bucket_t * buckets,chd_ph_item_t * items,cmph_uint32 * disp_table,chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size)574 static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items,
575                                                   cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size)
576 {
577 	register cmph_uint32 bucket_size, i, j;
578 	register cmph_uint32 position, probe0_num, probe1_num;
579 	register cmph_uint32 m = 0;
580 	register chd_ph_item_t * item;
581 	if(chd_ph->keys_per_bin > 1)
582 		memset(chd_ph->occup_table, 0, chd_ph->n);
583 	else
584 		memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
585 
586 	for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
587 		for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size +
588 				sorted_lists[bucket_size].buckets_list; i++)
589 		{
590 			j = bucket_size;
591 			item = items + buckets[i].items_list;
592 			probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n;
593 			probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n;
594 			for(; j > 0; j--)
595 			{
596 				m++;
597 				position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
598 				if(chd_ph->keys_per_bin > 1)
599 				{
600 					if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
601 					{
602 						return 0;
603 					}
604 					(chd_ph->occup_table[position])++;
605 				}
606 				else
607 				{
608 					if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position))
609 					{
610 						return 0;
611 					}
612 					SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
613 				};
614 				item++;
615 			};
616 		};
617 	DEBUGP("We were able to place m = %u keys\n", m);
618 	return 1;
619 };
620 
621 
chd_ph_new(cmph_config_t * mph,double c)622 cmph_t *chd_ph_new(cmph_config_t *mph, double c)
623 {
624 	cmph_t *mphf = NULL;
625 	chd_ph_data_t *chd_phf = NULL;
626 	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
627 
628 	register double load_factor = c;
629 	register cmph_uint8 searching_success = 0;
630 	register cmph_uint32 max_probes = 1 << 20; // default value for max_probes
631 	register cmph_uint32 iterations = 100;
632 	chd_ph_bucket_t * buckets = NULL;
633 	chd_ph_item_t * items = NULL;
634 	register cmph_uint8 failure = 0;
635 	cmph_uint32 max_bucket_size = 0;
636 	chd_ph_sorted_list_t * sorted_lists = NULL;
637 	cmph_uint32 * disp_table = NULL;
638 	register double space_lower_bound = 0;
639 	#ifdef CMPH_TIMING
640 	double construction_time_begin = 0.0;
641 	double construction_time = 0.0;
642 	ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
643 	#endif
644 
645 
646 	chd_ph->m = mph->key_source->nkeys;
647 	DEBUGP("m = %u\n", chd_ph->m);
648 
649 	chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
650 	DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
651 
652 	if(load_factor < 0.5 )
653 	{
654 		load_factor = 0.5;
655 	}
656 
657 	if(load_factor >= 0.99)
658 	{
659 		load_factor = 0.99;
660 	}
661 
662 	DEBUGP("load_factor = %.3f\n", load_factor);
663 
664 	chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
665 
666 	//Round the number of bins to the prime immediately above
667 	if(chd_ph->n % 2 == 0) chd_ph->n++;
668 	for(;;)
669 	{
670 		if(check_primality(chd_ph->n) == 1)
671 			break;
672 		chd_ph->n += 2; // just odd numbers can be primes for n > 2
673 
674 	};
675 
676 	DEBUGP("n = %u \n", chd_ph->n);
677 	if(chd_ph->keys_per_bin == 1)
678 	{
679 		space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
680 	}
681 
682 	if(mph->verbosity)
683 	{
684 		fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
685 	}
686 
687        	// We allocate the working tables
688 	buckets = chd_ph_bucket_new(chd_ph->nbuckets);
689 	items   = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
690 
691 	max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
692 
693 	if(chd_ph->keys_per_bin == 1)
694 		chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));
695 	else
696 		chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
697 
698 	disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
699 //
700 // 	init_genrand(time(0));
701 
702 	while(1)
703 	{
704 		iterations --;
705 		if (mph->verbosity)
706 		{
707 			fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
708 		}
709 
710 		if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
711 		{
712 			if (mph->verbosity)
713 			{
714 				fprintf(stderr, "Failure in mapping step\n");
715 			}
716 			failure = 1;
717 			goto cleanup;
718 		}
719 
720 		if (mph->verbosity)
721 		{
722 			fprintf(stderr, "Starting ordering step\n");
723 		}
724 		if(sorted_lists)
725 		{
726 			free(sorted_lists);
727 		}
728 
729         	sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size);
730 
731 		if (mph->verbosity)
732 		{
733 			fprintf(stderr, "Starting searching step\n");
734 		}
735 
736 		searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
737 		if(searching_success) break;
738 
739 		// reset occup_table
740 		if(chd_ph->keys_per_bin > 1)
741 			memset(chd_ph->occup_table, 0, chd_ph->n);
742 		else
743 			memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
744 		if(iterations == 0)
745 		{
746 			// Cleanup memory
747 			if (mph->verbosity)
748 			{
749 				fprintf(stderr, "Failure because the max trials was exceeded\n");
750 			}
751 			failure = 1;
752 			goto cleanup;
753 		};
754 	}
755 
756 	#ifdef DEBUG
757 	{
758 		if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size))
759 		{
760 
761 			DEBUGP("Error for bin packing generation");
762 			failure = 1;
763 			goto cleanup;
764 		}
765 	}
766 	#endif
767 
768 	if (mph->verbosity)
769 	{
770 		fprintf(stderr, "Starting compressing step\n");
771 	}
772 
773 	if(chd_ph->cs)
774 	{
775 		free(chd_ph->cs);
776 	}
777 	chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
778 	compressed_seq_init(chd_ph->cs);
779 	compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
780 
781 	#ifdef CMPH_TIMING
782 	ELAPSED_TIME_IN_SECONDS(&construction_time);
783 	register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
784 	DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
785 	#endif
786 
787 cleanup:
788 	chd_ph_bucket_destroy(buckets);
789 	free(items);
790 	free(sorted_lists);
791 	free(disp_table);
792 	if(failure)
793 	{
794 		if(chd_ph->hl)
795 		{
796 			hash_state_destroy(chd_ph->hl);
797 		}
798 		chd_ph->hl = NULL;
799 		return NULL;
800 	}
801 
802 	mphf = (cmph_t *)malloc(sizeof(cmph_t));
803 	mphf->algo = mph->algo;
804 	chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
805 
806 	chd_phf->cs = chd_ph->cs;
807 	chd_ph->cs = NULL; //transfer memory ownership
808 	chd_phf->hl = chd_ph->hl;
809 	chd_ph->hl = NULL; //transfer memory ownership
810 	chd_phf->n = chd_ph->n;
811 	chd_phf->nbuckets = chd_ph->nbuckets;
812 
813 	mphf->data = chd_phf;
814 	mphf->size = chd_ph->n;
815 
816 	DEBUGP("Successfully generated minimal perfect hash\n");
817 	if (mph->verbosity)
818 	{
819 		fprintf(stderr, "Successfully generated minimal perfect hash function\n");
820 	}
821 
822 	#ifdef CMPH_TIMING
823 	register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
824 	construction_time = construction_time - construction_time_begin;
825 	fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
826 	#endif
827 
828 	return mphf;
829 }
830 
831 
832 
chd_ph_load(FILE * fd,cmph_t * mphf)833 void chd_ph_load(FILE *fd, cmph_t *mphf)
834 {
835 	char *buf = NULL;
836 	cmph_uint32 buflen;
837 	register size_t nbytes;
838 	chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
839 
840 	DEBUGP("Loading chd_ph mphf\n");
841 	mphf->data = chd_ph;
842 
843 	nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
844 	DEBUGP("Hash state has %u bytes\n", buflen);
845 	buf = (char *)malloc((size_t)buflen);
846 	nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
847 	chd_ph->hl = hash_state_load(buf, buflen);
848 	free(buf);
849 
850 	nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
851 	DEBUGP("Compressed sequence structure has %u bytes\n", buflen);
852 	buf = (char *)malloc((size_t)buflen);
853 	nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
854 	chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
855 	compressed_seq_load(chd_ph->cs, buf, buflen);
856 	free(buf);
857 
858 	// loading n and nbuckets
859 	DEBUGP("Reading n and nbuckets\n");
860 	nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd);
861 	nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
862 }
863 
chd_ph_dump(cmph_t * mphf,FILE * fd)864 int chd_ph_dump(cmph_t *mphf, FILE *fd)
865 {
866 	char *buf = NULL;
867 	cmph_uint32 buflen;
868 	register size_t nbytes;
869 	chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
870 
871 	__cmph_dump(mphf, fd);
872 
873 	hash_state_dump(data->hl, &buf, &buflen);
874 	DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
875 	nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
876 	nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
877 	free(buf);
878 
879 	compressed_seq_dump(data->cs, &buf, &buflen);
880 	DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen);
881 	nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
882 	nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
883 	free(buf);
884 
885 	// dumping n and nbuckets
886 	nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
887 	nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
888 	return 1;
889 }
890 
chd_ph_destroy(cmph_t * mphf)891 void chd_ph_destroy(cmph_t *mphf)
892 {
893 	chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
894 	compressed_seq_destroy(data->cs);
895 	free(data->cs);
896 	hash_state_destroy(data->hl);
897 	free(data);
898 	free(mphf);
899 
900 }
901 
chd_ph_search(cmph_t * mphf,const char * key,cmph_uint32 keylen)902 cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
903 {
904 	register chd_ph_data_t * chd_ph = (chd_ph_data_t *)mphf->data;
905 	cmph_uint32 hl[3];
906 	register cmph_uint32 disp,position;
907 	register cmph_uint32 probe0_num,probe1_num;
908 	register cmph_uint32 f,g,h;
909 	hash_vector(chd_ph->hl, key, keylen, hl);
910 	g = hl[0] % chd_ph->nbuckets;
911 	f = hl[1] % chd_ph->n;
912 	h = hl[2] % (chd_ph->n-1) + 1;
913 
914 	disp = compressed_seq_query(chd_ph->cs, g);
915 	probe0_num = disp % chd_ph->n;
916 	probe1_num = disp/chd_ph->n;
917 	position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n);
918 	return position;
919 }
920 
chd_ph_pack(cmph_t * mphf,void * packed_mphf)921 void chd_ph_pack(cmph_t *mphf, void *packed_mphf)
922 {
923 	chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
924 	cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
925 
926 	// packing hl type
927 	CMPH_HASH hl_type = hash_get_type(data->hl);
928 	*((cmph_uint32 *) ptr) = hl_type;
929 	ptr += sizeof(cmph_uint32);
930 
931 	// packing hl
932 	hash_state_pack(data->hl, ptr);
933 	ptr += hash_state_packed_size(hl_type);
934 
935 	// packing n
936 	*((cmph_uint32 *) ptr) = data->n;
937 	ptr += sizeof(data->n);
938 
939 	// packing nbuckets
940 	*((cmph_uint32 *) ptr) = data->nbuckets;
941 	ptr += sizeof(data->nbuckets);
942 
943 	// packing cs
944 	compressed_seq_pack(data->cs, ptr);
945 	//ptr += compressed_seq_packed_size(data->cs);
946 
947 }
948 
chd_ph_packed_size(cmph_t * mphf)949 cmph_uint32 chd_ph_packed_size(cmph_t *mphf)
950 {
951 	register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
952 	register CMPH_HASH hl_type = hash_get_type(data->hl);
953 	register cmph_uint32 hash_state_pack_size =  hash_state_packed_size(hl_type);
954 	register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs);
955 
956 	return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32));
957 
958 }
959 
chd_ph_search_packed(void * packed_mphf,const char * key,cmph_uint32 keylen)960 cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
961 {
962 	register CMPH_HASH hl_type  = (CMPH_HASH)*(cmph_uint32 *)packed_mphf;
963 	register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
964 
965 	register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type));
966 	register cmph_uint32 n = *ptr++;
967 	register cmph_uint32 nbuckets = *ptr++;
968 	cmph_uint32 hl[3];
969 
970 	register cmph_uint32 disp,position;
971 	register cmph_uint32 probe0_num,probe1_num;
972 	register cmph_uint32 f,g,h;
973 
974 	hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
975 
976 	g = hl[0] % nbuckets;
977 	f = hl[1] % n;
978 	h = hl[2] % (n-1) + 1;
979 
980 	disp = compressed_seq_query_packed(ptr, g);
981 	probe0_num = disp % n;
982 	probe1_num = disp/n;
983 	position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n);
984 	return position;
985 }
986