1 #include "graph.h"
2 #include "fch.h"
3 #include "fch_structs.h"
4 #include "bmz8.h"
5 #include "bmz8_structs.h"
6 #include "brz.h"
7 #include "cmph_structs.h"
8 #include "brz_structs.h"
9 #include "buffer_manager.h"
10 #include "cmph.h"
11 #include "hash.h"
12 #include "bitbool.h"
13 #include <math.h>
14 #include <stdlib.h>
15 #include <stdio.h>
16 #include <assert.h>
17 #include <string.h>
18 #define MAX_BUCKET_SIZE 255
19 //#define DEBUG
20 #include "debug.h"
21
22 static int brz_gen_mphf(cmph_config_t *mph);
23 static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n);
24 static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys);
25 static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen);
26 static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen);
brz_config_new(void)27 brz_config_data_t *brz_config_new(void)
28 {
29 brz_config_data_t *brz = NULL;
30 brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t));
31 if (!brz) return NULL;
32 brz->algo = CMPH_FCH;
33 brz->b = 128;
34 brz->hashfuncs[0] = CMPH_HASH_JENKINS;
35 brz->hashfuncs[1] = CMPH_HASH_JENKINS;
36 brz->hashfuncs[2] = CMPH_HASH_JENKINS;
37 brz->size = NULL;
38 brz->offset = NULL;
39 brz->g = NULL;
40 brz->h1 = NULL;
41 brz->h2 = NULL;
42 brz->h0 = NULL;
43 brz->memory_availability = 1024*1024;
44 brz->tmp_dir = (cmph_uint8 *)calloc((size_t)10, sizeof(cmph_uint8));
45 brz->mphf_fd = NULL;
46 strcpy((char *)(brz->tmp_dir), "/var/tmp/");
47 assert(brz);
48 return brz;
49 }
50
brz_config_destroy(cmph_config_t * mph)51 void brz_config_destroy(cmph_config_t *mph)
52 {
53 brz_config_data_t *data = (brz_config_data_t *)mph->data;
54 free(data->tmp_dir);
55 DEBUGP("Destroying algorithm dependent data\n");
56 free(data);
57 }
58
brz_config_set_hashfuncs(cmph_config_t * mph,CMPH_HASH * hashfuncs)59 void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
60 {
61 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
62 CMPH_HASH *hashptr = hashfuncs;
63 cmph_uint32 i = 0;
64 while(*hashptr != CMPH_HASH_COUNT)
65 {
66 if (i >= 3) break; //brz only uses three hash functions
67 brz->hashfuncs[i] = *hashptr;
68 ++i, ++hashptr;
69 }
70 }
71
brz_config_set_memory_availability(cmph_config_t * mph,cmph_uint32 memory_availability)72 void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability)
73 {
74 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
75 if(memory_availability > 0) brz->memory_availability = memory_availability*1024*1024;
76 }
77
brz_config_set_tmp_dir(cmph_config_t * mph,cmph_uint8 * tmp_dir)78 void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir)
79 {
80 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
81 if(tmp_dir)
82 {
83 size_t len = strlen((char *)tmp_dir);
84 free(brz->tmp_dir);
85 if(tmp_dir[len-1] != '/')
86 {
87 brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+2, sizeof(cmph_uint8));
88 sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir);
89 }
90 else
91 {
92 brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+1, sizeof(cmph_uint8));
93 sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir);
94 }
95
96 }
97 }
98
brz_config_set_mphf_fd(cmph_config_t * mph,FILE * mphf_fd)99 void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd)
100 {
101 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
102 brz->mphf_fd = mphf_fd;
103 assert(brz->mphf_fd);
104 }
105
brz_config_set_b(cmph_config_t * mph,cmph_uint32 b)106 void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b)
107 {
108 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
109 if(b <= 64 || b >= 175)
110 {
111 b = 128;
112 }
113 brz->b = (cmph_uint8)b;
114 }
115
brz_config_set_algo(cmph_config_t * mph,CMPH_ALGO algo)116 void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
117 {
118 if (algo == CMPH_BMZ8 || algo == CMPH_FCH) // supported algorithms
119 {
120 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
121 brz->algo = algo;
122 }
123 }
124
brz_new(cmph_config_t * mph,double c)125 cmph_t *brz_new(cmph_config_t *mph, double c)
126 {
127 cmph_t *mphf = NULL;
128 brz_data_t *brzf = NULL;
129 cmph_uint32 i;
130 cmph_uint32 iterations = 20;
131
132 DEBUGP("c: %f\n", c);
133 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
134 switch(brz->algo) // validating restrictions over parameter c.
135 {
136 case CMPH_BMZ8:
137 if (c == 0 || c >= 2.0) c = 1;
138 break;
139 case CMPH_FCH:
140 if (c <= 2.0) c = 2.6;
141 break;
142 default:
143 assert(0);
144 }
145 brz->c = c;
146 brz->m = mph->key_source->nkeys;
147 DEBUGP("m: %u\n", brz->m);
148 brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b));
149 DEBUGP("k: %u\n", brz->k);
150 brz->size = (cmph_uint8 *) calloc((size_t)brz->k, sizeof(cmph_uint8));
151
152 // Clustering the keys by graph id.
153 if (mph->verbosity)
154 {
155 fprintf(stderr, "Partioning the set of keys.\n");
156 }
157
158 while(1)
159 {
160 int ok;
161 DEBUGP("hash function 3\n");
162 brz->h0 = hash_state_new(brz->hashfuncs[2], brz->k);
163 DEBUGP("Generating graphs\n");
164 ok = brz_gen_mphf(mph);
165 if (!ok)
166 {
167 --iterations;
168 hash_state_destroy(brz->h0);
169 brz->h0 = NULL;
170 DEBUGP("%u iterations remaining to create the graphs in a external file\n", iterations);
171 if (mph->verbosity)
172 {
173 fprintf(stderr, "Failure: A graph with more than 255 keys was created - %u iterations remaining\n", iterations);
174 }
175 if (iterations == 0) break;
176 }
177 else break;
178 }
179 if (iterations == 0)
180 {
181 DEBUGP("Graphs with more than 255 keys were created in all 20 iterations\n");
182 free(brz->size);
183 return NULL;
184 }
185 DEBUGP("Graphs generated\n");
186
187 brz->offset = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32));
188 for (i = 1; i < brz->k; ++i)
189 {
190 brz->offset[i] = brz->size[i-1] + brz->offset[i-1];
191 }
192 // Generating a mphf
193 mphf = (cmph_t *)malloc(sizeof(cmph_t));
194 mphf->algo = mph->algo;
195 brzf = (brz_data_t *)malloc(sizeof(brz_data_t));
196 brzf->g = brz->g;
197 brz->g = NULL; //transfer memory ownership
198 brzf->h1 = brz->h1;
199 brz->h1 = NULL; //transfer memory ownership
200 brzf->h2 = brz->h2;
201 brz->h2 = NULL; //transfer memory ownership
202 brzf->h0 = brz->h0;
203 brz->h0 = NULL; //transfer memory ownership
204 brzf->size = brz->size;
205 brz->size = NULL; //transfer memory ownership
206 brzf->offset = brz->offset;
207 brz->offset = NULL; //transfer memory ownership
208 brzf->k = brz->k;
209 brzf->c = brz->c;
210 brzf->m = brz->m;
211 brzf->algo = brz->algo;
212 mphf->data = brzf;
213 mphf->size = brz->m;
214 DEBUGP("Successfully generated minimal perfect hash\n");
215 if (mph->verbosity)
216 {
217 fprintf(stderr, "Successfully generated minimal perfect hash function\n");
218 }
219 return mphf;
220 }
221
brz_gen_mphf(cmph_config_t * mph)222 static int brz_gen_mphf(cmph_config_t *mph)
223 {
224 cmph_uint32 i, e, error;
225 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
226 cmph_uint32 memory_usage = 0;
227 cmph_uint32 nkeys_in_buffer = 0;
228 cmph_uint8 *buffer = (cmph_uint8 *)malloc((size_t)brz->memory_availability);
229 cmph_uint32 *buckets_size = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32));
230 cmph_uint32 *keys_index = NULL;
231 cmph_uint8 **buffer_merge = NULL;
232 cmph_uint32 *buffer_h0 = NULL;
233 cmph_uint32 nflushes = 0;
234 cmph_uint32 h0;
235 register size_t nbytes;
236 FILE * tmp_fd = NULL;
237 buffer_manager_t * buff_manager = NULL;
238 char *filename = NULL;
239 char *key = NULL;
240 cmph_uint32 keylen;
241 cmph_uint32 cur_bucket = 0;
242 cmph_uint8 nkeys_vd = 0;
243 cmph_uint8 ** keys_vd = NULL;
244
245 mph->key_source->rewind(mph->key_source->data);
246 DEBUGP("Generating graphs from %u keys\n", brz->m);
247 // Partitioning
248 for (e = 0; e < brz->m; ++e)
249 {
250 mph->key_source->read(mph->key_source->data, &key, &keylen);
251
252 /* Buffers management */
253 if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers
254 {
255 if(mph->verbosity)
256 {
257 fprintf(stderr, "Flushing %u\n", nkeys_in_buffer);
258 }
259 cmph_uint32 value = buckets_size[0];
260 cmph_uint32 sum = 0;
261 cmph_uint32 keylen1 = 0;
262 buckets_size[0] = 0;
263 for(i = 1; i < brz->k; i++)
264 {
265 if(buckets_size[i] == 0) continue;
266 sum += value;
267 value = buckets_size[i];
268 buckets_size[i] = sum;
269
270 }
271 memory_usage = 0;
272 keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
273 for(i = 0; i < nkeys_in_buffer; i++)
274 {
275 memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
276 h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
277 keys_index[buckets_size[h0]] = memory_usage;
278 buckets_size[h0]++;
279 memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1);
280 }
281 filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
282 sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
283 tmp_fd = fopen(filename, "wb");
284 free(filename);
285 filename = NULL;
286 for(i = 0; i < nkeys_in_buffer; i++)
287 {
288 memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
289 nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
290 }
291 nkeys_in_buffer = 0;
292 memory_usage = 0;
293 memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
294 nflushes++;
295 free(keys_index);
296 fclose(tmp_fd);
297 }
298 memcpy(buffer + memory_usage, &keylen, sizeof(keylen));
299 memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen);
300 memory_usage += keylen + (cmph_uint32)sizeof(keylen);
301 h0 = hash(brz->h0, key, keylen) % brz->k;
302
303 if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0])))
304 {
305 free(buffer);
306 free(buckets_size);
307 return 0;
308 }
309 brz->size[h0] = (cmph_uint8)(brz->size[h0] + 1U);
310 buckets_size[h0] ++;
311 nkeys_in_buffer++;
312 mph->key_source->dispose(mph->key_source->data, key, keylen);
313 }
314 if (memory_usage != 0) // flush buffers
315 {
316 if(mph->verbosity)
317 {
318 fprintf(stderr, "Flushing %u\n", nkeys_in_buffer);
319 }
320 cmph_uint32 value = buckets_size[0];
321 cmph_uint32 sum = 0;
322 cmph_uint32 keylen1 = 0;
323 buckets_size[0] = 0;
324 for(i = 1; i < brz->k; i++)
325 {
326 if(buckets_size[i] == 0) continue;
327 sum += value;
328 value = buckets_size[i];
329 buckets_size[i] = sum;
330 }
331 memory_usage = 0;
332 keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
333 for(i = 0; i < nkeys_in_buffer; i++)
334 {
335 memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
336 h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
337 keys_index[buckets_size[h0]] = memory_usage;
338 buckets_size[h0]++;
339 memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1);
340 }
341 filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
342 sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
343 tmp_fd = fopen(filename, "wb");
344 free(filename);
345 filename = NULL;
346 for(i = 0; i < nkeys_in_buffer; i++)
347 {
348 memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
349 nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
350 }
351 nkeys_in_buffer = 0;
352 memory_usage = 0;
353 memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
354 nflushes++;
355 free(keys_index);
356 fclose(tmp_fd);
357 }
358
359 free(buffer);
360 free(buckets_size);
361 if(nflushes > 1024) return 0; // Too many files generated.
362 // mphf generation
363 if(mph->verbosity)
364 {
365 fprintf(stderr, "\nMPHF generation \n");
366 }
367 /* Starting to dump to disk the resultant MPHF: __cmph_dump function */
368 nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd);
369 nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd);
370 nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd);
371 nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd);
372 nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs
373 nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd);
374
375 //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *));
376 buff_manager = buffer_manager_new(brz->memory_availability, nflushes);
377 buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *));
378 buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32));
379
380 memory_usage = 0;
381 for(i = 0; i < nflushes; i++)
382 {
383 filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
384 sprintf(filename, "%s%u.cmph",brz->tmp_dir, i);
385 buffer_manager_open(buff_manager, i, filename);
386 free(filename);
387 filename = NULL;
388 key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
389 h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
390 buffer_h0[i] = h0;
391 buffer_merge[i] = (cmph_uint8 *)key;
392 key = NULL; //transfer memory ownership
393 }
394 e = 0;
395 keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *));
396 nkeys_vd = 0;
397 error = 0;
398 while(e < brz->m)
399 {
400 i = brz_min_index(buffer_h0, nflushes);
401 cur_bucket = buffer_h0[i];
402 key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
403 if(key)
404 {
405 while(key)
406 {
407 //keylen = strlen(key);
408 h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
409 if (h0 != buffer_h0[i]) break;
410 keys_vd[nkeys_vd++] = (cmph_uint8 *)key;
411 key = NULL; //transfer memory ownership
412 e++;
413 key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
414 }
415 if (key)
416 {
417 assert(nkeys_vd < brz->size[cur_bucket]);
418 keys_vd[nkeys_vd++] = buffer_merge[i];
419 buffer_merge[i] = NULL; //transfer memory ownership
420 e++;
421 buffer_h0[i] = h0;
422 buffer_merge[i] = (cmph_uint8 *)key;
423 }
424 }
425 if(!key)
426 {
427 assert(nkeys_vd < brz->size[cur_bucket]);
428 keys_vd[nkeys_vd++] = buffer_merge[i];
429 buffer_merge[i] = NULL; //transfer memory ownership
430 e++;
431 buffer_h0[i] = UINT_MAX;
432 }
433
434 if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket.
435 {
436 cmph_io_adapter_t *source = NULL;
437 cmph_config_t *config = NULL;
438 cmph_t *mphf_tmp = NULL;
439 char *bufmphf = NULL;
440 cmph_uint32 buflenmphf = 0;
441 // Source of keys
442 source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd);
443 config = cmph_config_new(source);
444 cmph_config_set_algo(config, brz->algo);
445 //cmph_config_set_algo(config, CMPH_BMZ8);
446 cmph_config_set_graphsize(config, brz->c);
447 mphf_tmp = cmph_new(config);
448 if (mphf_tmp == NULL)
449 {
450 if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k);
451 error = 1;
452 cmph_config_destroy(config);
453 brz_destroy_keys_vd(keys_vd, nkeys_vd);
454 cmph_io_byte_vector_adapter_destroy(source);
455 break;
456 }
457 if(mph->verbosity)
458 {
459 if (cur_bucket % 1000 == 0)
460 {
461 fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k);
462 }
463 }
464 switch(brz->algo)
465 {
466 case CMPH_FCH:
467 {
468 fch_data_t * fchf = NULL;
469 fchf = (fch_data_t *)mphf_tmp->data;
470 bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf);
471 }
472 break;
473 case CMPH_BMZ8:
474 {
475 bmz8_data_t * bmzf = NULL;
476 bmzf = (bmz8_data_t *)mphf_tmp->data;
477 bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf);
478 }
479 break;
480 default: assert(0);
481 }
482 nbytes = fwrite(bufmphf, (size_t)buflenmphf, (size_t)1, brz->mphf_fd);
483 free(bufmphf);
484 bufmphf = NULL;
485 cmph_config_destroy(config);
486 brz_destroy_keys_vd(keys_vd, nkeys_vd);
487 cmph_destroy(mphf_tmp);
488 cmph_io_byte_vector_adapter_destroy(source);
489 nkeys_vd = 0;
490 }
491 }
492 buffer_manager_destroy(buff_manager);
493 free(keys_vd);
494 free(buffer_merge);
495 free(buffer_h0);
496 if (error) return 0;
497 return 1;
498 }
499
brz_min_index(cmph_uint32 * vector,cmph_uint32 n)500 static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n)
501 {
502 cmph_uint32 i, min_index = 0;
503 for(i = 1; i < n; i++)
504 {
505 if(vector[i] < vector[min_index]) min_index = i;
506 }
507 return min_index;
508 }
509
brz_destroy_keys_vd(cmph_uint8 ** keys_vd,cmph_uint32 nkeys)510 static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys)
511 {
512 cmph_uint8 i;
513 for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;}
514 }
515
brz_copy_partial_fch_mphf(brz_config_data_t * brz,fch_data_t * fchf,cmph_uint32 index,cmph_uint32 * buflen)516 static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen)
517 {
518 cmph_uint32 i = 0;
519 cmph_uint32 buflenh1 = 0;
520 cmph_uint32 buflenh2 = 0;
521 char * bufh1 = NULL;
522 char * bufh2 = NULL;
523 char * buf = NULL;
524 cmph_uint32 n = fchf->b;//brz->size[index];
525 hash_state_dump(fchf->h1, &bufh1, &buflenh1);
526 hash_state_dump(fchf->h2, &bufh2, &buflenh2);
527 *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32);
528 buf = (char *)malloc((size_t)(*buflen));
529 memcpy(buf, &buflenh1, sizeof(cmph_uint32));
530 memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1);
531 memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32));
532 memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2);
533 for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), (size_t)1);
534 free(bufh1);
535 free(bufh2);
536 return buf;
537 }
brz_copy_partial_bmz8_mphf(brz_config_data_t * brz,bmz8_data_t * bmzf,cmph_uint32 index,cmph_uint32 * buflen)538 static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen)
539 {
540 cmph_uint32 buflenh1 = 0;
541 cmph_uint32 buflenh2 = 0;
542 char * bufh1 = NULL;
543 char * bufh2 = NULL;
544 char * buf = NULL;
545 cmph_uint32 n = (cmph_uint32)ceil(brz->c * brz->size[index]);
546 hash_state_dump(bmzf->hashes[0], &bufh1, &buflenh1);
547 hash_state_dump(bmzf->hashes[1], &bufh2, &buflenh2);
548 *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32);
549 buf = (char *)malloc((size_t)(*buflen));
550 memcpy(buf, &buflenh1, sizeof(cmph_uint32));
551 memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1);
552 memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32));
553 memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2);
554 memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2,bmzf->g, (size_t)n);
555 free(bufh1);
556 free(bufh2);
557 return buf;
558 }
559
560
brz_dump(cmph_t * mphf,FILE * fd)561 int brz_dump(cmph_t *mphf, FILE *fd)
562 {
563 brz_data_t *data = (brz_data_t *)mphf->data;
564 char *buf = NULL;
565 cmph_uint32 buflen;
566 register size_t nbytes;
567 DEBUGP("Dumping brzf\n");
568 // The initial part of the MPHF have already been dumped to disk during construction
569 // Dumping h0
570 hash_state_dump(data->h0, &buf, &buflen);
571 DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
572 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
573 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
574 free(buf);
575 // Dumping m and the vector offset.
576 nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
577 nbytes = fwrite(data->offset, sizeof(cmph_uint32)*(data->k), (size_t)1, fd);
578 return 1;
579 }
580
brz_load(FILE * f,cmph_t * mphf)581 void brz_load(FILE *f, cmph_t *mphf)
582 {
583 char *buf = NULL;
584 cmph_uint32 buflen;
585 register size_t nbytes;
586 cmph_uint32 i, n;
587 brz_data_t *brz = (brz_data_t *)malloc(sizeof(brz_data_t));
588
589 DEBUGP("Loading brz mphf\n");
590 mphf->data = brz;
591 nbytes = fread(&(brz->c), sizeof(double), (size_t)1, f);
592 nbytes = fread(&(brz->algo), sizeof(brz->algo), (size_t)1, f); // Reading algo.
593 nbytes = fread(&(brz->k), sizeof(cmph_uint32), (size_t)1, f);
594 brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k);
595 nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f);
596 brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
597 brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
598 brz->g = (cmph_uint8 **) calloc((size_t)brz->k, sizeof(cmph_uint8 *));
599 DEBUGP("Reading c = %f k = %u algo = %u \n", brz->c, brz->k, brz->algo);
600 //loading h_i1, h_i2 and g_i.
601 for(i = 0; i < brz->k; i++)
602 {
603 // h1
604 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
605 DEBUGP("Hash state 1 has %u bytes\n", buflen);
606 buf = (char *)malloc((size_t)buflen);
607 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
608 brz->h1[i] = hash_state_load(buf, buflen);
609 free(buf);
610 //h2
611 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
612 DEBUGP("Hash state 2 has %u bytes\n", buflen);
613 buf = (char *)malloc((size_t)buflen);
614 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
615 brz->h2[i] = hash_state_load(buf, buflen);
616 free(buf);
617 switch(brz->algo)
618 {
619 case CMPH_FCH:
620 n = fch_calc_b(brz->c, brz->size[i]);
621 break;
622 case CMPH_BMZ8:
623 n = (cmph_uint32)ceil(brz->c * brz->size[i]);
624 break;
625 default: assert(0);
626 }
627 DEBUGP("g_i has %u bytes\n", n);
628 brz->g[i] = (cmph_uint8 *)calloc((size_t)n, sizeof(cmph_uint8));
629 nbytes = fread(brz->g[i], sizeof(cmph_uint8)*n, (size_t)1, f);
630 }
631 //loading h0
632 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
633 DEBUGP("Hash state has %u bytes\n", buflen);
634 buf = (char *)malloc((size_t)buflen);
635 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
636 brz->h0 = hash_state_load(buf, buflen);
637 free(buf);
638
639 //loading c, m, and the vector offset.
640 nbytes = fread(&(brz->m), sizeof(cmph_uint32), (size_t)1, f);
641 brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k);
642 nbytes = fread(brz->offset, sizeof(cmph_uint32)*(brz->k), (size_t)1, f);
643 return;
644 }
645
brz_bmz8_search(brz_data_t * brz,const char * key,cmph_uint32 keylen,cmph_uint32 * fingerprint)646 static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
647 {
648 register cmph_uint32 h0;
649
650 hash_vector(brz->h0, key, keylen, fingerprint);
651 h0 = fingerprint[2] % brz->k;
652
653 register cmph_uint32 m = brz->size[h0];
654 register cmph_uint32 n = (cmph_uint32)ceil(brz->c * m);
655 register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % n;
656 register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % n;
657 register cmph_uint8 mphf_bucket;
658
659 if (h1 == h2 && ++h2 >= n) h2 = 0;
660 mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]);
661 DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0);
662 DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m);
663 DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]);
664 return (mphf_bucket + brz->offset[h0]);
665 }
666
brz_fch_search(brz_data_t * brz,const char * key,cmph_uint32 keylen,cmph_uint32 * fingerprint)667 static cmph_uint32 brz_fch_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
668 {
669 register cmph_uint32 h0;
670
671 hash_vector(brz->h0, key, keylen, fingerprint);
672 h0 = fingerprint[2] % brz->k;
673
674 register cmph_uint32 m = brz->size[h0];
675 register cmph_uint32 b = fch_calc_b(brz->c, m);
676 register double p1 = fch_calc_p1(m);
677 register double p2 = fch_calc_p2(b);
678 register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % m;
679 register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % m;
680 register cmph_uint8 mphf_bucket = 0;
681 h1 = mixh10h11h12(b, p1, p2, h1);
682 mphf_bucket = (cmph_uint8)((h2 + brz->g[h0][h1]) % m);
683 return (mphf_bucket + brz->offset[h0]);
684 }
685
brz_search(cmph_t * mphf,const char * key,cmph_uint32 keylen)686 cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
687 {
688 brz_data_t *brz = (brz_data_t *)mphf->data;
689 cmph_uint32 fingerprint[3];
690 switch(brz->algo)
691 {
692 case CMPH_FCH:
693 return brz_fch_search(brz, key, keylen, fingerprint);
694 case CMPH_BMZ8:
695 return brz_bmz8_search(brz, key, keylen, fingerprint);
696 default: assert(0);
697 }
698 return 0;
699 }
brz_destroy(cmph_t * mphf)700 void brz_destroy(cmph_t *mphf)
701 {
702 cmph_uint32 i;
703 brz_data_t *data = (brz_data_t *)mphf->data;
704 if(data->g)
705 {
706 for(i = 0; i < data->k; i++)
707 {
708 free(data->g[i]);
709 hash_state_destroy(data->h1[i]);
710 hash_state_destroy(data->h2[i]);
711 }
712 free(data->g);
713 free(data->h1);
714 free(data->h2);
715 }
716 hash_state_destroy(data->h0);
717 free(data->size);
718 free(data->offset);
719 free(data);
720 free(mphf);
721 }
722
723 /** \fn void brz_pack(cmph_t *mphf, void *packed_mphf);
724 * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
725 * \param mphf pointer to the resulting mphf
726 * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
727 */
brz_pack(cmph_t * mphf,void * packed_mphf)728 void brz_pack(cmph_t *mphf, void *packed_mphf)
729 {
730 brz_data_t *data = (brz_data_t *)mphf->data;
731 cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
732 cmph_uint32 i,n;
733
734 // packing internal algo type
735 memcpy(ptr, &(data->algo), sizeof(data->algo));
736 ptr += sizeof(data->algo);
737
738 // packing h0 type
739 CMPH_HASH h0_type = hash_get_type(data->h0);
740 memcpy(ptr, &h0_type, sizeof(h0_type));
741 ptr += sizeof(h0_type);
742
743 // packing h0
744 hash_state_pack(data->h0, ptr);
745 ptr += hash_state_packed_size(h0_type);
746
747 // packing k
748 memcpy(ptr, &(data->k), sizeof(data->k));
749 ptr += sizeof(data->k);
750
751 // packing c
752 *((cmph_uint64 *)ptr) = (cmph_uint64)data->c;
753 ptr += sizeof(data->c);
754
755 // packing h1 type
756 CMPH_HASH h1_type = hash_get_type(data->h1[0]);
757 memcpy(ptr, &h1_type, sizeof(h1_type));
758 ptr += sizeof(h1_type);
759
760 // packing h2 type
761 CMPH_HASH h2_type = hash_get_type(data->h2[0]);
762 memcpy(ptr, &h2_type, sizeof(h2_type));
763 ptr += sizeof(h2_type);
764
765 // packing size
766 memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k);
767 ptr += data->k;
768
769 // packing offset
770 memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k);
771 ptr += sizeof(cmph_uint32)*data->k;
772
773 #if defined (__ia64) || defined (__x86_64__)
774 cmph_uint64 * g_is_ptr = (cmph_uint64 *)ptr;
775 #else
776 cmph_uint32 * g_is_ptr = (cmph_uint32 *)ptr;
777 #endif
778
779 cmph_uint8 * g_i = (cmph_uint8 *) (g_is_ptr + data->k);
780
781 for(i = 0; i < data->k; i++)
782 {
783 #if defined (__ia64) || defined (__x86_64__)
784 *g_is_ptr++ = (cmph_uint64)g_i;
785 #else
786 *g_is_ptr++ = (cmph_uint32)g_i;
787 #endif
788 // packing h1[i]
789 hash_state_pack(data->h1[i], g_i);
790 g_i += hash_state_packed_size(h1_type);
791
792 // packing h2[i]
793 hash_state_pack(data->h2[i], g_i);
794 g_i += hash_state_packed_size(h2_type);
795
796 // packing g_i
797 switch(data->algo)
798 {
799 case CMPH_FCH:
800 n = fch_calc_b(data->c, data->size[i]);
801 break;
802 case CMPH_BMZ8:
803 n = (cmph_uint32)ceil(data->c * data->size[i]);
804 break;
805 default: assert(0);
806 }
807 memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n);
808 g_i += n;
809
810 }
811
812 }
813
814 /** \fn cmph_uint32 brz_packed_size(cmph_t *mphf);
815 * \brief Return the amount of space needed to pack mphf.
816 * \param mphf pointer to a mphf
817 * \return the size of the packed function or zero for failures
818 */
brz_packed_size(cmph_t * mphf)819 cmph_uint32 brz_packed_size(cmph_t *mphf)
820 {
821 cmph_uint32 i;
822 cmph_uint32 size = 0;
823 brz_data_t *data = (brz_data_t *)mphf->data;
824 CMPH_HASH h0_type = hash_get_type(data->h0);
825 CMPH_HASH h1_type = hash_get_type(data->h1[0]);
826 CMPH_HASH h2_type = hash_get_type(data->h2[0]);
827 size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) +
828 sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k);
829 // pointers to g_is
830 #if defined (__ia64) || defined (__x86_64__)
831 size += (cmph_uint32) sizeof(cmph_uint64)*data->k;
832 #else
833 size += (cmph_uint32) sizeof(cmph_uint32)*data->k;
834 #endif
835
836 size += hash_state_packed_size(h1_type) * data->k;
837 size += hash_state_packed_size(h2_type) * data->k;
838
839 cmph_uint32 n = 0;
840 for(i = 0; i < data->k; i++)
841 {
842 switch(data->algo)
843 {
844 case CMPH_FCH:
845 n = fch_calc_b(data->c, data->size[i]);
846 break;
847 case CMPH_BMZ8:
848 n = (cmph_uint32)ceil(data->c * data->size[i]);
849 break;
850 default: assert(0);
851 }
852 size += n;
853 }
854 return size;
855 }
856
857
858
brz_bmz8_search_packed(cmph_uint32 * packed_mphf,const char * key,cmph_uint32 keylen,cmph_uint32 * fingerprint)859 static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
860 {
861 register CMPH_HASH h0_type = (CMPH_HASH)*packed_mphf++;
862 register cmph_uint32 *h0_ptr = packed_mphf;
863 packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type));
864
865 register cmph_uint32 k = *packed_mphf++;
866
867 register double c = (double)(*((cmph_uint64*)packed_mphf));
868 packed_mphf += 2;
869
870 register CMPH_HASH h1_type = (CMPH_HASH)*packed_mphf++;
871
872 register CMPH_HASH h2_type = (CMPH_HASH)*packed_mphf++;
873
874 register cmph_uint8 * size = (cmph_uint8 *) packed_mphf;
875 packed_mphf = (cmph_uint32 *)(size + k);
876
877 register cmph_uint32 * offset = packed_mphf;
878 packed_mphf += k;
879
880 register cmph_uint32 h0;
881
882 hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint);
883 h0 = fingerprint[2] % k;
884
885 register cmph_uint32 m = size[h0];
886 register cmph_uint32 n = (cmph_uint32)ceil(c * m);
887
888 #if defined (__ia64) || defined (__x86_64__)
889 register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf;
890 #else
891 register cmph_uint32 * g_is_ptr = packed_mphf;
892 #endif
893
894 register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0];
895
896 register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
897
898 register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type);
899
900 register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
901 register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
902
903 register cmph_uint8 mphf_bucket;
904
905 if (h1 == h2 && ++h2 >= n) h2 = 0;
906 mphf_bucket = (cmph_uint8)(g[h1] + g[h2]);
907 DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0);
908 DEBUGP("Address: %u\n", mphf_bucket + offset[h0]);
909 return (mphf_bucket + offset[h0]);
910 }
911
brz_fch_search_packed(cmph_uint32 * packed_mphf,const char * key,cmph_uint32 keylen,cmph_uint32 * fingerprint)912 static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
913 {
914 register CMPH_HASH h0_type = (CMPH_HASH)*packed_mphf++;
915
916 register cmph_uint32 *h0_ptr = packed_mphf;
917 packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type));
918
919 register cmph_uint32 k = *packed_mphf++;
920
921 register double c = (double)(*((cmph_uint64*)packed_mphf));
922 packed_mphf += 2;
923
924 register CMPH_HASH h1_type = (CMPH_HASH)*packed_mphf++;
925
926 register CMPH_HASH h2_type = (CMPH_HASH)*packed_mphf++;
927
928 register cmph_uint8 * size = (cmph_uint8 *) packed_mphf;
929 packed_mphf = (cmph_uint32 *)(size + k);
930
931 register cmph_uint32 * offset = packed_mphf;
932 packed_mphf += k;
933
934 register cmph_uint32 h0;
935
936 hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint);
937 h0 = fingerprint[2] % k;
938
939 register cmph_uint32 m = size[h0];
940 register cmph_uint32 b = fch_calc_b(c, m);
941 register double p1 = fch_calc_p1(m);
942 register double p2 = fch_calc_p2(b);
943
944 #if defined (__ia64) || defined (__x86_64__)
945 register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf;
946 #else
947 register cmph_uint32 * g_is_ptr = packed_mphf;
948 #endif
949
950 register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0];
951
952 register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
953
954 register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type);
955
956 register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m;
957 register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m;
958
959 register cmph_uint8 mphf_bucket = 0;
960 h1 = mixh10h11h12(b, p1, p2, h1);
961 mphf_bucket = (cmph_uint8)((h2 + g[h1]) % m);
962 return (mphf_bucket + offset[h0]);
963 }
964
965 /** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
966 * \brief Use the packed mphf to do a search.
967 * \param packed_mphf pointer to the packed mphf
968 * \param key key to be hashed
969 * \param keylen key legth in bytes
970 * \return The mphf value
971 */
brz_search_packed(void * packed_mphf,const char * key,cmph_uint32 keylen)972 cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
973 {
974 register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf;
975 register CMPH_ALGO algo = (CMPH_ALGO)*ptr++;
976 cmph_uint32 fingerprint[3];
977 switch(algo)
978 {
979 case CMPH_FCH:
980 return brz_fch_search_packed(ptr, key, keylen, fingerprint);
981 case CMPH_BMZ8:
982 return brz_bmz8_search_packed(ptr, key, keylen, fingerprint);
983 default: assert(0);
984 }
985 }
986