1 #ifdef WIN32
2 #include "wingetopt.h"
3 #else
4 #include <getopt.h>
5 #endif
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <errno.h>
9 #include <string.h>
10 #include <time.h>
11 #include <limits.h>
12 #include <assert.h>
13 #include "cmph.h"
14 #include "hash.h"
15
16 #ifdef WIN32
17 #define VERSION "0.8"
18 #else
19 #include "config.h"
20 #endif
21
22
usage(const char * prg)23 void usage(const char *prg)
24 {
25 fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
26 }
usage_long(const char * prg)27 void usage_long(const char *prg)
28 {
29 cmph_uint32 i;
30 fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
31 fprintf(stderr, "Minimum perfect hashing tool\n\n");
32 fprintf(stderr, " -h\t print this help message\n");
33 fprintf(stderr, " -c\t c value determines:\n");
34 fprintf(stderr, " \t * the number of vertices in the graph for the algorithms BMZ and CHM\n");
35 fprintf(stderr, " \t * the number of bits per key required in the FCH algorithm\n");
36 fprintf(stderr, " \t * the load factor in the CHD_PH algorithm\n");
37 fprintf(stderr, " -a\t algorithm - valid values are\n");
38 for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]);
39 fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
40 for (i = 0; i < CMPH_HASH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_hash_names[i]);
41 fprintf(stderr, " -V\t print version number and exit\n");
42 fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n");
43 fprintf(stderr, " -k\t number of keys\n");
44 fprintf(stderr, " -g\t generation mode\n");
45 fprintf(stderr, " -s\t random seed\n");
46 fprintf(stderr, " -m\t minimum perfect hash function file \n");
47 fprintf(stderr, " -M\t main memory availability (in MB) used in BRZ algorithm \n");
48 fprintf(stderr, " -d\t temporary directory used in BRZ algorithm \n");
49 fprintf(stderr, " -b\t the meaning of this parameter depends on the algorithm selected in the -a option:\n");
50 fprintf(stderr, " \t * For BRZ it is used to make the maximal number of keys in a bucket lower than 256.\n");
51 fprintf(stderr, " \t In this case its value should be an integer in the range [64,175]. Default is 128.\n\n");
52 fprintf(stderr, " \t * For BDZ it is used to determine the size of some precomputed rank\n");
53 fprintf(stderr, " \t information and its value should be an integer in the range [3,10]. Default\n");
54 fprintf(stderr, " \t is 7. The larger is this value, the more compact are the resulting functions\n");
55 fprintf(stderr, " \t and the slower are them at evaluation time.\n\n");
56 fprintf(stderr, " \t * For CHD and CHD_PH it is used to set the average number of keys per bucket\n");
57 fprintf(stderr, " \t and its value should be an integer in the range [1,32]. Default is 4. The\n");
58 fprintf(stderr, " \t larger is this value, the slower is the construction of the functions.\n");
59 fprintf(stderr, " \t This parameter has no effect for other algorithms.\n\n");
60 fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n");
61 fprintf(stderr, " \t hash function allows at most t collisions in a given bin. This parameter applies\n");
62 fprintf(stderr, " \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n");
63 fprintf(stderr, " \t range [1,128]. Defaul is 1\n");
64 fprintf(stderr, " keysfile\t line separated file with keys\n");
65 }
66
main(int argc,char ** argv)67 int main(int argc, char **argv)
68 {
69 cmph_uint32 verbosity = 0;
70 char generate = 0;
71 char *mphf_file = NULL;
72 FILE *mphf_fd = stdout;
73 const char *keys_file = NULL;
74 FILE *keys_fd;
75 cmph_uint32 nkeys = UINT_MAX;
76 cmph_uint32 seed = UINT_MAX;
77 CMPH_HASH *hashes = NULL;
78 cmph_uint32 nhashes = 0;
79 cmph_uint32 i;
80 CMPH_ALGO mph_algo = CMPH_CHM;
81 double c = 0;
82 cmph_config_t *config = NULL;
83 cmph_t *mphf = NULL;
84 char * tmp_dir = NULL;
85 cmph_io_adapter_t *source;
86 cmph_uint32 memory_availability = 0;
87 cmph_uint32 b = 0;
88 cmph_uint32 keys_per_bin = 1;
89 while (1)
90 {
91 char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
92 if (ch == -1) break;
93 switch (ch)
94 {
95 case 's':
96 {
97 char *cptr;
98 seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
99 if(*cptr != 0) {
100 fprintf(stderr, "Invalid seed %s\n", optarg);
101 exit(1);
102 }
103 }
104 break;
105 case 'c':
106 {
107 char *endptr;
108 c = strtod(optarg, &endptr);
109 if(*endptr != 0) {
110 fprintf(stderr, "Invalid c value %s\n", optarg);
111 exit(1);
112 }
113 }
114 break;
115 case 'g':
116 generate = 1;
117 break;
118 case 'k':
119 {
120 char *endptr;
121 nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
122 if(*endptr != 0) {
123 fprintf(stderr, "Invalid number of keys %s\n", optarg);
124 exit(1);
125 }
126 }
127 break;
128 case 'm':
129 mphf_file = strdup(optarg);
130 break;
131 case 'd':
132 tmp_dir = strdup(optarg);
133 break;
134 case 'M':
135 {
136 char *cptr;
137 memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
138 if(*cptr != 0) {
139 fprintf(stderr, "Invalid memory availability %s\n", optarg);
140 exit(1);
141 }
142 }
143 break;
144 case 'b':
145 {
146 char *cptr;
147 b = (cmph_uint32)strtoul(optarg, &cptr, 10);
148 if(*cptr != 0) {
149 fprintf(stderr, "Parameter b was not found: %s\n", optarg);
150 exit(1);
151 }
152 }
153 break;
154 case 't':
155 {
156 char *cptr;
157 keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
158 if(*cptr != 0) {
159 fprintf(stderr, "Parameter t was not found: %s\n", optarg);
160 exit(1);
161 }
162 }
163 break;
164 case 'v':
165 ++verbosity;
166 break;
167 case 'V':
168 printf("%s\n", VERSION);
169 return 0;
170 case 'h':
171 usage_long(argv[0]);
172 return 0;
173 case 'a':
174 {
175 char valid = 0;
176 for (i = 0; i < CMPH_COUNT; ++i)
177 {
178 if (strcmp(cmph_names[i], optarg) == 0)
179 {
180 mph_algo = (CMPH_ALGO)i;
181 valid = 1;
182 break;
183 }
184 }
185 if (!valid)
186 {
187 fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
188 return -1;
189 }
190 }
191 break;
192 case 'f':
193 {
194 char valid = 0;
195 for (i = 0; i < CMPH_HASH_COUNT; ++i)
196 {
197 if (strcmp(cmph_hash_names[i], optarg) == 0)
198 {
199 hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
200 hashes[nhashes] = (CMPH_HASH)i;
201 hashes[nhashes + 1] = CMPH_HASH_COUNT;
202 ++nhashes;
203 valid = 1;
204 break;
205 }
206 }
207 if (!valid)
208 {
209 fprintf(stderr, "Invalid hash function: %s\n", optarg);
210 return -1;
211 }
212 }
213 break;
214 default:
215 usage(argv[0]);
216 return 1;
217 }
218 }
219
220 if (optind != argc - 1)
221 {
222 usage(argv[0]);
223 return 1;
224 }
225 keys_file = argv[optind];
226
227 if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
228 srand(seed);
229 int ret = 0;
230 if (mphf_file == NULL)
231 {
232 mphf_file = (char *)malloc(strlen(keys_file) + 5);
233 memcpy(mphf_file, keys_file, strlen(keys_file));
234 memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
235 }
236
237 keys_fd = fopen(keys_file, "r");
238
239 if (keys_fd == NULL)
240 {
241 fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
242 return -1;
243 }
244
245 if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
246 if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
247 else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
248 if (generate)
249 {
250 //Create mphf
251 mphf_fd = fopen(mphf_file, "w");
252 config = cmph_config_new(source);
253 cmph_config_set_algo(config, mph_algo);
254 if (nhashes) cmph_config_set_hashfuncs(config, hashes);
255 cmph_config_set_verbosity(config, verbosity);
256 cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
257 cmph_config_set_mphf_fd(config, mphf_fd);
258 cmph_config_set_memory_availability(config, memory_availability);
259 cmph_config_set_b(config, b);
260 cmph_config_set_keys_per_bin(config, keys_per_bin);
261
262 //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
263 if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15;
264 if (c != 0) cmph_config_set_graphsize(config, c);
265 mphf = cmph_new(config);
266
267 cmph_config_destroy(config);
268 if (mphf == NULL)
269 {
270 fprintf(stderr, "Unable to create minimum perfect hashing function\n");
271 //cmph_config_destroy(config);
272 free(mphf_file);
273 return -1;
274 }
275
276 if (mphf_fd == NULL)
277 {
278 fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
279 free(mphf_file);
280 return -1;
281 }
282 cmph_dump(mphf, mphf_fd);
283 cmph_destroy(mphf);
284 fclose(mphf_fd);
285 }
286 else
287 {
288 cmph_uint8 * hashtable = NULL;
289 mphf_fd = fopen(mphf_file, "r");
290 if (mphf_fd == NULL)
291 {
292 fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
293 free(mphf_file);
294 return -1;
295 }
296 mphf = cmph_load(mphf_fd);
297 fclose(mphf_fd);
298 if (!mphf)
299 {
300 fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
301 free(mphf_file);
302 return -1;
303 }
304 cmph_uint32 siz = cmph_size(mphf);
305 hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
306 memset(hashtable, 0,(size_t) siz);
307 //check all keys
308 for (i = 0; i < source->nkeys; ++i)
309 {
310 cmph_uint32 h;
311 char *buf;
312 cmph_uint32 buflen = 0;
313 source->read(source->data, &buf, &buflen);
314 h = cmph_search(mphf, buf, buflen);
315 if (!(h < siz))
316 {
317 fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
318 ret = 1;
319 } else if(hashtable[h] >= keys_per_bin)
320 {
321 fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
322 fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
323 ret = 1;
324 } else hashtable[h]++;
325
326 if (verbosity)
327 {
328 printf("%s -> %u\n", buf, h);
329 }
330 source->dispose(source->data, buf, buflen);
331 }
332
333 cmph_destroy(mphf);
334 free(hashtable);
335 }
336 fclose(keys_fd);
337 free(mphf_file);
338 free(tmp_dir);
339 cmph_io_nlfile_adapter_destroy(source);
340 return ret;
341
342 }
343