1 #ifdef WIN32
2 #include "wingetopt.h"
3 #else
4 #include <getopt.h>
5 #endif
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <errno.h>
9 #include <string.h>
10 #include <time.h>
11 #include <limits.h>
12 #include <assert.h>
13 #include "cmph.h"
14 #include "hash.h"
15 
16 #ifdef WIN32
17 #define VERSION "0.8"
18 #else
19 #include "config.h"
20 #endif
21 
22 
usage(const char * prg)23 void usage(const char *prg)
24 {
25 	fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph]  keysfile\n", prg);
26 }
usage_long(const char * prg)27 void usage_long(const char *prg)
28 {
29 	cmph_uint32 i;
30 	fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
31 	fprintf(stderr, "Minimum perfect hashing tool\n\n");
32 	fprintf(stderr, "  -h\t print this help message\n");
33 	fprintf(stderr, "  -c\t c value determines:\n");
34 	fprintf(stderr, "    \t  * the number of vertices in the graph for the algorithms BMZ and CHM\n");
35 	fprintf(stderr, "    \t  * the number of bits per key required in the FCH algorithm\n");
36 	fprintf(stderr, "    \t  * the load factor in the CHD_PH algorithm\n");
37 	fprintf(stderr, "  -a\t algorithm - valid values are\n");
38 	for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, "    \t  * %s\n", cmph_names[i]);
39 	fprintf(stderr, "  -f\t hash function (may be used multiple times) - valid values are\n");
40 	for (i = 0; i < CMPH_HASH_COUNT; ++i) fprintf(stderr, "    \t  * %s\n", cmph_hash_names[i]);
41 	fprintf(stderr, "  -V\t print version number and exit\n");
42 	fprintf(stderr, "  -v\t increase verbosity (may be used multiple times)\n");
43 	fprintf(stderr, "  -k\t number of keys\n");
44 	fprintf(stderr, "  -g\t generation mode\n");
45 	fprintf(stderr, "  -s\t random seed\n");
46 	fprintf(stderr, "  -m\t minimum perfect hash function file \n");
47 	fprintf(stderr, "  -M\t main memory availability (in MB) used in BRZ algorithm \n");
48 	fprintf(stderr, "  -d\t temporary directory used in BRZ algorithm \n");
49 	fprintf(stderr, "  -b\t the meaning of this parameter depends on the algorithm selected in the -a option:\n");
50 	fprintf(stderr, "    \t  * For BRZ it is used to make the maximal number of keys in a bucket lower than 256.\n");
51 	fprintf(stderr, "    \t    In this case its value should be an integer in the range [64,175]. Default is 128.\n\n");
52 	fprintf(stderr, "    \t  * For BDZ it is used to determine the size of some precomputed rank\n");
53 	fprintf(stderr, "    \t    information and its value should be an integer in the range [3,10]. Default\n");
54 	fprintf(stderr, "    \t    is 7. The larger is this value, the more compact are the resulting functions\n");
55 	fprintf(stderr, "    \t    and the slower are them at evaluation time.\n\n");
56 	fprintf(stderr, "    \t  * For CHD and CHD_PH it is used to set the average number of keys per bucket\n");
57 	fprintf(stderr, "    \t    and its value should be an integer in the range [1,32]. Default is 4. The\n");
58 	fprintf(stderr, "    \t    larger is this value, the slower is the construction of the functions.\n");
59 	fprintf(stderr, "    \t    This parameter has no effect for other algorithms.\n\n");
60 	fprintf(stderr, "  -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n");
61 	fprintf(stderr, "    \t hash function allows at most t collisions in a given bin. This parameter applies\n");
62 	fprintf(stderr, "    \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n");
63 	fprintf(stderr, "    \t range [1,128]. Defaul is 1\n");
64 	fprintf(stderr, "  keysfile\t line separated file with keys\n");
65 }
66 
main(int argc,char ** argv)67 int main(int argc, char **argv)
68 {
69 	cmph_uint32 verbosity = 0;
70 	char generate = 0;
71 	char *mphf_file = NULL;
72 	FILE *mphf_fd = stdout;
73 	const char *keys_file = NULL;
74 	FILE *keys_fd;
75 	cmph_uint32 nkeys = UINT_MAX;
76 	cmph_uint32 seed = UINT_MAX;
77 	CMPH_HASH *hashes = NULL;
78 	cmph_uint32 nhashes = 0;
79 	cmph_uint32 i;
80 	CMPH_ALGO mph_algo = CMPH_CHM;
81 	double c = 0;
82 	cmph_config_t *config = NULL;
83 	cmph_t *mphf = NULL;
84 	char * tmp_dir = NULL;
85 	cmph_io_adapter_t *source;
86 	cmph_uint32 memory_availability = 0;
87 	cmph_uint32 b = 0;
88 	cmph_uint32 keys_per_bin = 1;
89 	while (1)
90 	{
91 		char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
92 		if (ch == -1) break;
93 		switch (ch)
94 		{
95 			case 's':
96 				{
97 					char *cptr;
98 					seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
99 					if(*cptr != 0) {
100 						fprintf(stderr, "Invalid seed %s\n", optarg);
101 						exit(1);
102 					}
103 				}
104 				break;
105 			case 'c':
106 				{
107 					char *endptr;
108 					c = strtod(optarg, &endptr);
109 					if(*endptr != 0) {
110 						fprintf(stderr, "Invalid c value %s\n", optarg);
111 						exit(1);
112 					}
113 				}
114 				break;
115 			case 'g':
116 				generate = 1;
117 				break;
118 			case 'k':
119 			        {
120 					char *endptr;
121 					nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
122 					if(*endptr != 0) {
123 						fprintf(stderr, "Invalid number of keys %s\n", optarg);
124 						exit(1);
125 					}
126 				}
127 				break;
128 			case 'm':
129 				mphf_file = strdup(optarg);
130 				break;
131 			case 'd':
132 				tmp_dir = strdup(optarg);
133 				break;
134 			case 'M':
135 				{
136 					char *cptr;
137 					memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
138 					if(*cptr != 0) {
139 						fprintf(stderr, "Invalid memory availability %s\n", optarg);
140 						exit(1);
141 					}
142 				}
143 				break;
144 			case 'b':
145 				{
146 					char *cptr;
147 					b =  (cmph_uint32)strtoul(optarg, &cptr, 10);
148 					if(*cptr != 0) {
149 						fprintf(stderr, "Parameter b was not found: %s\n", optarg);
150 						exit(1);
151 					}
152 				}
153 				break;
154 			case 't':
155 				{
156 					char *cptr;
157 					keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
158 					if(*cptr != 0) {
159 						fprintf(stderr, "Parameter t was not found: %s\n", optarg);
160 						exit(1);
161 					}
162 				}
163 				break;
164 			case 'v':
165 				++verbosity;
166 				break;
167 			case 'V':
168 				printf("%s\n", VERSION);
169 				return 0;
170 			case 'h':
171 				usage_long(argv[0]);
172 				return 0;
173 			case 'a':
174 				{
175 				char valid = 0;
176 				for (i = 0; i < CMPH_COUNT; ++i)
177 				{
178 					if (strcmp(cmph_names[i], optarg) == 0)
179 					{
180 						mph_algo = (CMPH_ALGO)i;
181 						valid = 1;
182 						break;
183 					}
184 				}
185 				if (!valid)
186 				{
187 					fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
188 					return -1;
189 				}
190 				}
191 				break;
192 			case 'f':
193 				{
194 				char valid = 0;
195 				for (i = 0; i < CMPH_HASH_COUNT; ++i)
196 				{
197 					if (strcmp(cmph_hash_names[i], optarg) == 0)
198 					{
199 						hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
200 						hashes[nhashes] = (CMPH_HASH)i;
201 						hashes[nhashes + 1] = CMPH_HASH_COUNT;
202 						++nhashes;
203 						valid = 1;
204 						break;
205 					}
206 				}
207 				if (!valid)
208 				{
209 					fprintf(stderr, "Invalid hash function: %s\n", optarg);
210 					return -1;
211 				}
212 				}
213 				break;
214 			default:
215 				usage(argv[0]);
216 				return 1;
217 		}
218 	}
219 
220 	if (optind != argc - 1)
221 	{
222 		usage(argv[0]);
223 		return 1;
224 	}
225 	keys_file = argv[optind];
226 
227 	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
228 	srand(seed);
229 	int ret = 0;
230 	if (mphf_file == NULL)
231 	{
232 		mphf_file = (char *)malloc(strlen(keys_file) + 5);
233 		memcpy(mphf_file, keys_file, strlen(keys_file));
234 		memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
235 	}
236 
237 	keys_fd = fopen(keys_file, "r");
238 
239 	if (keys_fd == NULL)
240 	{
241 		fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
242 		return -1;
243 	}
244 
245 	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
246 	if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
247 	else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
248 	if (generate)
249 	{
250 		//Create mphf
251 		mphf_fd = fopen(mphf_file, "w");
252 		config = cmph_config_new(source);
253 		cmph_config_set_algo(config, mph_algo);
254 		if (nhashes) cmph_config_set_hashfuncs(config, hashes);
255 		cmph_config_set_verbosity(config, verbosity);
256 		cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
257 		cmph_config_set_mphf_fd(config, mphf_fd);
258 		cmph_config_set_memory_availability(config, memory_availability);
259 		cmph_config_set_b(config, b);
260 		cmph_config_set_keys_per_bin(config, keys_per_bin);
261 
262 		//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
263 		if(mph_algo == CMPH_BMZ  && c >= 2.0) c=1.15;
264 		if (c != 0) cmph_config_set_graphsize(config, c);
265 		mphf = cmph_new(config);
266 
267 		cmph_config_destroy(config);
268 		if (mphf == NULL)
269 		{
270 			fprintf(stderr, "Unable to create minimum perfect hashing function\n");
271 			//cmph_config_destroy(config);
272 			free(mphf_file);
273 			return -1;
274 		}
275 
276 		if (mphf_fd == NULL)
277 		{
278 			fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
279 			free(mphf_file);
280 			return -1;
281 		}
282 		cmph_dump(mphf, mphf_fd);
283 		cmph_destroy(mphf);
284 		fclose(mphf_fd);
285 	}
286 	else
287 	{
288 		cmph_uint8 * hashtable = NULL;
289 		mphf_fd = fopen(mphf_file, "r");
290 		if (mphf_fd == NULL)
291 		{
292 			fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
293 			free(mphf_file);
294 			return -1;
295 		}
296 		mphf = cmph_load(mphf_fd);
297 		fclose(mphf_fd);
298 		if (!mphf)
299 		{
300 			fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
301 			free(mphf_file);
302 			return -1;
303 		}
304 		cmph_uint32 siz = cmph_size(mphf);
305 		hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
306 		memset(hashtable, 0,(size_t) siz);
307 		//check all keys
308 		for (i = 0; i < source->nkeys; ++i)
309 		{
310 			cmph_uint32 h;
311 			char *buf;
312 			cmph_uint32 buflen = 0;
313 			source->read(source->data, &buf, &buflen);
314 			h = cmph_search(mphf, buf, buflen);
315 			if (!(h < siz))
316 			{
317 				fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
318 				ret = 1;
319 			} else if(hashtable[h] >= keys_per_bin)
320 			{
321 				fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
322 				fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
323 				ret = 1;
324 			} else hashtable[h]++;
325 
326 			if (verbosity)
327 			{
328 				printf("%s -> %u\n", buf, h);
329 			}
330 			source->dispose(source->data, buf, buflen);
331 		}
332 
333 		cmph_destroy(mphf);
334 		free(hashtable);
335 	}
336 	fclose(keys_fd);
337 	free(mphf_file);
338 	free(tmp_dir);
339         cmph_io_nlfile_adapter_destroy(source);
340 	return ret;
341 
342 }
343