1 #include <sys/types.h> /* for 'open' */
2 #include <sys/stat.h> /* for 'open' */
3 #include <fcntl.h> /* for 'open' */
4 #include <stdlib.h> /* for 'malloc' */
5 #include <stdio.h> /* for 'printf' */
6 #include <unistd.h> /* for 'read' */
7 #include <errno.h> /* for 'sterror' */
8 #include <sys/time.h> /* for 'gettimeofday' */
9 #include "uthash.h"
10
11 #undef uthash_noexpand_fyi
12 #define uthash_noexpand_fyi(t) die()
13 #define UNALIGNED_KEYS 0
14
die()15 static void die()
16 {
17 fprintf(stderr,"expansion inhibited\n");
18 exit(-1);
19 }
20
21 /* Windows doesn't have gettimeofday. While Cygwin and some
22 * versions of MinGW supply one, it is very coarse. This substitute
23 * gives much more accurate elapsed times under Windows. */
24 #if (( defined __CYGWIN__ ) || ( defined __MINGW32__ ))
25 #include <windows.h>
win_gettimeofday(struct timeval * p,void * tz)26 static void win_gettimeofday(struct timeval* p, void* tz /* IGNORED */)
27 {
28 LARGE_INTEGER q;
29 static long long freq;
30 static long long cyg_timer;
31 QueryPerformanceFrequency(&q);
32 freq = q.QuadPart;
33 QueryPerformanceCounter(&q);
34 cyg_timer = q.QuadPart;
35 p->tv_sec = (long)(cyg_timer / freq);
36 p->tv_usec = (long)(((cyg_timer % freq) * 1000000) / freq);
37 }
38 #define gettimeofday win_gettimeofday
39 #define MODE (O_RDONLY|O_BINARY)
40 #else
41 #define MODE (O_RDONLY)
42 #endif
43
44 #ifndef timersub
45 #define timersub(a, b, result) \
46 do { \
47 (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
48 (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
49 if ((result)->tv_usec < 0) { \
50 --(result)->tv_sec; \
51 (result)->tv_usec += 1000000; \
52 } \
53 } while (0)
54 #endif
55
56 typedef struct stat_key {
57 char *key;
58 unsigned len;
59 UT_hash_handle hh, hh2;
60 } stat_key;
61
62 #define CHAIN_0 0
63 #define CHAIN_5 1
64 #define CHAIN_10 2
65 #define CHAIN_20 3
66 #define CHAIN_100 4
67 #define CHAIN_MAX 5
hash_chain_len_histogram(const UT_hash_table * tbl)68 static void hash_chain_len_histogram(const UT_hash_table *tbl)
69 {
70 unsigned i, bkt_hist[CHAIN_MAX+1];
71 double pct = 100.0/(double)tbl->num_buckets;
72 memset(bkt_hist,0,sizeof(bkt_hist));
73 for(i=0; i < tbl->num_buckets; i++) {
74 unsigned count = tbl->buckets[i].count;
75 if (count == 0U) {
76 bkt_hist[CHAIN_0]++;
77 } else if (count < 5U) {
78 bkt_hist[CHAIN_5]++;
79 } else if (count < 10U) {
80 bkt_hist[CHAIN_10]++;
81 } else if (count < 20U) {
82 bkt_hist[CHAIN_20]++;
83 } else if (count < 100U) {
84 bkt_hist[CHAIN_100]++;
85 } else {
86 bkt_hist[CHAIN_MAX]++;
87 }
88 }
89 fprintf(stderr, "Buckets with 0 items: %.1f%%\n", (double)bkt_hist[CHAIN_0 ]*pct);
90 fprintf(stderr, "Buckets with < 5 items: %.1f%%\n", (double)bkt_hist[CHAIN_5 ]*pct);
91 fprintf(stderr, "Buckets with < 10 items: %.1f%%\n", (double)bkt_hist[CHAIN_10]*pct);
92 fprintf(stderr, "Buckets with < 20 items: %.1f%%\n", (double)bkt_hist[CHAIN_20]*pct);
93 fprintf(stderr, "Buckets with < 100 items: %.1f%%\n", (double)bkt_hist[CHAIN_100]*pct);
94 fprintf(stderr, "Buckets with > 100 items: %.1f%%\n", (double)bkt_hist[CHAIN_MAX]*pct);
95 }
96
main(int argc,char * argv[])97 int main(int argc, char *argv[])
98 {
99 int dups=0, rc, fd, done=0, err=0, want, i, padding=0, v=1, percent=100;
100 unsigned keylen, max_keylen=0, verbose=0;
101 const char *filename = "/dev/stdin";
102 char *dst;
103 stat_key *keyt, *keytmp, *keys=NULL, *keys2=NULL;
104 struct timeval start_tm, end_tm, elapsed_tm, elapsed_tm2, elapsed_tm3;
105
106 if ((argc >= 3) && (strcmp(argv[1],"-p") == 0)) {
107 percent = atoi(argv[2]);
108 v = 3;
109 }
110 if ((v < argc) && (strcmp(argv[v],"-v") == 0)) {
111 verbose=1;
112 v++;
113 }
114 if (v < argc) {
115 filename=argv[v];
116 }
117 fd=open(filename,MODE);
118
119 if ( fd == -1 ) {
120 fprintf(stderr,"open failed %s: %s\n", filename, strerror(errno));
121 return -1;
122 }
123
124 for(i=0; done==0; i++) {
125
126 want = sizeof(int);
127 dst = (char*)&keylen;
128 readmore1:
129 rc = read(fd,dst,want);
130 if (rc != want) {
131 if (rc == 0) {
132 done=1;
133 } else if (rc == -1) {
134 fprintf(stderr,"read failed: %s\n", strerror(errno));
135 err=1;
136 } else if (rc > 0) {
137 want -= rc;
138 dst += rc;
139 goto readmore1;
140 }
141 }
142
143 if (done || err) {
144 break;
145 }
146 if (keylen > max_keylen) {
147 max_keylen=keylen;
148 }
149
150 keyt = (stat_key*)malloc(sizeof(stat_key));
151 if (keyt == NULL) {
152 fprintf(stderr,"out of memory\n");
153 exit(-1);
154 }
155
156 /* read key */
157 #ifdef UNALIGNED_KEYS
158 padding = i%8;
159 #endif
160 keyt->key = (char*)malloc(padding+keylen);
161 if (keyt->key == NULL) {
162 fprintf(stderr,"out of memory\n");
163 exit(-1);
164 }
165 keyt->key += padding; /* forcibly alter the alignment of key */
166 keyt->len = keylen;
167
168 want = keylen;
169 dst = keyt->key;
170 readmore2:
171 rc = read(fd,dst,want);
172 if (rc != want) {
173 if (rc == -1) {
174 fprintf(stderr,"read failed: %s\n", strerror(errno));
175 err=1;
176 } else if (rc == 0) {
177 fprintf(stderr,"incomplete file\n");
178 err=1;
179 } else if (rc >= 0) {
180 want -= rc;
181 dst += rc;
182 goto readmore2;
183 }
184 }
185 if (err != 0) {
186 break;
187 }
188 /* if percent was set to something less than 100%, skip some keys*/
189 if (((rand()*1.0) / RAND_MAX) > ((percent*1.0)/100)) {
190 free(keyt->key-padding);
191 free(keyt);
192 continue;
193 }
194
195 /* eliminate dups */
196 HASH_FIND(hh,keys,keyt->key,keylen,keytmp);
197 if (keytmp != NULL) {
198 dups++;
199 free(keyt->key - padding);
200 free(keyt);
201 } else {
202 HASH_ADD_KEYPTR(hh,keys,keyt->key,keylen,keyt);
203 }
204 }
205
206 if (verbose != 0) {
207 unsigned key_count = HASH_COUNT(keys);
208 fprintf(stderr,"max key length: %u\n", max_keylen);
209 fprintf(stderr,"number unique keys: %u\n", key_count);
210 fprintf(stderr,"keystats memory: %u\n",
211 (unsigned)((sizeof(stat_key)+max_keylen)*key_count));
212 hash_chain_len_histogram(keys->hh.tbl);
213 }
214
215 /* add all keys to a new hash, so we can measure add time w/o malloc */
216 gettimeofday(&start_tm,NULL);
217 for(keyt = keys; keyt != NULL; keyt=(stat_key*)keyt->hh.next) {
218 HASH_ADD_KEYPTR(hh2,keys2,keyt->key,keyt->len,keyt);
219 }
220 gettimeofday(&end_tm,NULL);
221 timersub(&end_tm, &start_tm, &elapsed_tm);
222
223 /* now look up all keys in the new hash, again measuring elapsed time */
224 gettimeofday(&start_tm,NULL);
225 for(keyt = keys; keyt != NULL; keyt=(stat_key*)keyt->hh.next) {
226 HASH_FIND(hh2,keys2,keyt->key,keyt->len,keytmp);
227 if (keytmp == NULL) {
228 fprintf(stderr,"internal error, key not found\n");
229 }
230 }
231 gettimeofday(&end_tm,NULL);
232 timersub(&end_tm, &start_tm, &elapsed_tm2);
233
234 /* now delete all items in the new hash, measuring elapsed time */
235 gettimeofday(&start_tm,NULL);
236 while (keys2 != NULL) {
237 keytmp = keys2;
238 HASH_DELETE(hh2,keys2,keytmp);
239 }
240 gettimeofday(&end_tm,NULL);
241 timersub(&end_tm, &start_tm, &elapsed_tm3);
242
243 if (err == 0) {
244 printf("%.3f,%u,%u,%d,%s,%ld,%ld,%ld\n",
245 1-(1.0*keys->hh.tbl->nonideal_items/keys->hh.tbl->num_items),
246 keys->hh.tbl->num_items,
247 keys->hh.tbl->num_buckets,
248 dups,
249 (keys->hh.tbl->noexpand != 0U) ? "nx" : "ok",
250 (elapsed_tm.tv_sec * 1000000) + elapsed_tm.tv_usec,
251 (elapsed_tm2.tv_sec * 1000000) + elapsed_tm2.tv_usec,
252 (elapsed_tm3.tv_sec * 1000000) + elapsed_tm3.tv_usec );
253 }
254 return 0;
255 }
256