1 /*
2  * copyright 2010-2012 Edscott Wilson Garcia (GPL-license)
3  *
4  *
5  * Tests on a 4 GB ram box.
6  * Step 1: Create a database with over 4M records. Total key size > 4GB. Table size > 40GB
7  *         Check code, can DBH handle this?
8  *
9  *
10  * This is very simple example program to test 64 bit
11  * functions of the Disk Based Hash (DBH) and
12  * verify correct handling of dbh files greater than
13  * 2 Gb in size (up to 256^8/2).
14 
15  * A dbh file is created from a specified filesystem.
16  * Paths are indexed with g_string hash key
17  * Hash key collisions are noted in dbh file COLLISIONS
18  * path => Hash key<->path associations are noted in dbh file INDEX
19  * Hash key => file are noted in dbh file TABLE
20  *
21  * usage: ./filesystem path option
22  * Option can be:
23  *    "index" (create INDEX, COLLISIONS and TABLE dbh files)
24  *    "dump"  (do a foreach on all records and print summary)
25  *    "regen" (recreate TABLE dbh file with optimized fisical structure)
26  *    "compare" (compare each file in TABLE with actual file on disk)
27  *    "parallel"
28  *    "thread"
29  *    "fulltest" (all of the above)
30 
31  */
32 #include "config.h"
33 #define _GNU_SOURCE             /* See feature_test_macros(7) */
34 #include <features.h>
35 #include <pthread.h>
36 
37 #ifdef HAVE_LSTAT
38 # define LSTAT lstat
39 #else
40 # define LSTAT stat
41 #endif
42 #include <string.h>
43 #include <stdlib.h>
44 #include <stdio.h>
45 #include <dbh.h>
46 #include <dirent.h>
47 #include <sys/types.h>
48 #include <inttypes.h>
49 
50 #ifdef HAVE_GDBM_H
51 # include <gdbm.h>
52 #endif
53 
54 #ifdef HAVE_SYS_WAIT_H
55 # include <sys/wait.h>
56 #endif
57 
58 #ifdef HAVE_SYS_RESOURCE_H
59 #include <sys/time.h>
60 #include <sys/resource.h>
61 #endif
62 
63 #include <sys/stat.h>
64 #include <fcntl.h>
65 #include <unistd.h>
66 #include <errno.h>
67 #ifdef HAVE_WINDOWS_H
68 #include <windows.h>
69 #endif
70 #ifndef O_BINARY
71 #define O_BINARY 0x0
72 #endif
73 
74 
75 #include <glib.h>
76 
77 #define SKIP_DIR "/home"
78 #define DIRECTORY "/home/edscott/testfiles/"
79 const gchar *directory=DIRECTORY;
80 
81 #define RANDOM_LIST DIRECTORY"randomlist.txt"
82 
83 #define COLLISIONS DIRECTORY"performance.collisions.dbh"
84 #define TABLE DIRECTORY"performance.table.dbh"
85 
86 #define QINDEX DIRECTORY"performance.qindex.dbh"
87 #define QTABLE DIRECTORY"performance.qtable.dbh"
88 
89 #define GCOLLISIONS DIRECTORY"performance.gcollisions.dbf"
90 #define GTABLE DIRECTORY"performance.gtable.dbf"
91 
92 #define REBUILT DIRECTORY"performance.index.rebuilt.dbh"
93 #define TEST_INDEX DIRECTORY"parperformance.index.dbh"
94 
95 #define HELP \
96 "     Options:\n"\
97 "       create: Create table files of items within the specified \"path\"\n"\
98 "       random: Create a random list for tests\n"\
99 "       regen: Regenerate the DBH table (sweep/fanout)\n"\
100 "       test:  Random r/w tests\n"\
101 " *To test a DBH table larger than 4 GB, choose a \"path\" with more than 4GB.\n"\
102 "  Do not alter any item within \"path\" during the test or error will occur."
103 
104 
105 typedef struct dump_t{
106     int original_count;
107     long long original_sum;
108     long long sum;
109     int which;
110     int count;
111     gint natural;
112 }dump_t;
113 
114 
115 static
get_hash_key(unsigned char bucket,const char * pre_key)116 gchar *get_hash_key(unsigned char bucket, const char *pre_key){
117     GString *gs = g_string_new(pre_key);
118     gchar *key;
119     key=g_strdup_printf("%c%10u", bucket, g_string_hash(gs));
120     g_string_free(gs, TRUE);
121     return key;
122 }
123 
skip_msg(const gchar * path,const gchar * file,const gchar * reason)124 static gchar *skip_msg(const gchar *path, const gchar *file, const gchar *reason){
125     if (reason) fprintf(stderr, "skipping \"%s/%s\" (%s)\n",path, file, reason);
126     return NULL;
127 }
128 
out(long count,long walltime)129 void out(long count, long walltime){
130 #ifdef HAVE_SYS_RESOURCE_H
131     struct rusage usage;
132     if (getrusage(RUSAGE_SELF, &usage)){
133         fprintf(stderr, "rusage(): %s\n", strerror(errno));
134         return;
135     }
136     if (!count) {
137         fprintf(stdout, "# count walltime usertime systime resident shared data stack page_r page_f swaps block_i block_o\n");
138         return;
139     }
140     // usertime systime resident shared data stack page_r page_f swaps block_i block_o
141     //fprintf(stdout, "%d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld \n",
142     fprintf(stdout, "%ld", count);
143     fprintf(stdout, "\t%ld", walltime);
144     fprintf(stdout, "\t%ld", (long)usage.ru_utime.tv_sec);
145     fprintf(stdout, "\t%ld", (long)usage.ru_stime.tv_sec);
146     fprintf(stdout, "\t%ld", (long)usage.ru_maxrss);
147     fprintf(stdout, "\t%ld", (long)usage.ru_ixrss);
148     fprintf(stdout, "\t%ld", usage.ru_idrss);
149     fprintf(stdout, "\t%ld", usage.ru_isrss);
150     fprintf(stdout, "\t%ld", usage.ru_minflt);
151     fprintf(stdout, "\t%ld", usage.ru_majflt);
152     fprintf(stdout, "\t%ld", usage.ru_nswap);
153     fprintf(stdout, "\t%ld", usage.ru_inblock);
154     fprintf(stdout, "\t%ld", usage.ru_oublock);
155     fprintf(stdout, "\n");
156     fflush(stdout);
157 #endif
158     return;
159 }
160 static gchar *
get_fullpath(const gchar * path,struct dirent * d,struct stat * st)161 get_fullpath(const gchar *path, struct dirent *d, struct stat *st){
162     if(strcmp(d->d_name, ".")==0)  return NULL; //skip_msg(path, d->d_name, NULL);
163     if(strcmp(d->d_name, "..")==0) return NULL; //skip_msg(path, d->d_name, NULL);
164     if(strstr(d->d_name, ".dbh"))  return NULL; //skip_msg(path, d->d_name, NULL);
165     if(strstr(d->d_name, ".dbf"))  return NULL; //skip_msg(path, d->d_name, NULL);
166     gchar *fullpath=g_build_filename(path,d->d_name,NULL);
167     if (LSTAT(fullpath,st)<0 ){
168         g_free(fullpath);
169         return skip_msg(path, d->d_name, "cannot stat");
170     }
171     if (st->st_size == 0) {
172         g_free(fullpath);
173         return skip_msg(path, d->d_name, NULL);
174         return skip_msg(path, d->d_name, "st->st_size == 0");
175     }
176     // Let's put a 0.2 MB limit for recordsize in the test.
177     if (st->st_size > 200000LL) {
178         g_free(fullpath);
179 
180         return skip_msg(path, d->d_name, NULL);
181 //        return skip_msg(path, d->d_name, "file is too big (this is arbitrary)");
182     }
183     if (!S_ISREG(st->st_mode) && !S_ISDIR(st->st_mode)) {
184         g_free(fullpath);
185         return NULL; //skip_msg(path, d->d_name, "non regular file");
186     }
187     return fullpath;
188 }
189 //////////////////////////////////////////////////////////////////////
190 
191 
192 static int
check_filesystem(const char * path,size_t limit,size_t * records_p,size_t * size_p,size_t * key_storage_p)193 check_filesystem(const char *path, size_t limit, size_t *records_p, size_t *size_p, size_t *key_storage_p)
194 {
195     DIR *directory;
196     struct dirent *d;
197 
198     errno=0;
199     directory = opendir(path);
200     if(!directory) {
201 	fprintf(stderr,"Cannot open %s (%s)\n" ,path, strerror(errno));
202 	return -1;
203     }
204 #define     _BSD_SOURCE 1
205 
206 while((d = readdir(directory)) != NULL)
207     {
208 
209 	gboolean is_dir=FALSE;
210         struct stat st;
211 
212         gchar *fullpath=get_fullpath(path, d, &st);
213         if (!fullpath) continue;
214 
215 	if (S_ISDIR(st.st_mode)) is_dir=TRUE;
216 
217         if (is_dir) {
218              if (strcmp(fullpath, SKIP_DIR)){
219 		int retval=0;
220 		retval = check_filesystem(fullpath, limit, records_p, size_p, key_storage_p);
221 		if (retval < 0) return -1;
222              }
223 	} else if (st.st_size <= limit){
224             (*records_p)++;
225             (*size_p) += st.st_size;
226             (*key_storage_p) += (strlen(fullpath));
227         }
228 
229 
230 	g_free(fullpath);
231     }
232     closedir(directory);
233     return (1);
234 }
235 
236 //////////////////////////////////////////////////////////////////////
237 
238 static int
qread_filesystem(DBHashTable * dbh,DBHashTable * index,const char * path,dump_t * dump_p)239 qread_filesystem(DBHashTable *dbh, DBHashTable *index, const char *path, dump_t *dump_p)
240 {
241     DIR *directory;
242     int count = 0;
243     struct dirent *d;
244 
245 
246     directory = opendir(path);
247     if(!directory) {
248 	fprintf(stderr,"Cannot open %s\n" ,path);
249 	return -1;
250     }
251 
252     // We add 1 to q_number to save the null terminating char.
253     unsigned char q_number[DBH_KEYLENGTH(dbh)+1];
254     unsigned char q_key[DBH_KEYLENGTH(index)+1];
255     //fprintf(stderr, "keylength=%d\n", DBH_KEYLENGTH(index));
256 
257 #define     _BSD_SOURCE 1
258 while((d = readdir(directory)) != NULL)
259     {
260         // allocate fullpath...
261         struct stat st;
262 
263         gchar *fullpath=get_fullpath(path, d, &st);
264         if (!fullpath) continue;
265 
266         if (S_ISDIR(st.st_mode)) {
267              // skip value
268              if (strcmp(fullpath, SKIP_DIR)){
269                 int retval;
270 		retval = qread_filesystem(dbh, index, fullpath, dump_p);
271 		if (retval > 0) count += retval;
272              }
273 	} else { // not a dir.
274              // This is useful if our data size grows over 1024 B:
275              if (DBH_MAXIMUM_RECORD_SIZE(dbh) < st.st_size) {
276                  dbh_set_size(dbh,st.st_size);
277                  fprintf(stderr, "dbh_set_size set to %lld (%s)\n",(long long)st.st_size, fullpath);
278              }
279              // This is binary mode in unix.
280              int fd=open(fullpath,O_RDONLY|O_BINARY);
281              if (fd < 0) {
282                 fprintf(stderr, "cannot open %s for read\n",fullpath);
283                 g_free(fullpath); continue;
284              }
285              // This works instead of dbh_set_data():
286              // read data directly into DBH_DATA(dbh)
287              dbh_set_recordsize(dbh,st.st_size);
288              if (read(fd,DBH_DATA(dbh),st.st_size) < 0){
289                 fprintf(stderr, "problem reading %lld bytes from %s\n",
290                         (long long)st.st_size,fullpath);
291                 close(fd);
292                 g_free(fullpath); continue;
293              }
294              close(fd);
295 
296              // index...
297              // set the index key
298              memset(q_key, 0, DBH_KEYLENGTH(index)+1);
299              strncpy((gchar *)q_key, fullpath, DBH_KEYLENGTH(index));
300              dbh_set_key(index, q_key);
301              // set the index data
302              memset(q_number, 0, DBH_KEYLENGTH(dbh)+1);
303              dbh_genkey(q_number, DBH_KEYLENGTH(dbh), dump_p->natural++);
304              gint recordsize = strlen((gchar *)q_number)+1;
305              //fprintf(stderr,"Recordsize: %d\n", recordsize);
306              dbh_set_recordsize(index, recordsize);
307              dbh_set_data(index, q_number, recordsize);
308              // update the index
309              //fprintf(stderr, "key: %s, data %s\n", (gchar *)DBH_KEY(index), (gchar *)DBH_DATA(index));
310              dbh_update(index);
311              //fprintf(stderr, "index q_number:\"%s\" data:\"%s\"\n", q_number, (char *)DBH_DATA(index));
312 
313              // table...
314              // use the q number as the access key
315              dbh_set_key(dbh, q_number);
316              dbh_update(dbh);
317              dump_p->sum += st.st_size;
318              count++;
319              //fprintf(stdout,"%s: adding %ld bytes of data to table\n",fullpath, st.st_size);
320 	 //
321 	}
322 
323 	g_free(fullpath);
324     }
325     closedir(directory);
326 //	printf ("%s -> %d files\n",path,count);
327     return (count);
328 }
329 
330 static int
read_filesystem(DBHashTable * dbh,DBHashTable * collisions,const char * path,dump_t * dump_p)331 read_filesystem(DBHashTable *dbh, DBHashTable *collisions, const char *path, dump_t *dump_p)
332 {
333     DIR *directory;
334     int count = 0;
335     struct dirent *d;
336 
337     directory = opendir(path);
338     if(!directory) {
339 	fprintf(stderr,"Cannot open %s\n" ,path);
340 	return -1;
341     }
342 #define     _BSD_SOURCE 1
343 while((d = readdir(directory)) != NULL)
344     {
345 
346 	gboolean is_dir=FALSE;
347 	unsigned char bucket='A';
348         struct stat st;
349 
350         gchar *fullpath=get_fullpath(path, d, &st);
351         if (!fullpath) continue;
352 
353 
354 	// while hash key is already used, keep moving to next bucket.
355 	gchar  *key=NULL;
356         while (1) {
357 	    key=get_hash_key(bucket,fullpath);
358             dbh_set_key (dbh,(unsigned char *)key);
359             if (!dbh_load(dbh)) break;
360 	    fprintf(stderr, "HASH colision: %s -> %s\n", key,fullpath);
361 	    bucket++;
362             g_free(key);
363 	}
364         g_free(key);
365 
366 
367 
368 
369         // If we are beyond first bucket, a collision has occured.
370         if (bucket > 'A'){
371 	    char collision_key[255];
372 	    memset(collision_key,0,255);
373 	    strncpy(collision_key,fullpath, 254);
374 
375 	    dbh_set_key (collisions,(unsigned char *)collision_key);
376 
377             dbh_set_size(collisions,DBH_KEYLENGTH(dbh));
378 	    dbh_set_data(collisions,(void *)DBH_KEY(dbh),DBH_KEYLENGTH(dbh));
379 	    dbh_update(collisions);
380 	}
381 
382 
383 
384 	if (S_ISDIR(st.st_mode)) is_dir=TRUE;
385 
386         if (!is_dir) {
387 	 // This is useful if our data size grows over 1024 B:
388 	 if (DBH_MAXIMUM_RECORD_SIZE(dbh) < st.st_size) {
389 	     dbh_set_size(dbh,st.st_size);
390 	     fprintf(stderr, "dbh_set_size set to %lld\n",(long long)st.st_size);
391 	 }
392 	 int fd=open(fullpath,O_RDONLY);
393 	 if (fd < 0) {
394 	    fprintf(stderr, "cannot open %s for read\n",fullpath);
395 	    g_free(fullpath); continue;
396 	 }
397 	 // This works instead of dbh_set_data():
398 	 if (read(fd,DBH_DATA(dbh),st.st_size) < 0){
399 	    fprintf(stderr, "problem reading %lld bytes from %s\n",
400 		    (long long)st.st_size,fullpath);
401 	    close(fd);
402 	    g_free(fullpath); continue;
403 	 }
404 	 close(fd);
405 	 dbh_set_recordsize(dbh,st.st_size);
406 	 dbh_update(dbh);
407 	 dump_p->sum += st.st_size;
408 	 count++;
409          //fprintf(stderr,"%s\n",fullpath);
410 	 //
411 	}
412         if (is_dir) {
413              if (strcmp(fullpath, SKIP_DIR)){
414 		int retval;
415 		retval = read_filesystem(dbh,collisions, fullpath, dump_p);
416 		if (retval > 0) count += retval;
417              }
418 	}
419 
420 	g_free(fullpath);
421     }
422     closedir(directory);
423 //	printf ("%s -> %d files\n",path,count);
424     return (count);
425 }
426 
427 
428 // Keep these variables global so that recursion won't gobble up all memory...
429     size_t data_size = 1024;
430     void *data_ptr = NULL;
431 
432 static int
gread_filesystem(GDBM_FILE dbf,GDBM_FILE collisions,const char * path,dump_t * dump_p)433 gread_filesystem(GDBM_FILE dbf, GDBM_FILE collisions, const char *path, dump_t *dump_p)
434 {
435     DIR *directory;
436     int count = 0;
437     struct dirent *d;
438 
439     directory = opendir(path);
440     if(!directory) {
441 	fprintf(stderr,"Cannot open %s\n" ,path);
442 	return -1;
443     }
444     if (!data_ptr) data_ptr = (void *)malloc(data_size);
445     if (!data_ptr) g_error("should not happen, malloc 1024\n");
446 
447     gchar  *key_s=NULL;
448 #define     _BSD_SOURCE 1
449 while((d = readdir(directory)) != NULL)
450     {
451 
452 	gboolean is_dir=FALSE;
453 	unsigned char bucket='A';
454         // allocate fullpath...
455         struct stat st;
456 
457         gchar *fullpath=get_fullpath(path, d, &st);
458         if (!fullpath) continue;
459 
460 
461 	// while hash key is already used, keep moving to next bucket.
462         datum key;
463         datum content;
464 
465         g_free(key_s);
466         while (1) {
467 	    key_s=get_hash_key(bucket,fullpath);
468             // Set key
469             key.dptr = key_s;
470             key.dsize = 11;
471             // Does the key collide?
472             if (!gdbm_exists(dbf, key)) break;
473 	    fprintf(stderr, "HASH colision: %s -> %s\n", key_s, fullpath);
474 	    bucket++;
475             g_free(key_s);
476 	}
477 
478         // up above and on eoc: g_free(key_s);
479 
480 
481 
482 
483         // If we are beyond first bucket, a collision has occured.
484         if (bucket > 'A'){
485 	    char collision_key[255];
486 	    memset(collision_key,0,255);
487 	    strncpy(collision_key,fullpath, 254);
488 
489             // set the collision key:
490             key.dptr = collision_key;
491             key.dsize = 254; // XXX we could probably do better here, using variable key size...
492 
493             // set the data:
494             content.dptr = key_s;
495             content.dsize = 11;
496 
497             // Update the record
498             gdbm_store(collisions, key, content, GDBM_INSERT);
499 
500 	}
501 
502 
503 
504 	if (S_ISDIR(st.st_mode)) is_dir=TRUE;
505 
506         if (!is_dir) {
507 
508 	 // This is useful if our data size grows over 1024 B:
509 	 if (data_size < st.st_size) {
510              g_free(data_ptr);
511              data_size = st.st_size;
512              data_ptr = (void *)malloc(data_size);
513              if (!data_ptr){
514                  fprintf(stderr, "Cannot allocate %lld bytes for gdbm data_ptr. Terminating now...\n", (long long)data_size);
515                  exit(1);
516              }
517 	     fprintf(stderr, "gdbm size set to %lld\n",(long long)st.st_size);
518 	 }
519 	 int fd=open(fullpath,O_RDONLY);
520 	 if (fd < 0) {
521 	    fprintf(stderr, "cannot open %s for read\n",fullpath);
522 	    g_free(fullpath); continue;
523 	 }
524 	 // This works instead of dbh_set_data():
525 	 if (read(fd, data_ptr, st.st_size) < 0){
526 	    fprintf(stderr, "problem reading %lld bytes from %s\n",
527 		    (long long)st.st_size,fullpath);
528 	    close(fd);
529 	    g_free(fullpath); continue;
530 	 }
531 	 close(fd);
532 
533          // Set key
534          key.dptr = key_s;
535          key.dsize = 11;
536 
537          // Set data
538          content.dptr = data_ptr;
539          content.dsize = st.st_size;
540 
541          // update data
542          gdbm_store (dbf, key, content, GDBM_INSERT);
543 
544          dump_p->sum += st.st_size;
545 	 count++;
546          //fprintf(stderr,"%s\n",fullpath);
547 	 //
548 	}
549         if (is_dir) {
550              if (strcmp(fullpath, SKIP_DIR)){
551 		int retval;
552 		retval = gread_filesystem(dbf,collisions, fullpath, dump_p);
553 		if (retval > 0) count += retval;
554              }
555 	}
556 
557 	g_free(fullpath);
558     }
559     closedir(directory);
560     // free final leftover
561     g_free(key_s);
562 
563 //	printf ("%s -> %d files\n",path,count);
564     return (count);
565 }
566 
567 #if 0
568 
569 static void  operate (DBHashTable *dbh){
570     dump_t *dump_p = dbh->user_data;
571     dump_p->count++;
572     //sum += strlen((char *)DBH_DATA(dbh));
573     dump_p->sum += DBH_RECORD_SIZE(dbh);
574 }
575 
576 static DBHashTable *dbh_key;
577 static void  compare (DBHashTable *dbh){
578     dbh_set_key (dbh_key,(unsigned char *)DBH_KEY(dbh));
579     dbh_load(dbh_key);
580     char *path=DBH_DATA(dbh_key);
581     int fd=open(path,O_RDONLY);
582 	 if (fd < 0) {
583 	    printf("cannot open %s for read\n",path);
584 	    return;
585 	 }
586 	 // This works instead of dbh_set_data():
587     struct stat st;
588     LSTAT(path,&st);
589     void *p=malloc(st.st_size);
590      if (p == NULL) {
591 	 fprintf(stderr, "malloc: %s\n", strerror(errno));
592 	exit(1);
593      }
594 	 if (read(fd,p,st.st_size) < 0){
595 	    printf("problem reading %lld bytes from %s\n",
596 		    (long long)st.st_size,path);
597 	    close(fd);
598 	    free(p);
599 	    return;
600 	 }
601 	 close(fd);
602     if (memcmp(p,DBH_DATA(dbh),st.st_size) != 0) {
603 	printf("%s does not compare!\n",path);
604     } else {
605 	static int count=0;
606 	if (count++ % 1000 == 0) {
607 	    printf ("."); fflush(stdout);
608 	}
609     }
610     free(p);
611 }
612 #endif
613 
614 #if 0
615 static int
616 dump(dump_t *dump_p) {
617     //char **argv, int which, int original_count, long long original_sum){
618     dump_p->count=0;   dump_p->sum=0;
619     const char *text;
620     if (dump_p->which) text = "Sweep"; else text = "Fanout";
621     fprintf(stdout,"%s is now being performed by pid %d\n", text, getpid());
622     // PARALLEL SAFE need not be specified on READ_ONLY
623     DBHashTable *dbh=dbh_new(TABLE, NULL, DBH_READ_ONLY);
624     dbh->user_data = dump_p;
625     if (dump_p->which) dbh_foreach_sweep (dbh,operate);
626     else dbh_foreach_fanout (dbh,operate);
627     dbh_close(dbh);
628     /*
629     if (strcmp(dump_p->argv[1],"fulltest")==0) {
630 	if (dump_p->sum != dump_p->original_sum){
631 	  //g_warning("Original sum does not match %s sum (%I64d != %I64d)\nTest FAILED.\n",
632 	  g_warning("Original sum does not match %s sum (%lld != %lld)\nTest FAILED.\n",
633 		text, dump_p->original_sum, dump_p->sum);
634 	  exit(1);
635 	}
636 	if (dump_p->count != dump_p->original_count){
637 	    g_warning("Original count does not match %s count (%d != %d)\nTest FAILED.\n",
638 		text, dump_p->original_count, dump_p->count);
639 	  exit(1);
640 	}
641     }*/
642     fprintf(stdout,
643 "  Sweep data:\n"\
644 "    Items in the DBH table (filesystem count) = %d\n"\
645 "    Sum of data items size saved in DBH table = %lld\n",
646 	    dump_p->count, dump_p->sum);
647     /*
648     if (strcmp(dump_p->argv[2],"fulltest")==0) {
649 	fprintf(stderr, "Test %s PASSED\n", text);
650     }*/
651     return 1;
652 }
653 #endif
654 
655 
656 #if 0
657 static void
658 check_files(void){
659     if (!g_file_test(TABLE, G_FILE_TEST_EXISTS)){
660       g_warning("Index file %s has not yet been created\n",
661 	      TABLE);
662       exit(1);
663     }
664     if (!g_file_test(COLLISIONS, G_FILE_TEST_EXISTS)){
665       g_warning("DBH table %s has not yet been created\n",
666 	      COLLISIONS);
667       exit(1);
668     }
669 }
670 #endif
671 
rebuild(DBHashTable * dbh_thread)672  void  rebuild (DBHashTable *dbh_thread){
673     DBHashTable *rebuilt_dbh = dbh_thread->user_data;
674     // Adquire mutex.
675     dbh_mutex_lock(rebuilt_dbh);
676 
677     // Copy key and data to rebuilt_dbh
678     dbh_set_key(rebuilt_dbh, DBH_KEY(dbh_thread));
679     dbh_set_recordsize (rebuilt_dbh, DBH_RECORD_SIZE(dbh_thread));
680     dbh_set_data(rebuilt_dbh, DBH_DATA(dbh_thread), DBH_RECORD_SIZE(dbh_thread));
681     // Write to rebuilt dbh
682     dbh_update(rebuilt_dbh);
683     // Release mutex
684     dbh_mutex_unlock(rebuilt_dbh);
685     return;
686 }
687 
688 long long checksum=0;
689 GSList *random_list = NULL;
690 GSList *random_numlist = NULL;
691 GSList *random_qlist = NULL;
692 GSList *random_glist = NULL;
693 gint random_count=0;
get_random_list(DBHashTable * dbh,void * data)694 void get_random_list(DBHashTable *dbh, void *data){
695     // flip a coin...
696     //static struct drand48_data rand_buffer;
697     static unsigned int s=7;
698     if (random_list == NULL){
699         srand48(time(NULL));
700     }
701     double coin = drand48();
702     //fprintf(stderr, "coin 1= %4.2lf\n", coin);
703  //   if (coin < 0.5) return;
704 
705     coin = drand48();
706     //fprintf(stderr, "coin 2= %4.2lf\n", coin);
707     const gchar *path = (gchar *)DBH_KEY(dbh);
708     if (coin > 0.5){
709         random_list = g_slist_prepend(random_list, g_strdup(path));
710         random_numlist = g_slist_prepend(random_numlist, GINT_TO_POINTER(++random_count));
711     }
712     else {
713         random_list = g_slist_append(random_list, g_strdup(path));
714         random_numlist = g_slist_append(random_numlist, GINT_TO_POINTER(++random_count));
715     }
716     //fprintf(stderr, "random key in list: \"%s\"\n", path);
717     checksum += s;
718     s++;
719 }
720 #if 0
721 void dump_qindex(DBHashTable *xdbh){
722 
723     fprintf(stderr, "DUMP key: %s, data: %s\n", (gchar *)DBH_KEY(xdbh), (gchar *)DBH_DATA(xdbh));
724     return;
725 }
726 #endif
727 
728 
score(char ** argv)729 static int score(char **argv){
730         struct stat st;
731         if (stat(argv[2], &st) < 0 || !S_ISDIR(st.st_mode)){
732             fprintf(stderr, "%s is not a directory.\n", argv[1]);
733         }
734         size_t records=0, size=0, limit=2000000, key_storage=0;
735         fprintf(stderr, "Checking %s for table creation with files <= %ld Mbytes...\n", argv[2], (long)(limit/1000000));
736         check_filesystem(argv[2], limit, &records, &size, &key_storage);
737         fprintf(stderr, "%s has %ld records for a total of %ld Mb\n", argv[2], (long)records, (long)(size/1000000));
738         fprintf(stderr, "full variable key storage=%ld KB versus fixed size = %ld KB \n",(long)key_storage/1000, (long)(256*records/1000));
739         return (1);
740 }
741 
742 static int
create_gdbm_table(const gchar * path)743 create_gdbm_table(const gchar *path){
744     time_t gdbm_creation_time;
745     time_t start = time(NULL);
746 
747     dump_t dump_v;
748     memset(&dump_v, 0, sizeof(dump_t));
749 
750     fprintf(stderr, "///////////////////  GDBM g_hash key table generation /////////////////\n");
751     fprintf(stderr,"Creating index now, process 0x%x recursively reading %s\n", getpid(), path);
752     // This table is the bucket index file. The index file is also
753     // the data table.
754     //
755     GDBM_FILE dbf = gdbm_open(GTABLE, 0, GDBM_NEWDB, 0770, NULL);
756     // This table handles collisions. If a path is indexed here (first 254
757     // bytes of the string), then the data element is the actual hash table
758     // key. This avoids a collision with a path that has already been indexed.
759     GDBM_FILE collisions = gdbm_open(GCOLLISIONS, 0, GDBM_NEWDB, 0770, NULL);
760     // Read the filesystem data into the DBH table.
761     dump_v.sum = 0;
762     dump_v.original_count=gread_filesystem(dbf, collisions, path, &dump_v);
763     dump_v.original_sum = dump_v.sum;
764     gdbm_close(dbf);
765     gdbm_close(collisions);
766     gdbm_creation_time = time(NULL) - start;
767     fprintf(stderr,
768 "  Index created:\n"\
769 "    Items in the GDBM table (filesystem count) = %d\n"\
770 "    Sum of data items size saved in GDBM table = %lld\ntime = %lld s.",
771 	    dump_v.original_count, dump_v.original_sum,
772             (long long) gdbm_creation_time);
773 
774    fprintf(stderr, "gdbm creation time = %lld s.\n", (long long)gdbm_creation_time);
775    return (1);
776 
777 }
778 
779   // This DBH uses more than one bucket in order to handle hashtable
780   // key collisions.
781 static int
create_dbh_table(gchar * path)782 create_dbh_table(gchar *path){
783 
784     dump_t dump_v;
785     memset(&dump_v, 0, sizeof(dump_t));
786     time_t dbh_creation_time;
787     time_t start = time(NULL);
788     fprintf(stderr, "///////////////////  DBH g_hash key table generation /////////////////\n");
789     fprintf(stderr,"Creating index now, process 0x%x recursively reading %s\n", getpid(), path);
790     // This table is the bucket index file. The index file is also
791     // the data table.
792     //
793     // Our key length here is one extra byte to handle bucket id for collisions.
794     unsigned char key_length = 11;
795     DBHashTable *dbh = dbh_new(TABLE, &key_length, DBH_CREATE);
796     // This table handles collisions. If a path is indexed here (first 254
797     // bytes of the string), then the data element is the actual hash table
798     // key. This avoids a collision with a path that has already been indexed.
799     key_length = 254;
800     DBHashTable *collisions = dbh_new(COLLISIONS, &key_length, DBH_CREATE);
801     // Read the filesystem data into the DBH table.
802     dump_v.sum = 0;
803     dump_v.original_count=read_filesystem(dbh, collisions, path, &dump_v);
804     dump_v.original_sum = dump_v.sum;
805     dbh_close(dbh);
806     dbh_close(collisions);
807     fprintf(stderr,
808 "  Index created:\n"\
809 "    Items in the DBH table (filesystem count) = %d\n"\
810 "    Sum of data items size saved in DBH table = %lld\n",
811 	    dump_v.original_count, dump_v.original_sum);
812     dbh_creation_time = time(NULL) - start;
813     fprintf(stderr, "dbh creation time = %lld s.\n", (long long)dbh_creation_time);
814     return 1;
815 
816 }
817 
818 
819 // quantified key index.
820 static int
create_qdbh_table(gchar * path)821 create_qdbh_table(gchar *path)   {
822     dump_t dump_v;
823     memset(&dump_v, 0, sizeof(dump_t));
824     time_t q_creation_time;
825     time_t start = time(NULL);
826     fprintf(stderr, "///////////////////  DBH quantified key table generation ///////////////////////\n");
827     fprintf(stderr,"Creating qindex now, process 0x%x recursively reading %s\n", getpid(), path);
828     // This table is the bucket index file. The index file is also
829     // the data table.
830     unsigned char key_length = 10;
831     DBHashTable *dbh = dbh_new(QTABLE, &key_length, DBH_CREATE);
832     key_length = 254;
833     DBHashTable *index = dbh_new(QINDEX, &key_length, DBH_CREATE);
834     // Read the filesystem data into the DBH table.
835     dump_v.natural = 1;
836     dump_v.sum = 0;
837     dump_v.original_count=qread_filesystem(dbh,index, path,&dump_v);
838     dump_v.original_sum = dump_v.sum;
839     dbh_close(dbh);
840     dbh_close(index);
841     fprintf(stderr,
842 "  Q Index created: (%lld records, %lld data, %lld erased, %lld format)\n"\
843 "  Q Table created: (%lld records, %lld data, %lld erased, %lld format)\n"\
844 "    Items in the DBH table (filesystem count) = %d\n"\
845 "    Sum of data items size saved in DBH table = %lld\n",
846      DBH_RECORDS(index), DBH_DATA_SPACE(index), DBH_ERASED_SPACE(index), DBH_FORMAT_SPACE(index),
847      DBH_RECORDS(dbh), DBH_DATA_SPACE(dbh), DBH_ERASED_SPACE(dbh), DBH_FORMAT_SPACE(dbh),
848 	    dump_v.original_count, dump_v.original_sum);
849     q_creation_time = time(NULL) - start;
850     //index=dbh_new(QINDEX,&key_length, DBH_READ_ONLY);
851     //dbh_foreach_sweep(index, dump_qindex);
852     //dbh_close(index);
853     fprintf(stderr, "qdbh creation time = %lld s.\n", (long long)q_creation_time);
854     return 1;
855   }
856 
857 static int
mkdir_output(void)858 mkdir_output(void){
859     if (g_mkdir_with_parents(DIRECTORY, 0770) < 0){
860 	if (!g_file_test(DIRECTORY, G_FILE_TEST_IS_DIR)){
861 	    g_warning("mkdir(%s): %s\n", DIRECTORY, strerror(errno));
862 	    exit(1);
863 	}
864     }
865     if (!g_file_test(DIRECTORY, G_FILE_TEST_IS_DIR)){
866 	g_warning("Failed test: g_file_test(%s, G_FILE_TEST_IS_DIR)\n",
867 		DIRECTORY );
868 	exit(1);
869     }
870     return 1;
871 }
872 
load_grow_dbh(DBHashTable * in_table,const gchar * key)873 static gint load_grow_dbh(DBHashTable *in_table, const gchar *key){
874     DBHashTable *table;
875     if (in_table)table = in_table;
876     else table = dbh_new("grow.dbh", NULL, DBH_READ_ONLY);
877 
878     dbh_set_key(table, (unsigned char *)key);
879     int retval = 0;
880     if (!dbh_load(table)) {
881       fprintf(stderr, "cannot load table key  \"%s\"\n", key);
882       retval=-1;
883     }
884 
885     if (in_table==NULL) dbh_close(table);
886     return retval;
887 }
load_grow_dbf(GDBM_FILE in_gtable,const gchar * inkey)888 static gint load_grow_dbf(GDBM_FILE in_gtable, const gchar *inkey){
889     gint retval = 0;
890       GDBM_FILE gtable;
891       if (in_gtable) gtable = in_gtable;
892       else gtable = gdbm_open("grow.dbf", 0, GDBM_READER, 0770, NULL);
893 
894       datum key;
895 
896 
897 
898       int ik = atoi(inkey);
899         key.dptr = (void *)&ik;
900         key.dsize = sizeof(int);
901 
902       datum record = gdbm_fetch(gtable, key);
903       if (!record.dptr) {
904           fprintf(stderr, "cannot load gdbm table key  \"%s\"\n", inkey);
905           retval=-1;
906       }
907       g_free(record.dptr);
908       if (in_gtable==NULL) gdbm_close(gtable);
909       return retval;
910 }
911 
load_dbh_item(const gchar * item)912 static gint load_dbh_item(const gchar *item){
913     gint retval = 0;
914     gchar collision_key[256];
915     DBHashTable *collisions = dbh_new(COLLISIONS, NULL, DBH_READ_ONLY);
916     DBHashTable *table = dbh_new(TABLE, NULL, DBH_READ_ONLY);
917 
918     memset(collision_key, 0, DBH_KEYLENGTH(collisions)+1);
919     memcpy(collision_key, item, strlen(item));
920     dbh_set_key(collisions, (unsigned char *)collision_key);
921     if (!dbh_load(collisions)){
922       // get regular hash key in 'A' bucket.
923       gchar *k=get_hash_key('A', item);
924       dbh_set_key(table, (unsigned char *) k);
925       g_free(k);
926     } else {
927       dbh_set_key(table, DBH_DATA(collisions));
928     }
929     if (!dbh_load(table)) {
930       fprintf(stderr, "cannot load table key  \"%s\"\n", item);
931       retval=-1;
932     }
933 
934     dbh_close(table);
935     dbh_close(collisions);
936       return retval;
937 }
938 
load_qdbh_item(const gchar * item)939 static gint load_qdbh_item(const gchar *item){
940     gint retval = 0;
941       DBHashTable *table = dbh_new(QTABLE, NULL, DBH_READ_ONLY);
942       DBHashTable *index = dbh_new(QINDEX, NULL, DBH_READ_ONLY);
943     //fprintf(stderr, "keylength=%d/%d\n", DBH_KEYLENGTH(index),key_length);
944 
945       gchar index_key[DBH_KEYLENGTH(index)];
946 
947       memset(index_key, 0, DBH_KEYLENGTH(index));
948       memcpy(index_key, item, strlen(item));
949       dbh_set_key(index, (unsigned char *)index_key);
950       if (!dbh_load(index)){
951           fprintf(stderr, "Cannot load index key %s\n", item);
952           retval=-1;
953       } else {
954           dbh_set_key(table, DBH_DATA(index));
955           if (!dbh_load(table)) fprintf(stderr, "cannot load qtable key \"%s\"\n", item);
956       }
957 
958       dbh_close(index);
959       dbh_close(table);
960       return retval;
961 }
962 
load_gdbm_item(const gchar * item)963 static gint load_gdbm_item(const gchar *item){
964     gint retval = 0;
965       GDBM_FILE gcollisions = gdbm_open(GCOLLISIONS, 0, GDBM_READER, 0770, NULL);
966       GDBM_FILE gtable = gdbm_open(GTABLE, 0, GDBM_READER, 0770, NULL);
967 
968       gchar collision_key[256];
969       gint c_key_length=11;
970       memset(collision_key, 0, c_key_length+1);
971       memcpy(collision_key, item, strlen(item));
972       datum gkey;
973 
974       gkey.dptr = collision_key;
975       gkey.dsize = c_key_length;
976 
977       datum record = gdbm_fetch(gcollisions, gkey);
978       if (record.dptr == NULL){
979           // get regular hash key in 'A' bucket.
980           gchar *k=get_hash_key('A', item);
981           gkey.dptr = g_strdup(k);
982           gkey.dsize = 11;
983           g_free(k);
984       } else {
985           gkey.dptr = record.dptr;
986           gkey.dsize = 11;
987       }
988       record = gdbm_fetch(gtable, gkey);
989       if (!record.dptr) {
990           fprintf(stderr, "cannot load gdbm table key  \"%s\"\n", item);
991           retval=-1;
992       }
993       g_free(record.dptr);
994       g_free(gkey.dptr);
995       gdbm_close(gcollisions);
996       gdbm_close(gtable);
997       return retval;
998 }
999 
1000 #if 0
1001 typedef struct grow_t {
1002         DBHashTable *dbh;
1003         long records;
1004         long size;
1005         time_t start;
1006 } grow_t;
1007 static void grow_f(DBHashTable *dbh, void *data){
1008     char junk[256];
1009     grow_t *grow_p = data;
1010     //const gchar *path = (gchar *)DBH_KEY(dbh);
1011     double r = 1.0 * rand() /RAND_MAX;
1012     int size = r * 255;
1013     dbh_set_data(grow_p->dbh, junk, size);
1014     grow_p->size += size;
1015     grow_p->records++;
1016    // dbh_genkey (DBH_KEY(grow_p->dbh), DBH_KEYLENGTH(grow_p->dbh), grow_p->records);
1017 
1018    // dbh_update(grow_p->dbh);
1019     if (grow_p->records % 1000 == 0) {
1020         fprintf(stderr, "%ld records done, size=%ld (%ld s)\n",
1021                 grow_p->records, grow_p->size, (long)(time(NULL) - grow_p->start));
1022     }
1023 
1024 }
1025 #endif
1026 
1027 
1028 
1029 #define KEY_LEN 10
1030 #define V_PER_THREAD 10000
1031 #define MAX_KEYS 25*1000*1000
1032 #define MAX_THREADS 5
1033 #define JUNK_LENGTH 64
1034 
1035 
1036 typedef struct r2_t {
1037     time_t  start;
1038     gint  index;
1039     gint process;
1040     unsigned char key[V_PER_THREAD][KEY_LEN+1];
1041     int size[V_PER_THREAD];
1042 }r2_t;
1043 
w_grow_f(void * data)1044 void *w_grow_f(void *data){
1045     r2_t *r2_p = data;
1046 //    fprintf(stderr, "process=%d index=%d\n", r2_p->process, r2_p->index);
1047 //return data;
1048     gint i;
1049     unsigned char key[KEY_LEN+1];
1050     memset(key,0,KEY_LEN+1);
1051     for (i=0; i <  V_PER_THREAD; i++){
1052         dbh_genkey(key, KEY_LEN, i + r2_p->index);
1053         //strcpy(r2_p->key+i,key);
1054         memcpy(r2_p->key+i,key, KEY_LEN+1);
1055 	double r = 1.0 * rand() /RAND_MAX;
1056         r2_p->size[i] = r * JUNK_LENGTH;
1057         if (!r2_p->size[i]) r2_p->size[i]=5;
1058 
1059     }
1060 
1061     int quantum=0;
1062     unsigned char *p = key;
1063     for (;p && *p; p++) quantum += (*p-'0');
1064     i--;
1065     fprintf(stderr, "[%d] %d:%.2lfM: %s [%s](%ld s.)\n", r2_p->process, quantum, 1.0*(i+r2_p->index)/1000000.01,
1066                 key, (char *)(r2_p->key+(V_PER_THREAD-1)),
1067                 (long)(time(NULL) - r2_p->start));
1068 
1069     return data;
1070 }
1071 
grow_write_f(void * process_data,void * data)1072 void grow_write_f(void *process_data, void *data){
1073     void **arg = data;
1074     char junk[JUNK_LENGTH];
1075     r2_t *r2_p = process_data;
1076     DBHashTable *dbh = arg[0];
1077     GDBM_FILE dbf = arg[1];
1078     int k;
1079     for (k=0; k<V_PER_THREAD; k++){
1080         dbh_set_key(dbh, (unsigned char *)(r2_p->key+k));
1081         dbh_set_data(dbh, junk, r2_p->size[k]);
1082         dbh_update(dbh);
1083         datum key;
1084         datum content;
1085 	int ik = k+r2_p->index;
1086         key.dptr = (void *)&ik;
1087         key.dsize = sizeof(int);
1088 	content.dptr = junk;
1089 	content.dsize = r2_p->size[k];
1090         gdbm_store(dbf, key, content, GDBM_INSERT);
1091 
1092     }
1093     g_free(process_data);
1094 }
1095 
grow(void)1096 static void grow(void){
1097     gint max_threads = MAX_THREADS;
1098     gint current_threads = 0;
1099     pthread_t thread_id[max_threads];
1100     r2_t r2_v[max_threads];
1101 
1102     gint index = 0;
1103     time_t start = time(NULL);
1104 
1105     for (; current_threads < max_threads; current_threads++){
1106         if (!index) index = 1;
1107         else index += V_PER_THREAD;
1108         r2_v[current_threads].process = current_threads+1;
1109         r2_v[current_threads].index = index;
1110         r2_v[current_threads].start = start;
1111 
1112         pthread_create(thread_id+current_threads, NULL,
1113                 w_grow_f, (void *)(r2_v+current_threads));
1114     }
1115 
1116     unsigned char keylength = KEY_LEN;
1117     DBHashTable *dbh =
1118         dbh_new("/home/edscott/testfiles/grow.dbh", &keylength, DBH_CREATE);
1119     GDBM_FILE dbf = gdbm_open("/home/edscott/testfiles/grow.dbf", 0, GDBM_NEWDB|GDBM_SYNC, 0770, NULL);
1120     void *arg[]={dbh, dbf};
1121     gint i=0;
1122     GThreadPool *writepool = g_thread_pool_new(grow_write_f, arg, 1, TRUE, NULL);
1123     while (1){
1124         void *return_data;
1125         if (thread_id[i] && pthread_tryjoin_np(thread_id[i], &return_data)==0){
1126             fprintf(stderr, "joined thread [%d], now writing\n", i);
1127             // process data, threadpool
1128 	    r2_t *thread_r2_p = malloc(sizeof(r2_t));
1129 	    if (!thread_r2_p) g_error("Cannot malloc thread_r2_p: %s\n", strerror(errno));
1130 	    memcpy(thread_r2_p, return_data, sizeof(r2_t));
1131 
1132             g_thread_pool_push (writepool, thread_r2_p, NULL);
1133             r2_t *r2_p = return_data;
1134             // start new thread
1135             if (index < MAX_KEYS){
1136                 index += V_PER_THREAD;
1137                 r2_p->index = index;
1138                 pthread_create(thread_id+i, NULL, w_grow_f, (void *)(r2_p));
1139             } else {
1140                 fprintf(stderr, "Not starting new thread, limit reached: %d\n", index);
1141                 thread_id[i] = 0;
1142                 if (--current_threads == 0) {
1143                     fprintf(stderr, "All threads are done.\n");
1144                     break;
1145                 }
1146             }
1147         }
1148         if (++i >= max_threads){
1149             sleep(1);
1150             i=0;
1151         }
1152     }
1153     g_thread_pool_free(writepool, FALSE, TRUE);
1154     fprintf(stderr, "Threadpool write is complete %ld s. for %d records\n",
1155             (long)(time(NULL)-start), index);
1156     fprintf(stderr, "Threadpool write is complete.\n");
1157     dbh_close(dbh);
1158     gdbm_close(dbf);
1159 }
1160 
1161 
1162 
1163 #if 0
1164     grow_t grow_v;
1165     memset(&grow_v, 0, sizeof(grow_t));
1166     grow_v.start=time(NULL);
1167     // Go though q-index
1168     unsigned char keylength;
1169     DBHashTable *index = dbh_new(QINDEX, &keylength, 0);
1170    // unsigned char keylength = DBH_KEYLENGTH(index);
1171     grow_v.dbh = dbh_new("/home/edscott/testfiles/grow.dbh", &keylength, DBH_CREATE);
1172 
1173     fprintf(stderr, "growing file\n");
1174     dbh_foreach(index, grow_f, &grow_v);
1175     fprintf(stderr, "grow done: %ld seconds.\n",(long)(time(NULL) - grow_v.start));
1176     dbh_close(index);
1177     dbh_close(grow_v.dbh);
1178 #endif
1179 /*
1180 static void grow_test(){
1181     time_t dbh_time=0;
1182     time_t dbf_time=0;
1183     DBHashTable *dbh =
1184         dbh_new("/home/edscott/testfiles/grow.dbh", &keylength, DBH_READONLY);
1185     GDBM_FILE dbf =
1186         gdbm_open("/home/edscott/testfiles/grow.dbf", 0, 0, 0770, NULL);
1187     double r = (1.0)*rand()/RAND_MAX;
1188     r *= MAX_KEYS;
1189     int item = r;
1190     unsigned char key[KEY_LEN+1];
1191     memset(key,0,KEY_LEN+1);
1192     time_t start=time(NULL);
1193         dbh_genkey(key, KEY_LEN, item);
1194         dbh_set_key(dbh, (unsigned char *)(r2_p->key+k));
1195         dbh_load
1196         dbh_time += (time(NULL) - start);
1197 
1198 }
1199 */
1200 
1201 
main(int argc,char ** argv)1202 int main(int argc, char **argv){
1203 
1204   if (argc < 2) {
1205    fprintf(stderr,"insufficient arguments (%d < 2), usage: %s option [path] (option)\n%s\n",
1206 	   argc, argv[0], HELP);
1207    exit(1);
1208   }
1209 
1210     if (argc == 3 && strcmp(argv[1], "create")==0){
1211         if (argc < 3) {
1212             fprintf(stderr, "option %s requires a path\n", argv[1]);
1213             exit (1);
1214         }
1215         mkdir_output();
1216         // First, take a look at what we got to build a test datafile
1217         score(argv);
1218         // Create gdbm table
1219         create_gdbm_table(argv[2]);
1220         // Create DBH table
1221         create_dbh_table(argv[2]);
1222         // Create double DBH table
1223         create_qdbh_table(argv[2]);
1224         exit (0);
1225 
1226     }
1227 
1228 
1229 
1230   if (strcmp(argv[1],"regen")==0 ) {
1231     DBHashTable *dbh = dbh_new(QTABLE, NULL, 0);
1232     DBHashTable *index = dbh_new(QINDEX, NULL, 0);
1233     fprintf(stderr, "regen QINDEX\n");
1234     dbh_regen_sweep(&index);
1235     fprintf(stderr, "regen QTABLE\n");
1236     dbh_regen_sweep(&dbh);
1237     dbh_close(dbh);
1238     dbh_close(index);
1239 
1240     DBHashTable *table = dbh_new(TABLE, NULL, 0);
1241     DBHashTable *collisions = dbh_new(COLLISIONS, NULL, 0);
1242     fprintf(stderr, "regen COLLISIONS\n");
1243     dbh_regen_sweep(&collisions);
1244     fprintf(stderr, "regen TABLE\n");
1245     dbh_regen_sweep(&table);
1246     dbh_close(table);
1247     dbh_close(collisions);
1248     exit(0);
1249   }
1250 
1251   GSList *list = NULL;
1252   if (strcmp(argv[1],"random")==0 ) {
1253       fprintf(stderr, "preparing random list...\n");
1254       // prepare a random access list with 25% of data records.
1255       do {
1256           unsigned char key_length;
1257           random_count = 0;
1258           DBHashTable *random_src = dbh_new(QINDEX, &key_length, DBH_READ_ONLY);
1259           if (!random_src) {
1260               fprintf(stderr, "cannot create random list until tables are generated\n");
1261               fprintf(stderr, "%s found -> %d\n", QINDEX, g_file_test(QINDEX, G_FILE_TEST_EXISTS));
1262               exit(1);
1263           }
1264 
1265           dbh_foreach(random_src, get_random_list, NULL);
1266           fprintf(stderr, "random list has %d/%d items (randomness=%lld)\n",
1267                   g_slist_length(random_list), (int)DBH_RECORDS(random_src)/3, checksum);
1268           if (g_slist_length(random_list) < (int)DBH_RECORDS(random_src)/3){
1269               // free list data...
1270               GSList *tlist = random_list;
1271               for (;tlist && tlist->data; tlist = tlist->next) g_free(tlist->data);
1272               g_slist_free(random_list);
1273               g_slist_free(random_numlist);
1274               random_list=NULL;
1275               random_numlist=NULL;
1276           }
1277           dbh_close(random_src);
1278           checksum = 0;
1279       } while (random_list == NULL);
1280       fprintf(stderr,"Random list is ready. Now writing out...\n");
1281       FILE *outlist = fopen(RANDOM_LIST,"w");
1282       if (!outlist){
1283           fprintf(stderr, "cannot open %s for write\n", RANDOM_LIST);
1284           exit(1);
1285       }
1286       for (list=random_list; list && list->data; list = list->next){
1287         fprintf(outlist, "%s\n", (gchar *)list->data);
1288       }
1289       fclose(outlist);
1290       fprintf(stderr,"Random list done.\n");
1291       exit(0);
1292   }
1293 
1294 
1295   if (strstr(argv[1],"grow")) {
1296       if (strstr(argv[1],"growlist")){
1297          unsigned char key[KEY_LEN+1];
1298          memset(key,0,KEY_LEN+1);
1299          int i;
1300          time_t start = time(NULL);
1301          double r;
1302          for (i=0; i<MAX_KEYS/2; i++){
1303 retry:
1304             r = (1.0)*rand()/RAND_MAX;
1305             r *= MAX_KEYS;
1306             int item = r;
1307             if (!item) goto retry;
1308             unsigned char key[KEY_LEN+1];
1309             memset(key,0,KEY_LEN+1);
1310 
1311             dbh_genkey(key, KEY_LEN, item);
1312             fprintf(stdout, "%d:%s\n", item, key);
1313             if (i%100 == 0) {
1314                 double t =  (double)(time(NULL) - start)/60.0;
1315                 fprintf(stderr, "records: %d in %lf minutes\n", i, t);
1316             }
1317          }
1318 
1319       }
1320       if (strstr(argv[1],"growtest")) {
1321           fprintf(stderr, "doing growtest\n");
1322           if (!g_file_test("growlist.txt", G_FILE_TEST_EXISTS)){
1323               fprintf(stderr, "%s does not exist. Run random option first\n", "growlist.txt");
1324               exit(1);
1325           } else {
1326               FILE *inlist = fopen("growlist.txt","r");
1327               if (!inlist){
1328                   fprintf(stderr, "cannot open %s for read\n", "growlist.txt");
1329                   exit(1);
1330               }
1331               gchar buffer[300];
1332               fprintf(stderr, "Reading random list file...\n");
1333               gint count=1;
1334               while (fgets(buffer, 300, inlist) && !feof(inlist)){
1335                   if (strchr(buffer, '\n')) *strchr(buffer, '\n')=0;
1336                   random_list = g_slist_prepend(random_list, g_strdup(buffer));
1337                   count++;
1338                   //if (count > 50)break;
1339                   //if (count %10000 == 0) fprintf(stderr, "read %d records...\n", count);
1340 
1341               }
1342               fclose(inlist);
1343               fprintf(stderr, "Finished reading random list file (%d records).\n", count);
1344           }
1345           GSList *list = random_list;
1346           time_t start=time(NULL);
1347           // test dbh
1348           int count;
1349           DBHashTable *g_dbh=NULL;
1350           if (strstr(argv[1],"growtest2")) {
1351               g_dbh = dbh_new("grow.dbh", NULL, DBH_READ_ONLY);
1352           }
1353           for (list = random_list,count=1; list && list->data; list=list->next, count++){
1354               gchar *key = strchr((gchar *)(list->data),':');
1355               if (!key) {
1356                   g_warning("key is null for %s\n",(gchar *)(list->data));
1357                   continue;
1358               }
1359               key++;
1360               //fprintf(stderr,"key=%s data=%s\n", key, (gchar *)(list->data));
1361               load_grow_dbh(g_dbh, key);
1362               if (count % 10000 == 0){
1363                   fprintf(stdout, "%d  %ld\n", count, (long)(time(NULL)-start));
1364               }
1365                if (count % 1000 == 0){
1366                   fprintf(stderr, "dbh: %d records in  %ld seconds\n", count, (long)(time(NULL)-start));
1367               }
1368          }
1369           if (g_dbh) dbh_close(g_dbh);
1370           start=time(NULL);
1371 
1372           GDBM_FILE g_dbf=NULL;
1373           if (strstr(argv[1],"growtest2")) {
1374               g_dbf = gdbm_open("grow.dbf", 0, GDBM_READER, 0770, NULL);
1375           }
1376           for (list = random_list,count=1; list && list->data; list=list->next, count++){
1377               gchar *key = (gchar *)(list->data);
1378               *strchr(key,':') = 0;
1379               //fprintf(stderr,"key=%s data=%s\n", key, (gchar *)(list->data));
1380               load_grow_dbf(g_dbf,key);
1381               if (count % 10000 == 0){
1382                   fprintf(stdout, "%d  %ld\n", count, (long)(time(NULL)-start));
1383               }
1384               if (count % 1000 == 0){
1385                   fprintf(stderr, "dbf: %d records in  %ld seconds\n", count, (long)(time(NULL)-start));
1386               }
1387           }
1388           if (g_dbf) gdbm_close(g_dbf);
1389 
1390       }
1391 
1392 
1393 
1394       //else grow();
1395       exit(0);
1396   }
1397 
1398 
1399   if (!strstr(argv[1],"test")) {
1400       fprintf(stderr, "unknown option: %s\n", argv[1]);
1401       exit(1);
1402   }
1403 
1404 
1405    {
1406 
1407 
1408       if (!g_file_test(RANDOM_LIST, G_FILE_TEST_EXISTS)){
1409           fprintf(stderr, "%s does not exist. Run random option first\n", RANDOM_LIST);
1410           exit(1);
1411       } else {
1412           FILE *inlist = fopen(RANDOM_LIST,"r");
1413           if (!inlist){
1414               fprintf(stderr, "cannot open %s for read\n", RANDOM_LIST);
1415               exit(1);
1416           }
1417           gchar buffer[300];
1418           fprintf(stderr, "Reading random list file...\n");
1419           gint count=1;
1420           while (fgets(buffer, 300, inlist) && !feof(inlist)){
1421               if (strchr(buffer, '\n')) *strchr(buffer, '\n')=0;
1422               random_list = g_slist_prepend(random_list, g_strdup(buffer));
1423               count++;
1424               //if (count %10000 == 0) fprintf(stderr, "read %d records...\n", count);
1425 
1426           }
1427           fclose(inlist);
1428           fprintf(stderr, "Finished reading random list file (%d records).\n", count);
1429       }
1430   }
1431 
1432 
1433   dump_t dump_v;
1434   memset(&dump_v, 0, sizeof(dump_t));
1435 
1436   time_t start;
1437 
1438 
1439     fprintf(stderr, "Starting test...\n");
1440     //gchar *tests[]={"test-g","test-d","test-q",NULL};
1441     gchar **p;
1442     gchar **q;
1443 
1444     gchar *strings[256];
1445     memset(strings, 0, 256*sizeof(gchar **));
1446 
1447     srand((int)time(NULL));
1448 
1449     //for (p=tests; p && *p; p++)
1450     p=argv+1;
1451     fprintf(stdout, "# GNUplot output for %s: %s\n",
1452         *p,
1453         strstr(*p,"test-g")?"GDBM":
1454         strstr(*p,"test-q")?"QDBH":
1455         strstr(*p,"test-d")?"DBH":"wtf");
1456     {
1457       fprintf(stderr, "Testing %s...\n", *p);
1458       out(0, 0);
1459       start=time(NULL);
1460       long t_items=0;
1461       gint i;
1462       gint item = RAND_MAX;
1463       long items;
1464       long top_items = 100000;
1465       //long top_items = 100;
1466       gint k;
1467 
1468       for (items=10, k=0; items <= top_items; items *= 10, k++) {
1469         if (!strings[k]) strings[k] = g_strdup_printf("%ld", items);
1470         for(i=0; i< items; i++){
1471           while (item > g_slist_length(random_list) || !item) {
1472               double r = 1.0 * rand() / RAND_MAX * g_slist_length(random_list);
1473               item=r;
1474           }
1475           if (strstr(*p,"test-g"))
1476               load_gdbm_item((gchar *) ((g_slist_nth(random_list, item))->data));
1477           if (strstr(*p,"test-q"))
1478               load_qdbh_item((gchar *) ((g_slist_nth(random_list, item))->data));
1479           else
1480               load_dbh_item((gchar *) ((g_slist_nth(random_list, item))->data));
1481 
1482           item=0;
1483 
1484 
1485         }
1486         fprintf(stderr, "%s table loaded %ld items in %ld seconds\n",
1487         strstr(*p,"test-g")?"GDBM":
1488         strstr(*p,"test-q")?"QDBH":
1489         strstr(*p,"test-d")?"DBH":"wtf",
1490         items, (long)(time(NULL)-start));
1491         t_items += items;
1492         out(t_items, (long)(time(NULL)-start));
1493 
1494         gchar *g = g_strdup_printf("%s\t%ld", strings[k],(long)(time(NULL)-start));
1495         g_free(strings[k]);
1496         strings[k] = g;
1497 
1498         start=time(NULL);
1499 
1500       }
1501     }
1502     //fprintf(stdout, "# GNUplot output: column content:\n# records gdbm(s) dbh(s) qdbh(s)  \n");
1503     for (q=strings; q && *q; q++){
1504       //    fprintf(stdout, "%s\n", *q);fflush(stdout);
1505     }
1506 
1507 
1508 
1509 
1510       exit(0);
1511 
1512    for (list = random_list; list && list->data; list = list->next) g_free(list->data);
1513    g_slist_free(random_list);
1514 
1515   exit(0);
1516 }
1517 
1518 
1519 
1520 ////////////////////////////////////
1521 
1522 #if 0
1523   // Full or specific test follows.
1524   check_files();
1525   // Dump test
1526   if (strcmp(argv[2],"dump")==0 || strcmp(argv[2],"fulltest")==0) {
1527       // Find out how many items and total size of data records
1528       // a sweep/fanout of DBH table will find
1529       int i; for(i=1; i>=0; i--) {dump_v.which = i; dump(&dump_v);}
1530   }
1531 
1532   // Regen tests
1533   if (strcmp(argv[2],"regen")==0 || strcmp(argv[2],"fulltest")==0) {
1534     fprintf(stderr, "///////////////////  Serial tests //////////////////////////\n");
1535     fprintf(stdout,"Performing regen_sweep now...\n");
1536     DBHashTable *dbh;
1537     dbh=dbh_new(TABLE, NULL, 0);
1538     dbh_regen_sweep(&dbh);
1539     dbh_close(dbh);
1540     // Find out how many items and total size of data records
1541     // a sweep of DBH table will find
1542     dump_v.which = 1;
1543     dump(&dump_v);
1544     fprintf(stdout,"Performing regen_fanout now...\n");
1545     dbh=dbh_new(TABLE, NULL, 0);
1546     dbh_regen_fanout(&dbh);
1547     dbh_close(dbh);
1548     // Find out how many items and total size of data records
1549     // a sweep of DBH table will find
1550     dump_v.which = 0;
1551     dump(&dump_v);
1552   }
1553 #endif
1554 
1555       // DBH test with q number keys //////////////////////////////////
1556 #if 0
1557       // this will mislead following test...
1558       start=time(NULL);
1559       DBHashTable *table = dbh_new(QTABLE, NULL, DBH_READ_ONLY);
1560       GSList *list = random_numlist;
1561       unsigned char k[DBH_KEYLENGTH(table)+1];
1562       memset(k, 0, DBH_KEYLENGTH(table)+1);
1563       for (;list && list->data; list = list->next){
1564           dbh_genkey(k, DBH_KEYLENGTH(table), GPOINTER_TO_INT(list->data));
1565           dbh_set_key(table, k);
1566           if (!dbh_load(table)) fprintf(stderr, "q-numload cannot load item %s (%d)\n", k, GPOINTER_TO_INT(list->data));
1567           else loaded++;
1568       }
1569       dbh_close(table);
1570 
1571       fprintf(stderr,"q-numload loaded %d items, random access time = %ld s.\n", loaded, (long)(time(NULL) - start));
1572 
1573 #endif
1574