1 // @(#) $Revision: 4.1 $ $Source: /judy/test/manual/SLcompare.c $
2 //=======================================================================
3 // Author Douglas L. Baskins, June 2002.
4 // Permission to use this code is freely granted, provided that this
5 // statement is retained.
6 // email - doug@sourcejudy.com
7 //=======================================================================
8
9 // Compile for choice of algorithm:
10
11 // cc -static -O -o Judy -DJUDYMETHOD SLcompare.c -lJudy
12 // cc -static -O -o Hash -DHASHMETHOD SLcompare.c
13 // cc -static -O -o Splay -DSPLAYMETHOD SLcompare.c
14 // cc -static -O -o Redblack -DREDBLACKMETHOD SLcompare.c
15 //
16 // Note: -static will generally get better performance because string
17 // routines are slower with shared librarys.
18 //
19 // Usage:
20 //
21 // Judy <textfile>
22 // Hash <textfile>
23 // Splay <textfile>
24 // Redblack <textfile>
25 /*
26
27 This program will give you a chance to measure the speed/space
28 performance of 4 different string store/lookup algorithms on your data
29 set. All are very fast and generally can scale to very large data sets.
30 The memory efficiency is usually best with the Judy algorithm because it
31 uses the opportunity to compress data in a digital tree. The Hashing is
32 generally the fastest algorithm, however it looses its advantage when
33 the number of stored exceeds about 5X the size of the hash table.
34 Many thanks to J. Zobel for supplying the code for Hash, Splay and
35 Redblack algorithms.
36
37 I will be including more algorithms as people donate code for testing.
38
39 The Judy code is available at: <http://sourceforge.net/projects/judy>
40 9/2002 (dlb).
41
42 This is the output of this program when run on a 750Mhz P3 laptop.
43
44 This output is using a file available from www.google.com. It looks
45 like 'http' web pages and was one of the files used in a March 2002
46 contest by www.google.com
47
48 wc google/data/repos.00
49 3440314 14613027 162822437 google/data/repos.00
50
51 lines avg_linelen getline stored RAMused/line store/ln lookup/ln ADT
52 2807737 58.0 byts 0.856 uS 1455201 104.0 byts 4.859 uS 4.566 uS SPLAY
53 2807737 58.0 byts 0.862 uS 1455201 96.0 byts 2.523 uS 2.101 uS HASH
54 2807737 58.0 byts 0.856 uS 1455201 104.0 byts 4.601 uS 4.001 uS REDBLK
55 2807737 58.0 byts 0.875 uS 1455201 78.7 byts 3.898 uS 2.687 uS JUDY
56
57 With = 1.46 million lines stored and 2.81 million non-blank lines,
58 all the algorithms seem to keep their performance. The hash table is a
59 perfect size for this data set. I suspect only the HASH algorithm will
60 slow down with larger data sets unless the hash table is increased larger
61 than 2**20 million entrys. I have now tried a data set with about 10
62 million unique lines on a 64 bit machine (google/data/repos.*).
63 The speeds are about the same. Lookup times are in the 2-4 uSec range
64 and this is probably insignificant compared to the application needing
65 the data. I think all four methods are fine for speed, and Judy is a
66 standout for its memory efficiency -- frequently using less memory than
67 the strings alone.
68
69 I plan on tuning JudySL in the near future. This should improve the
70 performance of JudySL with short (a text word) strings. I have a data
71 base of all (28.8 million) domain names on the internet as a test case.
72
73 wc /data/domnames.txt
74 28826508 28826508 488227095 /data/domnames.txt
75
76 For the curious, JudySL (I.E JSLI/G macros in this program) is simply an
77 application subroutine that uses JudyL to the work. JudyL is a very
78 scaleable word to word (or pointer) mapper. JudySL is implemented as a
79 digital tree using JudyL arrays as 2^32 (or 2^64 in 64 bit machines)
80 wide (virtual) nodes. Each level in the digital tree decodes the next 4
81 (or 8) bytes of the line/string until only one entry would be in the
82 node. Then the remaining undecoded portion of the line is stored as a
83 string from the parent node. A digital tree does not need rotation or
84 balancing. In practice, digital trees are rarely use because of their
85 poor memory efficiency in the nodes and a tendency to have large depths.
86 These problems are solved in the Judy algorithm. For more details see
87 application notes on: www.sourcejudy.com. And for the really curious,
88 a technical paper is available in the application section. If you would
89 like to be a reviewer, contact Doug Baskins at <doug@sourcejudy.com>
90
91 */
92
93 #include <unistd.h>
94 #include <stdlib.h>
95 #include <stdio.h> // printf
96 #include <fcntl.h>
97 #include <string.h>
98 #include <errno.h> // errno
99 #include <sys/mman.h> // mmap()
100 #include <sys/stat.h> // stat()
101 #include <sys/time.h> // gettimeofday()
102
103 // remove all uses of this in production code.
104 int gDupCnt = 0; // counter for duplicate strings
105
106 //=======================================================================
107 // T I M I N G M A C R O S
108 //=======================================================================
109 // if your machine is one of the supported types in the following header
110 // file then uncomment this corresponding to what the header file says.
111 // This will give very high timing resolution.
112 //
113 // #define JU_xxxxxxx 1 // read timeit.h
114 // #define JU_LINUX_IA32 1 // I.E. IA32 Linux
115 //
116 //
117 // optional for high resolution times and compile with timeit.c
118 // cc -static -O -o Judy -DJUDYMETHOD SLcompare.c timeit.c -lJudy
119 //
120 //#include "timeit.h"
121
122 double DeltaUSec; // Global for remembering delta times
123
124 #ifndef _TIMEIT_H
125
126 // Note: I have found some Linux systems (2.4.18-6mdk) to have bugs in the
127 // gettimeofday() routine. Sometimes it returns a negative ~2840 microseconds
128 // instead of 0 or 1. If you use the above #include "timeit.h" and compile with
129 // timeit.c and use -DJU_LINUX_IA32, that problem will be eliminated. This is
130 // because for delta times less than .1 sec, the hardware free running timer
131 // is used instead of gettimeofday(). I have found the negative time problem
132 // appears about 40-50 times per second with consecutive gettimeofday() calls.
133
134 #define TIMER_vars(T) struct timeval __TVBeg_##T, __TVEnd_##T
135
136 #define STARTTm(T) gettimeofday(&__TVBeg_##T, NULL)
137
138 #define ENDTm(D,T) \
139 { \
140 gettimeofday(&__TVEnd_##T, NULL); \
141 (D) = (double)(__TVEnd_##T.tv_sec - __TVBeg_##T.tv_sec) * 1E6 + \
142 ((double)(__TVEnd_##T.tv_usec - __TVBeg_##T.tv_usec)); \
143 }
144
145 #endif // _TIMEIT_H
146
147 //=======================================================================
148 // M E M O R Y S I Z E M A C R O S
149 //=======================================================================
150 // Most mallocs have mallinfo()
151
152 // define this if your malloc does not have mallinfo();
153 #define NOMALLINFO 1
154
155 double DeltaMem; // for remembering
156
157 #ifndef NOMALLINFO
158
159 #include <malloc.h> // mallinfo()
160
161 struct mallinfo malStart;
162
163 #define STARTmem malStart = mallinfo() /* works with some mallocs */
164 #define ENDmem \
165 { \
166 struct mallinfo malEnd = mallinfo(); \
167 /* strange little dance from signed to unsigned to double */ \
168 unsigned int _un_int = malEnd.arena - malStart.arena; \
169 DeltaMem = _un_int; /* to double */ \
170 }
171
172 #else // MALLINFO
173
174 // this usually works for machines with less than 1-2Gb RAM.
175 // (it does not include memory ACQUIRED by mmap())
176
177 char *malStart;
178
179 #define STARTmem (malStart = (char *)sbrk(0))
180 #define ENDmem \
181 { \
182 char *malEnd = (char *)sbrk(0); \
183 DeltaMem = malEnd - malStart; \
184 }
185
186 #endif // MALLINFO
187
188
189 //=======================================================================
190 // G E T S T R I N G F R O M mmap() F I L E M A C R O
191 //=======================================================================
192 //
193 // From memory map POINTER store next '\n' terminated string to BUFFER
194 // Delete spaces, tabs, returns and resulting blank lines.
195
196 // POINTER must be check to be within memory mapped file range
197 //
198 // NOTE: This code will core-dump if a corrupt text file because
199 // POINTER is not checked to exceed end of file.
200
201 #define MAXLINE 10000 // max length line
202
203 #define GETLINE(BUFFER,POINTER) \
204 { \
205 char _chr; \
206 int _count = 0; \
207 for (;;) /* forever */ \
208 { \
209 switch (_chr = *POINTER++) \
210 { \
211 case ' ': /* eat spaces */ \
212 case '\t': /* eat tabs */ \
213 case '\r': /* eat returns */ \
214 continue; \
215 case '\n': /* eat blank lines */ \
216 if (_count == 0) continue; \
217 case '\0': /* Done */ \
218 BUFFER[_count++] = '\0'; \
219 break; \
220 default: /* copy char */ \
221 if (_count == (MAXLINE - 1)) \
222 { /* cut into 2 lines */ \
223 BUFFER[_count++] = '\0'; \
224 POINTER--; \
225 break; \
226 } \
227 BUFFER[_count++] = _chr; \
228 continue; \
229 } \
230 break; \
231 } \
232 }
233
234
235 //=======================================================================
236 // J U D Y M E T H O D
237 //=======================================================================
238
239 #ifdef JUDYMETHOD
240 #define ADTMETHOD "JUDY"
241
242 #include <Judy.h>
243
244 #define MEMOVD 0 /* no overhead in Judy */
245 #define INITARRAY Pvoid_t JudyArray = NULL
246
247 // store string into array
248 #define STORESTRING(STRING) \
249 { \
250 PWord_t _PValue; \
251 JSLI(_PValue, JudyArray, (uint8_t *) STRING); \
252 if (++(*_PValue) != 1) gDupCnt++; /* count dup strings */ \
253 }
254
255 // get pointer to Value associated with string
256
257 #define GETSTRING(STRING) \
258 { \
259 PWord_t _PValue; \
260 JSLG(_PValue, JudyArray, (uint8_t *) STRING); \
261 if (_PValue == NULL) /* not found -- bug */ \
262 { \
263 printf("GETSTRING failed(Judy) -- BUG!\n\n"); \
264 exit(1); \
265 } \
266 }
267
268 #endif // JUDYMETHOD
269
270 // Note: this optimization by J. Zobel should not be necessary
271 // with the -static compile option (linux). (dlb)
272
273 #ifdef SLOW_STRCMP
274 int
scmp(char * s1,char * s2)275 scmp(char *s1, char *s2)
276 {
277 while (*s1 != '\0' && *s1 == *s2)
278 {
279 s1++;
280 s2++;
281 }
282 return (*s1 - *s2);
283 }
284 #else // ! SLOW_STRCMP
285
286 #define scmp strcmp
287
288 #endif // ! SLOW_STRCMP
289
290 //=======================================================================
291 // H A S H M E T H O D
292 //=======================================================================
293
294 #ifdef HASHMETHOD
295 #define ADTMETHOD "HASH"
296
297 #define MEMOVD (sizeof(ht)) /* the hash table is overhead */
298
299 #define STORESTRING(STRING) hashinsert(ht, STRING)
300
301 #define GETSTRING(STRING) \
302 { \
303 HASHREC *_return; \
304 _return = hashsearch(ht, STRING); \
305 if (_return == NULL) /* not found -- bug */ \
306 { \
307 printf("GETSTRING(hash) failed -- BUG!\n\n"); \
308 exit(1); \
309 } \
310 }
311
312 /* Author J. Zobel, April 2001.
313 Permission to use this code is freely granted, provided that this
314 statement is retained. */
315
316 #define TSIZE (1LU << 20) /* many processors need this to be a pwr of 2 */
317 #define SEED 1159241
318 #define HASHFN bitwisehash
319
320 #define INITARRAY static HASHREC *ht[TSIZE]
321 #define SIZEOFINIT sizeof(ht[TSIZE])
322
323 typedef struct hashrec
324 {
325 char *word;
326 struct hashrec *next;
327 }
328 HASHREC;
329
330 /* Bitwise hash function. Note that tsize does not have to be prime. */
331 unsigned int
bitwisehash(char * word,int tsize,unsigned int seed)332 bitwisehash(char *word, int tsize, unsigned int seed)
333 {
334 char c;
335 unsigned int h;
336
337 h = seed;
338 for (; (c = *word) != '\0'; word++)
339 {
340 h ^= ((h << 5) + c + (h >> 2));
341 }
342 return ((unsigned int)((h & 0x7fffffff) % tsize));
343 }
344
345 #ifdef notdef // not needed if a static definition used in UNIX
346 /* Create hash table, initialise ptrs to NULL */
347 HASHREC **
inithashtable()348 inithashtable()
349 {
350 int i;
351 HASHREC **ht;
352
353 ht = (HASHREC **) malloc(sizeof(HASHREC *) * TSIZE);
354
355 for (i = 0; i < TSIZE; i++)
356 ht[i] = (HASHREC *) NULL;
357
358 return (ht);
359 }
360 #endif // notdef
361
362 /* Search hash table for given string, return record if found, else NULL */
363 HASHREC *
hashsearch(HASHREC ** ht,char * w)364 hashsearch(HASHREC ** ht, char *w)
365 {
366 HASHREC *htmp, *hprv;
367 unsigned int hval = HASHFN(w, TSIZE, SEED);
368
369 for (hprv = NULL, htmp = ht[hval];
370 htmp != NULL && scmp(htmp->word, w) != 0;
371 hprv = htmp, htmp = htmp->next)
372 {
373 ;
374 }
375
376 if (hprv != NULL && htmp != NULL) /* move to front on access */
377 {
378 hprv->next = htmp->next;
379 htmp->next = ht[hval];
380 ht[hval] = htmp;
381 }
382
383 return (htmp);
384 }
385
386 /* Search hash table for given string, insert if not found */
387 void
hashinsert(HASHREC ** ht,char * w)388 hashinsert(HASHREC ** ht, char *w)
389 {
390 HASHREC *htmp, *hprv;
391 unsigned int hval = HASHFN(w, TSIZE, SEED);
392
393 for (hprv = NULL, htmp = ht[hval];
394 htmp != NULL && scmp(htmp->word, w) != 0;
395 hprv = htmp, htmp = htmp->next)
396 {
397 ;
398 }
399
400 if (htmp == NULL)
401 {
402 htmp = (HASHREC *) malloc(sizeof(HASHREC));
403 htmp->word = (char *)malloc(strlen(w) + 1);
404 strcpy(htmp->word, w);
405 htmp->next = NULL;
406 if (hprv == NULL)
407 ht[hval] = htmp;
408 else
409 hprv->next = htmp;
410
411 /* new records are not moved to front */
412 }
413 else
414 {
415 if (hprv != NULL) /* move to front on access */
416 {
417 hprv->next = htmp->next;
418 htmp->next = ht[hval];
419 ht[hval] = htmp;
420 }
421 gDupCnt++; /* count duplicate strings */
422 }
423
424 return;
425 }
426
427 #endif // HASHMETHOD
428
429 //=======================================================================
430 // S P L A Y M E T H O D
431 //=======================================================================
432
433 #ifdef SPLAYMETHOD
434 #define ADTMETHOD "SPLAY"
435
436 #define MEMOVD 0 /* not enough overhead to count */
437
438 /* Author J. Zobel, April 2001.
439 Permission to use this code is freely granted, provided that this
440 statement is retained. */
441
442 #define ROTATEFAC 11
443
444 typedef struct wordrec
445 {
446 char *word;
447 struct wordrec *left, *right;
448 struct wordrec *par;
449 }
450 TREEREC;
451
452 typedef struct ansrec
453 {
454 struct wordrec *root;
455 struct wordrec *ans;
456 }
457 ANSREC;
458
459 #define ONELEVEL(PAR,CURR,DIR,RID) \
460 { \
461 PAR->DIR = CURR->RID; \
462 if(PAR->DIR!=NULL) \
463 PAR->DIR->PAR = PAR; \
464 CURR->RID = PAR; \
465 PAR->PAR = CURR; \
466 CURR->PAR = NULL; \
467 }
468
469 #define ZIGZIG(GPAR,PAR,CURR,DIR,RID) \
470 { \
471 CURR->PAR = GPAR->PAR; \
472 if (CURR->PAR != NULL) \
473 { \
474 if (CURR->PAR->DIR == GPAR) \
475 CURR->PAR->DIR = CURR; \
476 else \
477 CURR->PAR->RID = CURR; \
478 } \
479 GPAR->DIR = PAR->RID; \
480 if (GPAR->DIR != NULL) \
481 GPAR->DIR->PAR = GPAR; \
482 PAR->DIR = CURR->RID; \
483 if (CURR->RID != NULL) \
484 CURR->RID->PAR = PAR; \
485 CURR->RID = PAR; \
486 PAR->PAR = CURR; \
487 PAR->RID = GPAR; \
488 GPAR->PAR = PAR; \
489 }
490
491 #define ZIGZAG(GPAR,PAR,CURR,DIR,RID) \
492 { \
493 CURR->PAR = GPAR->PAR; \
494 if (CURR->PAR != NULL) \
495 { \
496 if (CURR->PAR->DIR == GPAR) \
497 CURR->PAR->DIR = CURR; \
498 else \
499 CURR->PAR->RID = CURR; \
500 } \
501 PAR->RID = CURR->DIR; \
502 if (PAR->RID != NULL) \
503 PAR->RID->PAR = PAR; \
504 GPAR->DIR = CURR->RID; \
505 if (GPAR->DIR != NULL) \
506 GPAR->DIR->PAR = GPAR; \
507 CURR->DIR = PAR; \
508 PAR->PAR = CURR; \
509 CURR->RID = GPAR; \
510 GPAR->PAR = CURR; \
511 }
512
513 int scount = ROTATEFAC;
514
515 /* Search for word in a splay tree. If word is found, bring it to
516 root, possibly intermittently. Structure ans is used to pass
517 in the root, and to pass back both the new root (which may or
518 may not be changed) and the looked-for record. */
519 void
splaysearch(ANSREC * ans,char * word)520 splaysearch(ANSREC * ans, char *word)
521 {
522 TREEREC *curr = ans->root, *par, *gpar;
523 int val;
524
525 scount--;
526
527 if (ans->root == NULL)
528 {
529 ans->ans = NULL;
530 return;
531 }
532 while (curr != NULL && (val = scmp(word, curr->word)) != 0)
533 {
534 if (val > 0)
535 curr = curr->right;
536 else
537 curr = curr->left;
538 }
539
540 ans->ans = curr;
541
542 if (curr == ans->root)
543 {
544 return;
545 }
546
547 if (scount <= 0 && curr != NULL) /* Move node towards root */
548 {
549 scount = ROTATEFAC;
550
551 while ((par = curr->par) != NULL)
552 {
553 if (par->left == curr)
554 {
555 if ((gpar = par->par) == NULL)
556 {
557 ONELEVEL(par, curr, left, right);
558 }
559 else if (gpar->left == par)
560 {
561 ZIGZIG(gpar, par, curr, left, right);
562 }
563 else
564 {
565 ZIGZAG(gpar, par, curr, right, left);
566 }
567 }
568 else
569 {
570 if ((gpar = par->par) == NULL)
571 {
572 ONELEVEL(par, curr, right, left);
573 }
574 else if (gpar->left == par)
575 {
576 ZIGZAG(gpar, par, curr, left, right);
577 }
578 else
579 {
580 ZIGZIG(gpar, par, curr, right, left);
581 }
582 }
583 }
584 ans->root = curr;
585 }
586
587 return;
588 }
589
590 /* Insert word into a splay tree. If word is already present, bring it to
591 root, possibly intermittently. Structure ans is used to pass
592 in the root, and to pass back both the new root (which may or
593 may not be changed) and the looked-for record. */
594
595 void
splayinsert(ANSREC * ans,char * word)596 splayinsert(ANSREC * ans, char *word)
597 {
598 TREEREC *curr = ans->root, *par, *gpar, *prev = NULL, *wcreate();
599 int val = 0;
600
601 scount--;
602
603 if (ans->root == NULL)
604 {
605 ans->ans = ans->root = wcreate(word, NULL);
606 return;
607 }
608
609 while (curr != NULL && (val = scmp(word, curr->word)) != 0)
610 {
611 prev = curr;
612 if (val > 0)
613 curr = curr->right;
614 else
615 curr = curr->left;
616 }
617
618 if (curr == NULL)
619 {
620 if (val > 0)
621 curr = prev->right = wcreate(word, prev);
622 else
623 curr = prev->left = wcreate(word, prev);
624 }
625 else
626 gDupCnt++; /* count duplicate strings */
627
628 ans->ans = curr;
629
630 if (scount <= 0) /* Move node towards root */
631 {
632 scount = ROTATEFAC;
633
634 while ((par = curr->par) != NULL)
635 {
636 if (par->left == curr)
637 {
638 if ((gpar = par->par) == NULL)
639 {
640 ONELEVEL(par, curr, left, right);
641 }
642 else if (gpar->left == par)
643 {
644 ZIGZIG(gpar, par, curr, left, right);
645 }
646 else
647 {
648 ZIGZAG(gpar, par, curr, right, left);
649 }
650 }
651 else
652 {
653 if ((gpar = par->par) == NULL)
654 {
655 ONELEVEL(par, curr, right, left);
656 }
657 else if (gpar->left == par)
658 {
659 ZIGZAG(gpar, par, curr, left, right);
660 }
661 else
662 {
663 ZIGZIG(gpar, par, curr, right, left);
664 }
665 }
666 }
667 ans->root = curr;
668 }
669 return;
670 }
671
672 /* Create a node to hold a word */
673 TREEREC *
wcreate(char * word,TREEREC * par)674 wcreate(char *word, TREEREC * par)
675 {
676 TREEREC *tmp;
677
678 tmp = (TREEREC *) malloc(sizeof(TREEREC));
679 tmp->word = (char *)malloc(strlen(word) + 1);
680 strcpy(tmp->word, word);
681 tmp->left = tmp->right = NULL;
682
683 tmp->par = par;
684
685 return (tmp);
686 }
687
688 #define INITARRAY ANSREC ans = {0}
689
690 #define STORESTRING(STRING) splayinsert(&ans, STRING)
691
692 #define GETSTRING(STRING) splaysearch(&ans, STRING)
693
694 #endif // SPLAYMETHOD
695
696 //=======================================================================
697 // R E D B L A C K M E T H O D
698 //=======================================================================
699
700 #ifdef REDBLACKMETHOD
701 #define ADTMETHOD "REDBLACK"
702
703 #define MEMOVD 0 /* not enough overhead to count */
704
705 /* Author J. Zobel, April 2001.
706 Permission to use this code is freely granted, provided that this
707 statement is retained. */
708
709 typedef struct wordrec
710 {
711 char *word;
712 struct wordrec *left, *right;
713 struct wordrec *par;
714 char colour;
715 }
716 TREEREC;
717
718 typedef struct ansrec
719 {
720 struct wordrec *root;
721 struct wordrec *ans;
722 }
723 ANSREC;
724
725 #define RED 0
726 #define BLACK 1
727
728 /* Find word in a redblack tree */
729
730 void
redblacksearch(ANSREC * ans,char * word)731 redblacksearch(ANSREC * ans, char *word)
732 {
733 TREEREC *curr = ans->root;
734 int val;
735
736 if (ans->root != NULL)
737 {
738 while (curr != NULL && (val = scmp(word, curr->word)) != 0)
739 {
740 if (val > 0)
741 curr = curr->right;
742 else
743 curr = curr->left;
744 }
745 }
746
747 ans->ans = curr;
748
749 return;
750 }
751
752 /* Rotate the right child of par upwards */
753 /* Could be written as a macro, but not really necessary
754 as it is only called on insertion */
755
756 void
leftrotate(ANSREC * ans,TREEREC * par)757 leftrotate(ANSREC * ans, TREEREC * par)
758 {
759 TREEREC *curr, *gpar;
760
761 if ((curr = par->right) != NULL)
762 {
763 par->right = curr->left;
764 if (curr->left != NULL)
765 curr->left->par = par;
766 curr->par = par->par;
767 if ((gpar = par->par) == NULL)
768 ans->root = curr;
769 else
770 {
771 if (par == gpar->left)
772 gpar->left = curr;
773 else
774 gpar->right = curr;
775 }
776 curr->left = par;
777 par->par = curr;
778 }
779 }
780
781 /* Rotate the left child of par upwards */
782 void
rightrotate(ANSREC * ans,TREEREC * par)783 rightrotate(ANSREC * ans, TREEREC * par)
784 {
785 TREEREC *curr, *gpar;
786
787 if ((curr = par->left) != NULL)
788 {
789 par->left = curr->right;
790 if (curr->right != NULL)
791 curr->right->par = par;
792 curr->par = par->par;
793 if ((gpar = par->par) == NULL)
794 ans->root = curr;
795 else
796 {
797 if (par == gpar->left)
798 gpar->left = curr;
799 else
800 gpar->right = curr;
801 }
802 curr->right = par;
803 par->par = curr;
804 }
805 }
806
807 /* Insert word into a redblack tree */
808 void
redblackinsert(ANSREC * ans,char * word)809 redblackinsert(ANSREC * ans, char *word)
810 {
811 TREEREC *curr = ans->root, *par, *gpar, *prev = NULL, *wcreate();
812 int val;
813
814 if (ans->root == NULL)
815 {
816 ans->ans = ans->root = wcreate(word, NULL);
817 return;
818 }
819 while (curr != NULL && (val = scmp(word, curr->word)) != 0)
820 {
821 prev = curr;
822 if (val > 0)
823 curr = curr->right;
824 else
825 curr = curr->left;
826 }
827
828 ans->ans = curr;
829
830 if (curr == NULL)
831 /* Insert a new node, rotate up if necessary */
832 {
833 if (val > 0)
834 curr = prev->right = wcreate(word, prev);
835 else
836 curr = prev->left = wcreate(word, prev);
837
838 curr->colour = RED;
839 while ((par = curr->par) != NULL
840 && (gpar = par->par) != NULL && curr->par->colour == RED)
841 {
842 if (par == gpar->left)
843 {
844 if (gpar->right != NULL && gpar->right->colour == RED)
845 {
846 par->colour = BLACK;
847 gpar->right->colour = BLACK;
848 gpar->colour = RED;
849 curr = gpar;
850 }
851 else
852 {
853 if (curr == par->right)
854 {
855 curr = par;
856 leftrotate(ans, curr);
857 par = curr->par;
858 }
859 par->colour = BLACK;
860 if ((gpar = par->par) != NULL)
861 {
862 gpar->colour = RED;
863 rightrotate(ans, gpar);
864 }
865 }
866 }
867 else
868 {
869 if (gpar->left != NULL && gpar->left->colour == RED)
870 {
871 par->colour = BLACK;
872 gpar->left->colour = BLACK;
873 gpar->colour = RED;
874 curr = gpar;
875 }
876 else
877 {
878 if (curr == par->left)
879 {
880 curr = par;
881 rightrotate(ans, curr);
882 par = curr->par;
883 }
884 par->colour = BLACK;
885 if ((gpar = par->par) != NULL)
886 {
887 gpar->colour = RED;
888 leftrotate(ans, gpar);
889 }
890 }
891 }
892 }
893 if (curr->par == NULL)
894 ans->root = curr;
895 ans->root->colour = BLACK;
896 }
897 else
898 gDupCnt++; /* count duplicate strings */
899
900 return;
901 }
902
903 /* Create a node to hold a word */
904 TREEREC *
wcreate(char * word,TREEREC * par)905 wcreate(char *word, TREEREC * par)
906 {
907 TREEREC *tmp;
908
909 tmp = (TREEREC *) malloc(sizeof(TREEREC));
910 tmp->word = (char *)malloc(strlen(word) + 1);
911 strcpy(tmp->word, word);
912 tmp->left = tmp->right = NULL;
913
914 tmp->par = par;
915
916 return (tmp);
917 }
918
919 #define INITARRAY ANSREC ans = {0}
920
921 #define STORESTRING(STRING) redblackinsert(&ans, STRING)
922
923 #define GETSTRING(STRING) redblacksearch(&ans, STRING)
924
925 #endif // REDBLACKMETHOD
926
927 //=======================================================================
928 // M A I N P R O G R A M -by- Doug Baskins
929 //=======================================================================
930
931 // error routine for system routines for accessing file
932
933 #define FILERROR \
934 { \
935 printf("%s: Cannot open file \"%s\": %s " \
936 "(errno = %d)\n", argv[0], argv[1], strerror(errno), \
937 errno); \
938 fprintf(stderr, "%s: Cannot open file \"%s\": %s " \
939 "(errno = %d)\n", argv[0], argv[1], strerror(errno), \
940 errno); \
941 exit(1); \
942 }
943
944
945 //=======================================================================
946 // M E A S U R E A D T S P E E D and M E M O R Y U S A G E
947 //=======================================================================
948
949 int
main(int argc,char * argv[])950 main(int argc, char *argv[])
951 {
952 TIMER_vars(tm); // declare timer variables
953 int fd; // to read file.
954 struct stat statbuf; // to get size of file
955 char *Pfile; // ram address of file
956 size_t fsize; // file size in bytes
957 char *FSmap; // start address of mapped file
958 char *FEmap; // end address+1 of mapped file
959 char String[MAXLINE]; // input buffer
960 int StrCnt; // line counter
961 int ii; // temp
962
963 INITARRAY;
964
965 // CHECK FOR REQUIRED INPUT FILE PARAMETER:
966
967 if (argc != 2)
968 {
969 printf("Usage: %s <text file>\n", argv[0]);
970 exit(1);
971 }
972
973 // GET FILE SIZE
974
975 if (stat(argv[1], &statbuf) == -1)
976 FILERROR;
977
978 fsize = statbuf.st_size;
979
980 // OPEN INPUT FILE:
981
982 if ((fd = open(argv[1], O_RDONLY)) == -1)
983 FILERROR;
984
985 // MEMORY MAP FILE
986
987 Pfile =
988 (char *)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
989 if (Pfile == (char *)-1)
990 FILERROR;
991
992 FEmap = Pfile + fsize; // set end+1 address
993
994 // print command name + run arguments
995
996 printf("# ");
997 for (ii = 0; ii < argc; ii++)
998 printf("%s ", argv[ii]);
999 printf("\n");
1000 fflush(stdout);
1001
1002 // file is in RAM, read again to measure time
1003
1004 STARTTm(tm); // start timer
1005
1006 for (StrCnt = 0, FSmap = Pfile; FSmap < FEmap; )
1007 {
1008 GETLINE(String, FSmap); // 'read' next string
1009
1010 StrCnt++;
1011 }
1012 ENDTm(DeltaUSec, tm); // end timer
1013 //
1014 // print header - 6 fields
1015
1016 printf("# lines avg_linelen getline stored");
1017 printf(" RAMused/line store/ln lookup/ln ADT\n");
1018
1019 // print numb lines filesize/line
1020
1021 printf("%8u", StrCnt); // Number of lines
1022 printf(" %6.1f byts", (double)fsize / (double)StrCnt); // filesize/line
1023 fflush(stdout);
1024
1025 // print time to read a line
1026
1027 printf(" %5.3f uS", DeltaUSec / (double)StrCnt);
1028 fflush(stdout);
1029
1030 // INPUT/STORE LOOP:
1031
1032 // read each input line into String and store into ADT
1033
1034 STARTmem; // current malloc() mem usage
1035 STARTTm(tm); // start timer
1036
1037 for (FSmap = Pfile; FSmap < FEmap; )
1038 {
1039 GETLINE(String, FSmap); // 'read' next string
1040
1041 STORESTRING(String);
1042 }
1043 ENDTm(DeltaUSec, tm); // end timer
1044 ENDmem; // current malloc() mem usage
1045
1046 // print number of non-duplicate lines
1047
1048 printf(" %8u", StrCnt - gDupCnt); // duplicate lines
1049
1050 // print RAM used by malloc() by ADT to store data
1051
1052 printf(" %7.1f byts", (double)(DeltaMem + MEMOVD) /
1053 (double)(StrCnt - gDupCnt));
1054
1055 // print time per line to store the data
1056
1057 printf(" %6.3f uS", DeltaUSec / (double)StrCnt);
1058 fflush(stdout);
1059
1060 // READ BACK LOOP
1061
1062 STARTTm(tm); // start timer
1063
1064 for (FSmap = Pfile; FSmap < FEmap; )
1065 {
1066 GETLINE(String, FSmap); // 'read' next string
1067
1068 GETSTRING(String);
1069 }
1070 ENDTm(DeltaUSec, tm); // end timer
1071
1072 // print time per line to lookup the data from the ADT
1073
1074 printf(" %6.3f uS", DeltaUSec / (double)StrCnt); // store time/line
1075 printf(" %s\n", ADTMETHOD);
1076
1077 return (0); // done
1078
1079 } // main()
1080