1 // @(#) $Revision: 4.1 $ $Source: /judy/test/manual/SLcompare.c $
2 //=======================================================================
3 //   Author Douglas L. Baskins, June 2002.
4 //   Permission to use this code is freely granted, provided that this
5 //   statement is retained.
6 //   email - doug@sourcejudy.com
7 //=======================================================================
8 
9 // Compile for choice of algorithm:
10 
11 //   cc -static -O -o Judy     -DJUDYMETHOD     SLcompare.c -lJudy
12 //   cc -static -O -o Hash     -DHASHMETHOD     SLcompare.c
13 //   cc -static -O -o Splay    -DSPLAYMETHOD    SLcompare.c
14 //   cc -static -O -o Redblack -DREDBLACKMETHOD SLcompare.c
15 //
16 // Note:  -static will generally get better performance because string
17 // routines are slower with shared librarys.
18 //
19 // Usage:
20 //
21 //   Judy     <textfile>
22 //   Hash     <textfile>
23 //   Splay    <textfile>
24 //   Redblack <textfile>
25 /*
26 
27 This program will give you a chance to measure the speed/space
28 performance of 4 different string store/lookup algorithms on your data
29 set.  All are very fast and generally can scale to very large data sets.
30 The memory efficiency is usually best with the Judy algorithm because it
31 uses the opportunity to compress data in a digital tree.  The Hashing is
32 generally the fastest algorithm, however it looses its advantage when
33 the number of stored exceeds about 5X the size of the hash table.
34 Many thanks to J.  Zobel for supplying the code for Hash, Splay and
35 Redblack algorithms.
36 
37 I will be including more algorithms as people donate code for testing.
38 
39 The Judy code is available at: <http://sourceforge.net/projects/judy>
40 9/2002 (dlb).
41 
42 This is the output of this program when run on a 750Mhz P3 laptop.
43 
44 This output is using a file available from www.google.com.  It looks
45 like 'http' web pages and was one of the files used in a March 2002
46 contest by www.google.com
47 
48  wc google/data/repos.00
49  3440314 14613027 162822437 google/data/repos.00
50 
51    lines avg_linelen  getline   stored RAMused/line  store/ln lookup/ln ADT
52  2807737   58.0 byts 0.856 uS  1455201   104.0 byts  4.859 uS  4.566 uS SPLAY
53  2807737   58.0 byts 0.862 uS  1455201    96.0 byts  2.523 uS  2.101 uS HASH
54  2807737   58.0 byts 0.856 uS  1455201   104.0 byts  4.601 uS  4.001 uS REDBLK
55  2807737   58.0 byts 0.875 uS  1455201    78.7 byts  3.898 uS  2.687 uS JUDY
56 
57 With = 1.46 million lines stored and 2.81 million non-blank lines,
58 all the algorithms seem to keep their performance.  The hash table is a
59 perfect size for this data set.  I suspect only the HASH algorithm will
60 slow down with larger data sets unless the hash table is increased larger
61 than 2**20 million entrys.  I have now tried a data set with about 10
62 million unique lines on a 64 bit machine (google/data/repos.*).
63 The speeds are about the same.  Lookup times are in the 2-4 uSec range
64 and this is probably insignificant compared to the application needing
65 the data.  I think all four methods are fine for speed, and Judy is a
66 standout for its memory efficiency -- frequently using less memory than
67 the strings alone.
68 
69 I plan on tuning JudySL in the near future.  This should improve the
70 performance of JudySL with short (a text word) strings.  I have a data
71 base of all (28.8 million) domain names on the internet as a test case.
72 
73 wc /data/domnames.txt
74 28826508 28826508 488227095 /data/domnames.txt
75 
76 For the curious, JudySL (I.E JSLI/G macros in this program) is simply an
77 application subroutine that uses JudyL to the work.  JudyL is a very
78 scaleable word to word (or pointer) mapper.  JudySL is implemented as a
79 digital tree using JudyL arrays as 2^32 (or 2^64 in 64 bit machines)
80 wide (virtual) nodes.  Each level in the digital tree decodes the next 4
81 (or 8) bytes of the line/string until only one entry would be in the
82 node.  Then the remaining undecoded portion of the line is stored as a
83 string from the parent node.  A digital tree does not need rotation or
84 balancing.  In practice, digital trees are rarely use because of their
85 poor memory efficiency in the nodes and a tendency to have large depths.
86 These problems are solved in the Judy algorithm.  For more details see
87 application notes on:  www.sourcejudy.com.  And for the really curious,
88 a technical paper is available in the application section.  If you would
89 like to be a reviewer, contact Doug Baskins at <doug@sourcejudy.com>
90 
91 */
92 
93 #include <unistd.h>
94 #include <stdlib.h>
95 #include <stdio.h>              // printf
96 #include <fcntl.h>
97 #include <string.h>
98 #include <errno.h>              // errno
99 #include <sys/mman.h>           // mmap()
100 #include <sys/stat.h>           // stat()
101 #include <sys/time.h>           // gettimeofday()
102 
103 // remove all uses of this in production code.
104 int       gDupCnt = 0;          // counter for duplicate strings
105 
106 //=======================================================================
107 //      T I M I N G   M A C R O S
108 //=======================================================================
109 // if your machine is one of the supported types in the following header
110 // file then uncomment this corresponding to what the header file says.
111 // This will give very high timing resolution.
112 //
113 // #define JU_xxxxxxx 1         // read timeit.h
114 // #define JU_LINUX_IA32 1      // I.E. IA32 Linux
115 //
116 //
117 // optional for high resolution times and compile with timeit.c
118 //   cc -static -O -o Judy -DJUDYMETHOD SLcompare.c timeit.c -lJudy
119 //
120 //#include "timeit.h"
121 
122 double    DeltaUSec;            // Global for remembering delta times
123 
124 #ifndef _TIMEIT_H
125 
126 // Note: I have found some Linux systems (2.4.18-6mdk) to have bugs in the
127 // gettimeofday() routine.  Sometimes it returns a negative ~2840 microseconds
128 // instead of 0 or 1.  If you use the above #include "timeit.h" and compile with
129 // timeit.c and use -DJU_LINUX_IA32, that problem will be eliminated.  This is
130 // because for delta times less than .1 sec, the hardware free running timer
131 // is used instead of gettimeofday().  I have found the negative time problem
132 // appears about 40-50 times per second with consecutive gettimeofday() calls.
133 
134 #define TIMER_vars(T) struct timeval __TVBeg_##T, __TVEnd_##T
135 
136 #define STARTTm(T) gettimeofday(&__TVBeg_##T, NULL)
137 
138 #define ENDTm(D,T)                                                      \
139 {                                                                       \
140     gettimeofday(&__TVEnd_##T, NULL);                                   \
141     (D) = (double)(__TVEnd_##T.tv_sec  - __TVBeg_##T.tv_sec) * 1E6 +    \
142          ((double)(__TVEnd_##T.tv_usec - __TVBeg_##T.tv_usec));         \
143 }
144 
145 #endif // _TIMEIT_H
146 
147 //=======================================================================
148 //      M E M O R Y   S I Z E   M A C R O S
149 //=======================================================================
150 //      Most mallocs have mallinfo()
151 
152 // define this if your malloc does not have mallinfo();
153 #define NOMALLINFO 1
154 
155 double    DeltaMem;             // for remembering
156 
157 #ifndef NOMALLINFO
158 
159 #include <malloc.h>             // mallinfo()
160 
161 struct mallinfo malStart;
162 
163 #define STARTmem malStart = mallinfo() /* works with some mallocs */
164 #define ENDmem                                                  \
165 {                                                               \
166     struct mallinfo malEnd = mallinfo();                        \
167 /* strange little dance from signed to unsigned to double */    \
168     unsigned int _un_int = malEnd.arena - malStart.arena;       \
169     DeltaMem = _un_int;      /* to double */                    \
170 }
171 
172 #else // MALLINFO
173 
174 // this usually works for machines with less than 1-2Gb RAM.
175 // (it does not include memory ACQUIRED by mmap())
176 
177 char     *malStart;
178 
179 #define STARTmem (malStart = (char *)sbrk(0))
180 #define ENDmem                                                  \
181 {                                                               \
182     char  *malEnd =  (char *)sbrk(0);                           \
183     DeltaMem = malEnd - malStart;                               \
184 }
185 
186 #endif // MALLINFO
187 
188 
189 //=======================================================================
190 //      G E T S T R I N G   F R O M   mmap()   F I L E   M A C R O
191 //=======================================================================
192 //
193 // From memory map POINTER store next '\n' terminated string to BUFFER
194 // Delete spaces, tabs, returns and resulting blank lines.
195 
196 // POINTER must be check to be within memory mapped file range
197 //
198 // NOTE: This code will core-dump if a corrupt text file because
199 // POINTER is not checked to exceed end of file.
200 
201 #define MAXLINE 10000   // max length line
202 
203 #define GETLINE(BUFFER,POINTER)			\
204 {                                               \
205     char _chr;                                  \
206     int  _count = 0;                            \
207     for (;;)	/* forever */			\
208     {                                           \
209         switch (_chr = *POINTER++)		\
210         {					\
211         case  ' ':	/* eat spaces  */	\
212         case '\t':	/* eat tabs    */	\
213         case '\r':	/* eat returns */	\
214             continue;				\
215 	case '\n':      /* eat blank lines */	\
216 	    if (_count == 0) continue;		\
217 	case '\0':	/* Done */		\
218             BUFFER[_count++] = '\0';              \
219 	    break;				\
220 	default:	/* copy char */		\
221             if (_count == (MAXLINE - 1))	\
222 	    { 	        /* cut into 2 lines */	\
223                 BUFFER[_count++] = '\0';	\
224 	        POINTER--;			\
225 	        break;				\
226 	    }					\
227             BUFFER[_count++] = _chr;	        \
228 	    continue;				\
229 	}					\
230 	break;					\
231     }						\
232 }
233 
234 
235 //=======================================================================
236 //      J U D Y   M E T H O D
237 //=======================================================================
238 
239 #ifdef JUDYMETHOD
240 #define ADTMETHOD       "JUDY"
241 
242 #include <Judy.h>
243 
244 #define MEMOVD          0       /* no overhead in Judy */
245 #define INITARRAY       Pvoid_t JudyArray = NULL
246 
247 // store string into array
248 #define STORESTRING(STRING)                                             \
249 {                                                                       \
250         PWord_t _PValue;                                                \
251         JSLI(_PValue, JudyArray, (uint8_t *) STRING);                   \
252         if (++(*_PValue) != 1) gDupCnt++;  /* count dup strings */      \
253 }
254 
255 // get pointer to Value associated with string
256 
257 #define GETSTRING(STRING)                                               \
258 {                                                                       \
259         PWord_t _PValue;                                                \
260         JSLG(_PValue, JudyArray, (uint8_t *) STRING);                   \
261         if (_PValue == NULL)                 /* not found -- bug */     \
262         {                                                               \
263                printf("GETSTRING failed(Judy) -- BUG!\n\n");            \
264                exit(1);                                                 \
265         }                                                               \
266 }
267 
268 #endif // JUDYMETHOD
269 
270 // Note: this optimization by J. Zobel should not be necessary
271 // with the -static compile option (linux). (dlb)
272 
273 #ifdef SLOW_STRCMP
274 int
scmp(char * s1,char * s2)275 scmp(char *s1, char *s2)
276 {
277     while (*s1 != '\0' && *s1 == *s2)
278     {
279         s1++;
280         s2++;
281     }
282     return (*s1 - *s2);
283 }
284 #else // ! SLOW_STRCMP
285 
286 #define scmp strcmp
287 
288 #endif // ! SLOW_STRCMP
289 
290 //=======================================================================
291 //      H A S H   M E T H O D
292 //=======================================================================
293 
294 #ifdef HASHMETHOD
295 #define ADTMETHOD       "HASH"
296 
297 #define MEMOVD	(sizeof(ht))     /* the hash table is overhead */
298 
299 #define STORESTRING(STRING)   hashinsert(ht, STRING)
300 
301 #define GETSTRING(STRING)                                               \
302 {                                                                       \
303         HASHREC *_return;                                               \
304         _return = hashsearch(ht, STRING);                               \
305         if (_return == NULL)              /* not found -- bug */        \
306         {                                                               \
307                printf("GETSTRING(hash) failed -- BUG!\n\n");            \
308                exit(1);                                                 \
309         }                                                               \
310 }
311 
312 /* Author J. Zobel, April 2001.
313    Permission to use this code is freely granted, provided that this
314    statement is retained. */
315 
316 #define TSIZE (1LU << 20)  /* many processors need this to be a pwr of 2 */
317 #define SEED	1159241
318 #define HASHFN  bitwisehash
319 
320 #define INITARRAY       static HASHREC *ht[TSIZE]
321 #define SIZEOFINIT      sizeof(ht[TSIZE])
322 
323 typedef struct hashrec
324 {
325     char     *word;
326     struct hashrec *next;
327 }
328 HASHREC;
329 
330 /* Bitwise hash function.  Note that tsize does not have to be prime. */
331 unsigned int
bitwisehash(char * word,int tsize,unsigned int seed)332 bitwisehash(char *word, int tsize, unsigned int seed)
333 {
334     char      c;
335     unsigned int h;
336 
337     h = seed;
338     for (; (c = *word) != '\0'; word++)
339     {
340         h ^= ((h << 5) + c + (h >> 2));
341     }
342     return ((unsigned int)((h & 0x7fffffff) % tsize));
343 }
344 
345 #ifdef notdef                   // not needed if a static definition used in UNIX
346 /* Create hash table, initialise ptrs to NULL */
347 HASHREC **
inithashtable()348 inithashtable()
349 {
350     int       i;
351     HASHREC **ht;
352 
353     ht = (HASHREC **) malloc(sizeof(HASHREC *) * TSIZE);
354 
355     for (i = 0; i < TSIZE; i++)
356         ht[i] = (HASHREC *) NULL;
357 
358     return (ht);
359 }
360 #endif // notdef
361 
362 /* Search hash table for given string, return record if found, else NULL */
363 HASHREC  *
hashsearch(HASHREC ** ht,char * w)364 hashsearch(HASHREC ** ht, char *w)
365 {
366     HASHREC  *htmp, *hprv;
367     unsigned int hval = HASHFN(w, TSIZE, SEED);
368 
369     for (hprv = NULL, htmp = ht[hval];
370          htmp != NULL && scmp(htmp->word, w) != 0;
371          hprv = htmp, htmp = htmp->next)
372     {
373         ;
374     }
375 
376     if (hprv != NULL && htmp != NULL) /* move to front on access */
377     {
378         hprv->next = htmp->next;
379         htmp->next = ht[hval];
380         ht[hval] = htmp;
381     }
382 
383     return (htmp);
384 }
385 
386 /* Search hash table for given string, insert if not found */
387 void
hashinsert(HASHREC ** ht,char * w)388 hashinsert(HASHREC ** ht, char *w)
389 {
390     HASHREC  *htmp, *hprv;
391     unsigned int hval = HASHFN(w, TSIZE, SEED);
392 
393     for (hprv = NULL, htmp = ht[hval];
394          htmp != NULL && scmp(htmp->word, w) != 0;
395          hprv = htmp, htmp = htmp->next)
396     {
397         ;
398     }
399 
400     if (htmp == NULL)
401     {
402         htmp = (HASHREC *) malloc(sizeof(HASHREC));
403         htmp->word = (char *)malloc(strlen(w) + 1);
404         strcpy(htmp->word, w);
405         htmp->next = NULL;
406         if (hprv == NULL)
407             ht[hval] = htmp;
408         else
409             hprv->next = htmp;
410 
411         /* new records are not moved to front */
412     }
413     else
414     {
415         if (hprv != NULL)       /* move to front on access */
416         {
417             hprv->next = htmp->next;
418             htmp->next = ht[hval];
419             ht[hval] = htmp;
420         }
421         gDupCnt++;              /* count duplicate strings */
422     }
423 
424     return;
425 }
426 
427 #endif // HASHMETHOD
428 
429 //=======================================================================
430 //      S P L A Y   M E T H O D
431 //=======================================================================
432 
433 #ifdef SPLAYMETHOD
434 #define ADTMETHOD       "SPLAY"
435 
436 #define MEMOVD          0       /* not enough overhead to count */
437 
438 /* Author J. Zobel, April 2001.
439    Permission to use this code is freely granted, provided that this
440    statement is retained. */
441 
442 #define ROTATEFAC 11
443 
444 typedef struct wordrec
445 {
446     char     *word;
447     struct wordrec *left, *right;
448     struct wordrec *par;
449 }
450 TREEREC;
451 
452 typedef struct ansrec
453 {
454     struct wordrec *root;
455     struct wordrec *ans;
456 }
457 ANSREC;
458 
459 #define ONELEVEL(PAR,CURR,DIR,RID)      \
460     {                                   \
461 	PAR->DIR = CURR->RID;           \
462 	if(PAR->DIR!=NULL)              \
463 	    PAR->DIR->PAR = PAR;        \
464 	CURR->RID = PAR;                \
465 	PAR->PAR = CURR;                \
466 	CURR->PAR = NULL;               \
467     }
468 
469 #define ZIGZIG(GPAR,PAR,CURR,DIR,RID)   \
470     {                                   \
471 	CURR->PAR = GPAR->PAR;          \
472 	if (CURR->PAR != NULL)          \
473 	{                               \
474 	    if (CURR->PAR->DIR == GPAR) \
475 		CURR->PAR->DIR = CURR;  \
476 	    else                        \
477 		CURR->PAR->RID = CURR;  \
478 	}                               \
479 	GPAR->DIR = PAR->RID;           \
480 	if (GPAR->DIR != NULL)          \
481 	    GPAR->DIR->PAR = GPAR;      \
482 	PAR->DIR = CURR->RID;           \
483 	if (CURR->RID != NULL)          \
484 	    CURR->RID->PAR = PAR;       \
485 	CURR->RID = PAR;                \
486 	PAR->PAR = CURR;                \
487 	PAR->RID = GPAR;                \
488 	GPAR->PAR = PAR;                \
489     }
490 
491 #define ZIGZAG(GPAR,PAR,CURR,DIR,RID)   \
492     {                                   \
493 	CURR->PAR = GPAR->PAR;          \
494 	if (CURR->PAR != NULL)          \
495 	{                               \
496 	    if (CURR->PAR->DIR == GPAR) \
497 		CURR->PAR->DIR = CURR;  \
498 	    else                        \
499 		CURR->PAR->RID = CURR;  \
500 	}                               \
501 	PAR->RID = CURR->DIR;           \
502 	if (PAR->RID != NULL)           \
503 	    PAR->RID->PAR = PAR;        \
504 	GPAR->DIR = CURR->RID;          \
505 	if (GPAR->DIR != NULL)          \
506 	    GPAR->DIR->PAR = GPAR;      \
507 	CURR->DIR = PAR;                \
508 	PAR->PAR = CURR;                \
509 	CURR->RID = GPAR;               \
510 	GPAR->PAR = CURR;               \
511     }
512 
513 int       scount = ROTATEFAC;
514 
515 /* Search for word in a splay tree.  If word is found, bring it to
516    root, possibly intermittently.  Structure ans is used to pass
517    in the root, and to pass back both the new root (which may or
518    may not be changed) and the looked-for record. */
519 void
splaysearch(ANSREC * ans,char * word)520 splaysearch(ANSREC * ans, char *word)
521 {
522     TREEREC  *curr = ans->root, *par, *gpar;
523     int       val;
524 
525     scount--;
526 
527     if (ans->root == NULL)
528     {
529         ans->ans = NULL;
530         return;
531     }
532     while (curr != NULL && (val = scmp(word, curr->word)) != 0)
533     {
534         if (val > 0)
535             curr = curr->right;
536         else
537             curr = curr->left;
538     }
539 
540     ans->ans = curr;
541 
542     if (curr == ans->root)
543     {
544         return;
545     }
546 
547     if (scount <= 0 && curr != NULL) /* Move node towards root */
548     {
549         scount = ROTATEFAC;
550 
551         while ((par = curr->par) != NULL)
552         {
553             if (par->left == curr)
554             {
555                 if ((gpar = par->par) == NULL)
556                 {
557                     ONELEVEL(par, curr, left, right);
558                 }
559                 else if (gpar->left == par)
560                 {
561                     ZIGZIG(gpar, par, curr, left, right);
562                 }
563                 else
564                 {
565                     ZIGZAG(gpar, par, curr, right, left);
566                 }
567             }
568             else
569             {
570                 if ((gpar = par->par) == NULL)
571                 {
572                     ONELEVEL(par, curr, right, left);
573                 }
574                 else if (gpar->left == par)
575                 {
576                     ZIGZAG(gpar, par, curr, left, right);
577                 }
578                 else
579                 {
580                     ZIGZIG(gpar, par, curr, right, left);
581                 }
582             }
583         }
584         ans->root = curr;
585     }
586 
587     return;
588 }
589 
590 /* Insert word into a splay tree.  If word is already present, bring it to
591    root, possibly intermittently.  Structure ans is used to pass
592    in the root, and to pass back both the new root (which may or
593    may not be changed) and the looked-for record. */
594 
595 void
splayinsert(ANSREC * ans,char * word)596 splayinsert(ANSREC * ans, char *word)
597 {
598     TREEREC  *curr = ans->root, *par, *gpar, *prev = NULL, *wcreate();
599     int       val = 0;
600 
601     scount--;
602 
603     if (ans->root == NULL)
604     {
605         ans->ans = ans->root = wcreate(word, NULL);
606         return;
607     }
608 
609     while (curr != NULL && (val = scmp(word, curr->word)) != 0)
610     {
611         prev = curr;
612         if (val > 0)
613             curr = curr->right;
614         else
615             curr = curr->left;
616     }
617 
618     if (curr == NULL)
619     {
620         if (val > 0)
621             curr = prev->right = wcreate(word, prev);
622         else
623             curr = prev->left = wcreate(word, prev);
624     }
625     else
626         gDupCnt++;              /* count duplicate strings */
627 
628     ans->ans = curr;
629 
630     if (scount <= 0)            /* Move node towards root */
631     {
632         scount = ROTATEFAC;
633 
634         while ((par = curr->par) != NULL)
635         {
636             if (par->left == curr)
637             {
638                 if ((gpar = par->par) == NULL)
639                 {
640                     ONELEVEL(par, curr, left, right);
641                 }
642                 else if (gpar->left == par)
643                 {
644                     ZIGZIG(gpar, par, curr, left, right);
645                 }
646                 else
647                 {
648                     ZIGZAG(gpar, par, curr, right, left);
649                 }
650             }
651             else
652             {
653                 if ((gpar = par->par) == NULL)
654                 {
655                     ONELEVEL(par, curr, right, left);
656                 }
657                 else if (gpar->left == par)
658                 {
659                     ZIGZAG(gpar, par, curr, left, right);
660                 }
661                 else
662                 {
663                     ZIGZIG(gpar, par, curr, right, left);
664                 }
665             }
666         }
667         ans->root = curr;
668     }
669     return;
670 }
671 
672 /* Create a node to hold a word */
673 TREEREC  *
wcreate(char * word,TREEREC * par)674 wcreate(char *word, TREEREC * par)
675 {
676     TREEREC  *tmp;
677 
678     tmp = (TREEREC *) malloc(sizeof(TREEREC));
679     tmp->word = (char *)malloc(strlen(word) + 1);
680     strcpy(tmp->word, word);
681     tmp->left = tmp->right = NULL;
682 
683     tmp->par = par;
684 
685     return (tmp);
686 }
687 
688 #define INITARRAY ANSREC      ans = {0}
689 
690 #define STORESTRING(STRING)   splayinsert(&ans, STRING)
691 
692 #define GETSTRING(STRING)     splaysearch(&ans, STRING)
693 
694 #endif // SPLAYMETHOD
695 
696 //=======================================================================
697 //      R E D B L A C K   M E T H O D
698 //=======================================================================
699 
700 #ifdef REDBLACKMETHOD
701 #define ADTMETHOD       "REDBLACK"
702 
703 #define MEMOVD          0       /* not enough overhead to count */
704 
705 /* Author J. Zobel, April 2001.
706    Permission to use this code is freely granted, provided that this
707    statement is retained. */
708 
709 typedef struct wordrec
710 {
711     char     *word;
712     struct wordrec *left, *right;
713     struct wordrec *par;
714     char      colour;
715 }
716 TREEREC;
717 
718 typedef struct ansrec
719 {
720     struct wordrec *root;
721     struct wordrec *ans;
722 }
723 ANSREC;
724 
725 #define RED		0
726 #define BLACK		1
727 
728 /* Find word in a redblack tree */
729 
730 void
redblacksearch(ANSREC * ans,char * word)731 redblacksearch(ANSREC * ans, char *word)
732 {
733     TREEREC  *curr = ans->root;
734     int       val;
735 
736     if (ans->root != NULL)
737     {
738         while (curr != NULL && (val = scmp(word, curr->word)) != 0)
739         {
740             if (val > 0)
741                 curr = curr->right;
742             else
743                 curr = curr->left;
744         }
745     }
746 
747     ans->ans = curr;
748 
749     return;
750 }
751 
752 /* Rotate the right child of par upwards */
753 /* Could be written as a macro, but not really necessary
754 as it is only called on insertion */
755 
756 void
leftrotate(ANSREC * ans,TREEREC * par)757 leftrotate(ANSREC * ans, TREEREC * par)
758 {
759     TREEREC  *curr, *gpar;
760 
761     if ((curr = par->right) != NULL)
762     {
763         par->right = curr->left;
764         if (curr->left != NULL)
765             curr->left->par = par;
766         curr->par = par->par;
767         if ((gpar = par->par) == NULL)
768             ans->root = curr;
769         else
770         {
771             if (par == gpar->left)
772                 gpar->left = curr;
773             else
774                 gpar->right = curr;
775         }
776         curr->left = par;
777         par->par = curr;
778     }
779 }
780 
781 /* Rotate the left child of par upwards */
782 void
rightrotate(ANSREC * ans,TREEREC * par)783 rightrotate(ANSREC * ans, TREEREC * par)
784 {
785     TREEREC  *curr, *gpar;
786 
787     if ((curr = par->left) != NULL)
788     {
789         par->left = curr->right;
790         if (curr->right != NULL)
791             curr->right->par = par;
792         curr->par = par->par;
793         if ((gpar = par->par) == NULL)
794             ans->root = curr;
795         else
796         {
797             if (par == gpar->left)
798                 gpar->left = curr;
799             else
800                 gpar->right = curr;
801         }
802         curr->right = par;
803         par->par = curr;
804     }
805 }
806 
807 /* Insert word into a redblack tree */
808 void
redblackinsert(ANSREC * ans,char * word)809 redblackinsert(ANSREC * ans, char *word)
810 {
811     TREEREC  *curr = ans->root, *par, *gpar, *prev = NULL, *wcreate();
812     int       val;
813 
814     if (ans->root == NULL)
815     {
816         ans->ans = ans->root = wcreate(word, NULL);
817         return;
818     }
819     while (curr != NULL && (val = scmp(word, curr->word)) != 0)
820     {
821         prev = curr;
822         if (val > 0)
823             curr = curr->right;
824         else
825             curr = curr->left;
826     }
827 
828     ans->ans = curr;
829 
830     if (curr == NULL)
831         /* Insert a new node, rotate up if necessary */
832     {
833         if (val > 0)
834             curr = prev->right = wcreate(word, prev);
835         else
836             curr = prev->left = wcreate(word, prev);
837 
838         curr->colour = RED;
839         while ((par = curr->par) != NULL
840                && (gpar = par->par) != NULL && curr->par->colour == RED)
841         {
842             if (par == gpar->left)
843             {
844                 if (gpar->right != NULL && gpar->right->colour == RED)
845                 {
846                     par->colour = BLACK;
847                     gpar->right->colour = BLACK;
848                     gpar->colour = RED;
849                     curr = gpar;
850                 }
851                 else
852                 {
853                     if (curr == par->right)
854                     {
855                         curr = par;
856                         leftrotate(ans, curr);
857                         par = curr->par;
858                     }
859                     par->colour = BLACK;
860                     if ((gpar = par->par) != NULL)
861                     {
862                         gpar->colour = RED;
863                         rightrotate(ans, gpar);
864                     }
865                 }
866             }
867             else
868             {
869                 if (gpar->left != NULL && gpar->left->colour == RED)
870                 {
871                     par->colour = BLACK;
872                     gpar->left->colour = BLACK;
873                     gpar->colour = RED;
874                     curr = gpar;
875                 }
876                 else
877                 {
878                     if (curr == par->left)
879                     {
880                         curr = par;
881                         rightrotate(ans, curr);
882                         par = curr->par;
883                     }
884                     par->colour = BLACK;
885                     if ((gpar = par->par) != NULL)
886                     {
887                         gpar->colour = RED;
888                         leftrotate(ans, gpar);
889                     }
890                 }
891             }
892         }
893         if (curr->par == NULL)
894             ans->root = curr;
895         ans->root->colour = BLACK;
896     }
897     else
898         gDupCnt++;              /* count duplicate strings */
899 
900     return;
901 }
902 
903 /* Create a node to hold a word */
904 TREEREC  *
wcreate(char * word,TREEREC * par)905 wcreate(char *word, TREEREC * par)
906 {
907     TREEREC  *tmp;
908 
909     tmp = (TREEREC *) malloc(sizeof(TREEREC));
910     tmp->word = (char *)malloc(strlen(word) + 1);
911     strcpy(tmp->word, word);
912     tmp->left = tmp->right = NULL;
913 
914     tmp->par = par;
915 
916     return (tmp);
917 }
918 
919 #define INITARRAY ANSREC      ans = {0}
920 
921 #define STORESTRING(STRING)   redblackinsert(&ans, STRING)
922 
923 #define GETSTRING(STRING)     redblacksearch(&ans, STRING)
924 
925 #endif // REDBLACKMETHOD
926 
927 //=======================================================================
928 //      M A I N   P R O G R A M  -by-  Doug Baskins
929 //=======================================================================
930 
931 // error routine for system routines for accessing file
932 
933 #define FILERROR                                                        \
934 {                                                                       \
935     printf("%s: Cannot open file \"%s\": %s "                           \
936 		"(errno = %d)\n", argv[0], argv[1], strerror(errno),    \
937 		errno);                                                 \
938     fprintf(stderr, "%s: Cannot open file \"%s\": %s "                  \
939 		"(errno = %d)\n", argv[0], argv[1], strerror(errno),    \
940 		errno);                                                 \
941     exit(1);                                                            \
942 }
943 
944 
945 //=======================================================================
946 //      M E A S U R E  A D T   S P E E D  and  M E M O R Y  U S A G E
947 //=======================================================================
948 
949 int
main(int argc,char * argv[])950 main(int argc, char *argv[])
951 {
952     TIMER_vars(tm);             // declare timer variables
953     int       fd;               // to read file.
954     struct stat statbuf;        // to get size of file
955     char     *Pfile;            // ram address of file
956     size_t    fsize;            // file size in bytes
957     char     *FSmap;            // start address of mapped file
958     char     *FEmap;            // end address+1 of mapped file
959     char      String[MAXLINE];  // input buffer
960     int       StrCnt;           // line counter
961     int       ii;               // temp
962 
963     INITARRAY;
964 
965 // CHECK FOR REQUIRED INPUT FILE PARAMETER:
966 
967     if (argc != 2)
968     {
969         printf("Usage: %s <text file>\n", argv[0]);
970         exit(1);
971     }
972 
973 // GET FILE SIZE
974 
975     if (stat(argv[1], &statbuf) == -1)
976         FILERROR;
977 
978     fsize = statbuf.st_size;
979 
980 // OPEN INPUT FILE:
981 
982     if ((fd = open(argv[1], O_RDONLY)) == -1)
983         FILERROR;
984 
985 // MEMORY MAP FILE
986 
987     Pfile =
988         (char *)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
989     if (Pfile == (char *)-1)
990         FILERROR;
991 
992     FEmap = Pfile + fsize;      // set end+1 address
993 
994 // print command name + run arguments
995 
996     printf("# ");
997     for (ii = 0; ii < argc; ii++)
998         printf("%s ", argv[ii]);
999     printf("\n");
1000     fflush(stdout);
1001 
1002 // file is in RAM, read again to measure time
1003 
1004     STARTTm(tm);                // start timer
1005 
1006     for (StrCnt = 0, FSmap = Pfile; FSmap < FEmap; )
1007     {
1008         GETLINE(String, FSmap); // 'read' next string
1009 
1010         StrCnt++;
1011     }
1012     ENDTm(DeltaUSec, tm);       // end timer
1013 //
1014 //  print header - 6 fields
1015 
1016     printf("#  lines avg_linelen  getline   stored");
1017     printf(" RAMused/line  store/ln lookup/ln ADT\n");
1018 
1019 //  print numb lines  filesize/line
1020 
1021     printf("%8u", StrCnt);      // Number of lines
1022     printf(" %6.1f byts", (double)fsize / (double)StrCnt); // filesize/line
1023     fflush(stdout);
1024 
1025 //  print time to read a line
1026 
1027     printf(" %5.3f uS", DeltaUSec / (double)StrCnt);
1028     fflush(stdout);
1029 
1030 // INPUT/STORE LOOP:
1031 
1032 // read each input line into String and store into ADT
1033 
1034     STARTmem;                   // current malloc() mem usage
1035     STARTTm(tm);                // start timer
1036 
1037     for (FSmap = Pfile; FSmap < FEmap; )
1038     {
1039         GETLINE(String, FSmap);  // 'read' next string
1040 
1041         STORESTRING(String);
1042     }
1043     ENDTm(DeltaUSec, tm);       // end timer
1044     ENDmem;                     // current malloc() mem usage
1045 
1046 //  print number of non-duplicate lines
1047 
1048     printf(" %8u", StrCnt - gDupCnt); // duplicate lines
1049 
1050 //  print RAM used by malloc() by ADT to store data
1051 
1052     printf(" %7.1f byts", (double)(DeltaMem + MEMOVD) /
1053             (double)(StrCnt - gDupCnt));
1054 
1055 //  print time per line to store the data
1056 
1057     printf(" %6.3f uS", DeltaUSec / (double)StrCnt);
1058     fflush(stdout);
1059 
1060 // READ BACK LOOP
1061 
1062     STARTTm(tm);                // start timer
1063 
1064     for (FSmap = Pfile; FSmap < FEmap; )
1065     {
1066         GETLINE(String, FSmap);  // 'read' next string
1067 
1068         GETSTRING(String);
1069     }
1070     ENDTm(DeltaUSec, tm);       // end timer
1071 
1072 //  print time per line to lookup the data from the ADT
1073 
1074     printf(" %6.3f uS", DeltaUSec / (double)StrCnt); // store time/line
1075     printf(" %s\n", ADTMETHOD);
1076 
1077     return (0);                 // done
1078 
1079 }                               // main()
1080