1 //  Judy arrays 13 DEC 2012 (judy64n.c from http://code.google.com/p/judyarray/ )
2 //  This code is public domain.
3 
4 //  Author Karl Malbrain, malbrain AT yahoo.com
5 //  with assistance from Jan Weiss.
6 //  modifications (and any bugs) by Mark Pictor, mpictor at gmail
7 
8 //  Simplified judy arrays for strings and integers
9 //  Adapted from the ideas of Douglas Baskins of HP.
10 
11 //  Map a set of keys to corresponding memory cells (uints).
12 //  Each cell must be set to a non-zero value by the caller.
13 
14 //  STANDALONE is defined to compile into a string sorter.
15 
16 //  String mappings are denoted by calling judy_open with zero as
17 //  the second argument.
18 
19 #ifndef STANDALONE
20 #  error must define STANDALONE while compiling this file and judy64.c
21 #endif
22 
23 #include "judy.h"
24 #include "sort.h"
25 
26 unsigned int MaxMem = 0;
27 
28 //    usage:
29 //    a.out [in-file] [out-file] [keysize] [recordlen] [keyoffset] [mergerecs]
30 //    where keysize is 10 to indicate pennysort files
31 
32 //    ASKITIS compilation:
33 //    g++ -O3 -fpermissive -fomit-frame-pointer -w -D STANDALONE -D ASKITIS -o judy64n judy64n.c
34 //    ./judy64n [input-file-to-build-judy] e.g. distinct_1 or skew1_1
35 
36 //    note:  the judy array is an in-memory data structure. As such, make sure you
37 //    have enough memory to hold the entire input file + data structure, otherwise
38 //    you'll have to break the input file into smaller pieces and load them in
39 //    on-by-one.
40 
41 //    Also, the file to search judy is hardcoded to skew1_1.
42 
main(int argc,char ** argv)43 int main( int argc, char ** argv ) {
44     unsigned char buff[1024];
45     JudySlot max = 0;
46     JudySlot * cell;
47     FILE * in, *out;
48     void * judy;
49     unsigned int len;
50     unsigned int idx;
51 #ifdef ASKITIS
52     char * askitis;
53     int prev, off;
54     float insert_real_time = 0.0;
55     float search_real_time = 0.0;
56     int size;
57 #if !defined(_WIN32)
58     timer start, stop;
59 #else
60     time_t start[1], stop[1];
61 #endif
62 #endif
63 
64     if( argc > 1 ) {
65         in = fopen( argv[1], "rb" );
66     } else {
67         in = stdin;
68     }
69 
70     if( argc > 2 ) {
71         out = fopen( argv[2], "wb" );
72     } else {
73         out = stdout;
74     }
75 
76     setvbuf( out, NULL, _IOFBF, 4096 * 1024 );
77 
78     if( !in ) {
79         fprintf( stderr, "unable to open input file\n" );
80     }
81 
82     if( !out ) {
83         fprintf( stderr, "unable to open output file\n" );
84     }
85 
86     if( argc > 6 ) {
87         PennyRecs = atoi( argv[6] );
88     }
89 
90     if( argc > 5 ) {
91         PennyOff = atoi( argv[5] );
92     }
93 
94     if( argc > 4 ) {
95         PennyLine = atoi( argv[4] );
96     }
97 
98     PennyMerge = ( unsigned long long )PennyLine * PennyRecs;
99 
100     if( argc > 3 ) {
101         PennyKey = atoi( argv[3] );
102         sort( in, argv[2] );
103         return merge( out, argv[2] );
104     }
105 
106 #ifdef ASKITIS
107     judy = judy_open( 1024, 0 );
108 
109 //    build judy array
110     size = lseek( fileno( in ), 0L, 2 );
111     askitis = malloc( size );
112     lseek( fileno( in ), 0L, 0 );
113     read( fileno( in ), askitis, size );
114     prev = 0;
115 //    naskitis.com:
116 //    Start the timer.
117 
118 #if !defined(_WIN32)
119     gettimeofday( &start, NULL );
120 #else
121     time( start );
122 #endif
123 
124     for( off = 0; off < size; off++ )
125         if( askitis[off] == '\n' ) {
126             *( judy_cell( judy, askitis + prev, off - prev ) ) += 1;   // count instances of string
127             prev = off + 1;
128         }
129 //    naskitis.com:
130 //    Stop the timer and do some math to compute the time required to insert the strings into the judy array.
131 
132 #if !defined(_WIN32)
133     gettimeofday( &stop, NULL );
134 
135     insert_real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * ( stop.tv_usec - start.tv_usec );
136     insert_real_time = insert_real_time / 1000.0;
137 #else
138     time( stop );
139     insert_real_time = *stop - *start;
140 #endif
141 
142 //    naskitis.com:
143 //    Free the input buffer used to store the first file.  We must do this before we get the process size below.
144     free( askitis );
145     fprintf( stderr, "JudyArray@Karl_Malbrain\nDASKITIS option enabled\n-------------------------------\n%-20s %.2f MB\n%-20s %.2f sec\n",
146              "Judy Array size:", MaxMem / 1000000., "Time to insert:", insert_real_time );
147     fprintf( stderr, "%-20s %d\n", "Words:", Words );
148     fprintf( stderr, "%-20s %d\n", "Inserts:", Inserts );
149     fprintf( stderr, "%-20s %d\n", "Found:", Found );
150 
151     Words = 0;
152     Inserts = 0;
153     Found = 0;
154 
155 //    search judy array
156     if( in = freopen( "skew1_1", "rb", in ) ) {
157         size = lseek( fileno( in ), 0L, 2 );
158     } else {
159         exit( 0 );
160     }
161     askitis = malloc( size );
162     lseek( fileno( in ), 0L, 0 );
163     read( fileno( in ), askitis, size );
164     prev = 0;
165 
166 #if !defined(_WIN32)
167     gettimeofday( &start, NULL );
168 #else
169     time( start );
170 #endif
171 
172     for( off = 0; off < size; off++ )
173         if( askitis[off] == '\n' ) {
174             *judy_cell( judy, askitis + prev, off - prev ) += 1;
175             prev = off + 1;
176         }
177 //    naskitis.com:
178 //    Stop the timer and do some math to compute the time required to search the judy array.
179 
180 #if !defined(_WIN32)
181     gettimeofday( &stop, NULL );
182     search_real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001
183                        * ( stop.tv_usec - start.tv_usec );
184     search_real_time = search_real_time / 1000.0;
185 #else
186     time( stop );
187     search_real_time = *stop - *start;
188 #endif
189 
190 //    naskitis.com:
191 //    To do: report a count on the number of strings found.
192 
193     fprintf( stderr, "\n%-20s %.2f MB\n%-20s %.2f sec\n",
194              "Judy Array size:", MaxMem / 1000000., "Time to search:", search_real_time );
195     fprintf( stderr, "%-20s %d\n", "Words:", Words );
196     fprintf( stderr, "%-20s %d\n", "Inserts:", Inserts );
197     fprintf( stderr, "%-20s %d\n", "Found:", Found );
198     exit( 0 );
199 #endif
200 
201     judy = judy_open( 1024, 0 );
202 
203     while( fgets( ( char * )buff, sizeof( buff ), in ) ) {
204         if( len = strlen( ( const char * )buff ) ) {
205             buff[--len] = 0;    // remove LF
206         }
207         *( judy_cell( judy, buff, len ) ) += 1;     // count instances of string
208         max++;
209     }
210 
211     fprintf( stderr, "%" PRIuint " memory used\n", MaxMem );
212 
213     cell = judy_strt( judy, NULL, 0 );
214 
215     if( cell ) do {
216             len = judy_key( judy, buff, sizeof( buff ) );
217             for( idx = 0; idx < *cell; idx++ ) {       // spit out duplicates
218                 fwrite( buff, len, 1, out );
219                 fputc( '\n', out );
220             }
221         } while( cell = judy_nxt( judy ) );
222 
223 #if 0
224     // test deletion all the way to an empty tree
225 
226     if( cell = judy_prv( judy ) )
227         do {
228             max -= *cell;
229         } while( cell = judy_del( judy ) );
230 
231     assert( max == 0 );
232 #endif
233     judy_close( judy );
234     return 0;
235 }
236 
237