1 // Judy arrays 13 DEC 2012 (judy64n.c from http://code.google.com/p/judyarray/ )
2 // This code is public domain.
3
4 // Author Karl Malbrain, malbrain AT yahoo.com
5 // with assistance from Jan Weiss.
6 // modifications (and any bugs) by Mark Pictor, mpictor at gmail
7
8 // Simplified judy arrays for strings and integers
9 // Adapted from the ideas of Douglas Baskins of HP.
10
11 // Map a set of keys to corresponding memory cells (uints).
12 // Each cell must be set to a non-zero value by the caller.
13
14 // STANDALONE is defined to compile into a string sorter.
15
16 // String mappings are denoted by calling judy_open with zero as
17 // the second argument.
18
19 #ifndef STANDALONE
20 # error must define STANDALONE while compiling this file and judy64.c
21 #endif
22
23 #include "judy.h"
24 #include "sort.h"
25
26 unsigned int MaxMem = 0;
27
28 // usage:
29 // a.out [in-file] [out-file] [keysize] [recordlen] [keyoffset] [mergerecs]
30 // where keysize is 10 to indicate pennysort files
31
32 // ASKITIS compilation:
33 // g++ -O3 -fpermissive -fomit-frame-pointer -w -D STANDALONE -D ASKITIS -o judy64n judy64n.c
34 // ./judy64n [input-file-to-build-judy] e.g. distinct_1 or skew1_1
35
36 // note: the judy array is an in-memory data structure. As such, make sure you
37 // have enough memory to hold the entire input file + data structure, otherwise
38 // you'll have to break the input file into smaller pieces and load them in
39 // on-by-one.
40
41 // Also, the file to search judy is hardcoded to skew1_1.
42
main(int argc,char ** argv)43 int main( int argc, char ** argv ) {
44 unsigned char buff[1024];
45 JudySlot max = 0;
46 JudySlot * cell;
47 FILE * in, *out;
48 void * judy;
49 unsigned int len;
50 unsigned int idx;
51 #ifdef ASKITIS
52 char * askitis;
53 int prev, off;
54 float insert_real_time = 0.0;
55 float search_real_time = 0.0;
56 int size;
57 #if !defined(_WIN32)
58 timer start, stop;
59 #else
60 time_t start[1], stop[1];
61 #endif
62 #endif
63
64 if( argc > 1 ) {
65 in = fopen( argv[1], "rb" );
66 } else {
67 in = stdin;
68 }
69
70 if( argc > 2 ) {
71 out = fopen( argv[2], "wb" );
72 } else {
73 out = stdout;
74 }
75
76 setvbuf( out, NULL, _IOFBF, 4096 * 1024 );
77
78 if( !in ) {
79 fprintf( stderr, "unable to open input file\n" );
80 }
81
82 if( !out ) {
83 fprintf( stderr, "unable to open output file\n" );
84 }
85
86 if( argc > 6 ) {
87 PennyRecs = atoi( argv[6] );
88 }
89
90 if( argc > 5 ) {
91 PennyOff = atoi( argv[5] );
92 }
93
94 if( argc > 4 ) {
95 PennyLine = atoi( argv[4] );
96 }
97
98 PennyMerge = ( unsigned long long )PennyLine * PennyRecs;
99
100 if( argc > 3 ) {
101 PennyKey = atoi( argv[3] );
102 sort( in, argv[2] );
103 return merge( out, argv[2] );
104 }
105
106 #ifdef ASKITIS
107 judy = judy_open( 1024, 0 );
108
109 // build judy array
110 size = lseek( fileno( in ), 0L, 2 );
111 askitis = malloc( size );
112 lseek( fileno( in ), 0L, 0 );
113 read( fileno( in ), askitis, size );
114 prev = 0;
115 // naskitis.com:
116 // Start the timer.
117
118 #if !defined(_WIN32)
119 gettimeofday( &start, NULL );
120 #else
121 time( start );
122 #endif
123
124 for( off = 0; off < size; off++ )
125 if( askitis[off] == '\n' ) {
126 *( judy_cell( judy, askitis + prev, off - prev ) ) += 1; // count instances of string
127 prev = off + 1;
128 }
129 // naskitis.com:
130 // Stop the timer and do some math to compute the time required to insert the strings into the judy array.
131
132 #if !defined(_WIN32)
133 gettimeofday( &stop, NULL );
134
135 insert_real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * ( stop.tv_usec - start.tv_usec );
136 insert_real_time = insert_real_time / 1000.0;
137 #else
138 time( stop );
139 insert_real_time = *stop - *start;
140 #endif
141
142 // naskitis.com:
143 // Free the input buffer used to store the first file. We must do this before we get the process size below.
144 free( askitis );
145 fprintf( stderr, "JudyArray@Karl_Malbrain\nDASKITIS option enabled\n-------------------------------\n%-20s %.2f MB\n%-20s %.2f sec\n",
146 "Judy Array size:", MaxMem / 1000000., "Time to insert:", insert_real_time );
147 fprintf( stderr, "%-20s %d\n", "Words:", Words );
148 fprintf( stderr, "%-20s %d\n", "Inserts:", Inserts );
149 fprintf( stderr, "%-20s %d\n", "Found:", Found );
150
151 Words = 0;
152 Inserts = 0;
153 Found = 0;
154
155 // search judy array
156 if( in = freopen( "skew1_1", "rb", in ) ) {
157 size = lseek( fileno( in ), 0L, 2 );
158 } else {
159 exit( 0 );
160 }
161 askitis = malloc( size );
162 lseek( fileno( in ), 0L, 0 );
163 read( fileno( in ), askitis, size );
164 prev = 0;
165
166 #if !defined(_WIN32)
167 gettimeofday( &start, NULL );
168 #else
169 time( start );
170 #endif
171
172 for( off = 0; off < size; off++ )
173 if( askitis[off] == '\n' ) {
174 *judy_cell( judy, askitis + prev, off - prev ) += 1;
175 prev = off + 1;
176 }
177 // naskitis.com:
178 // Stop the timer and do some math to compute the time required to search the judy array.
179
180 #if !defined(_WIN32)
181 gettimeofday( &stop, NULL );
182 search_real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001
183 * ( stop.tv_usec - start.tv_usec );
184 search_real_time = search_real_time / 1000.0;
185 #else
186 time( stop );
187 search_real_time = *stop - *start;
188 #endif
189
190 // naskitis.com:
191 // To do: report a count on the number of strings found.
192
193 fprintf( stderr, "\n%-20s %.2f MB\n%-20s %.2f sec\n",
194 "Judy Array size:", MaxMem / 1000000., "Time to search:", search_real_time );
195 fprintf( stderr, "%-20s %d\n", "Words:", Words );
196 fprintf( stderr, "%-20s %d\n", "Inserts:", Inserts );
197 fprintf( stderr, "%-20s %d\n", "Found:", Found );
198 exit( 0 );
199 #endif
200
201 judy = judy_open( 1024, 0 );
202
203 while( fgets( ( char * )buff, sizeof( buff ), in ) ) {
204 if( len = strlen( ( const char * )buff ) ) {
205 buff[--len] = 0; // remove LF
206 }
207 *( judy_cell( judy, buff, len ) ) += 1; // count instances of string
208 max++;
209 }
210
211 fprintf( stderr, "%" PRIuint " memory used\n", MaxMem );
212
213 cell = judy_strt( judy, NULL, 0 );
214
215 if( cell ) do {
216 len = judy_key( judy, buff, sizeof( buff ) );
217 for( idx = 0; idx < *cell; idx++ ) { // spit out duplicates
218 fwrite( buff, len, 1, out );
219 fputc( '\n', out );
220 }
221 } while( cell = judy_nxt( judy ) );
222
223 #if 0
224 // test deletion all the way to an empty tree
225
226 if( cell = judy_prv( judy ) )
227 do {
228 max -= *cell;
229 } while( cell = judy_del( judy ) );
230
231 assert( max == 0 );
232 #endif
233 judy_close( judy );
234 return 0;
235 }
236
237