1 // Judy arrays 13 DEC 2012 (judy64n.c from http://code.google.com/p/judyarray/ )
2 // This code is public domain.
3
4 // Author Karl Malbrain, malbrain AT yahoo.com
5 // with assistance from Jan Weiss.
6 // modifications (and any bugs) by Mark Pictor, mpictor at gmail
7
8 // Simplified judy arrays for strings and integers
9 // Adapted from the ideas of Douglas Baskins of HP.
10
11 // Map a set of keys to corresponding memory cells (uints).
12 // Each cell must be set to a non-zero value by the caller.
13
14 // STANDALONE is defined to compile into a string sorter.
15
16 // String mappings are denoted by calling judy_open with zero as
17 // the second argument. Integer mappings are denoted by calling
18 // judy_open with the Integer depth of the Judy Trie as the second
19 // argument.
20
21 #include "judy.h"
22 #include "sort.h"
23
24 // memory map input file and sort
25
26 // define pennysort parameters
27 unsigned int PennyRecs = ( 4096 * 400 ); // records to sort to temp files
28 unsigned int PennyLine = 100; // length of input record
29 unsigned int PennyKey = 10; // length of input key
30 unsigned int PennyOff = 0; // key offset in input record
31
32 unsigned long long PennyMerge; // PennyRecs * PennyLine = file map length
33 unsigned int PennyPasses; // number of intermediate files created
34 unsigned int PennySortTime; // cpu time to run sort
35 unsigned int PennyMergeTime; // cpu time to run merge
36
sort(FILE * infile,char * outname)37 void sort( FILE * infile, char * outname ) {
38 unsigned long long size, off, offset, part;
39 int ifd = fileno( infile );
40 char filename[512];
41 PennySort * line;
42 JudySlot * cell;
43 unsigned char * inbuff;
44 void * judy;
45 FILE * out;
46 #if defined(_WIN32)
47 HANDLE hndl, fm;
48 DWORD hiword;
49 FILETIME dummy[1];
50 FILETIME user[1];
51 #else
52 struct tms buff[1];
53 #endif
54 time_t start = time( NULL );
55
56 if( PennyOff + PennyKey > PennyLine ) {
57 fprintf( stderr, "Key Offset + Key Length > Record Length\n" ), exit( 1 );
58 }
59
60 offset = 0;
61 PennyPasses = 0;
62
63 #if defined(_WIN32)
64 hndl = ( HANDLE )_get_osfhandle( ifd );
65 size = GetFileSize( hndl, &hiword );
66 fm = CreateFileMapping( hndl, NULL, PAGE_READONLY, hiword, ( DWORD )size, NULL );
67 if( !fm ) {
68 fprintf( stderr, "CreateFileMapping error %d\n", GetLastError() ), exit( 1 );
69 }
70 size |= ( unsigned long long )hiword << 32;
71 #else
72 size = lseek( ifd, 0L, 2 );
73 #endif
74
75 while( offset < size ) {
76 #if defined(_WIN32)
77 part = offset + PennyMerge > size ? size - offset : PennyMerge;
78 inbuff = MapViewOfFile( fm, FILE_MAP_READ, offset >> 32, offset, part );
79 if( !inbuff ) {
80 fprintf( stderr, "MapViewOfFile error %d\n", GetLastError() ), exit( 1 );
81 }
82 #else
83 inbuff = mmap( NULL, PennyMerge, PROT_READ, MAP_SHARED, ifd, offset );
84
85 if( inbuff == MAP_FAILED ) {
86 fprintf( stderr, "mmap error %d\n", errno ), exit( 1 );
87 }
88
89 if( madvise( inbuff, PennyMerge, MADV_WILLNEED | MADV_SEQUENTIAL ) < 0 ) {
90 fprintf( stderr, "madvise error %d\n", errno );
91 }
92 #endif
93 judy = judy_open( PennyKey, 0 );
94
95 off = 0;
96
97 // build judy array from mapped input chunk
98
99 while( offset + off < size && off < PennyMerge ) {
100 line = judy_data( judy, sizeof( PennySort ) );
101 cell = judy_cell( judy, inbuff + off + PennyOff, PennyKey );
102 line->next = *( void ** )cell;
103 line->buff = inbuff + off;
104
105 *( PennySort ** )cell = line;
106 off += PennyLine;
107 }
108
109 sprintf( filename, "%s.%d", outname, PennyPasses );
110 out = fopen( filename, "wb" );
111 setvbuf( out, NULL, _IOFBF, 4096 * 1024 );
112
113 #ifndef _WIN32
114 if( madvise( inbuff, PennyMerge, MADV_WILLNEED | MADV_RANDOM ) < 0 ) {
115 fprintf( stderr, "madvise error %d\n", errno );
116 }
117 #endif
118
119 // write judy array in sorted order to temporary file
120
121 cell = judy_strt( judy, NULL, 0 );
122
123 if( cell ) do {
124 line = *( PennySort ** )cell;
125 do {
126 fwrite( line->buff, PennyLine, 1, out );
127 } while( line = line->next );
128 } while( cell = judy_nxt( judy ) );
129
130 #if defined(_WIN32)
131 UnmapViewOfFile( inbuff );
132 #else
133 munmap( inbuff, PennyMerge );
134 #endif
135 judy_close( judy );
136 offset += off;
137 fflush( out );
138 fclose( out );
139 PennyPasses++;
140 }
141 fprintf( stderr, "End Sort %llu secs", ( unsigned long long ) time( NULL ) - start );
142 #if defined(_WIN32)
143 CloseHandle( fm );
144 GetProcessTimes( GetCurrentProcess(), dummy, dummy, dummy, user );
145 PennySortTime = *( unsigned long long * )user / 10000000;
146 #else
147 times( buff );
148 PennySortTime = buff->tms_utime / 100;
149 #endif
150 fprintf( stderr, " Cpu %d\n", PennySortTime );
151 }
152
merge(FILE * out,char * outname)153 int merge( FILE * out, char * outname ) {
154 time_t start = time( NULL );
155 char filename[512];
156 JudySlot * cell;
157 unsigned int nxt, idx;
158 unsigned char ** line;
159 unsigned int * next;
160 void * judy;
161 FILE ** in;
162
163 next = calloc( PennyPasses + 1, sizeof( unsigned int ) );
164 line = calloc( PennyPasses, sizeof( void * ) );
165 in = calloc( PennyPasses, sizeof( void * ) );
166
167 judy = judy_open( PennyKey, 0 );
168
169 // initialize merge with one record from each temp file
170
171 for( idx = 0; idx < PennyPasses; idx++ ) {
172 sprintf( filename, "%s.%d", outname, idx );
173 in[idx] = fopen( filename, "rb" );
174 line[idx] = malloc( PennyLine );
175 setvbuf( in[idx], NULL, _IOFBF, 4096 * 1024 );
176 fread( line[idx], PennyLine, 1, in[idx] );
177 cell = judy_cell( judy, line[idx] + PennyOff, PennyKey );
178 next[idx + 1] = *( unsigned int * )cell;
179 *cell = idx + 1;
180 }
181
182 // output records, replacing smallest each time
183
184 while( cell = judy_strt( judy, NULL, 0 ) ) {
185 nxt = *( unsigned int * )cell;
186 judy_del( judy );
187
188 // process duplicates
189
190 while( idx = nxt ) {
191 nxt = next[idx--];
192 fwrite( line[idx], PennyLine, 1, out );
193
194 if( fread( line[idx], PennyLine, 1, in[idx] ) ) {
195 cell = judy_cell( judy, line[idx] + PennyOff, PennyKey );
196 next[idx + 1] = *( unsigned int * )cell;
197 *cell = idx + 1;
198 } else {
199 next[idx + 1] = 0;
200 }
201 }
202 }
203
204 for( idx = 0; idx < PennyPasses; idx++ ) {
205 fclose( in[idx] );
206 free( line[idx] );
207 }
208
209 free( line );
210 free( next );
211 free( in );
212
213 fprintf( stderr, "End Merge %llu secs", ( unsigned long long ) time( NULL ) - start );
214 #ifdef _WIN32
215 {
216 FILETIME dummy[1];
217 FILETIME user[1];
218 GetProcessTimes( GetCurrentProcess(), dummy, dummy, dummy, user );
219 PennyMergeTime = *( unsigned long long * )user / 10000000;
220 }
221 #else
222 {
223 struct tms buff[1];
224 times( buff );
225 PennyMergeTime = buff->tms_utime / 100;
226 }
227 #endif
228 fprintf( stderr, " Cpu %d\n", PennyMergeTime - PennySortTime );
229 judy_close( judy );
230 fflush( out );
231 fclose( out );
232 return 0;
233 }
234