1 //  Judy arrays 13 DEC 2012 (judy64n.c from http://code.google.com/p/judyarray/ )
2 //  This code is public domain.
3 
4 //  Author Karl Malbrain, malbrain AT yahoo.com
5 //  with assistance from Jan Weiss.
6 //  modifications (and any bugs) by Mark Pictor, mpictor at gmail
7 
8 //  Simplified judy arrays for strings and integers
9 //  Adapted from the ideas of Douglas Baskins of HP.
10 
11 //  Map a set of keys to corresponding memory cells (uints).
12 //  Each cell must be set to a non-zero value by the caller.
13 
14 //  STANDALONE is defined to compile into a string sorter.
15 
16 //  String mappings are denoted by calling judy_open with zero as
17 //  the second argument.  Integer mappings are denoted by calling
18 //  judy_open with the Integer depth of the Judy Trie as the second
19 //  argument.
20 
21 #include "judy.h"
22 #include "sort.h"
23 
24 //    memory map input file and sort
25 
26 //    define pennysort parameters
27 unsigned int PennyRecs = ( 4096 * 400 );  // records to sort to temp files
28 unsigned int PennyLine = 100;            // length of input record
29 unsigned int PennyKey = 10;                // length of input key
30 unsigned int PennyOff = 0;                // key offset in input record
31 
32 unsigned long long PennyMerge;    // PennyRecs * PennyLine = file map length
33 unsigned int PennyPasses;                // number of intermediate files created
34 unsigned int PennySortTime;                // cpu time to run sort
35 unsigned int PennyMergeTime;            // cpu time to run merge
36 
sort(FILE * infile,char * outname)37 void sort( FILE * infile, char * outname ) {
38     unsigned long long size, off, offset, part;
39     int ifd = fileno( infile );
40     char filename[512];
41     PennySort * line;
42     JudySlot * cell;
43     unsigned char * inbuff;
44     void * judy;
45     FILE * out;
46 #if defined(_WIN32)
47     HANDLE hndl, fm;
48     DWORD hiword;
49     FILETIME dummy[1];
50     FILETIME user[1];
51 #else
52     struct tms buff[1];
53 #endif
54     time_t start = time( NULL );
55 
56     if( PennyOff + PennyKey > PennyLine ) {
57         fprintf( stderr, "Key Offset + Key Length > Record Length\n" ), exit( 1 );
58     }
59 
60     offset = 0;
61     PennyPasses = 0;
62 
63 #if defined(_WIN32)
64     hndl = ( HANDLE )_get_osfhandle( ifd );
65     size = GetFileSize( hndl, &hiword );
66     fm = CreateFileMapping( hndl, NULL, PAGE_READONLY, hiword, ( DWORD )size, NULL );
67     if( !fm ) {
68         fprintf( stderr, "CreateFileMapping error %d\n", GetLastError() ), exit( 1 );
69     }
70     size |= ( unsigned long long )hiword << 32;
71 #else
72     size = lseek( ifd, 0L, 2 );
73 #endif
74 
75     while( offset < size ) {
76 #if defined(_WIN32)
77         part = offset + PennyMerge > size ? size - offset : PennyMerge;
78         inbuff = MapViewOfFile( fm, FILE_MAP_READ, offset >> 32, offset, part );
79         if( !inbuff ) {
80             fprintf( stderr, "MapViewOfFile error %d\n", GetLastError() ), exit( 1 );
81         }
82 #else
83         inbuff = mmap( NULL, PennyMerge, PROT_READ,  MAP_SHARED, ifd, offset );
84 
85         if( inbuff == MAP_FAILED ) {
86             fprintf( stderr, "mmap error %d\n", errno ), exit( 1 );
87         }
88 
89         if( madvise( inbuff, PennyMerge, MADV_WILLNEED | MADV_SEQUENTIAL ) < 0 ) {
90             fprintf( stderr, "madvise error %d\n", errno );
91         }
92 #endif
93         judy = judy_open( PennyKey, 0 );
94 
95         off = 0;
96 
97         //    build judy array from mapped input chunk
98 
99         while( offset + off < size && off < PennyMerge ) {
100             line = judy_data( judy, sizeof( PennySort ) );
101             cell = judy_cell( judy, inbuff + off + PennyOff, PennyKey );
102             line->next = *( void ** )cell;
103             line->buff = inbuff + off;
104 
105             *( PennySort ** )cell = line;
106             off += PennyLine;
107         }
108 
109         sprintf( filename, "%s.%d", outname, PennyPasses );
110         out = fopen( filename, "wb" );
111         setvbuf( out, NULL, _IOFBF, 4096 * 1024 );
112 
113 #ifndef _WIN32
114         if( madvise( inbuff, PennyMerge, MADV_WILLNEED | MADV_RANDOM ) < 0 ) {
115             fprintf( stderr, "madvise error %d\n", errno );
116         }
117 #endif
118 
119         //    write judy array in sorted order to temporary file
120 
121         cell = judy_strt( judy, NULL, 0 );
122 
123         if( cell ) do {
124                 line = *( PennySort ** )cell;
125                 do {
126                     fwrite( line->buff, PennyLine, 1, out );
127                 } while( line = line->next );
128             } while( cell = judy_nxt( judy ) );
129 
130 #if defined(_WIN32)
131         UnmapViewOfFile( inbuff );
132 #else
133         munmap( inbuff, PennyMerge );
134 #endif
135         judy_close( judy );
136         offset += off;
137         fflush( out );
138         fclose( out );
139         PennyPasses++;
140     }
141     fprintf( stderr, "End Sort %llu secs", ( unsigned long long ) time( NULL ) - start );
142 #if defined(_WIN32)
143     CloseHandle( fm );
144     GetProcessTimes( GetCurrentProcess(), dummy, dummy, dummy, user );
145     PennySortTime = *( unsigned long long * )user / 10000000;
146 #else
147     times( buff );
148     PennySortTime = buff->tms_utime / 100;
149 #endif
150     fprintf( stderr, " Cpu %d\n", PennySortTime );
151 }
152 
merge(FILE * out,char * outname)153 int merge( FILE * out, char * outname ) {
154     time_t start = time( NULL );
155     char filename[512];
156     JudySlot * cell;
157     unsigned int nxt, idx;
158     unsigned char ** line;
159     unsigned int * next;
160     void * judy;
161     FILE ** in;
162 
163     next = calloc( PennyPasses + 1, sizeof( unsigned int ) );
164     line = calloc( PennyPasses, sizeof( void * ) );
165     in = calloc( PennyPasses, sizeof( void * ) );
166 
167     judy = judy_open( PennyKey, 0 );
168 
169     // initialize merge with one record from each temp file
170 
171     for( idx = 0; idx < PennyPasses; idx++ ) {
172         sprintf( filename, "%s.%d", outname, idx );
173         in[idx] = fopen( filename, "rb" );
174         line[idx] = malloc( PennyLine );
175         setvbuf( in[idx], NULL, _IOFBF, 4096 * 1024 );
176         fread( line[idx], PennyLine, 1, in[idx] );
177         cell = judy_cell( judy, line[idx] + PennyOff, PennyKey );
178         next[idx + 1] = *( unsigned int * )cell;
179         *cell = idx + 1;
180     }
181 
182     //    output records, replacing smallest each time
183 
184     while( cell = judy_strt( judy, NULL, 0 ) ) {
185         nxt = *( unsigned int * )cell;
186         judy_del( judy );
187 
188         // process duplicates
189 
190         while( idx = nxt ) {
191             nxt = next[idx--];
192             fwrite( line[idx], PennyLine, 1, out );
193 
194             if( fread( line[idx], PennyLine, 1, in[idx] ) ) {
195                 cell = judy_cell( judy, line[idx] + PennyOff, PennyKey );
196                 next[idx + 1] = *( unsigned int * )cell;
197                 *cell = idx + 1;
198             } else {
199                 next[idx + 1] = 0;
200             }
201         }
202     }
203 
204     for( idx = 0; idx < PennyPasses; idx++ ) {
205         fclose( in[idx] );
206         free( line[idx] );
207     }
208 
209     free( line );
210     free( next );
211     free( in );
212 
213     fprintf( stderr, "End Merge %llu secs", ( unsigned long long ) time( NULL ) - start );
214 #ifdef _WIN32
215     {
216         FILETIME dummy[1];
217         FILETIME user[1];
218         GetProcessTimes( GetCurrentProcess(), dummy, dummy, dummy, user );
219         PennyMergeTime = *( unsigned long long * )user / 10000000;
220     }
221 #else
222     {
223         struct tms buff[1];
224         times( buff );
225         PennyMergeTime = buff->tms_utime / 100;
226     }
227 #endif
228     fprintf( stderr, " Cpu %d\n", PennyMergeTime - PennySortTime );
229     judy_close( judy );
230     fflush( out );
231     fclose( out );
232     return 0;
233 }
234