1 /* $NetBSD: fsort.c,v 1.20 2001/05/15 11:49:25 jdolecek Exp $ */ 2 3 /*- 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Peter McIlroy. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 /* 40 * Read in the next bin. If it fits in one segment sort it; 41 * otherwise refine it by segment deeper by one character, 42 * and try again on smaller bins. Sort the final bin at this level 43 * of recursion to keep the head of fstack at 0. 44 * After PANIC passes, abort to merge sort. 45 */ 46 #include "sort.h" 47 #include "fsort.h" 48 49 #ifndef lint 50 __RCSID("$NetBSD: fsort.c,v 1.20 2001/05/15 11:49:25 jdolecek Exp $"); 51 __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); 52 #endif /* not lint */ 53 54 #include <stdlib.h> 55 #include <string.h> 56 57 static const u_char **keylist = 0; 58 u_char *buffer = 0, *linebuf = 0; 59 size_t bufsize = DEFBUFSIZE; 60 size_t linebuf_size; 61 struct tempfile fstack[MAXFCT]; 62 extern char *toutpath; 63 #define FSORTMAX 4 64 int PANIC = FSORTMAX; 65 66 #define MSTART (MAXFCT - MERGE_FNUM) 67 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1)) 68 69 void 70 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl) 71 int binno, depth, top; 72 struct filelist *filelist; 73 int nfiles; 74 FILE *outfp; 75 struct field *ftbl; 76 { 77 const u_char **keypos; 78 u_char *bufend, *tmpbuf; 79 u_char *weights; 80 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; 81 int c, nelem, base; 82 long sizes [NBINS+1]; 83 get_func_t get; 84 struct recheader *crec; 85 struct field tfield[2]; 86 FILE *prevfp, *tailfp[FSORTMAX+1]; 87 88 memset(tailfp, 0, sizeof(tailfp)); 89 prevfp = outfp; 90 memset(tfield, 0, sizeof(tfield)); 91 if (ftbl[0].flags & R) 92 tfield[0].weights = Rascii; 93 else 94 tfield[0].weights = ascii; 95 tfield[0].icol.num = 1; 96 weights = ftbl[0].weights; 97 if (!buffer) { 98 buffer = malloc(bufsize); 99 keylist = malloc(MAXNUM * sizeof(u_char *)); 100 memset(keylist, 0, MAXNUM * sizeof(u_char *)); 101 if (!SINGL_FLD) { 102 linebuf_size = DEFLLEN; 103 if ((linebuf = malloc(linebuf_size)) == NULL) 104 errx(2, "cannot allocate memory"); 105 } 106 } 107 bufend = buffer + bufsize; 108 if (binno >= 0) { 109 base = top + nfiles; 110 get = getnext; 111 } else { 112 base = 0; 113 if (SINGL_FLD) 114 get = makeline; 115 else 116 get = makekey; 117 } 118 for (;;) { 119 memset(sizes, 0, sizeof(sizes)); 120 c = ntfiles = 0; 121 if (binno == weights[REC_D] && 122 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ 123 rd_append(weights[REC_D], top, 124 nfiles, prevfp, buffer, bufend); 125 break; 126 } else if (binno == weights[REC_D]) { 127 depth = 0; /* start over on flat weights */ 128 ftbl = tfield; 129 weights = ftbl[0].weights; 130 } 131 while (c != EOF) { 132 keypos = keylist; 133 nelem = 0; 134 crec = (RECHEADER *) buffer; 135 136 do_read: 137 while((c = get(binno, top, filelist, nfiles, crec, 138 bufend, ftbl)) == 0) { 139 *keypos++ = crec->data + depth; 140 if (++nelem == MAXNUM) { 141 c = BUFFEND; 142 break; 143 } 144 crec =(RECHEADER *) ((char *) crec + 145 SALIGN(crec->length) + sizeof(TRECHEADER)); 146 } 147 148 if (c == BUFFEND && nelem < MAXNUM 149 && bufsize < MAXBUFSIZE) { 150 const u_char **keyp; 151 u_char *oldb = buffer; 152 153 /* buffer was too small for data, allocate 154 * bigger buffer */ 155 bufsize *= 2; 156 buffer = realloc(buffer, bufsize); 157 if (!buffer) { 158 err(2, "failed to realloc buffer to %ld bytes", 159 (unsigned long) bufsize); 160 } 161 bufend = buffer + bufsize; 162 163 /* patch up keylist[] */ 164 for(keyp = &keypos[-1]; keyp >= keylist; keyp--) 165 *keyp = buffer + (*keyp - oldb); 166 167 crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); 168 goto do_read; 169 } 170 171 if (c != BUFFEND && !ntfiles && !mfct) { 172 /* do not push */ 173 continue; 174 } 175 176 /* push */ 177 if (panic >= PANIC) { 178 fstack[MSTART + mfct].fp = ftmp(); 179 if ((stable_sort) 180 ? sradixsort(keylist, nelem, 181 weights, REC_D) 182 : radixsort(keylist, nelem, 183 weights, REC_D) ) 184 err(2, NULL); 185 append(keylist, nelem, depth, 186 fstack[MSTART + mfct].fp, putrec, 187 ftbl); 188 mfct++; 189 /* reduce number of open files */ 190 if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { 191 /* 192 * Only copy extra incomplete crec 193 * data if there are any. 194 */ 195 int nodata = (bufend >= (u_char *)crec 196 && bufend <= crec->data); 197 198 if (!nodata) { 199 tmpbuf = malloc(bufend - 200 crec->data); 201 memmove(tmpbuf, crec->data, 202 bufend - crec->data); 203 } 204 205 fstack[base + ntfiles].fp = ftmp(); 206 fmerge(0, MSTART, filelist, 207 mfct, geteasy, fstack[base].fp, 208 putrec, ftbl); 209 ntfiles++; 210 mfct = 0; 211 212 if (!nodata) { 213 memmove(crec->data, tmpbuf, 214 bufend - crec->data); 215 free(tmpbuf); 216 } 217 } 218 } else { 219 fstack[base + ntfiles].fp= ftmp(); 220 onepass(keylist, depth, nelem, sizes, 221 weights, fstack[base + ntfiles].fp); 222 ntfiles++; 223 } 224 } 225 if (!ntfiles && !mfct) { /* everything in memory--pop */ 226 if (nelem > 1 227 && ((stable_sort) 228 ? sradixsort(keylist, nelem, weights, REC_D) 229 : radixsort(keylist, nelem, weights, REC_D) )) 230 err(2, NULL); 231 if (nelem > 0) 232 append(keylist, nelem, depth, outfp, putline, ftbl); 233 break; /* pop */ 234 } 235 if (panic >= PANIC) { 236 if (!ntfiles) 237 fmerge(0, MSTART, filelist, mfct, geteasy, 238 outfp, putline, ftbl); 239 else 240 fmerge(0, base, filelist, ntfiles, geteasy, 241 outfp, putline, ftbl); 242 break; 243 244 } 245 total = maxb = lastb = 0; /* find if one bin dominates */ 246 for (i = 0; i < NBINS; i++) 247 if (sizes[i]) { 248 if (sizes[i] > sizes[maxb]) 249 maxb = i; 250 lastb = i; 251 total += sizes[i]; 252 } 253 if (sizes[maxb] < max((total / 2) , BUFSIZE)) 254 maxb = lastb; /* otherwise pop after last bin */ 255 fstack[base].lastb = lastb; 256 fstack[base].maxb = maxb; 257 258 /* start refining next level. */ 259 getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */ 260 for (i = 0; i < maxb; i++) { 261 if (!sizes[i]) /* bin empty; step ahead file offset */ 262 getnext(i, base, NULL,ntfiles, crec, bufend, 0); 263 else 264 fsort(i, depth+1, base, filelist, ntfiles, 265 outfp, ftbl); 266 } 267 268 get = getnext; 269 270 if (lastb != maxb) { 271 if (prevfp != outfp) 272 tailfp[panic] = prevfp; 273 prevfp = ftmp(); 274 for (i = maxb+1; i <= lastb; i++) 275 if (!sizes[i]) 276 getnext(i, base, NULL, ntfiles, crec, 277 bufend,0); 278 else 279 fsort(i, depth+1, base, filelist, 280 ntfiles, prevfp, ftbl); 281 } 282 283 /* sort biggest (or last) bin at this level */ 284 depth++; 285 panic++; 286 binno = maxb; 287 top = base; 288 nfiles = ntfiles; /* so overwrite them */ 289 } 290 if (prevfp != outfp) { 291 concat(outfp, prevfp); 292 fclose(prevfp); 293 } 294 for (i = panic; i >= 0; --i) 295 if (tailfp[i]) { 296 concat(outfp, tailfp[i]); 297 fclose(tailfp[i]); 298 } 299 300 /* If on top level, free our structures */ 301 if (depth == 0) { 302 free(keylist), keylist = NULL; 303 free(buffer), buffer = NULL; 304 } 305 } 306 307 /* 308 * This is one pass of radix exchange, dumping the bins to disk. 309 */ 310 #define swap(a, b, t) t = a, a = b, b = t 311 void 312 onepass(a, depth, n, sizes, tr, fp) 313 const u_char **a; 314 int depth; 315 long n, sizes[]; 316 u_char *tr; 317 FILE *fp; 318 { 319 size_t tsizes[NBINS+1]; 320 const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp; 321 static int histo[256]; 322 int *hp; 323 int c; 324 const u_char **an, *t, **aj; 325 const u_char **ak, *r; 326 327 memset(tsizes, 0, sizeof(tsizes)); 328 depth += sizeof(TRECHEADER); 329 an = &a[n]; 330 for (ak = a; ak < an; ak++) { 331 histo[c = tr[**ak]]++; 332 tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length; 333 } 334 335 bin[0] = a; 336 bpmax = bin + 256; 337 tp = top, hp = histo; 338 for (bp = bin; bp < bpmax; bp++) { 339 *tp++ = *(bp+1) = *bp + (c = *hp); 340 *hp++ = 0; 341 if (c <= 1) 342 continue; 343 } 344 for (aj = a; aj < an; *aj = r, aj = bin[c+1]) 345 for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;) 346 swap(*ak, r, t); 347 348 for (ak = a, c = 0; c < 256; c++) { 349 an = bin[c+1]; 350 n = an - ak; 351 tsizes[c] += n * sizeof(TRECHEADER); 352 /* tell getnext how many elements in this bin, this segment. */ 353 EWRITE(&tsizes[c], sizeof(size_t), 1, fp); 354 sizes[c] += tsizes[c]; 355 for (; ak < an; ++ak) 356 putrec((const RECHEADER *) *ak, fp); 357 } 358 } 359