1 /* $NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $ */ 2 3 /*- 4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Ben Harris and Jaromir Dolecek. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1993 34 * The Regents of the University of California. All rights reserved. 35 * 36 * This code is derived from software contributed to Berkeley by 37 * Peter McIlroy. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. Neither the name of the University nor the names of its contributors 48 * may be used to endorse or promote products derived from this software 49 * without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 61 * SUCH DAMAGE. 62 */ 63 64 #include "sort.h" 65 #include "fsort.h" 66 67 __RCSID("$NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $"); 68 69 #include <stdlib.h> 70 #include <string.h> 71 #include <unistd.h> 72 #include <util.h> 73 74 /* Subroutines using comparisons: merge sort and check order */ 75 #define DELETE (1) 76 77 typedef struct mfile { 78 FILE *fp; 79 get_func_t get; 80 RECHEADER *rec; 81 u_char *end; 82 } MFILE; 83 84 static int cmp(RECHEADER *, RECHEADER *); 85 static int insert(struct mfile **, struct mfile *, int, int); 86 static void merge_sort_fstack(FILE *, put_func_t, struct field *); 87 88 /* 89 * Number of files merge() can merge in one pass. 90 */ 91 #define MERGE_FNUM 16 92 93 static struct mfile fstack[MERGE_FNUM]; 94 static struct mfile fstack_1[MERGE_FNUM]; 95 static struct mfile fstack_2[MERGE_FNUM]; 96 static int fstack_count, fstack_1_count, fstack_2_count; 97 98 void 99 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl) 100 { 101 FILE *mfp, *mfp1, *mfp2; 102 103 if (fstack_count == MERGE_FNUM) { 104 /* Must reduce the number of temporary files */ 105 mfp = ftmp(); 106 merge_sort_fstack(mfp, putrec, ftbl); 107 /* Save output in next layer */ 108 if (fstack_1_count == MERGE_FNUM) { 109 mfp1 = ftmp(); 110 memcpy(fstack, fstack_1, sizeof fstack); 111 merge_sort_fstack(mfp1, putrec, ftbl); 112 if (fstack_2_count == MERGE_FNUM) { 113 /* More than 4096 files! */ 114 mfp2 = ftmp(); 115 memcpy(fstack, fstack_2, sizeof fstack); 116 merge_sort_fstack(mfp2, putrec, ftbl); 117 fstack_2[0].fp = mfp2; 118 fstack_2_count = 1; 119 } 120 fstack_2[fstack_2_count].fp = mfp1; 121 fstack_2[fstack_2_count].get = geteasy; 122 fstack_2_count++; 123 fstack_1_count = 0; 124 } 125 fstack_1[fstack_1_count].fp = mfp; 126 fstack_1[fstack_1_count].get = geteasy; 127 fstack_1_count++; 128 fstack_count = 0; 129 } 130 131 fstack[fstack_count].fp = fp; 132 fstack[fstack_count++].get = get; 133 } 134 135 void 136 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl) 137 { 138 get_func_t get = SINGL_FLD ? makeline : makekey; 139 FILE *fp; 140 int i; 141 142 for (i = 0; i < nfiles; i++) { 143 fp = fopen(filelist->names[i], "r"); 144 if (fp == NULL) 145 err(2, "%s", filelist->names[i]); 146 save_for_merge(fp, get, ftbl); 147 } 148 149 merge_sort(outfp, putline, ftbl); 150 } 151 152 void 153 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl) 154 { 155 int count = fstack_1_count + fstack_2_count; 156 FILE *mfp; 157 int i; 158 159 if (count == 0) { 160 /* All files in initial array */ 161 merge_sort_fstack(outfp, put, ftbl); 162 return; 163 } 164 165 count += fstack_count; 166 167 /* Too many files for one merge sort */ 168 for (;;) { 169 /* Sort latest 16 files */ 170 i = count; 171 if (i > MERGE_FNUM) 172 i = MERGE_FNUM; 173 while (fstack_count > 0) 174 fstack[--i] = fstack[--fstack_count]; 175 while (i > 0 && fstack_1_count > 0) 176 fstack[--i] = fstack_1[--fstack_1_count]; 177 while (i > 0) 178 fstack[--i] = fstack_2[--fstack_2_count]; 179 if (count <= MERGE_FNUM) { 180 /* Got all the data */ 181 fstack_count = count; 182 merge_sort_fstack(outfp, put, ftbl); 183 return; 184 } 185 mfp = ftmp(); 186 fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count; 187 merge_sort_fstack(mfp, putrec, ftbl); 188 fstack[0].fp = mfp; 189 fstack[0].get = geteasy; 190 fstack_count = 1; 191 count -= MERGE_FNUM - 1; 192 } 193 } 194 195 static void 196 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl) 197 { 198 struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile; 199 RECHEADER *new_rec; 200 u_char *new_end; 201 void *tmp; 202 int c, i, nfiles; 203 size_t sz; 204 205 /* Read one record from each file (read again if a duplicate) */ 206 for (nfiles = i = 0; i < fstack_count; i++) { 207 cfile = &fstack[i]; 208 if (cfile->rec == NULL) { 209 cfile->rec = allocrec(NULL, DEFLLEN); 210 cfile->end = (u_char *)cfile->rec + DEFLLEN; 211 } 212 rewind(cfile->fp); 213 214 for (;;) { 215 c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl); 216 if (c == EOF) 217 break; 218 219 if (c == BUFFEND) { 220 /* Double buffer size */ 221 sz = (cfile->end - (u_char *)cfile->rec) * 2; 222 cfile->rec = allocrec(cfile->rec, sz); 223 cfile->end = (u_char *)cfile->rec + sz; 224 continue; 225 } 226 227 if (nfiles != 0) { 228 if (insert(flist, cfile, nfiles, !DELETE)) 229 /* Duplicate removed */ 230 continue; 231 } else 232 flist[0] = cfile; 233 nfiles++; 234 break; 235 } 236 } 237 238 if (nfiles == 0) 239 return; 240 241 /* 242 * We now loop reading a new record from the file with the 243 * 'sorted first' existing record. 244 * As each record is added, the 'first' record is written to the 245 * output file - maintaining one record from each file in the sorted 246 * list. 247 */ 248 new_rec = allocrec(NULL, DEFLLEN); 249 new_end = (u_char *)new_rec + DEFLLEN; 250 for (;;) { 251 cfile = flist[0]; 252 c = cfile->get(cfile->fp, new_rec, new_end, ftbl); 253 if (c == EOF) { 254 /* Write out last record from now-empty input */ 255 put(cfile->rec, outfp); 256 if (--nfiles == 0) 257 break; 258 /* Replace from file with now-first sorted record. */ 259 /* (Moving base 'flist' saves copying everything!) */ 260 flist++; 261 continue; 262 } 263 if (c == BUFFEND) { 264 /* Buffer not large enough - double in size */ 265 sz = (new_end - (u_char *)new_rec) * 2; 266 new_rec = allocrec(new_rec, sz); 267 new_end = (u_char *)new_rec +sz; 268 continue; 269 } 270 271 /* Swap in new buffer, saving old */ 272 tmp = cfile->rec; 273 cfile->rec = new_rec; 274 new_rec = tmp; 275 tmp = cfile->end; 276 cfile->end = new_end; 277 new_end = tmp; 278 279 /* Add into sort, removing the original first entry */ 280 c = insert(flist, cfile, nfiles, DELETE); 281 if (c != 0 || (UNIQUE && cfile == flist[0] 282 && cmp(new_rec, cfile->rec) == 0)) { 283 /* Was an unwanted duplicate, restore buffer */ 284 tmp = cfile->rec; 285 cfile->rec = new_rec; 286 new_rec = tmp; 287 tmp = cfile->end; 288 cfile->end = new_end; 289 new_end = tmp; 290 continue; 291 } 292 293 /* Write out 'old' record */ 294 put(new_rec, outfp); 295 } 296 297 free(new_rec); 298 } 299 300 /* 301 * if delete: inserts rec in flist, deletes flist[0]; 302 * otherwise just inserts *rec in flist. 303 * Returns 1 if record is a duplicate to be ignored. 304 */ 305 static int 306 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete) 307 { 308 int mid, top = ttop, bot = 0, cmpv = 1; 309 310 for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) { 311 cmpv = cmp(rec->rec, flist[mid]->rec); 312 if (cmpv == 0 ) { 313 if (UNIQUE) 314 /* Duplicate key, read another record */ 315 /* NB: This doesn't guarantee to keep any 316 * particular record. */ 317 return 1; 318 /* 319 * Apply sort by input file order. 320 * We could truncate the sort is the fileno are 321 * adjacent - but that is all too hard! 322 * The fileno cannot be equal, since we only have one 323 * record from each file (+ flist[0] which never 324 * comes here). 325 */ 326 cmpv = rec < flist[mid] ? -1 : 1; 327 if (REVERSE) 328 cmpv = -cmpv; 329 } 330 if (cmpv < 0) 331 top = mid; 332 else 333 bot = mid; 334 } 335 336 /* At this point we haven't yet compared against flist[0] */ 337 338 if (delete) { 339 /* flist[0] is ourselves, only the caller knows the old data */ 340 if (bot != 0) { 341 memmove(flist, flist + 1, bot * sizeof(MFILE *)); 342 flist[bot] = rec; 343 } 344 return 0; 345 } 346 347 /* Inserting original set of records */ 348 349 if (bot == 0 && cmpv != 0) { 350 /* Doesn't match flist[1], must compare with flist[0] */ 351 cmpv = cmp(rec->rec, flist[0]->rec); 352 if (cmpv == 0 && UNIQUE) 353 return 1; 354 /* Add matching keys in file order (ie new is later) */ 355 if (cmpv < 0) 356 bot = -1; 357 } 358 bot++; 359 memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *)); 360 flist[bot] = rec; 361 return 0; 362 } 363 364 /* 365 * check order on one file 366 */ 367 void 368 order(struct filelist *filelist, struct field *ftbl) 369 { 370 get_func_t get = SINGL_FLD ? makeline : makekey; 371 RECHEADER *crec, *prec, *trec; 372 u_char *crec_end, *prec_end, *trec_end; 373 FILE *fp; 374 int c; 375 376 fp = fopen(filelist->names[0], "r"); 377 if (fp == NULL) 378 err(2, "%s", filelist->names[0]); 379 380 crec = malloc(offsetof(RECHEADER, data[DEFLLEN])); 381 crec_end = crec->data + DEFLLEN; 382 prec = malloc(offsetof(RECHEADER, data[DEFLLEN])); 383 prec_end = prec->data + DEFLLEN; 384 385 /* XXX this does exit(0) for overlong lines */ 386 if (get(fp, prec, prec_end, ftbl) != 0) 387 exit(0); 388 while (get(fp, crec, crec_end, ftbl) == 0) { 389 if (0 < (c = cmp(prec, crec))) { 390 crec->data[crec->length-1] = 0; 391 errx(1, "found disorder: %s", crec->data+crec->offset); 392 } 393 if (UNIQUE && !c) { 394 crec->data[crec->length-1] = 0; 395 errx(1, "found non-uniqueness: %s", 396 crec->data+crec->offset); 397 } 398 /* 399 * Swap pointers so that this record is on place pointed 400 * to by prec and new record is read to place pointed to by 401 * crec. 402 */ 403 trec = prec; 404 prec = crec; 405 crec = trec; 406 trec_end = prec_end; 407 prec_end = crec_end; 408 crec_end = trec_end; 409 } 410 exit(0); 411 } 412 413 static int 414 cmp(RECHEADER *rec1, RECHEADER *rec2) 415 { 416 int len; 417 int r; 418 419 /* key is weights */ 420 len = min(rec1->keylen, rec2->keylen); 421 r = memcmp(rec1->data, rec2->data, len); 422 if (r == 0) 423 r = rec1->keylen - rec2->keylen; 424 if (REVERSE) 425 r = -r; 426 return r; 427 } 428