xref: /netbsd/usr.bin/sort/fsort.c (revision bf9ec67e)
1 /*	$NetBSD: fsort.c,v 1.20 2001/05/15 11:49:25 jdolecek Exp $	*/
2 
3 /*-
4  * Copyright (c) 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Peter McIlroy.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 /*
40  * Read in the next bin.  If it fits in one segment sort it;
41  * otherwise refine it by segment deeper by one character,
42  * and try again on smaller bins.  Sort the final bin at this level
43  * of recursion to keep the head of fstack at 0.
44  * After PANIC passes, abort to merge sort.
45  */
46 #include "sort.h"
47 #include "fsort.h"
48 
49 #ifndef lint
50 __RCSID("$NetBSD: fsort.c,v 1.20 2001/05/15 11:49:25 jdolecek Exp $");
51 __SCCSID("@(#)fsort.c	8.1 (Berkeley) 6/6/93");
52 #endif /* not lint */
53 
54 #include <stdlib.h>
55 #include <string.h>
56 
57 static const u_char **keylist = 0;
58 u_char *buffer = 0, *linebuf = 0;
59 size_t bufsize = DEFBUFSIZE;
60 size_t linebuf_size;
61 struct tempfile fstack[MAXFCT];
62 extern char *toutpath;
63 #define FSORTMAX 4
64 int PANIC = FSORTMAX;
65 
66 #define MSTART		(MAXFCT - MERGE_FNUM)
67 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1))
68 
69 void
70 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl)
71 	int binno, depth, top;
72 	struct filelist *filelist;
73 	int nfiles;
74 	FILE *outfp;
75 	struct field *ftbl;
76 {
77 	const u_char **keypos;
78 	u_char *bufend, *tmpbuf;
79 	u_char *weights;
80 	int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
81 	int c, nelem, base;
82 	long sizes [NBINS+1];
83 	get_func_t get;
84 	struct recheader *crec;
85 	struct field tfield[2];
86 	FILE *prevfp, *tailfp[FSORTMAX+1];
87 
88 	memset(tailfp, 0, sizeof(tailfp));
89 	prevfp = outfp;
90 	memset(tfield, 0, sizeof(tfield));
91 	if (ftbl[0].flags & R)
92 		tfield[0].weights = Rascii;
93 	else
94 		tfield[0].weights = ascii;
95 	tfield[0].icol.num = 1;
96 	weights = ftbl[0].weights;
97 	if (!buffer) {
98 		buffer = malloc(bufsize);
99 		keylist = malloc(MAXNUM * sizeof(u_char *));
100 		memset(keylist, 0, MAXNUM * sizeof(u_char *));
101 		if (!SINGL_FLD) {
102 			linebuf_size = DEFLLEN;
103 			if ((linebuf = malloc(linebuf_size)) == NULL)
104 				errx(2, "cannot allocate memory");
105 		}
106 	}
107 	bufend = buffer + bufsize;
108 	if (binno >= 0) {
109 		base = top + nfiles;
110 		get = getnext;
111 	} else {
112 		base = 0;
113 		if (SINGL_FLD)
114 			get = makeline;
115 		else
116 			get = makekey;
117 	}
118 	for (;;) {
119 		memset(sizes, 0, sizeof(sizes));
120 		c = ntfiles = 0;
121 		if (binno == weights[REC_D] &&
122 		    !(SINGL_FLD && ftbl[0].flags & F)) {	/* pop */
123 			rd_append(weights[REC_D], top,
124 			    nfiles, prevfp, buffer, bufend);
125 			break;
126 		} else if (binno == weights[REC_D]) {
127 			depth = 0;		/* start over on flat weights */
128 			ftbl = tfield;
129 			weights = ftbl[0].weights;
130 		}
131 		while (c != EOF) {
132 			keypos = keylist;
133 			nelem = 0;
134 			crec = (RECHEADER *) buffer;
135 
136 		   do_read:
137 			while((c = get(binno, top, filelist, nfiles, crec,
138 			    bufend, ftbl)) == 0) {
139 				*keypos++ = crec->data + depth;
140 				if (++nelem == MAXNUM) {
141 					c = BUFFEND;
142 					break;
143 				}
144 				crec =(RECHEADER *)	((char *) crec +
145 				SALIGN(crec->length) + sizeof(TRECHEADER));
146 			}
147 
148 			if (c == BUFFEND && nelem < MAXNUM
149 			    && bufsize < MAXBUFSIZE) {
150 				const u_char **keyp;
151 				u_char *oldb = buffer;
152 
153 				/* buffer was too small for data, allocate
154 				 * bigger buffer */
155 				bufsize *= 2;
156 				buffer = realloc(buffer, bufsize);
157 				if (!buffer) {
158 					err(2, "failed to realloc buffer to %ld bytes",
159 						(unsigned long) bufsize);
160 				}
161 				bufend = buffer + bufsize;
162 
163 				/* patch up keylist[] */
164 				for(keyp = &keypos[-1]; keyp >= keylist; keyp--)
165 					*keyp = buffer + (*keyp - oldb);
166 
167 				crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
168 				goto do_read;
169 			}
170 
171 			if (c != BUFFEND && !ntfiles && !mfct) {
172 				/* do not push */
173 				continue;
174 			}
175 
176 			/* push */
177 			if (panic >= PANIC) {
178 				fstack[MSTART + mfct].fp = ftmp();
179 				if ((stable_sort)
180 					? sradixsort(keylist, nelem,
181 						weights, REC_D)
182 					: radixsort(keylist, nelem,
183 						weights, REC_D) )
184 					err(2, NULL);
185 				append(keylist, nelem, depth,
186 				    fstack[MSTART + mfct].fp, putrec,
187 				    ftbl);
188 				mfct++;
189 				/* reduce number of open files */
190 				if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) {
191 					/*
192 					 * Only copy extra incomplete crec
193 					 * data if there are any.
194 					 */
195 					int nodata = (bufend >= (u_char *)crec
196 					    && bufend <= crec->data);
197 
198 					if (!nodata) {
199 						tmpbuf = malloc(bufend -
200 						    crec->data);
201 						memmove(tmpbuf, crec->data,
202 						    bufend - crec->data);
203 					}
204 
205 					fstack[base + ntfiles].fp = ftmp();
206 					fmerge(0, MSTART, filelist,
207 					    mfct, geteasy, fstack[base].fp,
208 					    putrec, ftbl);
209 					ntfiles++;
210 					mfct = 0;
211 
212 					if (!nodata) {
213 						memmove(crec->data, tmpbuf,
214 						    bufend - crec->data);
215 						free(tmpbuf);
216 					}
217 				}
218 			} else {
219 				fstack[base + ntfiles].fp= ftmp();
220 				onepass(keylist, depth, nelem, sizes,
221 				    weights, fstack[base + ntfiles].fp);
222 				ntfiles++;
223 			}
224 		}
225 		if (!ntfiles && !mfct) {	/* everything in memory--pop */
226 			if (nelem > 1
227 			    && ((stable_sort)
228 				? sradixsort(keylist, nelem, weights, REC_D)
229 				: radixsort(keylist, nelem, weights, REC_D) ))
230 				err(2, NULL);
231 			if (nelem > 0)
232 			  append(keylist, nelem, depth, outfp, putline, ftbl);
233 			break;					/* pop */
234 		}
235 		if (panic >= PANIC) {
236 			if (!ntfiles)
237 				fmerge(0, MSTART, filelist, mfct, geteasy,
238 				    outfp, putline, ftbl);
239 			else
240 				fmerge(0, base, filelist, ntfiles, geteasy,
241 				    outfp, putline, ftbl);
242 			break;
243 
244 		}
245 		total = maxb = lastb = 0;	/* find if one bin dominates */
246 		for (i = 0; i < NBINS; i++)
247 			if (sizes[i]) {
248 				if (sizes[i] > sizes[maxb])
249 					maxb = i;
250 				lastb = i;
251 				total += sizes[i];
252 			}
253 		if (sizes[maxb] < max((total / 2) , BUFSIZE))
254 			maxb = lastb;	/* otherwise pop after last bin */
255 		fstack[base].lastb = lastb;
256 		fstack[base].maxb = maxb;
257 
258 		/* start refining next level. */
259 		getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */
260 		for (i = 0; i < maxb; i++) {
261 			if (!sizes[i])	/* bin empty; step ahead file offset */
262 				getnext(i, base, NULL,ntfiles, crec, bufend, 0);
263 			else
264 				fsort(i, depth+1, base, filelist, ntfiles,
265 					outfp, ftbl);
266 		}
267 
268 		get = getnext;
269 
270 		if (lastb != maxb) {
271 			if (prevfp != outfp)
272 				tailfp[panic] = prevfp;
273 			prevfp = ftmp();
274 			for (i = maxb+1; i <= lastb; i++)
275 				if (!sizes[i])
276 					getnext(i, base, NULL, ntfiles, crec,
277 					    bufend,0);
278 				else
279 					fsort(i, depth+1, base, filelist,
280 					    ntfiles, prevfp, ftbl);
281 		}
282 
283 		/* sort biggest (or last) bin at this level */
284 		depth++;
285 		panic++;
286 		binno = maxb;
287 		top = base;
288 		nfiles = ntfiles;		/* so overwrite them */
289 	}
290 	if (prevfp != outfp) {
291 		concat(outfp, prevfp);
292 		fclose(prevfp);
293 	}
294 	for (i = panic; i >= 0; --i)
295 		if (tailfp[i]) {
296 			concat(outfp, tailfp[i]);
297 			fclose(tailfp[i]);
298 		}
299 
300 	/* If on top level, free our structures */
301 	if (depth == 0) {
302 		free(keylist), keylist = NULL;
303 		free(buffer), buffer = NULL;
304 	}
305 }
306 
307 /*
308  * This is one pass of radix exchange, dumping the bins to disk.
309  */
310 #define swap(a, b, t) t = a, a = b, b = t
311 void
312 onepass(a, depth, n, sizes, tr, fp)
313 	const u_char **a;
314 	int depth;
315 	long n, sizes[];
316 	u_char *tr;
317 	FILE *fp;
318 {
319 	size_t tsizes[NBINS+1];
320 	const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp;
321 	static int histo[256];
322 	int *hp;
323 	int c;
324 	const u_char **an, *t, **aj;
325 	const u_char **ak, *r;
326 
327 	memset(tsizes, 0, sizeof(tsizes));
328 	depth += sizeof(TRECHEADER);
329 	an = &a[n];
330 	for (ak = a; ak < an; ak++) {
331 		histo[c = tr[**ak]]++;
332 		tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length;
333 	}
334 
335 	bin[0] = a;
336 	bpmax = bin + 256;
337 	tp = top, hp = histo;
338 	for (bp = bin; bp < bpmax; bp++) {
339 		*tp++ = *(bp+1) = *bp + (c = *hp);
340 		*hp++ = 0;
341 		if (c <= 1)
342 			continue;
343 	}
344 	for (aj = a; aj < an; *aj = r, aj = bin[c+1])
345 		for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
346 			swap(*ak, r, t);
347 
348 	for (ak = a, c = 0; c < 256; c++) {
349 		an = bin[c+1];
350 		n = an - ak;
351 		tsizes[c] += n * sizeof(TRECHEADER);
352 		/* tell getnext how many elements in this bin, this segment. */
353 		EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
354 		sizes[c] += tsizes[c];
355 		for (; ak < an; ++ak)
356 			putrec((const RECHEADER *) *ak, fp);
357 	}
358 }
359