1#!/usr/local/bin/perl
2
3#
4# dbsort
5# Copyright (C) 1991-2001 by John Heidemann <johnh@isi.edu>
6# $Id: dbsort,v 1.34 2004/05/19 18:34:27 johnh Exp $
7#
8# This program is distributed under terms of the GNU general
9# public license, version 2.  See the file COPYING
10# in $dblibdir for details.
11#
12sub usage {
13    print <<END;
14usage: $0 [-m mode] [-nNrR] column [column...]
15
16Sort rows based on the the specified columns.
17
18	-r sort in reverse order (high to low)
19	-R sort in normal order (low to high)
20	-i sort insensitivitly to case  [not yet supported]
21	-I sort, being sensitivitly to case  [not yet supported]
22	-n sort numerically
23	-N sort lexicographically
24	-d debug mode
25	-M MaxMemBytes    specify a limit in memory usage (in bytes)
26	-T TmpDir	  where to put tmp files (or env var TMPDIR)
27
28Flags (except for -d) can be interspersed with columns.
29
30Dbsort now consumes a fixed amount of memory regardless of input size.
31(It reverts to temporary files on disk if necessary, based on the -M
32and -T options.)
33
34Sample input:
35#h cid cname
3610 pascal
3711 numanal
3812 os
39
40Sample command:
41cat data.jdb | dbsort cname
42
43Sample output:
44#h      cid     cname
4511 numanal
4612 os
4710 pascal
48#  | dbsort cname
49END
50    exit 1;
51}
52
53BEGIN {
54    $dblibdir = "/home/johnh/BIN/DB";
55    push(@INC, $dblibdir);
56}
57require "$dblibdir/dblib.pl";
58use DbGetopt;
59use FileHandle;
60
61
62my(@orig_argv) = @ARGV;
63my($prog) = &progname;
64
65$debug = 0;
66my($max_mem) = 1024*1024*32;
67my($mem_debug) = 0;
68my($dbopts) = new DbGetopt("dM:nNrRiIT:?", \@ARGV);
69$dbopts->opterr(0);
70my($ch);
71while ($dbopts->getopt) {
72    $ch = $dbopts->opt;
73    if ($ch eq 'd') {
74    	$debug++;
75    } elsif ($ch eq 'M') {
76	$max_mem = $dbopts->optarg;
77    } elsif ($ch eq 'T') {
78	$ENV{'TMPDIR'} = $dbopts->optarg;
79    } elsif ($ch eq 'i' || $ch eq 'I') {
80	die "$prog: -i and -I not yet supported in dbsort.\n";
81    } elsif ($ch eq '?') {
82    	&usage;
83    } else {
84    	# got a db op.  Push it back on and break.
85	$dbopts->ungetopt;
86    	last;
87    };
88};
89&usage if ($#ARGV < 0);
90my($perl_mem_overhead) = 50;  # approx. bytes of overhead for each record in mem
91$max_mem /= 4;   # perl seems to take about 10x more memory than you'd expect
92
93&readprocess_header;
94
95
96&write_header();
97
98sub sort_row_col_fn {
99    my($row, $colname, $n) = @_;
100    return '$sf' . $n . '[$' . $row . ']';    # '
101}
102($compare_code, $enter_code, $num_cols) = &generate_compare_code ('custom_compare', 'sort_row_col_fn', @ARGV);
103my($mem_count) = 0;
104die "$prog: no columns were specified as the sort key.\n" if ($num_cols < 0);
105
106my($enter_memory_code) = "sub custom_enter {\n" .
107			 "    my(\$i) = \@_;\n" .
108			 $enter_code .
109#			 "    print \"enter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" .
110			 "}\n" .
111			 "sub custom_memory_enter {\n" .
112			 "    my(\$i, \$mem_ref) = \@_;\n" .
113			 $enter_code;
114#			 "    print \"cmenter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" .
115$enter_memory_code .= "    print STDERR \${\$mem_ref},\"\\n\" if (\$mem_count++ % 1000 ==0);\n" if ($mem_debug);
116# $enter_memory_code .= "    print STDERR \${\$mem_ref},\"\\n\";\n" if ($mem_debug);
117$enter_memory_code .= "    \${\$mem_ref} += $perl_mem_overhead + length(\$rawdata[\$#rawdata])\n";
118foreach (0..$num_cols) {
119    $enter_memory_code .= "\t\t + length(" . sort_row_col_fn('i', undef, $_) . ')';
120};
121$enter_memory_code .= ";\n    &segment_overflow() if (\${\$mem_ref} > $max_mem);\n}";
122eval $enter_memory_code;
123$@ && die "$0: eval: $@\n";
124
125if ($debug) {
126    print STDERR "COMPARE_CODE:\n$compare_code\nENTER_CODE:\n$enter_memory_code\n";
127    exit(1) if ($debug > 1);
128};
129
130
131#
132# Handle large things in pieces if necessary.
133#
134# call &segment_start to init things,
135#   &segment_overflow to close one segment and start the next
136#   &segment_merge to put them back together again.
137#
138# Note that we don't invoke the merge code unless the data
139# exceeds some threshold size, so small sorts happen completely
140# in memory.
141#
142# Once we give up on memory, all the merging happens by making
143# passes over the disk files.
144#
145
146my(@sortedp, @rawdata, @p, @files_to_merge, $i, $memory_used);
147
148sub segment_start {
149    $i = -1;
150    $memory_used = 0;
151    # undef(@sortedp, @rawdata, @p);   # free the mem (maybe next line does that too)
152    undef @sortedp;
153    undef @rawdata;
154    undef @p;
155    @sortedp = @rawdata = @p = ();
156}
157
158sub segment_overflow {
159    my($done) = @_;
160
161    # sort the segment
162    @sortedp = sort custom_compare @p;
163
164    # pass on the data, either to a tmp file stdout
165    if ($#files_to_merge >= 0 || $memory_used > $max_mem) {
166	push(@files_to_merge, db_tmpfile(OUT));
167    } else {
168	open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n";
169    };
170    foreach (@sortedp) {
171        print OUT $rawdata[$_];
172    };
173    close OUT;
174
175    # clean up memory usage
176    # and try again
177    print "memory used: $memory_used\n" if ($debug);
178    &segment_start;
179}
180
181#
182# &segment_merge merges the on-disk files we've built up
183# in the work queue @files_to_merge.
184#
185# Changed Nov. 2001: try to process the work queue in
186# a file-system-cache-friendly order (based on ideas from
187# "Information and Control in Gray-box Systems" by
188# the Arpaci-Dusseau's at SOSP 2001.
189#
190# Idea:  each "pass" through the queue, revsere the processing
191# order so that the most recent data (that's hopefully
192# in memory) is handled first.
193#
194# This algorithm isn't perfect (sometimes if there's an odd number
195# of files in the queue you reach way back in time, but most of
196# the time it's quite good).
197#
198# Also, in an ideal world $max_mem actually would be some sizable
199# percentage of memory, and so this whole optimization would
200# be useless because there would be no spare memory for the file system
201# cache.  But for saftey reasons (because we don't know how much
202# RAM there is, and there is multiprocessing, etc.), $max_mem
203# is almost always hugely conservative.  As of Nov. 2001 it defaults to
204# 10MB, but most workstations have >= 512MB memory.
205#
206sub segment_merge {
207    return if ($#files_to_merge < 0);
208    # keep track of how often to reverse
209    my($files_before_reversing_queue) = 0;
210    # Merge the files in a binary tree.
211    while ($#files_to_merge >= 0) {
212	# Each "pass", reverse the queue to reuse stuff in memory.
213	if ($files_before_reversing_queue <= 0) {
214	    @files_to_merge = reverse @files_to_merge;
215	    $files_before_reversing_queue = $#files_to_merge + 1;
216	    print "reversed queue, $files_before_reversing_queue before next reverse.\n" if ($debug);
217	};
218	$files_before_reversing_queue -= 2;
219	# Pick off the two next files for merging.
220	my(@fns);
221	die "$0: segment_merge, odd number of segments.\n" if ($#files_to_merge == 0);
222	push(@fns, shift @files_to_merge);
223	push(@fns, shift @files_to_merge);
224	# send the output to another file, or stdout if we're done
225	if ($#files_to_merge >= 0) {
226	    push(@files_to_merge, db_tmpfile(OUT));
227        } else {
228	    open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n";
229        };
230	print "merging $fns[0] and $fns[1] to " . ($#files_to_merge >=0 ? $files_to_merge[$#files_to_merge] : "STDOUT") . "\n" if ($debug);
231	merge_to_out(@fns);
232	close OUT;
233	# verify($files_to_merge[$#files_to_merge]) if ($#files_to_merge >= 0);
234	foreach (@fns) {
235	    db_tmpfile_cleanup($_);
236	};
237    };
238}
239
240# This function is very custom for debugging.
241# sub verify {
242#     my($fn) = @_;
243#     open(F, "<$fn") || die;
244#     my($last);
245#     my($i) = 0;
246#     while (<F>) {
247# 	$i++;
248# 	$last = $_ if (!defined($last));
249# 	if ($last > $_) {
250# 	    die "bogus on line $i\n";
251# 	};
252#     };
253#     close F;
254# }
255
256sub merge_to_out {
257    my(@fh) = qw(A B);
258    my($j);
259    foreach $j (0..1) {
260	$fh[$j] = new FileHandle;
261	$fh[$j]->open("<$_[$j]") || die "$0: cannot open $_[$j].\n";
262	&merge_read_one($fh[$j], $j) || die "$0: $_[$j] is empty.\n";
263    };
264    my($winner);
265    $a = 0; $b = 1;   # for custom_compare
266    for (;;) {
267	$winner = &custom_compare > 0 ? 1 : 0;
268	# print "\$sf0[0] = $sf0[0], \$sf0[1] = $sf0[1], \$winner = $winner, $rawdata[$winner]";
269	print OUT $rawdata[$winner];
270	# refill buffer
271	if (!&merge_read_one($fh[$winner], $winner)) {
272	    # $winner is exhausted.  Drain !$winner's buffer, then break and finish below
273	    print OUT $rawdata[!$winner];
274	    last;
275	};
276    };
277    # finish up !$winner
278    # while (<$fh[!$winner]>)  returns "A"--a perl bug in 5.004_04
279    # work around: use eof/getline methods.
280    while (!$fh[!$winner]->eof) {
281	# print "clearing $fh[!$winner]\n";
282	print OUT $fh[!$winner]->getline;
283    };
284    foreach (0..1) {
285	close $fh[$_];
286    };
287}
288
289sub merge_read_one {
290    my($fh, $index) = @_;
291    $_ = scalar <$fh>;
292    return undef if (!defined($_));   # out of data
293    $rawdata[$index] = $_;
294    &split_cols;
295    # print "read from $fh into $i, $_";
296    &custom_enter($index);;
297    return 1;
298}
299
300#
301# read in and set up the data
302#
303&segment_start;
304@files_to_merge = ();
305while (<STDIN>) {
306    # NEEDSWORK:  should buffer comments to a file, not memory.
307    next if (&delayed_pass_comments);
308    push (@rawdata, $_);
309    $i++;
310    push (@p, $i);
311    &split_cols;
312    &custom_memory_enter($i, \$memory_used);
313    # $@ && die("$prog: internal eval error: $@.\n");
314};
315# handle end case
316&segment_overflow if ($i >= 0);
317&segment_merge;
318&delayed_flush_comments;
319
320
321print "#  | $prog ", join(" ", @orig_argv), "\n";
322exit 0;
323
324