jdb/jdb-1.14/dbsort

#!/usr/local/bin/perl

#
# dbsort
# Copyright (C) 1991-2001 by John Heidemann <johnh@isi.edu>
# $Id: dbsort,v 1.34 2004/05/19 18:34:27 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
    print <<END;
usage: $0 [-m mode] [-nNrR] column [column...]

Sort rows based on the the specified columns.

	-r sort in reverse order (high to low)
	-R sort in normal order (low to high)
	-i sort insensitivitly to case  [not yet supported]
	-I sort, being sensitivitly to case  [not yet supported]
	-n sort numerically
	-N sort lexicographically
	-d debug mode
	-M MaxMemBytes    specify a limit in memory usage (in bytes)
	-T TmpDir	  where to put tmp files (or env var TMPDIR)

Flags (except for -d) can be interspersed with columns.

Dbsort now consumes a fixed amount of memory regardless of input size.
(It reverts to temporary files on disk if necessary, based on the -M
and -T options.)

Sample input:
#h cid cname
10 pascal
11 numanal
12 os

Sample command:
cat data.jdb | dbsort cname

Sample output:
#h      cid     cname
11 numanal
12 os
10 pascal
#  | dbsort cname
END
    exit 1;
}

BEGIN {
    $dblibdir = "/home/johnh/BIN/DB";
    push(@INC, $dblibdir);
}
require "$dblibdir/dblib.pl";
use DbGetopt;
use FileHandle;


my(@orig_argv) = @ARGV;
my($prog) = &progname;

$debug = 0;
my($max_mem) = 1024*1024*32;
my($mem_debug) = 0;
my($dbopts) = new DbGetopt("dM:nNrRiIT:?", \@ARGV);
$dbopts->opterr(0);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'd') {
    	$debug++;
    } elsif ($ch eq 'M') {
	$max_mem = $dbopts->optarg;
    } elsif ($ch eq 'T') {
	$ENV{'TMPDIR'} = $dbopts->optarg;
    } elsif ($ch eq 'i' || $ch eq 'I') {
	die "$prog: -i and -I not yet supported in dbsort.\n";
    } elsif ($ch eq '?') {
    	&usage;
    } else {
    	# got a db op.  Push it back on and break.
	$dbopts->ungetopt;
    	last;
    };
};
&usage if ($#ARGV < 0);
my($perl_mem_overhead) = 50;  # approx. bytes of overhead for each record in mem
$max_mem /= 4;   # perl seems to take about 10x more memory than you'd expect

&readprocess_header;


&write_header();

sub sort_row_col_fn {
    my($row, $colname, $n) = @_;
    return '$sf' . $n . '[$' . $row . ']';    # '
}
($compare_code, $enter_code, $num_cols) = &generate_compare_code ('custom_compare', 'sort_row_col_fn', @ARGV);
my($mem_count) = 0;
die "$prog: no columns were specified as the sort key.\n" if ($num_cols < 0);

my($enter_memory_code) = "sub custom_enter {\n" .
			 "    my(\$i) = \@_;\n" .
			 $enter_code .
#			 "    print \"enter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" .
			 "}\n" .
			 "sub custom_memory_enter {\n" .
			 "    my(\$i, \$mem_ref) = \@_;\n" .
			 $enter_code;
#			 "    print \"cmenter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" .
$enter_memory_code .= "    print STDERR \${\$mem_ref},\"\\n\" if (\$mem_count++ % 1000 ==0);\n" if ($mem_debug);
# $enter_memory_code .= "    print STDERR \${\$mem_ref},\"\\n\";\n" if ($mem_debug);
$enter_memory_code .= "    \${\$mem_ref} += $perl_mem_overhead + length(\$rawdata[\$#rawdata])\n";
foreach (0..$num_cols) {
    $enter_memory_code .= "\t\t + length(" . sort_row_col_fn('i', undef, $_) . ')';
};
$enter_memory_code .= ";\n    &segment_overflow() if (\${\$mem_ref} > $max_mem);\n}";
eval $enter_memory_code;
$@ && die "$0: eval: $@\n";

if ($debug) {
    print STDERR "COMPARE_CODE:\n$compare_code\nENTER_CODE:\n$enter_memory_code\n";
    exit(1) if ($debug > 1);
};


#
# Handle large things in pieces if necessary.
#
# call &segment_start to init things,
#   &segment_overflow to close one segment and start the next
#   &segment_merge to put them back together again.
#
# Note that we don't invoke the merge code unless the data
# exceeds some threshold size, so small sorts happen completely
# in memory.
#
# Once we give up on memory, all the merging happens by making
# passes over the disk files.
#

my(@sortedp, @rawdata, @p, @files_to_merge, $i, $memory_used);

sub segment_start {
    $i = -1;
    $memory_used = 0;
    # undef(@sortedp, @rawdata, @p);   # free the mem (maybe next line does that too)
    undef @sortedp;
    undef @rawdata;
    undef @p;
    @sortedp = @rawdata = @p = ();
}

sub segment_overflow {
    my($done) = @_;

    # sort the segment
    @sortedp = sort custom_compare @p;

    # pass on the data, either to a tmp file stdout
    if ($#files_to_merge >= 0 || $memory_used > $max_mem) {
	push(@files_to_merge, db_tmpfile(OUT));
    } else {
	open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n";
    };
    foreach (@sortedp) {
        print OUT $rawdata[$_];
    };
    close OUT;

    # clean up memory usage
    # and try again
    print "memory used: $memory_used\n" if ($debug);
    &segment_start;
}

#
# &segment_merge merges the on-disk files we've built up
# in the work queue @files_to_merge.
#
# Changed Nov. 2001: try to process the work queue in
# a file-system-cache-friendly order (based on ideas from
# "Information and Control in Gray-box Systems" by
# the Arpaci-Dusseau's at SOSP 2001.
#
# Idea:  each "pass" through the queue, revsere the processing
# order so that the most recent data (that's hopefully
# in memory) is handled first.
#
# This algorithm isn't perfect (sometimes if there's an odd number
# of files in the queue you reach way back in time, but most of
# the time it's quite good).
#
# Also, in an ideal world $max_mem actually would be some sizable
# percentage of memory, and so this whole optimization would
# be useless because there would be no spare memory for the file system
# cache.  But for saftey reasons (because we don't know how much
# RAM there is, and there is multiprocessing, etc.), $max_mem
# is almost always hugely conservative.  As of Nov. 2001 it defaults to
# 10MB, but most workstations have >= 512MB memory.
#
sub segment_merge {
    return if ($#files_to_merge < 0);
    # keep track of how often to reverse
    my($files_before_reversing_queue) = 0;
    # Merge the files in a binary tree.
    while ($#files_to_merge >= 0) {
	# Each "pass", reverse the queue to reuse stuff in memory.
	if ($files_before_reversing_queue <= 0) {
	    @files_to_merge = reverse @files_to_merge;
	    $files_before_reversing_queue = $#files_to_merge + 1;
	    print "reversed queue, $files_before_reversing_queue before next reverse.\n" if ($debug);
	};
	$files_before_reversing_queue -= 2;
	# Pick off the two next files for merging.
	my(@fns);
	die "$0: segment_merge, odd number of segments.\n" if ($#files_to_merge == 0);
	push(@fns, shift @files_to_merge);
	push(@fns, shift @files_to_merge);
	# send the output to another file, or stdout if we're done
	if ($#files_to_merge >= 0) {
	    push(@files_to_merge, db_tmpfile(OUT));
        } else {
	    open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n";
        };
	print "merging $fns[0] and $fns[1] to " . ($#files_to_merge >=0 ? $files_to_merge[$#files_to_merge] : "STDOUT") . "\n" if ($debug);
	merge_to_out(@fns);
	close OUT;
	# verify($files_to_merge[$#files_to_merge]) if ($#files_to_merge >= 0);
	foreach (@fns) {
	    db_tmpfile_cleanup($_);
	};
    };
}

# This function is very custom for debugging.
# sub verify {
#     my($fn) = @_;
#     open(F, "<$fn") || die;
#     my($last);
#     my($i) = 0;
#     while (<F>) {
# 	$i++;
# 	$last = $_ if (!defined($last));
# 	if ($last > $_) {
# 	    die "bogus on line $i\n";
# 	};
#     };
#     close F;
# }

sub merge_to_out {
    my(@fh) = qw(A B);
    my($j);
    foreach $j (0..1) {
	$fh[$j] = new FileHandle;
	$fh[$j]->open("<$_[$j]") || die "$0: cannot open $_[$j].\n";
	&merge_read_one($fh[$j], $j) || die "$0: $_[$j] is empty.\n";
    };
    my($winner);
    $a = 0; $b = 1;   # for custom_compare
    for (;;) {
	$winner = &custom_compare > 0 ? 1 : 0;
	# print "\$sf0[0] = $sf0[0], \$sf0[1] = $sf0[1], \$winner = $winner, $rawdata[$winner]";
	print OUT $rawdata[$winner];
	# refill buffer
	if (!&merge_read_one($fh[$winner], $winner)) {
	    # $winner is exhausted.  Drain !$winner's buffer, then break and finish below
	    print OUT $rawdata[!$winner];
	    last;
	};
    };
    # finish up !$winner
    # while (<$fh[!$winner]>)  returns "A"--a perl bug in 5.004_04
    # work around: use eof/getline methods.
    while (!$fh[!$winner]->eof) {
	# print "clearing $fh[!$winner]\n";
	print OUT $fh[!$winner]->getline;
    };
    foreach (0..1) {
	close $fh[$_];
    };
}

sub merge_read_one {
    my($fh, $index) = @_;
    $_ = scalar <$fh>;
    return undef if (!defined($_));   # out of data
    $rawdata[$index] = $_;
    &split_cols;
    # print "read from $fh into $i, $_";
    &custom_enter($index);;
    return 1;
}

#
# read in and set up the data
#
&segment_start;
@files_to_merge = ();
while (<STDIN>) {
    # NEEDSWORK:  should buffer comments to a file, not memory.
    next if (&delayed_pass_comments);
    push (@rawdata, $_);
    $i++;
    push (@p, $i);
    &split_cols;
    &custom_memory_enter($i, \$memory_used);
    # $@ && die("$prog: internal eval error: $@.\n");
};
# handle end case
&segment_overflow if ($i >= 0);
&segment_merge;
&delayed_flush_comments;


print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;