1#!/usr/local/bin/perl 2 3# 4# dbsort 5# Copyright (C) 1991-2001 by John Heidemann <johnh@isi.edu> 6# $Id: dbsort,v 1.34 2004/05/19 18:34:27 johnh Exp $ 7# 8# This program is distributed under terms of the GNU general 9# public license, version 2. See the file COPYING 10# in $dblibdir for details. 11# 12sub usage { 13 print <<END; 14usage: $0 [-m mode] [-nNrR] column [column...] 15 16Sort rows based on the the specified columns. 17 18 -r sort in reverse order (high to low) 19 -R sort in normal order (low to high) 20 -i sort insensitivitly to case [not yet supported] 21 -I sort, being sensitivitly to case [not yet supported] 22 -n sort numerically 23 -N sort lexicographically 24 -d debug mode 25 -M MaxMemBytes specify a limit in memory usage (in bytes) 26 -T TmpDir where to put tmp files (or env var TMPDIR) 27 28Flags (except for -d) can be interspersed with columns. 29 30Dbsort now consumes a fixed amount of memory regardless of input size. 31(It reverts to temporary files on disk if necessary, based on the -M 32and -T options.) 33 34Sample input: 35#h cid cname 3610 pascal 3711 numanal 3812 os 39 40Sample command: 41cat data.jdb | dbsort cname 42 43Sample output: 44#h cid cname 4511 numanal 4612 os 4710 pascal 48# | dbsort cname 49END 50 exit 1; 51} 52 53BEGIN { 54 $dblibdir = "/home/johnh/BIN/DB"; 55 push(@INC, $dblibdir); 56} 57require "$dblibdir/dblib.pl"; 58use DbGetopt; 59use FileHandle; 60 61 62my(@orig_argv) = @ARGV; 63my($prog) = &progname; 64 65$debug = 0; 66my($max_mem) = 1024*1024*32; 67my($mem_debug) = 0; 68my($dbopts) = new DbGetopt("dM:nNrRiIT:?", \@ARGV); 69$dbopts->opterr(0); 70my($ch); 71while ($dbopts->getopt) { 72 $ch = $dbopts->opt; 73 if ($ch eq 'd') { 74 $debug++; 75 } elsif ($ch eq 'M') { 76 $max_mem = $dbopts->optarg; 77 } elsif ($ch eq 'T') { 78 $ENV{'TMPDIR'} = $dbopts->optarg; 79 } elsif ($ch eq 'i' || $ch eq 'I') { 80 die "$prog: -i and -I not yet supported in dbsort.\n"; 81 } elsif ($ch eq '?') { 82 &usage; 83 } else { 84 # got a db op. Push it back on and break. 85 $dbopts->ungetopt; 86 last; 87 }; 88}; 89&usage if ($#ARGV < 0); 90my($perl_mem_overhead) = 50; # approx. bytes of overhead for each record in mem 91$max_mem /= 4; # perl seems to take about 10x more memory than you'd expect 92 93&readprocess_header; 94 95 96&write_header(); 97 98sub sort_row_col_fn { 99 my($row, $colname, $n) = @_; 100 return '$sf' . $n . '[$' . $row . ']'; # ' 101} 102($compare_code, $enter_code, $num_cols) = &generate_compare_code ('custom_compare', 'sort_row_col_fn', @ARGV); 103my($mem_count) = 0; 104die "$prog: no columns were specified as the sort key.\n" if ($num_cols < 0); 105 106my($enter_memory_code) = "sub custom_enter {\n" . 107 " my(\$i) = \@_;\n" . 108 $enter_code . 109# " print \"enter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" . 110 "}\n" . 111 "sub custom_memory_enter {\n" . 112 " my(\$i, \$mem_ref) = \@_;\n" . 113 $enter_code; 114# " print \"cmenter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" . 115$enter_memory_code .= " print STDERR \${\$mem_ref},\"\\n\" if (\$mem_count++ % 1000 ==0);\n" if ($mem_debug); 116# $enter_memory_code .= " print STDERR \${\$mem_ref},\"\\n\";\n" if ($mem_debug); 117$enter_memory_code .= " \${\$mem_ref} += $perl_mem_overhead + length(\$rawdata[\$#rawdata])\n"; 118foreach (0..$num_cols) { 119 $enter_memory_code .= "\t\t + length(" . sort_row_col_fn('i', undef, $_) . ')'; 120}; 121$enter_memory_code .= ";\n &segment_overflow() if (\${\$mem_ref} > $max_mem);\n}"; 122eval $enter_memory_code; 123$@ && die "$0: eval: $@\n"; 124 125if ($debug) { 126 print STDERR "COMPARE_CODE:\n$compare_code\nENTER_CODE:\n$enter_memory_code\n"; 127 exit(1) if ($debug > 1); 128}; 129 130 131# 132# Handle large things in pieces if necessary. 133# 134# call &segment_start to init things, 135# &segment_overflow to close one segment and start the next 136# &segment_merge to put them back together again. 137# 138# Note that we don't invoke the merge code unless the data 139# exceeds some threshold size, so small sorts happen completely 140# in memory. 141# 142# Once we give up on memory, all the merging happens by making 143# passes over the disk files. 144# 145 146my(@sortedp, @rawdata, @p, @files_to_merge, $i, $memory_used); 147 148sub segment_start { 149 $i = -1; 150 $memory_used = 0; 151 # undef(@sortedp, @rawdata, @p); # free the mem (maybe next line does that too) 152 undef @sortedp; 153 undef @rawdata; 154 undef @p; 155 @sortedp = @rawdata = @p = (); 156} 157 158sub segment_overflow { 159 my($done) = @_; 160 161 # sort the segment 162 @sortedp = sort custom_compare @p; 163 164 # pass on the data, either to a tmp file stdout 165 if ($#files_to_merge >= 0 || $memory_used > $max_mem) { 166 push(@files_to_merge, db_tmpfile(OUT)); 167 } else { 168 open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n"; 169 }; 170 foreach (@sortedp) { 171 print OUT $rawdata[$_]; 172 }; 173 close OUT; 174 175 # clean up memory usage 176 # and try again 177 print "memory used: $memory_used\n" if ($debug); 178 &segment_start; 179} 180 181# 182# &segment_merge merges the on-disk files we've built up 183# in the work queue @files_to_merge. 184# 185# Changed Nov. 2001: try to process the work queue in 186# a file-system-cache-friendly order (based on ideas from 187# "Information and Control in Gray-box Systems" by 188# the Arpaci-Dusseau's at SOSP 2001. 189# 190# Idea: each "pass" through the queue, revsere the processing 191# order so that the most recent data (that's hopefully 192# in memory) is handled first. 193# 194# This algorithm isn't perfect (sometimes if there's an odd number 195# of files in the queue you reach way back in time, but most of 196# the time it's quite good). 197# 198# Also, in an ideal world $max_mem actually would be some sizable 199# percentage of memory, and so this whole optimization would 200# be useless because there would be no spare memory for the file system 201# cache. But for saftey reasons (because we don't know how much 202# RAM there is, and there is multiprocessing, etc.), $max_mem 203# is almost always hugely conservative. As of Nov. 2001 it defaults to 204# 10MB, but most workstations have >= 512MB memory. 205# 206sub segment_merge { 207 return if ($#files_to_merge < 0); 208 # keep track of how often to reverse 209 my($files_before_reversing_queue) = 0; 210 # Merge the files in a binary tree. 211 while ($#files_to_merge >= 0) { 212 # Each "pass", reverse the queue to reuse stuff in memory. 213 if ($files_before_reversing_queue <= 0) { 214 @files_to_merge = reverse @files_to_merge; 215 $files_before_reversing_queue = $#files_to_merge + 1; 216 print "reversed queue, $files_before_reversing_queue before next reverse.\n" if ($debug); 217 }; 218 $files_before_reversing_queue -= 2; 219 # Pick off the two next files for merging. 220 my(@fns); 221 die "$0: segment_merge, odd number of segments.\n" if ($#files_to_merge == 0); 222 push(@fns, shift @files_to_merge); 223 push(@fns, shift @files_to_merge); 224 # send the output to another file, or stdout if we're done 225 if ($#files_to_merge >= 0) { 226 push(@files_to_merge, db_tmpfile(OUT)); 227 } else { 228 open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n"; 229 }; 230 print "merging $fns[0] and $fns[1] to " . ($#files_to_merge >=0 ? $files_to_merge[$#files_to_merge] : "STDOUT") . "\n" if ($debug); 231 merge_to_out(@fns); 232 close OUT; 233 # verify($files_to_merge[$#files_to_merge]) if ($#files_to_merge >= 0); 234 foreach (@fns) { 235 db_tmpfile_cleanup($_); 236 }; 237 }; 238} 239 240# This function is very custom for debugging. 241# sub verify { 242# my($fn) = @_; 243# open(F, "<$fn") || die; 244# my($last); 245# my($i) = 0; 246# while (<F>) { 247# $i++; 248# $last = $_ if (!defined($last)); 249# if ($last > $_) { 250# die "bogus on line $i\n"; 251# }; 252# }; 253# close F; 254# } 255 256sub merge_to_out { 257 my(@fh) = qw(A B); 258 my($j); 259 foreach $j (0..1) { 260 $fh[$j] = new FileHandle; 261 $fh[$j]->open("<$_[$j]") || die "$0: cannot open $_[$j].\n"; 262 &merge_read_one($fh[$j], $j) || die "$0: $_[$j] is empty.\n"; 263 }; 264 my($winner); 265 $a = 0; $b = 1; # for custom_compare 266 for (;;) { 267 $winner = &custom_compare > 0 ? 1 : 0; 268 # print "\$sf0[0] = $sf0[0], \$sf0[1] = $sf0[1], \$winner = $winner, $rawdata[$winner]"; 269 print OUT $rawdata[$winner]; 270 # refill buffer 271 if (!&merge_read_one($fh[$winner], $winner)) { 272 # $winner is exhausted. Drain !$winner's buffer, then break and finish below 273 print OUT $rawdata[!$winner]; 274 last; 275 }; 276 }; 277 # finish up !$winner 278 # while (<$fh[!$winner]>) returns "A"--a perl bug in 5.004_04 279 # work around: use eof/getline methods. 280 while (!$fh[!$winner]->eof) { 281 # print "clearing $fh[!$winner]\n"; 282 print OUT $fh[!$winner]->getline; 283 }; 284 foreach (0..1) { 285 close $fh[$_]; 286 }; 287} 288 289sub merge_read_one { 290 my($fh, $index) = @_; 291 $_ = scalar <$fh>; 292 return undef if (!defined($_)); # out of data 293 $rawdata[$index] = $_; 294 &split_cols; 295 # print "read from $fh into $i, $_"; 296 &custom_enter($index);; 297 return 1; 298} 299 300# 301# read in and set up the data 302# 303&segment_start; 304@files_to_merge = (); 305while (<STDIN>) { 306 # NEEDSWORK: should buffer comments to a file, not memory. 307 next if (&delayed_pass_comments); 308 push (@rawdata, $_); 309 $i++; 310 push (@p, $i); 311 &split_cols; 312 &custom_memory_enter($i, \$memory_used); 313 # $@ && die("$prog: internal eval error: $@.\n"); 314}; 315# handle end case 316&segment_overflow if ($i >= 0); 317&segment_merge; 318&delayed_flush_comments; 319 320 321print "# | $prog ", join(" ", @orig_argv), "\n"; 322exit 0; 323 324