1#!/usr/local/bin/perl -w 2 3# 4# dbcolmultiscale 5# Copyright (C) 2001 by John Heidemann <johnh@isi.edu> 6# $Id: dbcolmultiscale,v 1.6 2003/05/23 04:15:43 johnh Exp $ 7# 8# This program is distributed under terms of the GNU general 9# public license, version 2. See the file COPYING 10# in $dblibdir for details. 11# 12sub usage { 13 print STDERR <<END; 14usage: $0 [-d] [-s SmallestTimescale] TimeField DataField 15 16Dbcolmultiscale computes the sums and rates of a set of samples 17taken at times (given by TimeField) with values given by DataField 18across multiple timescales. Timescales start at the value given by 19SmallestTimescale (defaulting to 1 unit) and increasing by powers of 20two. 21 22The output is a table listing the time of each sample period, the 23timescale it sums, the timescale index, 24a jittered timescale index, the sum, and the rate (sum 25dividied by timescale). 26 27More formally, the output is 28 29 t_i s_i si_i si'_i sum_{t_i,s_i} rate_{t_i,s_i} 30 31where t_i is the time that the period begins 32s_i is the timescale for this period in units of time 33si_i is the timescale index (:= lg( (s_i - FirstTime) / SmallestTimescale) ) 34 (where FirstTime is the time of the first event) 35si'_i := si_i + h(t) where h(t) is some hash function to "jitter" the 36 scale index, possibly in some regular way (such as mod) 37sum_{t_i,s_i} := sum_{t_i,s_{i-1}} + sum{t_i+s_{i-1},s_{i-1}} 38rate_{t_i,s_i} := sum_{t_i,s_i} / s_i 39 40The input source must be sorted numerically by TimeField. 41By default dbcolmultiscale sorts its input; 42it will run more efficiently if given the -S option 43to tell it the data is already sorted. 44 45This program consumes O(lg duration) memory, 46where duration is defined as time time between the first and last events. 47If the data is not pre-sorted, the program requires O(number of records) 48disk space. 49 50Options: 51 -s LowestTimescale specify the shortest timescale to consider 52 (defaults to 1 unit) 53 -S data is pre-sorted 54 -d debugging 55 56Sample input: 57#h time size 58# This case has bursty data in the first half, and 59# smooth data in the second half. 600 0 611 8 628 1 639 1 6410 1 6511 1 6612 1 6713 1 6814 1 6915 1 70 71Sample command: 72cat TEST/dbcolmultiscale_bursty.in | dbcolmultiscale -S time size 73 74Sample output: 75#h period_start timescale timescale_index jittered_index sum rate 76# This case has bursty data in the first half, and 77# smooth data in the second half. 780 1 0 -0.15 0 0 791 1 0 -0.14 8 8 800 2 1 0.85 8 4 812 1 0 -0.13 0 0 823 1 0 -0.12 0 0 832 2 1 0.86 0 0 840 4 2 1.85 8 2 854 1 0 -0.11 0 0 865 1 0 -0.1 0 0 874 2 1 0.87 0 0 886 1 0 -0.09 0 0 897 1 0 -0.08 0 0 906 2 1 0.88 0 0 914 4 2 1.86 0 0 920 8 3 2.85 8 1 938 1 0 -0.07 1 1 949 1 0 -0.06 1 1 958 2 1 0.89 2 1 9610 1 0 -0.05 1 1 9711 1 0 -0.04 1 1 9810 2 1 0.9 2 1 998 4 2 1.87 4 1 10012 1 0 -0.03 1 1 10113 1 0 -0.02 1 1 10212 2 1 0.91 2 1 10314 1 0 -0.01 1 1 10415 1 0 0 1 1 10514 2 1 0.92 2 1 10612 4 2 1.88 4 1 1078 8 3 2.86 8 1 1080 16 4 3.85 16 1 109# | dbcolmultiscale -S time size 110 111Post-processing: 112To visualize with xgraph: 113 ... | dbcol jittered_index rate | xgraph -nl -p 114To do stats on the rates: 115 ... | dbmultistats -q 4 timescale_index rate 116Or both: 117 ... | dbcolmultiscale_to_gnuplot 118 119END 120# ' 121 exit 1; 122} 123 124BEGIN { 125 $dbbindir = "/home/johnh/BIN/DB"; 126 $dblibdir = "/home/johnh/BIN/DB"; 127 push(@INC, $dblibdir); 128} 129use DbGetopt; 130require "$dblibdir/dblib.pl"; 131 132@orig_argv = @ARGV; 133my($prog) = &progname; 134my($smallest_timescale) = 1; 135my($debug) = undef; 136my($sorting_required) = 1; 137my($dbopts) = new DbGetopt("ds:S?", \@ARGV); 138my($ch); 139while ($dbopts->getopt) { 140 $ch = $dbopts->opt; 141 if ($ch eq 's') { 142 $smallest_timescale = $dbopts->optarg; 143 } elsif ($ch eq 'd') { 144 $debug = 1; 145 } elsif ($ch eq 'S') { 146 $sorting_required = 0; 147 } else { 148 &usage; 149 }; 150}; 151 152&usage if ($#ARGV != 1); 153my($timecol, $datacol) = @ARGV; 154 155# handle sorting, if necessary 156if ($sorting_required) { 157 open(SORTED, "$dbbindir/dbsort -n $timecol |") || die("$prog: cannot run dbsort over input.\n"); 158 open(STDIN, "<&SORTED") || die "$0: cannot dup SORTED.\n"; 159}; 160 161&readprocess_header; 162die ("$prog: unknown column name ``$timecol''.\n") if (!defined($colnametonum{$timecol})); 163my($timef) = $colnametonum{$timecol}; 164die ("$prog: unknown column name ``$datacol''.\n") if (!defined($colnametonum{$datacol})); 165my($dataf) = $colnametonum{$datacol}; 166 167 168# 169# start the output 170# 171&write_header(qw(period_start timescale timescale_index jittered_index sum rate)); 172 173 174my($first_time, $last_time); 175my(@period_start) = undef; 176my(@timescale_duration) = ($smallest_timescale); 177my(@sum) = (0); 178my(@other_sum) = (undef); 179my(@other_period_start) = (undef); 180my($maxscale_summed_i) = 0; # counts from zero 181 182sub jitter { 183 my($index, $time, $timescale) = @_; 184 my($jitter) = int(($time - $first_time) / $timescale) % 30; 185 return $index + ($jitter - 15) / 100.0; 186} 187 188sub output_timescale { 189 my($index) = @_; 190 &write_these_cols($period_start[$index], 191 $timescale_duration[$index], 192 $index, 193 jitter($index, $period_start[$index], $timescale_duration[$index]), 194 $sum[$index], 195 $sum[$index] / $timescale_duration[$index]); 196}; 197 198sub complete_timescale { 199 my($index) = @_; 200 201 # are we finishing the first half or the second? 202 if (!defined($other_sum[$index])) { 203 # first half, just output the record 204 &output_timescale($index); 205 # now start work on the new record 206 $other_sum[$index] = $sum[$index]; 207 $other_period_start[$index] = $period_start[$index]; 208 } else { 209 # ok, we're doing the 2nd half, which means we have to recurse 210 # output the record 211 &output_timescale($index); 212 # compute the next level 213 $sum[$index+1] = $other_sum[$index] + $sum[$index]; 214 $timescale_duration[$index+1] = 2 * $timescale_duration[$index]; 215 $period_start[$index+1] = $other_period_start[$index]; 216 &complete_timescale($index+1); 217 # now reset our level 218 $other_sum[$index] = undef; 219 }; 220 $sum[$index] = 0; 221 $period_start[$index] += $timescale_duration[$index]; 222} 223 224 225 226# read data 227while (<STDIN>) { 228 &pass_comments() && next; 229 &split_cols; 230 231 my($time) = $f[$timef]; 232 my($data) = $f[$dataf]; 233 234 # initialization 235 if (!defined($first_time)) { 236 $first_time = $last_time = $period_start[0] = $time; 237 }; 238 die "$0: input is not sorted by time ($time < $last_time)\n" 239 if ($time < $last_time); 240 $last_time = $time; 241 # handle the zero level by hand 242 if ($time - $period_start[0] < $smallest_timescale) { 243 # just more data in the current period 244 $sum[0] += $data; 245 next; 246 }; 247 # ...overflowed period, so percolate if necessary 248 # until the data point is in the current period. 249 while ($time - $period_start[0] >= $smallest_timescale) { 250 &complete_timescale(0); # percolate 251 }; 252 # now initialize the new lowest level 253 $sum[0] = $data; 254}; 255 256die "$0: no input\n" if (!defined($first_time)); 257 258# Terminte the final entry, assuming it's complete 259# (even though it may not be since we might have missed 260# additional data that would have fitted in that timescale). 261# Note that this only "completes" the lowest timescale 262# and not necessarily any partially complete large timescales. 263# It's not clear that this is any more justified. 264# (But this additional call here does guarantee that if we have 265# exactly 2^n evenly spaced datapoints we get them all--- 266# see the dbcolmultiscale_flat test case for an example). 267&complete_timescale(0); 268 269 270# close up shop 271print "# | $prog ", join(" ", @orig_argv), "\n"; 272exit 0; 273 274if (0) { 275 <SORTED>; 276}; 277