1#!/usr/local/bin/perl -w
2
3#
4# dbcolmultiscale
5# Copyright (C) 2001 by John Heidemann <johnh@isi.edu>
6# $Id: dbcolmultiscale,v 1.6 2003/05/23 04:15:43 johnh Exp $
7#
8# This program is distributed under terms of the GNU general
9# public license, version 2.  See the file COPYING
10# in $dblibdir for details.
11#
12sub usage {
13	print STDERR <<END;
14usage: $0 [-d] [-s SmallestTimescale] TimeField DataField
15
16Dbcolmultiscale computes the sums and rates of a set of samples
17taken at times (given by TimeField) with values given by DataField
18across multiple timescales.  Timescales start at the value given by
19SmallestTimescale (defaulting to 1 unit) and increasing by powers of
20two.
21
22The output is a table listing the time of each sample period, the
23timescale it sums, the timescale index,
24a jittered timescale index, the sum, and the rate (sum
25dividied by timescale).
26
27More formally, the output is
28
29    t_i s_i si_i si'_i sum_{t_i,s_i} rate_{t_i,s_i}
30
31where t_i is the time that the period begins
32s_i is the timescale for this period in units of time
33si_i is the timescale index (:= lg( (s_i - FirstTime) / SmallestTimescale) )
34    (where FirstTime is the time of the first event)
35si'_i := si_i + h(t)  where h(t) is some hash function to "jitter" the
36  scale index, possibly in some regular way (such as mod)
37sum_{t_i,s_i} := sum_{t_i,s_{i-1}} + sum{t_i+s_{i-1},s_{i-1}}
38rate_{t_i,s_i} := sum_{t_i,s_i} / s_i
39
40The input source must be sorted numerically by TimeField.
41By default dbcolmultiscale sorts its input;
42it will run more efficiently if given the -S option
43to tell it the data is already sorted.
44
45This program consumes O(lg duration) memory,
46where duration is defined as time time between the first and last events.
47If the data is not pre-sorted, the program requires O(number of records)
48disk space.
49
50Options:
51    -s LowestTimescale    specify the shortest timescale to consider
52				(defaults to 1 unit)
53    -S                    data is pre-sorted
54    -d			  debugging
55
56Sample input:
57#h time size
58# This case has bursty data in the first half, and
59# smooth data in the second half.
600 0
611 8
628 1
639 1
6410 1
6511 1
6612 1
6713 1
6814 1
6915 1
70
71Sample command:
72cat TEST/dbcolmultiscale_bursty.in | dbcolmultiscale -S time size
73
74Sample output:
75#h period_start timescale timescale_index jittered_index sum rate
76# This case has bursty data in the first half, and
77# smooth data in the second half.
780	1	0	-0.15	0	0
791	1	0	-0.14	8	8
800	2	1	0.85	8	4
812	1	0	-0.13	0	0
823	1	0	-0.12	0	0
832	2	1	0.86	0	0
840	4	2	1.85	8	2
854	1	0	-0.11	0	0
865	1	0	-0.1	0	0
874	2	1	0.87	0	0
886	1	0	-0.09	0	0
897	1	0	-0.08	0	0
906	2	1	0.88	0	0
914	4	2	1.86	0	0
920	8	3	2.85	8	1
938	1	0	-0.07	1	1
949	1	0	-0.06	1	1
958	2	1	0.89	2	1
9610	1	0	-0.05	1	1
9711	1	0	-0.04	1	1
9810	2	1	0.9	2	1
998	4	2	1.87	4	1
10012	1	0	-0.03	1	1
10113	1	0	-0.02	1	1
10212	2	1	0.91	2	1
10314	1	0	-0.01	1	1
10415	1	0	0	1	1
10514	2	1	0.92	2	1
10612	4	2	1.88	4	1
1078	8	3	2.86	8	1
1080	16	4	3.85	16	1
109#  | dbcolmultiscale -S time size
110
111Post-processing:
112To visualize with xgraph:
113  ... | dbcol jittered_index rate | xgraph -nl -p
114To do stats on the rates:
115  ... | dbmultistats -q 4 timescale_index rate
116Or both:
117  ... | dbcolmultiscale_to_gnuplot
118
119END
120# '
121	exit 1;
122}
123
124BEGIN {
125    $dbbindir = "/home/johnh/BIN/DB";
126    $dblibdir = "/home/johnh/BIN/DB";
127    push(@INC, $dblibdir);
128}
129use DbGetopt;
130require "$dblibdir/dblib.pl";
131
132@orig_argv = @ARGV;
133my($prog) = &progname;
134my($smallest_timescale) = 1;
135my($debug) = undef;
136my($sorting_required) = 1;
137my($dbopts) = new DbGetopt("ds:S?", \@ARGV);
138my($ch);
139while ($dbopts->getopt) {
140    $ch = $dbopts->opt;
141    if ($ch eq 's') {
142	$smallest_timescale = $dbopts->optarg;
143    } elsif ($ch eq 'd') {
144	$debug = 1;
145    } elsif ($ch eq 'S') {
146	$sorting_required = 0;
147    } else {
148	&usage;
149    };
150};
151
152&usage if ($#ARGV != 1);
153my($timecol, $datacol) = @ARGV;
154
155# handle sorting, if necessary
156if ($sorting_required) {
157    open(SORTED, "$dbbindir/dbsort -n $timecol |") || die("$prog: cannot run dbsort over input.\n");
158    open(STDIN, "<&SORTED") || die "$0: cannot dup SORTED.\n";
159};
160
161&readprocess_header;
162die ("$prog: unknown column name ``$timecol''.\n") if (!defined($colnametonum{$timecol}));
163my($timef) = $colnametonum{$timecol};
164die ("$prog: unknown column name ``$datacol''.\n") if (!defined($colnametonum{$datacol}));
165my($dataf) = $colnametonum{$datacol};
166
167
168#
169# start the output
170#
171&write_header(qw(period_start timescale timescale_index jittered_index sum rate));
172
173
174my($first_time, $last_time);
175my(@period_start) = undef;
176my(@timescale_duration) = ($smallest_timescale);
177my(@sum) = (0);
178my(@other_sum) = (undef);
179my(@other_period_start) = (undef);
180my($maxscale_summed_i) = 0;  # counts from zero
181
182sub jitter {
183    my($index, $time, $timescale) = @_;
184    my($jitter) = int(($time - $first_time) / $timescale) % 30;
185    return $index + ($jitter - 15) / 100.0;
186}
187
188sub output_timescale {
189    my($index) = @_;
190    &write_these_cols($period_start[$index],
191	$timescale_duration[$index],
192	$index,
193	jitter($index, $period_start[$index], $timescale_duration[$index]),
194	$sum[$index],
195	$sum[$index] / $timescale_duration[$index]);
196};
197
198sub complete_timescale {
199    my($index) = @_;
200
201    # are we finishing the first half or the second?
202    if (!defined($other_sum[$index])) {
203	# first half, just output the record
204	&output_timescale($index);
205	# now start work on the new record
206	$other_sum[$index] = $sum[$index];
207	$other_period_start[$index] = $period_start[$index];
208    } else {
209        # ok, we're doing the 2nd half, which means we have to recurse
210        # output the record
211        &output_timescale($index);
212        # compute the next level
213        $sum[$index+1] = $other_sum[$index] + $sum[$index];
214        $timescale_duration[$index+1] = 2 * $timescale_duration[$index];
215        $period_start[$index+1] = $other_period_start[$index];
216        &complete_timescale($index+1);
217        # now reset our level
218        $other_sum[$index] = undef;
219    };
220    $sum[$index] = 0;
221    $period_start[$index] += $timescale_duration[$index];
222}
223
224
225
226# read data
227while (<STDIN>) {
228    &pass_comments() && next;
229    &split_cols;
230
231    my($time) = $f[$timef];
232    my($data) = $f[$dataf];
233
234    # initialization
235    if (!defined($first_time)) {
236	$first_time = $last_time = $period_start[0] = $time;
237    };
238    die "$0: input is not sorted by time ($time < $last_time)\n"
239        if ($time < $last_time);
240    $last_time = $time;
241    # handle the zero level by hand
242    if ($time - $period_start[0] < $smallest_timescale) {
243	# just more data in the current period
244	$sum[0] += $data;
245	next;
246    };
247    # ...overflowed period, so percolate if necessary
248    #   until the data point is in the current period.
249    while ($time - $period_start[0] >= $smallest_timescale) {
250	&complete_timescale(0);  # percolate
251    };
252    # now initialize the new lowest level
253    $sum[0] = $data;
254};
255
256die "$0: no input\n" if (!defined($first_time));
257
258# Terminte the final entry, assuming it's complete
259# (even though it may not be since we might have missed
260# additional data that would have fitted in that timescale).
261# Note that this only "completes" the lowest timescale
262# and not necessarily any partially complete large timescales.
263# It's not clear that this is any more justified.
264# (But this additional call here does guarantee that if we have
265# exactly 2^n evenly spaced datapoints we get them all---
266# see the dbcolmultiscale_flat test case for an example).
267&complete_timescale(0);
268
269
270# close up shop
271print "#  | $prog ", join(" ", @orig_argv), "\n";
272exit 0;
273
274if (0) {
275   <SORTED>;
276};
277