1#!/usr/bin/perl -w
2
3# settings are located in $HOME/.corpus
4
5use strict;
6use Getopt::Long;
7
8our ( $corpusdir, $opt_override, $opt_tag );
9GetOptions(
10    "tag=s" => \$opt_tag,
11    "dir=s" => \$corpusdir,
12    "override=s" => \$opt_override,
13);
14
15$opt_override ||= '';
16$opt_tag ||= 'n';       # nightly is the default
17
18
19use File::Path;
20use File::Copy;
21use Time::ParseDate;
22use Cwd;
23use POSIX qw(nice strftime);
24
25use constant WEEK => 60*60*24;
26nice(15);
27
28my $configuration = "$ENV{HOME}/.corpus";
29my %opt;
30my %revision = ();
31my %logs_by_rev = ();
32my %is_net_revision = ();
33my %time = ();
34my %revision_date = ();
35my @files;
36my @tmps = ();
37my $skip = '';
38my $time_start = time;
39$time_start -= ($time_start % 3600);
40my $output_revpath;
41
42&configure;
43&init;
44
45if ($corpusdir) {
46  print "reading logs from '$corpusdir'\n";
47}
48else {
49  $corpusdir = $opt{corpus};
50  &update_rsync;
51}
52
53&locate;
54&current;
55&clean_up;
56
57sub configure {
58  # does rough equivalent of source
59  open(C, $configuration) || die "open failed: $configuration: $!\n";
60  my $pwd = getcwd;
61
62  # add 'override' options
63  my @lines = (<C>, split(/\|/, $opt_override));
64
65  foreach $_ (@lines) {
66	chomp;
67	s/#.*//;
68	if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
69          my ($key, $val) = ($1, $2);
70          $val =~ s/\$PWD/$pwd/gs;
71	  $opt{$key} = $val;
72	}
73  }
74  close(C);
75}
76
77sub clean_up {
78  system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps);
79}
80
81sub init {
82  $SIG{INT} = \&clean_up;
83  $SIG{TERM} = \&clean_up;
84
85  $ENV{RSYNC_PASSWORD} = $opt{password};
86  $ENV{TIME} = '%e,%U,%S';
87  $ENV{TZ} = 'UTC';
88}
89
90sub update_rsync {
91  chdir $corpusdir;
92
93  # allow non-running of rsync under some circumstances
94  if ($opt{rsync_command}) {
95    system $opt{rsync_command};
96  } else {
97    system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
98  }
99
100  # this block is no longer required -- we do sensible things with modtime
101  # comparisons to work it out!
102  if (0 && !$opt{always_update_html}) {
103    if (-f "rsync.last") {
104      open(FIND, "find . -type f -newer rsync.last |");
105      my $files = "";
106      while(<FIND>) {
107        $files .= $_;
108      }
109      close(FIND);
110      if (! $files) {
111        print STDERR "no new corpus files\n";
112        if (rand(24) > 1) {
113          exit 0;
114        }
115        else {
116          print STDERR "updating anyway\n";
117        }
118      }
119    }
120  }
121
122  open(RSYNC, "> rsync.last");
123  close(RSYNC);
124  system "chmod +r *.log";
125}
126
127sub locate {
128  # chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
129
130  opendir(CORPUS, $corpusdir);
131  @files = sort readdir(CORPUS);
132  closedir(CORPUS);
133
134  @files = grep {
135    /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
136  } @files;
137
138  foreach my $file (@files) {
139    my $tag = 0;
140    my $revtime;
141    open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
142    while (my $line = <FILE>) {
143      last if $line !~ /^#/;
144      if ($line =~ m/^# Date:\s*(\S+)/) {
145        my $date_line = $1;
146        my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
147
148        my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
149                  GMT => 1, PREFER_PAST => 1);
150
151        $revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
152                  GMT => 1, PREFER_PAST => 1);
153
154        $time{$file} = $timet;
155        print "$corpusdir/$file: time=$timet\n";
156
157      }
158      if ($line =~ m/^# SVN revision:\s*(\S+)/) {
159        my $rev = $1;
160        $revision{$file} = $rev;
161
162        $logs_by_rev{$rev} ||= [ ];
163        push (@{$logs_by_rev{$rev}}, $file);
164
165        if ($file =~ /-net-/) {
166          $is_net_revision{$rev} = 1;
167          print "$corpusdir/$file: rev=$rev (net)\n";
168        }
169        else {
170          print "$corpusdir/$file: rev=$rev (non-net)\n";
171        }
172      }
173    }
174    close(FILE);
175    if ($revtime) {
176      my $rev = $revision{$file};
177      $revision_date{$rev} = $revtime unless defined $revision_date{$rev};
178
179      if ($revtime < $revision_date{$rev}) {
180        $revision_date{$rev} = $revtime;
181      }
182    }
183  }
184}
185
186sub sort_all {
187  my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
188  my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
189  $a1 =~ s/^[\+\-]//;
190  $b1 =~ s/^[\+\-]//;
191
192  my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
193  if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
194  elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
195  elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
196  if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
197  elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
198  elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
199  return $n;
200}
201
202sub time_filter {
203  my ($target, $after, $before) = @_;
204  if (/time=(\d+)/) {
205	return (($target - $1 >= WEEK * $after) &&
206		($target - $1 < WEEK * $before));
207  }
208  return 0;
209}
210
211sub current {
212  my $classes = $opt{output_classes};
213  $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age";
214
215  foreach my $entry (split(' ', $classes)) {
216    $entry =~ /^(\S+)\.(\S+)$/;
217    my $class = $1;
218    my $age = $2;
219    if (!$age) { warn "no age in $entry"; next; }
220
221    foreach my $rev (sort keys %logs_by_rev) {
222      next if ($rev eq 'unknown');
223
224      if ($class =~ /NET/) {
225        next unless $is_net_revision{$rev};
226      }
227
228      gen_class ($rev, $class, $age);
229    }
230  }
231}
232
233sub gen_class {
234  my ($rev, $class, $age) = @_;
235
236  print STDERR "\ngenerating r$rev $class.$age:\n";
237
238  next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
239
240  my @ham = grep { /^ham/ } @{$logs_by_rev{$rev}};
241  my @spam = grep { /^spam/ } @{$logs_by_rev{$rev}};
242
243  print STDERR "input h: " . join(' ', @ham) . "\n";
244  print STDERR "input s: " . join(' ', @spam) . "\n";
245
246  chdir $corpusdir;
247
248  # net vs. local
249  if ($class eq "NET") {
250    @ham = grep { /-net-/ } @ham;
251    @spam = grep { /-net-/ } @spam;
252  }
253  else {
254    # if both net and local exist, use newer
255    my %spam;
256    my %ham;
257
258    for my $file (@spam) {
259      $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
260    }
261    for my $file (@ham) {
262      $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
263    }
264    while (my ($user, $count) = each %ham) {
265      if ($count > 1) {
266        my $nightly = "ham-$user.log";
267        my $weekly = "ham-net-$user.log";
268        if ($revision{$nightly} >= $revision{$weekly}) {
269          @ham = grep { $_ ne $weekly } @ham;
270        }
271        else {
272          @ham = grep { $_ ne $nightly } @ham;
273        }
274      }
275    }
276    while (my ($user, $count) = each %spam) {
277      if ($count > 1) {
278        my $nightly = "spam-$user.log";
279        my $weekly = "spam-net-$user.log";
280        if ($revision{$nightly} >= $revision{$weekly}) {
281          @spam = grep { $_ ne $weekly } @spam;
282        }
283        else {
284          @spam = grep { $_ ne $nightly } @spam;
285        }
286      }
287    }
288  }
289
290  # age
291  if ($age =~ /(\d+)day/) {
292    my $mtime = $1;
293    @ham = grep { -M "$_" < $mtime } @ham;
294    @spam = grep { -M "$_" < $mtime } @spam;
295  }
296  elsif ($class ne 'NET' && $age =~ /^(?:new|all|age)$/)
297  {
298    # just ignore the tagtime stuff; since we now may be
299    # dealing with multiple mass-checks per day, just use svn rev data
300    # my $tt = (-M $opt{tagtime});
301    # @ham = grep { !defined($tt) || ((-M "$_") < $tt) } @ham;
302    # @spam = grep { !defined($tt) || ((-M "$_") < $tt) } @spam;
303  }
304
305  print STDERR "selected h: " . join(' ', @ham) . "\n";
306  print STDERR "selected s: " . join(' ', @spam) . "\n";
307
308  # we cannot continue if we have no files that match the criteria...
309  # demand at least 1 ham and 1 spam file
310  if (scalar @spam <= 0 || scalar @ham <= 0) {
311    warn "not enough files found matching criteria ($rev $class $age)\n";
312    return;
313  }
314
315  my $crev_time = $revision_date{$rev};
316  my $dir = create_outputdir($rev, $crev_time);
317
318  my $fname = "$dir/$class.$age";
319
320  # Look through corpus for files that have been added since last full build
321  # Update all class files on this
322
323  my $buildfile = "$dir/.buildtime";
324  my $last_build = 0;
325  my $needs_rebuild = 0;
326
327  if (-f $buildfile) {
328    open(BFILE, "$buildfile") or warn "cannot read $buildfile";
329    while (my $line = <BFILE>) {
330      last if $line !~ /^#/;
331      if ($line =~ m/^# BuildTime:\s*(\S+)/) {
332        $last_build = $1;
333      }
334    }
335    close(BFILE);
336  }
337
338  if ( !(-f $fname) || !$last_build || $last_build == $time_start) {
339    # No last build or we've already done the loop below
340    $needs_rebuild = 1;
341  } else {
342    foreach my $srcfile (@spam, @ham) {
343      my $file_time = (stat($srcfile))[9];
344      if ($file_time >= $last_build) {
345        $needs_rebuild = 1;
346        last;
347      }
348    }
349  }
350
351  if (!$needs_rebuild) {
352    print "last buildtime is fresher than sources\n";
353    return;
354  }
355
356  if ($last_build != $time_start) {
357    open(BFILE, "> $buildfile") or warn "cannot write to $buildfile";
358    print BFILE "# BuildTime: $time_start\n";
359    close(BFILE);
360  }
361
362  my $when = scalar localtime time;
363  print qq{creating: $fname
364  started $when...
365  };
366  my $bytes = 0;
367
368  if ($class eq 'LOGS') {
369    foreach my $f (@ham, @spam) {
370      $f =~ s/[^-\._A-Za-z0-9]+/_/gs;    # sanitize!
371      my $zf = "$fname-$f.gz";
372
373      system("pigz -c < $f > $zf.$$");
374      if ($? >> 8 != 0) {
375        warn "pigz -c < $f > $zf.$$ failed";
376      }
377
378      rename("$zf.$$", $zf) or
379                    warn "cannot rename $zf.$$ to $zf";
380      $bytes += (-s $zf);
381    }
382    my $tmpfname = "$fname.$$";
383    open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
384    print OUT "# $$ \n";
385    close(OUT);
386    rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
387  }
388  else {
389    my $tmpfname = "$fname.$$";
390
391    open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
392    print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n";
393    print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n";
394    for (@ham) {
395      print OUT "# $_ was at r$revision{$_}\n";
396    }
397    for (@spam) {
398      print OUT "# $_ was at r$revision{$_}\n";
399    }
400
401    push (@tmps, $tmpfname);
402
403    my $flags = "";
404    $flags = "-t net -s 1" if $class eq "NET";
405    $flags = "-M HTML_MESSAGE" if $class eq "HTML";
406    $flags = "-o" if $class eq "OVERLAP";
407    $flags = "-S" if $class eq "SCOREMAP";
408    if ($opt{rules_dir}) {
409      $flags .= " -c '$opt{rules_dir}'";
410    }
411
412    if ($age eq "all") {
413      my %spam;
414      my %ham;
415      my @output;
416
417      for my $file (@spam) {
418        $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
419      }
420      for my $file (@ham) {
421        $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
422      }
423      unlink "$opt{tmp}/ham.log.$$";
424      unlink "$opt{tmp}/spam.log.$$";
425
426      if (scalar keys %spam <= 0 || scalar keys %ham <= 0) {
427        warn "no files found for $class.$age";
428        return;
429      }
430
431      chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
432      for my $user (sort keys %spam) {
433        next unless $ham{$user};
434        system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
435        system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
436        open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
437        while(<IN>) {
438          chomp;
439          push @output, "$_:$user\n";
440        }
441        close(IN);
442      }
443      open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
444      while(<IN>) {
445        push @output, $_;
446      }
447      close(IN);
448      for (sort sort_all @output) { print OUT; }
449    }
450    elsif ($age eq "age") {
451      my @output;
452
453      for my $which (("0-1", "1-2", "2-3", "3-6")) {
454        my ($after, $before) = split(/-/, $which);
455        # get and filter logs
456        chdir $corpusdir;
457        for my $type (("ham", "spam")) {
458          open(TMP, "> $opt{tmp}/$type.log.$$");
459          my @array = ($type eq "ham") ? @ham : @spam;
460          for my $file (@array) {
461            open(IN, $file) or warn "cannot read $file";
462            while (<IN>) {
463              print TMP $_ if time_filter($crev_time, $after, $before);
464            }
465            close(IN);
466          }
467          close (TMP);
468        }
469        # print out by age
470        chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
471        open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
472        while(<IN>) {
473          chomp;
474          push @output, "$_:$which\n";
475        }
476        close(IN);
477      }
478      for (sort sort_all @output) { print OUT; }
479    }
480    elsif (@ham && @spam) {
481      # get logs
482      system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
483      system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
484
485      chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
486      open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
487      while(<IN>) { print(OUT); }
488      close(IN);
489    }
490
491    $bytes = (-s OUT);
492    close(OUT);
493    rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
494  }
495
496  $when = scalar localtime time;
497  print qq{created: $bytes bytes, finished at $when
498URL:
499
500  http://buildbot.spamassassin.org/ruleqa?daterev=$output_revpath
501
502};
503
504}
505
506sub create_outputdir {
507  my ($rev, $time) = @_;
508  my $revpath = strftime("%Y%m%d", gmtime($time)) . "/r$rev-$opt_tag";
509  my $dir = $opt{html} .'/'. $revpath;
510
511  # print "output dir: $dir\n";
512  if (!-d $dir) {
513    my $prevu = umask 0;
514    mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir";
515    umask $prevu;
516  }
517
518  $output_revpath = $revpath;       # set the global
519  $output_revpath =~ s/\//-/;       # looks nicer
520
521  return $dir;
522}
523
524