1#!/usr/bin/perl -w 2 3# settings are located in $HOME/.corpus 4 5use strict; 6use Getopt::Long; 7use File::Path; 8use File::Copy; 9use Time::ParseDate; 10use POSIX qw(nice strftime); 11use Cwd; 12 13our ( $opt_dir, $opt_override, $opt_tag ); 14GetOptions( 15 "tag=s" => \$opt_tag, 16 "dir=s" => \$opt_dir, 17 "override=s" => \$opt_override, 18); 19 20$opt_override ||= ''; 21$opt_tag ||= 'n'; # nightly is the default 22 23nice(15); 24 25# daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before 26# the time of day when the mass-check tagging occurs; see 27# http://wiki.apache.org/spamassassin/DateRev for more details. 28use constant DATEREV_ADJ => - (8 * 60 * 60); 29 30# --------------------------------------------------------------------------- 31 32my $configuration = "$ENV{HOME}/.corpus"; 33my %cf; 34 35configure(); 36init(); 37 38if (!$opt_dir) { 39 $opt_dir = $cf{corpus}; 40 update_rsync(); 41} 42 43chdir $opt_dir; 44print "reading logs from '$opt_dir'\n"; 45 46my $linkdir = "$cf{html}/logs"; 47(-d $linkdir) or mkdir $linkdir; 48 49locate_and_link(); 50exit; 51 52sub configure { 53 # does rough equivalent of source 54 open(C, $configuration) || die "open failed: $configuration: $!\n"; 55 my $pwd = Cwd::getcwd; 56 57 # add 'override' options 58 my @lines = (<C>, split(/\|/, $opt_override)); 59 60 foreach $_ (@lines) { 61 chomp; 62 s/#.*//; 63 if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { 64 my ($key, $val) = ($1, $2); 65 $val =~ s/\$PWD/$pwd/gs; 66 $cf{$key} = $val; 67 } 68 } 69 close(C); 70} 71 72sub init { 73 $ENV{RSYNC_PASSWORD} = $cf{password}; 74 $ENV{TIME} = '%e,%U,%S'; 75 $ENV{TZ} = 'UTC'; 76} 77 78sub update_rsync { 79 chdir $opt_dir; 80 if (!$cf{rsync_command}) { die "no 'rsync_command' set"; } 81 system $cf{rsync_command}; 82 system "chmod +r *.log > /dev/null 2>&1"; 83} 84 85sub locate_and_link { 86 opendir(CORPUS, $opt_dir); 87 my @files = sort readdir(CORPUS); 88 closedir(CORPUS); 89 90 print "Found ", $#files + 1, " files in $opt_dir\n"; 91 92 @files = grep { 93 /^(?:spam|ham)-(?:net-)?[-\w.]+\.log$/ && -f "$opt_dir/$_" && -M _ < 30 94 } @files; 95 96 print "Kept ", $#files + 1, " files\n"; 97 98 foreach my $file (@files) { 99 my $ftime; 100 my $frevision; 101 102 print "Processing $opt_dir/$file\n"; 103 open(FILE, "$opt_dir/$file") or warn "cannot read $opt_dir/$file"; 104 while (my $line = <FILE>) { 105 last if $line !~ /^#/; 106 if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) { 107 my ($datepre, $hh, $datepost) = ($1,$2,$3); 108 109 $ftime = Time::ParseDate::parsedate($datepre.$hh.$datepost, 110 GMT => 1, PREFER_PAST => 1); 111 } 112 elsif ($line =~ m/^# Date:\s*(\S+)/) { 113 # a better way to do the above. TODO: parse it instead 114 } 115 elsif ($line =~ m/^# SVN revision:\s*(\S+)/) { 116 $frevision = $1; 117 } 118 } 119 close(FILE); 120 121 if (!defined $ftime) { 122 warn "$opt_dir/$file: no time found, ignored\n"; next; 123 } 124 if (!defined $frevision) { 125 warn "$opt_dir/$file: no revision found, ignored\n"; next; 126 } 127 if ($frevision eq 'unknown') { 128 warn "$opt_dir/$file: not tagged with a revision, ignored\n"; next; 129 } 130 131 my $daterev = mk_daterev($ftime, $frevision, $opt_tag); 132 link_file($file, $daterev); 133 } 134} 135 136sub mk_daterev { 137 my ($timet, $rev, $tag) = @_; 138 return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag"; 139} 140 141sub link_file { 142 my ($file, $daterev) = @_; 143 144 my $f = "$opt_dir/$file"; 145 146 # /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ 147 my $linkfile = $file; 148 my $dr = $daterev; $dr =~ s/\//-/gs; $linkfile =~ s/\.log$/.$dr.log/i; 149 my $t = "$linkdir/$linkfile"; 150 151 print "ln $f $t\n"; 152 (-f $t) and unlink $t; 153 # cannot hardlink unless we have ownership or RW perms on the file 154 symlink $f, $t or die "cannot ln"; 155 system "/usr/bin/touch -h -r '$f' '$t'"; # preserve modtimes 156} 157 158