1#!/usr/bin/perl -w 2 3# settings are located in $HOME/.corpus 4 5use strict; 6use Getopt::Long; 7 8our ( $corpusdir, $opt_override, $opt_tag ); 9GetOptions( 10 "tag=s" => \$opt_tag, 11 "dir=s" => \$corpusdir, 12 "override=s" => \$opt_override, 13); 14 15$opt_override ||= ''; 16$opt_tag ||= 'n'; # nightly is the default 17 18 19use File::Path; 20use File::Copy; 21use Time::ParseDate; 22use Cwd; 23use POSIX qw(nice strftime); 24 25use constant WEEK => 60*60*24; 26nice(15); 27 28my $configuration = "$ENV{HOME}/.corpus"; 29my %opt; 30my %revision = (); 31my %logs_by_rev = (); 32my %is_net_revision = (); 33my %time = (); 34my %revision_date = (); 35my @files; 36my @tmps = (); 37my $skip = ''; 38my $time_start = time; 39$time_start -= ($time_start % 3600); 40my $output_revpath; 41 42&configure; 43&init; 44 45if ($corpusdir) { 46 print "reading logs from '$corpusdir'\n"; 47} 48else { 49 $corpusdir = $opt{corpus}; 50 &update_rsync; 51} 52 53&locate; 54¤t; 55&clean_up; 56 57sub configure { 58 # does rough equivalent of source 59 open(C, $configuration) || die "open failed: $configuration: $!\n"; 60 my $pwd = getcwd; 61 62 # add 'override' options 63 my @lines = (<C>, split(/\|/, $opt_override)); 64 65 foreach $_ (@lines) { 66 chomp; 67 s/#.*//; 68 if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { 69 my ($key, $val) = ($1, $2); 70 $val =~ s/\$PWD/$pwd/gs; 71 $opt{$key} = $val; 72 } 73 } 74 close(C); 75} 76 77sub clean_up { 78 system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps); 79} 80 81sub init { 82 $SIG{INT} = \&clean_up; 83 $SIG{TERM} = \&clean_up; 84 85 $ENV{RSYNC_PASSWORD} = $opt{password}; 86 $ENV{TIME} = '%e,%U,%S'; 87 $ENV{TZ} = 'UTC'; 88} 89 90sub update_rsync { 91 chdir $corpusdir; 92 93 # allow non-running of rsync under some circumstances 94 if ($opt{rsync_command}) { 95 system $opt{rsync_command}; 96 } else { 97 system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .'; 98 } 99 100 # this block is no longer required -- we do sensible things with modtime 101 # comparisons to work it out! 102 if (0 && !$opt{always_update_html}) { 103 if (-f "rsync.last") { 104 open(FIND, "find . -type f -newer rsync.last |"); 105 my $files = ""; 106 while(<FIND>) { 107 $files .= $_; 108 } 109 close(FIND); 110 if (! $files) { 111 print STDERR "no new corpus files\n"; 112 if (rand(24) > 1) { 113 exit 0; 114 } 115 else { 116 print STDERR "updating anyway\n"; 117 } 118 } 119 } 120 } 121 122 open(RSYNC, "> rsync.last"); 123 close(RSYNC); 124 system "chmod +r *.log"; 125} 126 127sub locate { 128 # chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; 129 130 opendir(CORPUS, $corpusdir); 131 @files = sort readdir(CORPUS); 132 closedir(CORPUS); 133 134 @files = grep { 135 /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 136 } @files; 137 138 foreach my $file (@files) { 139 my $tag = 0; 140 my $revtime; 141 open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file"; 142 while (my $line = <FILE>) { 143 last if $line !~ /^#/; 144 if ($line =~ m/^# Date:\s*(\S+)/) { 145 my $date_line = $1; 146 my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; 147 148 my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0", 149 GMT => 1, PREFER_PAST => 1); 150 151 $revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0", 152 GMT => 1, PREFER_PAST => 1); 153 154 $time{$file} = $timet; 155 print "$corpusdir/$file: time=$timet\n"; 156 157 } 158 if ($line =~ m/^# SVN revision:\s*(\S+)/) { 159 my $rev = $1; 160 $revision{$file} = $rev; 161 162 $logs_by_rev{$rev} ||= [ ]; 163 push (@{$logs_by_rev{$rev}}, $file); 164 165 if ($file =~ /-net-/) { 166 $is_net_revision{$rev} = 1; 167 print "$corpusdir/$file: rev=$rev (net)\n"; 168 } 169 else { 170 print "$corpusdir/$file: rev=$rev (non-net)\n"; 171 } 172 } 173 } 174 close(FILE); 175 if ($revtime) { 176 my $rev = $revision{$file}; 177 $revision_date{$rev} = $revtime unless defined $revision_date{$rev}; 178 179 if ($revtime < $revision_date{$rev}) { 180 $revision_date{$rev} = $revtime; 181 } 182 } 183 } 184} 185 186sub sort_all { 187 my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); 188 my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); 189 $a1 =~ s/^[\+\-]//; 190 $b1 =~ s/^[\+\-]//; 191 192 my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); 193 if ($a1 =~ /^OVERALL/) { $n -= 1000; } 194 elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; } 195 elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; } 196 if ($b1 =~ /^OVERALL/) { $n += 1000; } 197 elsif ($b1 =~ /^\(all messages\)/) { $n += 100; } 198 elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; } 199 return $n; 200} 201 202sub time_filter { 203 my ($target, $after, $before) = @_; 204 if (/time=(\d+)/) { 205 return (($target - $1 >= WEEK * $after) && 206 ($target - $1 < WEEK * $before)); 207 } 208 return 0; 209} 210 211sub current { 212 my $classes = $opt{output_classes}; 213 $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age"; 214 215 foreach my $entry (split(' ', $classes)) { 216 $entry =~ /^(\S+)\.(\S+)$/; 217 my $class = $1; 218 my $age = $2; 219 if (!$age) { warn "no age in $entry"; next; } 220 221 foreach my $rev (sort keys %logs_by_rev) { 222 next if ($rev eq 'unknown'); 223 224 if ($class =~ /NET/) { 225 next unless $is_net_revision{$rev}; 226 } 227 228 gen_class ($rev, $class, $age); 229 } 230 } 231} 232 233sub gen_class { 234 my ($rev, $class, $age) = @_; 235 236 print STDERR "\ngenerating r$rev $class.$age:\n"; 237 238 next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/); 239 240 my @ham = grep { /^ham/ } @{$logs_by_rev{$rev}}; 241 my @spam = grep { /^spam/ } @{$logs_by_rev{$rev}}; 242 243 print STDERR "input h: " . join(' ', @ham) . "\n"; 244 print STDERR "input s: " . join(' ', @spam) . "\n"; 245 246 chdir $corpusdir; 247 248 # net vs. local 249 if ($class eq "NET") { 250 @ham = grep { /-net-/ } @ham; 251 @spam = grep { /-net-/ } @spam; 252 } 253 else { 254 # if both net and local exist, use newer 255 my %spam; 256 my %ham; 257 258 for my $file (@spam) { 259 $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); 260 } 261 for my $file (@ham) { 262 $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); 263 } 264 while (my ($user, $count) = each %ham) { 265 if ($count > 1) { 266 my $nightly = "ham-$user.log"; 267 my $weekly = "ham-net-$user.log"; 268 if ($revision{$nightly} >= $revision{$weekly}) { 269 @ham = grep { $_ ne $weekly } @ham; 270 } 271 else { 272 @ham = grep { $_ ne $nightly } @ham; 273 } 274 } 275 } 276 while (my ($user, $count) = each %spam) { 277 if ($count > 1) { 278 my $nightly = "spam-$user.log"; 279 my $weekly = "spam-net-$user.log"; 280 if ($revision{$nightly} >= $revision{$weekly}) { 281 @spam = grep { $_ ne $weekly } @spam; 282 } 283 else { 284 @spam = grep { $_ ne $nightly } @spam; 285 } 286 } 287 } 288 } 289 290 # age 291 if ($age =~ /(\d+)day/) { 292 my $mtime = $1; 293 @ham = grep { -M "$_" < $mtime } @ham; 294 @spam = grep { -M "$_" < $mtime } @spam; 295 } 296 elsif ($class ne 'NET' && $age =~ /^(?:new|all|age)$/) 297 { 298 # just ignore the tagtime stuff; since we now may be 299 # dealing with multiple mass-checks per day, just use svn rev data 300 # my $tt = (-M $opt{tagtime}); 301 # @ham = grep { !defined($tt) || ((-M "$_") < $tt) } @ham; 302 # @spam = grep { !defined($tt) || ((-M "$_") < $tt) } @spam; 303 } 304 305 print STDERR "selected h: " . join(' ', @ham) . "\n"; 306 print STDERR "selected s: " . join(' ', @spam) . "\n"; 307 308 # we cannot continue if we have no files that match the criteria... 309 # demand at least 1 ham and 1 spam file 310 if (scalar @spam <= 0 || scalar @ham <= 0) { 311 warn "not enough files found matching criteria ($rev $class $age)\n"; 312 return; 313 } 314 315 my $crev_time = $revision_date{$rev}; 316 my $dir = create_outputdir($rev, $crev_time); 317 318 my $fname = "$dir/$class.$age"; 319 320 # Look through corpus for files that have been added since last full build 321 # Update all class files on this 322 323 my $buildfile = "$dir/.buildtime"; 324 my $last_build = 0; 325 my $needs_rebuild = 0; 326 327 if (-f $buildfile) { 328 open(BFILE, "$buildfile") or warn "cannot read $buildfile"; 329 while (my $line = <BFILE>) { 330 last if $line !~ /^#/; 331 if ($line =~ m/^# BuildTime:\s*(\S+)/) { 332 $last_build = $1; 333 } 334 } 335 close(BFILE); 336 } 337 338 if ( !(-f $fname) || !$last_build || $last_build == $time_start) { 339 # No last build or we've already done the loop below 340 $needs_rebuild = 1; 341 } else { 342 foreach my $srcfile (@spam, @ham) { 343 my $file_time = (stat($srcfile))[9]; 344 if ($file_time >= $last_build) { 345 $needs_rebuild = 1; 346 last; 347 } 348 } 349 } 350 351 if (!$needs_rebuild) { 352 print "last buildtime is fresher than sources\n"; 353 return; 354 } 355 356 if ($last_build != $time_start) { 357 open(BFILE, "> $buildfile") or warn "cannot write to $buildfile"; 358 print BFILE "# BuildTime: $time_start\n"; 359 close(BFILE); 360 } 361 362 my $when = scalar localtime time; 363 print qq{creating: $fname 364 started $when... 365 }; 366 my $bytes = 0; 367 368 if ($class eq 'LOGS') { 369 foreach my $f (@ham, @spam) { 370 $f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize! 371 my $zf = "$fname-$f.gz"; 372 373 system("pigz -c < $f > $zf.$$"); 374 if ($? >> 8 != 0) { 375 warn "pigz -c < $f > $zf.$$ failed"; 376 } 377 378 rename("$zf.$$", $zf) or 379 warn "cannot rename $zf.$$ to $zf"; 380 $bytes += (-s $zf); 381 } 382 my $tmpfname = "$fname.$$"; 383 open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; 384 print OUT "# $$ \n"; 385 close(OUT); 386 rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; 387 } 388 else { 389 my $tmpfname = "$fname.$$"; 390 391 open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; 392 print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n"; 393 print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n"; 394 for (@ham) { 395 print OUT "# $_ was at r$revision{$_}\n"; 396 } 397 for (@spam) { 398 print OUT "# $_ was at r$revision{$_}\n"; 399 } 400 401 push (@tmps, $tmpfname); 402 403 my $flags = ""; 404 $flags = "-t net -s 1" if $class eq "NET"; 405 $flags = "-M HTML_MESSAGE" if $class eq "HTML"; 406 $flags = "-o" if $class eq "OVERLAP"; 407 $flags = "-S" if $class eq "SCOREMAP"; 408 if ($opt{rules_dir}) { 409 $flags .= " -c '$opt{rules_dir}'"; 410 } 411 412 if ($age eq "all") { 413 my %spam; 414 my %ham; 415 my @output; 416 417 for my $file (@spam) { 418 $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); 419 } 420 for my $file (@ham) { 421 $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); 422 } 423 unlink "$opt{tmp}/ham.log.$$"; 424 unlink "$opt{tmp}/spam.log.$$"; 425 426 if (scalar keys %spam <= 0 || scalar keys %ham <= 0) { 427 warn "no files found for $class.$age"; 428 return; 429 } 430 431 chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; 432 for my $user (sort keys %spam) { 433 next unless $ham{$user}; 434 system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$"); 435 system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$"); 436 open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |"); 437 while(<IN>) { 438 chomp; 439 push @output, "$_:$user\n"; 440 } 441 close(IN); 442 } 443 open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); 444 while(<IN>) { 445 push @output, $_; 446 } 447 close(IN); 448 for (sort sort_all @output) { print OUT; } 449 } 450 elsif ($age eq "age") { 451 my @output; 452 453 for my $which (("0-1", "1-2", "2-3", "3-6")) { 454 my ($after, $before) = split(/-/, $which); 455 # get and filter logs 456 chdir $corpusdir; 457 for my $type (("ham", "spam")) { 458 open(TMP, "> $opt{tmp}/$type.log.$$"); 459 my @array = ($type eq "ham") ? @ham : @spam; 460 for my $file (@array) { 461 open(IN, $file) or warn "cannot read $file"; 462 while (<IN>) { 463 print TMP $_ if time_filter($crev_time, $after, $before); 464 } 465 close(IN); 466 } 467 close (TMP); 468 } 469 # print out by age 470 chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; 471 open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); 472 while(<IN>) { 473 chomp; 474 push @output, "$_:$which\n"; 475 } 476 close(IN); 477 } 478 for (sort sort_all @output) { print OUT; } 479 } 480 elsif (@ham && @spam) { 481 # get logs 482 system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$"); 483 system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$"); 484 485 chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; 486 open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); 487 while(<IN>) { print(OUT); } 488 close(IN); 489 } 490 491 $bytes = (-s OUT); 492 close(OUT); 493 rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; 494 } 495 496 $when = scalar localtime time; 497 print qq{created: $bytes bytes, finished at $when 498URL: 499 500 http://buildbot.spamassassin.org/ruleqa?daterev=$output_revpath 501 502}; 503 504} 505 506sub create_outputdir { 507 my ($rev, $time) = @_; 508 my $revpath = strftime("%Y%m%d", gmtime($time)) . "/r$rev-$opt_tag"; 509 my $dir = $opt{html} .'/'. $revpath; 510 511 # print "output dir: $dir\n"; 512 if (!-d $dir) { 513 my $prevu = umask 0; 514 mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir"; 515 umask $prevu; 516 } 517 518 $output_revpath = $revpath; # set the global 519 $output_revpath =~ s/\//-/; # looks nicer 520 521 return $dir; 522} 523 524