1#!/usr/bin/perl -w 2use strict; 3use MIME::Base64; 4 5# 6# <@LICENSE> 7# Licensed to the Apache Software Foundation (ASF) under one or more 8# contributor license agreements. See the NOTICE file distributed with 9# this work for additional information regarding copyright ownership. 10# The ASF licenses this file to you under the Apache License, Version 2.0 11# (the "License"); you may not use this file except in compliance with 12# the License. You may obtain a copy of the License at: 13# 14# http://www.apache.org/licenses/LICENSE-2.0 15# 16# Unless required by applicable law or agreed to in writing, software 17# distributed under the License is distributed on an "AS IS" BASIS, 18# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19# See the License for the specific language governing permissions and 20# limitations under the License. 21# </@LICENSE> 22 23sub aidbg; 24 25sub usage { 26 my $status = shift; 27 28 my $out = $status ? \*STDERR : \*STDOUT; 29 print $out <<EOF; 30usage: mass-check [options] target ... 31 32 -c=file set configuration/rules directory 33 -p=dir set user-prefs directory 34 -f=file read list of targets from <file> 35 -j=jobs specify the number of processes to run simultaneously 36 --net turn on network checks 37 --mid report Message-ID from each message 38 --debug[=LIST] report debugging information (default is all facilities, LIST 39 is a comma-separated list of facilities) 40 --rewrite=OUT save rewritten message to OUT (default is /tmp/out) 41 --rules=RE Only test rules matching the given regexp RE 42 --restart=N restart all of the children after processing N messages 43 --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they 44 were encapsulated by servers matching the regexp RE 45 (default = extract all SpamAssassin-encapsulated mails) 46 --lint check rules for syntax before running 47 --cf='config line' Additional line of configuration 48 --pre='config line' Additional line of ".pre" (prepended to configuration) 49 --run_post_scan='command' Run the named command after the 'scan' stage, 50 before starting the 'run' stage 51 52 verbosity options 53 --progress show progress updates during check 54 --noisy show noisier progress updates during check 55 --showdots print a dot for each scanned message 56 57 client/server mode options 58 --server=host:port 59 use server mode, running on the given hostname and port 60 --client=host:port 61 use client mode, connecting to the given hostname and port 62 --cs_conn_retries=N 63 only used in client mode. set the number of times to retry 64 the initial connection to the server, while waiting 60 65 seconds between connection attempts, default is 60 retries 66 --cs_max=N 67 at most, only ever request (client)/give out (server) a 68 maximum of N messages (defaults to 1000) 69 --cs_timeout=N 70 in client mode, try to connect to the server every N seconds 71 defaults to 120 72 in server mode, timeout messages after N seconds 73 defaults to 300 74 --cs_paths_only 75 only used in client mode. when making requests of the 76 server, only ask for paths to the messages and not the 77 messages themselves. useful when the client and server 78 have the same paths to the corpus data. 79 --cs_ssl use SSL to encrypt on-the-wire client-server traffic 80 (requires IO::Socket::SSL, see 81 http://wiki.apache.org/spamassassin/SslMassCheck for more) 82 --cs_verbose Log network bandwidth utilization figures and other statistics 83 --cs_schedule_cache 84 Distribute messages so that they are checked on clients who 85 have the messages in their local message cache, implies 86 --cs_cache; if --cs_schedule_cache is not enabled, but 87 --cs_cache is, clients running in --cs_paths_only mode will 88 opportunistically use the messages from their local cache 89 --cs_cache in client mode, enable the local message cache 90 in server mode, allows the clients to use cached messages 91 and/or add to their local caches 92 --cs_cachedir=dir 93 write cache info for --cs_cache in this directory tree 94 --cs_max_tries=N 95 maximum number of attempts to have a client scan a message 96 defaults to 3 97 98 log options 99 -o write all logs to stdout 100 --loghits log the text hit for patterns (useful for debugging) 101 --loguris log the URIs found 102 --logmem log the memory delta (only on Linux) 103 --hamlog=log use <log> as ham log ('ham.log' is default) 104 --spamlog=log use <log> as spam log ('spam.log' is default) 105 106 message selection options 107 -n no date sorting or spam/ham interleaving 108 --cache use cache information when selecting messages 109 --cachedir=dir write cache info for --cache in this directory tree 110 --all don't skip big messages 111 112 message selection options, can be specified for each target 113 --after=N only test mails received after time_t N (negative values 114 are an offset from current time, e.g. -86400 = last day) 115 or after date as parsed by Time::ParseDate (e.g. '-6 months') 116 --before=N same as --after, except received times are before time_t N 117 --scanprob=N probability of scanning a message, range 0.0 - 1.0 (default: 1.0) 118 119 message selection options, can be specified for each target class 120 --head=N only check first N ham and N spam (N messages if -n used) 121 --tail=N only check last N ham and N spam (N messages if -n used) 122 123 simple target options (implies -o and no ham/spam classification) 124 --dir subsequent targets are directories 125 --file subsequent targets are files in RFC 822 format 126 --mbox subsequent targets are mbox files 127 --mbx subsequent targets are mbx files 128 129 Just left over functions we should remove at some point: 130 --bayes report score from Bayesian classifier 131 132 options used during score generation process 133 --learn=N learn N% of messages as spam or ham 134 --reuse reuse network checks if X-Spam-Status: is present in messages 135 (note: both clients and servers in c/s mode need this) 136 137 non-option arguments are used as target names (mail files and folders), 138 the target format is: <class>:<format>:<location> 139 <class> is "spam" or "ham" 140 <format> is "dir", "file", "mbx", "mbox", or "detect" 141 <location> is a file or directory name. globbing of ~ and * is supported 142 143(see http://wiki.apache.org/spamassassin/MassCheck for more details.) 144 145EOF 146 exit($status); 147} 148 149########################################################################### 150 151our ($opt_c, $opt_p, $opt_f, $opt_j, $opt_n, $opt_o, $opt_all, $opt_bayes, 152 $opt_debug, $opt_format, $opt_hamlog, $opt_head, $opt_loghits, 153 $opt_mid, $opt_net, $opt_nosort, $opt_progress, $opt_showdots, 154 $opt_spamlog, $opt_tail, $opt_rules, $opt_restart, $opt_loguris, 155 $opt_logmem, $opt_after, $opt_before, $opt_rewrite, $opt_deencap, 156 $opt_learn, $opt_reuse, $opt_lint, $opt_cache, $opt_noisy, $opt_cf, 157 $total_messages, $statusevery, $opt_cachedir, $opt_scanprob, 158 $opt_client, $opt_cs_conn_retries, $opt_cs_max, $opt_cs_timeout, 159 $opt_cs_paths_only, $opt_server, %postdata, %real, $svn_revision, 160 $opt_cs_ssl, $opt_run_post_scan, $opt_cs_verbose, %client_caches, 161 %server_caches, @cache_tmp_files, %min_other_caches, 162 %unique_cache_completed, $opt_cs_schedule_cache, $opt_cs_cache, 163 $opt_cs_cachedir, $opt_cs_max_tries, $opt_pre, 164 $tmpfd); 165 166use FindBin; 167 168# use "blib" so that we can use e.g. @@LOCAL_STATE_DIR@@; "lib" doesn't 169# have that stuff substituted :( use lib too, though, as a backup, 170# since some users might be running mass-check without "make" first 171use lib "$FindBin::Bin/../lib"; 172use lib "$FindBin::Bin/../blib/lib"; 173 174use IO::Select; 175use IO::Socket; 176use Socket qw(); 177use Mail::SpamAssassin::ArchiveIterator; 178use Mail::SpamAssassin; 179use Mail::SpamAssassin::Logger; 180use File::Copy; 181use File::Spec; 182use Getopt::Long; 183use POSIX qw(strftime); 184use Fcntl qw(O_RDWR O_CREAT);; 185use Config; 186 187use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; 188use constant HAS_IO_ZLIB => eval { require IO::Zlib; }; 189use constant HAS_IO_SOCKET_SSL => eval { require IO::Socket::SSL; }; 190use constant HAS_TIME_HI_RES => eval { require Time::HiRes; }; 191use constant HAS_SDBM_FILE => eval { require SDBM_File; }; 192 193 194# default settings 195$opt_c = "$FindBin::Bin/../rules"; 196$opt_p = "$FindBin::Bin/spamassassin"; 197$opt_j = 1; 198$opt_head = 0; 199$opt_tail = 0; 200$opt_net = 0; 201$opt_hamlog = "ham.log"; 202$opt_spamlog = "spam.log"; 203$opt_learn = 0; 204$opt_cf = []; 205$opt_pre = []; 206 207my $rcvd_bytes = 0; 208my $sent_bytes = 0; 209my $t_first_msg = 0; 210my $t_last_msg = 0; 211my $msgs_processed = 0; 212my $failed_msgs = 0; 213my $cache_hits = 0; 214my $client_id = 0; 215 216my @ORIG_ARGV = @ARGV; 217GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s", 218 "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", 219 "progress!", "rewrite:s", "showdots", "spamlog=s", "tail=i", 220 "rules=s", "restart=i", "loguris", "run_post_scan=s", 221 "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache", 222 "cachedir=s", "noisy", "scanprob=f", 223 "server=s", "cs_max=i", "cs_timeout=i", "cs_conn_retries=i", 224 "cs_paths_only", "client=s", "cs_ssl", "cs_verbose", 225 "cs_schedule_cache", "cs_cache", "cs_cachedir=s", "cs_max_tries=i", 226 "before=s" => \&deal_with_before_after, 227 "after=s" => \&deal_with_before_after, 228 "cf=s" => \@{$opt_cf}, 229 "pre=s" => \@{$opt_pre}, 230 "dir" => sub { $opt_format = "dir"; }, 231 "file" => sub { $opt_format = "file"; }, 232 "mbox" => sub { $opt_format = "mbox"; }, 233 "mbx" => sub { $opt_format = "mbx"; }, 234 "help" => sub { usage(0); }, 235 '<>' => \&target) or usage(1); 236 237# We need IO::Zlib for client-server mode! 238if ( ($opt_client || $opt_server) && ! HAS_IO_ZLIB ) { 239 die "IO::Zlib required for client/server mode!\n"; 240} 241 242if ($opt_cs_ssl && ! HAS_IO_SOCKET_SSL ) { 243 die "IO::Socket::SSL required for --cs_ssl!\n"; 244} 245 246if ($opt_noisy) { 247 $opt_progress = 1; # implies --progress 248} 249 250$opt_debug ||= 'all' if defined $opt_debug; 251 252if ($opt_cs_schedule_cache) { 253 $opt_cs_cache = 1; # implies --cs_cache 254} 255 256if ($opt_client && $opt_cs_cache && !$opt_cs_cachedir) { 257 warn "You must specify a local message cache directory with --cs_cachedir\n". 258 "when using the --cs_cache option.\n"; 259 exit; 260} 261 262if ($opt_server && $opt_cs_schedule_cache && !$opt_n) { 263 warn '*'x74 ."\n"; 264 warn '*'. ' 'x72 ."*\n"; 265 warn "* Corpus will be run un-sorted but with date stamp loggging which is *\n"; 266 warn "* needed for score generation log selection but is not available when *\n"; 267 warn "* using the -n option. This may affect the results of any bayes and *\n"; 268 warn "* AWL tests run during this mass-check. *\n"; 269 warn '*'. ' 'x72 ."*\n"; 270 warn '*'x74 ."\n"; 271} 272 273if ($opt_cs_schedule_cache && !HAS_SDBM_FILE) { 274 warn "--cs_schedule_cache requires the Perl module SDBM_File.\n"; 275 exit; 276} 277 278# --lint 279if ($opt_lint) { 280 # In theory we could probably use the same spamtest object as below, 281 # but since it's probably not expecting that, and we don't want 282 # strange things happening, create a local object. 283 my $spamlint = create_spamtest(); 284 $spamlint->debug_diagnostics(); 285 my $res = $spamlint->lint_rules(); 286 $spamlint->finish(); 287 if ($res) { 288 warn "lint: $res issues detected, ". 289 "please rerun with debug enabled for more information\n"; 290 exit 1; 291 } 292} 293 294# test messages for the mass-check 295my @targets; 296if (!$opt_client) { 297 if ($opt_f) { 298 open(F, $opt_f) || die "cannot read target $opt_f: $!"; 299 push(@targets, map { chomp; $_ } <F>); 300 close(F); 301 } 302 usage(1) if !@targets; 303} 304 305if ($opt_reuse) { 306 # if we have --reuse, don't bother testing DNS; we shouldn't be hitting 307 # the wire at all, and in fact we may be running without a net connection 308 push @{$opt_cf}, "dns_available yes\n"; 309 310 # need to load M:SA:Plugin:Reuse 311 push @{$opt_pre}, "loadplugin Mail::SpamAssassin::Plugin::Reuse\n"; 312 313} 314 315my $user_prefs = "$opt_p/user_prefs"; 316 317sub create_spamtest { 318 return Mail::SpamAssassin->new({ 319 'debug' => $opt_debug, 320 'rules_filename' => $opt_c, 321 'site_rules_filename' => "$opt_p/local.cf", 322 'userprefs_filename' => $user_prefs, 323 'userstate_dir' => $opt_p, 324 'save_pattern_hits' => $opt_loghits, 325 'dont_copy_prefs' => 1, 326 'local_tests_only' => $opt_net ? 0 : 1, 327 'only_these_rules' => $opt_rules, 328 'ignore_safety_expire_timeout' => 1, 329 'pre_config_text' => join("\n", @{$opt_pre})."\n", 330 'post_config_text' => join("\n", @{$opt_cf})."\n", 331 PREFIX => '', 332 DEF_RULES_DIR => $opt_c, 333 LOCAL_RULES_DIR => '', 334 }); 335} 336 337my $spamtest = create_spamtest(); 338$spamtest->compile_now(0); # 0 since we will be reading more configs 339$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); 340$spamtest->read_scoreonly_config($user_prefs); 341$spamtest->call_plugins("prefork_init"); # since SA 3.4.0 342 343my $who = `id -un 2>/dev/null`; 344my $where = `uname -n 2>/dev/null`; 345my $when = `date -u`; 346my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || $where || `hostname` || 'localhost'; 347chomp $who; 348chomp $where; 349chomp $when; 350chomp $host; 351$svn_revision = get_current_svn_revision(); 352 353# when displaying the commandline, quote any arguments which have 354# "questionable" characters such as spaces, pipes, etc. 355my $cmdline = join(' ',map { m@[^A-Za-z0-9_/\\.-]@ ? qq/"$_"/ : $_ } @ORIG_ARGV); $cmdline =~ s/\s+/ /gs; 356my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better 357 358my $log_header = "# mass-check results from $who\@$where, on $when\n" . 359 "# M:SA version ".$spamtest->Version()."\n" . 360 "# SVN revision: $svn_revision\n" . 361 "# Date: $isowhen\n" . 362 "# Perl version: $] on $Config{archname}\n" . 363 "# Switches: '$cmdline'\n"; 364 365my $updates = ($opt_noisy ? 100 : 10); 366my $total_count = 0; 367my $spam_count = 0; 368my $ham_count = 0; 369my $init_results = 0; 370 371my $showdots_active = ($opt_showdots || $opt_noisy); 372my $showdots_counter = 0; 373my $showdots_every = ($opt_showdots ? 1 : 20); 374 375my $AIopts = { 376 'opt_all' => $opt_all, 377 'opt_skip_empty_messages' => 1, 378 }; 379 380if (!$opt_client) { 381 # Deal with --rewrite 382 if (defined $opt_rewrite) { 383 my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out"); 384 open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!"; 385 } 386 387 # ArchiveIterator options for non-client mode 388 $AIopts->{'opt_scanprob'} = $opt_scanprob; 389 $AIopts->{'opt_cache'} = $opt_cache; 390 $AIopts->{'opt_cachedir'} = $opt_cachedir; 391 $AIopts->{'opt_after'} = $opt_after; 392 $AIopts->{'opt_before'} = $opt_before; 393 $AIopts->{'scan_progress_sub'} = \&showdots_blip; 394 $AIopts->{'opt_want_date'} = ! $opt_n; 395 396 # ensure that scanprob stuff is predictable and reproducable 397 if (defined $opt_scanprob && $opt_scanprob < 1.0) { 398 srand(1); 399 } 400} 401else { 402 # ArchiveIterator options for client mode -- tends to be simple 403 $opt_n = 1; 404 $AIopts->{'opt_want_date'} = 0; 405} 406 407########################################################################### 408## SCAN MODE 409 410my $iter = Mail::SpamAssassin::ArchiveIterator->new($AIopts); 411 412# setup the AI functions 413if ($opt_client) { 414 $iter->set_functions(\&wanted, \&result_client); 415} 416elsif ($opt_server) { 417 $iter->set_functions(\&wanted_server, \&result); 418} 419else { 420 $iter->set_functions(\&wanted, \&result); 421} 422 423#AFTER SETTING ALL OPTS, RUN _set_default_message_selection_opts() TO GET SANE DEFAULTS 424$iter->_set_default_message_selection_opts(); 425 426my $messages; 427 428# normal mode as well as a server do scan mode and get a temp file 429if (!$opt_client) { 430 status('starting scan stage') if ($opt_progress); 431 432 # Make a temp file and delete it 433 my $tmpf; 434 ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 435 die 'mass-check: failed to create temp file' unless $tmpf; 436 unlink $tmpf or die "mass-check: unlink '$tmpf': $!"; 437 438 # having opt_j or server mode means do scan in a separate process 439 if ($opt_server || $opt_j) { 440 if ($tmpf = fork()) { 441 # parent 442 waitpid($tmpf, 0); 443 } 444 elsif (defined $tmpf) { 445 # child -- process using message_array 446 generate_queue(\@targets, $tmpfd); 447 exit; 448 } 449 else { 450 die "mass-check: cannot fork: $!"; 451 } 452 } 453 else { 454 # we get here if opt_j == 0, so scan in this process 455 generate_queue(\@targets, $tmpfd); 456 } 457 458 # we now have a temporary file with the messages to process 459 seek($tmpfd, 0, 0); 460 # the first line is the number of messages 461 $total_messages = read_line($tmpfd); 462 463 if (!$total_messages) { 464 die "mass-check: no messages to process\n"; 465 } 466 467 if ($opt_cs_schedule_cache) { 468 # create a tied hash database for the server to use for matching against 469 # client hashes 470 471 # while we're at it, count the total number of messages for ourself 472 # I suspect that AI gets it wrong sometimes, somehow 473 $total_messages = 0; 474 475 # get a temp file for the server database on messages to process 476 my ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 477 close $dbfd; 478 unlink $dbpath; 479 push @cache_tmp_files, $dbpath.'.pag'; 480 push @cache_tmp_files, $dbpath.'.dir'; 481 482 tie %{$server_caches{'to_process'}}, "SDBM_File", $dbpath, 483 O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!"; 484 485 # get a temp file for the server DB of messages that are not cached anywhere 486 ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 487 close $dbfd; 488 unlink $dbpath; 489 push @cache_tmp_files, $dbpath.'.pag'; 490 push @cache_tmp_files, $dbpath.'.dir'; 491 492 tie %{$server_caches{'not_cached'}}, "SDBM_File", $dbpath, 493 O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!"; 494 495 # get a temp file for the server database of cache hit counts per message 496 ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 497 close $dbfd; 498 unlink $dbpath; 499 push @cache_tmp_files, $dbpath.'.pag'; 500 push @cache_tmp_files, $dbpath.'.dir'; 501 502 tie %{$server_caches{'cache_count'}}, "SDBM_File", $dbpath, 503 O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!"; 504 505 # dump the server's list of messages to process to the DB_HASHes 506 while (my $msg = read_line($tmpfd)) { 507 my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg); 508 $server_caches{'to_process'}{$d[3]} = $msg; 509 $server_caches{'not_cached'}{$d[3]} = undef; 510 $total_messages++; 511 } 512 # and back to the first message (although we currently never use it again) 513 seek($tmpfd, 0, 0); 514 read_line($tmpfd); 515 } 516 517 showdots_finish(); 518 status("completed scan stage, $total_messages messages") if ($opt_progress); 519} 520 521########################################################################### 522## RUN MODE 523 524run_post_scan(); 525 526if ($opt_client) { 527 client_mode(); 528} 529else { 530 status('starting run stage') if ($opt_progress); 531 532 if ($opt_server) { 533 server_mode(); 534 } 535 else { 536 $t_first_msg = time; 537 run_through_messages(); 538 $t_last_msg = time; 539 } 540 541 status('completed run stage') if ($opt_progress); 542} 543 544# Even though we're about to exit, let's clean up after ourselves 545close($tmpfd) if ($tmpfd); 546showdots_finish(); 547 548if (defined $opt_rewrite) { 549 close(REWRITE); 550} 551 552$spamtest->finish(); 553 554if ($opt_cs_verbose) { 555 if ($opt_client || $opt_server) { 556 warn "network I/O: sent=$sent_bytes, received=$rcvd_bytes\n"; 557 } 558 $t_last_msg++ if $t_last_msg == $t_first_msg; 559 warn "processed $msgs_processed messages (failed to scan $failed_msgs) in ".($t_last_msg - $t_first_msg). 560 ' seconds ('.(sprintf("%.1f", ($msgs_processed / ($t_last_msg - $t_first_msg) * 3.6))). 561 " kmsgs/hr)\n"; 562 $msgs_processed ||= 1; 563 warn "cache hits: $cache_hits (" 564 .(sprintf("%.1f", ($cache_hits / $msgs_processed * 100)))."%)\n" if $opt_cs_cache; 565} 566 567# exit status: did we check at least one message correctly? 568exit(!($ham_count || $spam_count)); 569 570########################################################################### 571 572sub target { 573 my ($target) = @_; 574 575 # message-selection options; these can now be specified separately 576 # for each target 577 my %selopts = ( 578 opt_scanprob => $opt_scanprob, 579 opt_after => $opt_after, 580 opt_before => $opt_before 581 ); 582 583 if (!defined($opt_format)) { 584 push(@targets, { %selopts, target => $target }); 585 } 586 else { 587 $opt_o = 1; 588 push(@targets, { %selopts, target => "spam:$opt_format:$target" }); 589 } 590} 591 592########################################################################### 593 594sub init_results { 595 $init_results = 1; 596 597 showdots_finish(); 598 599 # now, showdots only happens if --showdots was used 600 $showdots_active = $opt_showdots; 601 602 if ($opt_progress) { 603 # round up since 100% will be caught at end already 604 $statusevery = int($total_messages / $updates + 1); 605 606 # if $messages < $updates, just give a status line per msg. 607 $statusevery ||= 1; 608 } 609 610 return if $opt_client; 611 612 if ($opt_o) { 613 autoflush STDOUT 1; 614 print STDOUT $log_header; 615 } 616 else { 617 open(HAM, "> $opt_hamlog") || die "open of $opt_hamlog failed: $!"; 618 open(SPAM, "> $opt_spamlog") || die "open of $opt_spamlog failed: $!"; 619 autoflush HAM 1; 620 autoflush SPAM 1; 621 print HAM $log_header; 622 print SPAM $log_header; 623 } 624} 625 626sub result { 627 my ($class, $result, $time) = @_; 628 629 # don't open results files until we get here to avoid overwriting files 630 init_results() if !$init_results; 631 632 if ($class eq "s") { 633 if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } 634 $spam_count++; 635 } 636 elsif ($class eq "h") { 637 if ($opt_o) { print STDOUT $result; } else { print HAM $result; } 638 $ham_count++; 639 } 640 641 $total_count++; 642 643 if ($opt_progress) { 644 progress($time); 645 } 646} 647 648sub wanted { 649 my ($class, $id, $time, $dataref, $format) = @_; 650 my $out = ''; 651 652 # if origid is defined, it'll be the message number from server mode 653 my $origid; 654 655 # client mode is a little crazy because we need to kluge around the fact 656 # that the information needed to do the run is different than the 657 # information that goes into the results. 658 if ($opt_client) { 659 # change the format and id to the real version, make sure to remember 660 # the server's message number 661 # note: pop/push makes it work regardless of any changes to M::SA::AI 662 $origid = pop @{$real{$id}}; 663 $format = $real{$id}->[2]; 664 $id = $real{$id}->[3]; 665 } 666 667 memory_track_start() if ($opt_logmem); 668 $spamtest->timer_reset; 669 670 # parse the message, and force it to complete 671 my $ma = $spamtest->parse($dataref, 1); 672 673 # get X-Spam-Status: header for rule hit resue 674 my $x_spam_status; 675 my $reusing; 676 if ($opt_reuse) { 677 $x_spam_status = $ma->get_header("X-Spam-Status"); 678 $x_spam_status and $x_spam_status =~ s/,\s+/,/gs; 679 } 680 681 # remove SpamAssassin markup, if present and the mail was spam 682 my $header = $ma->get_header("Received"); 683 if ($header && $header =~ /\bwith SpamAssassin\b/) { 684 if (!$opt_deencap || message_should_be_deencapped($ma)) { 685 my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); 686 $ma->finish(); 687 $ma = $new_ma; 688 } 689 } 690 691 if ($opt_reuse) { 692 if ($x_spam_status 693 && $x_spam_status =~ m/tests=(\S*)/ 694 && $x_spam_status !~ /\bshortcircuit=(?:ham|spam|default)\b/) 695 { 696 my @previous = split(/,/, $1); 697 # Bug 7709 698 # Amavis X-Spam-Status rules include score and are enclosed in [] 699 # Amavis: [RULENAME=0.01,RULENAME_2=0.01] 700 # Spamassassin: RULENAME,RULENAME_2 701 # .. also support compact RULE(hits), no need to count hits here 702 s/[\[\]]//, s/=.*//, s/\(\d+\)$// foreach (@previous); 703 $ma->{metadata}->{reuse_tests_hit} = { map {$_ => 1} @previous }; 704 $reusing = 1; 705 } 706 } 707 708 # plugin hook to cause us to skip messages 709 my $skip = $spamtest->call_plugins("mass_check_skip_message", { 710 class => $class, 711 'time' => $time, 712 'id' => $id, 713 msg => $ma 714 }); 715 if ($skip) { 716 $ma->finish(); 717 return; 718 } 719 720 # log-uris support 721 my $status; 722 my @uris; 723 my $before; 724 my $after; 725 if ($opt_loguris) { 726 my $pms = Mail::SpamAssassin::PerMsgStatus->new($spamtest, $ma); 727 @uris = $pms->get_uri_list(); 728 $pms->finish(); 729 730 } else { 731 $before = time; 732 $status = $spamtest->check($ma); 733 $after = time; 734 } 735 736 my @extra; 737 738 # sample-based learning 739 if ($opt_learn > 0) { 740 my $spam; 741 # spam learned as ham = 0.05% 742 if ($class eq 's' && rand(100) < 0.05) { 743 $spam = 0; 744 } 745 # ham learned as spam = 0.01% 746 elsif ($class eq 'h' && rand(100) < 0.01) { 747 $spam = 1; 748 } 749 # spam/ham learned correctly 750 elsif (rand(100) < $opt_learn) { 751 if ($class eq 's') { 752 $spam = 1; 753 } 754 elsif ($class eq 'h') { 755 $spam = 0; 756 } 757 else { 758 die "unknown class, learning failed"; 759 } 760 } 761 if (defined $spam) { 762 my $result = ($spam ? "spam" : "ham"); 763 my $status = $spamtest->learn($ma, undef, $spam, 0); 764 my $learned = $status->did_learn(); 765 $result = "undef" if !defined $learned; 766 push(@extra, "learn=".$result); 767 } 768 } 769 770 if (defined($time)) { 771 push(@extra, "time=".$time); 772 } 773 if ($status && defined $status->{bayes_score}) { 774 push(@extra, "bayes=".sprintf("%06f", $status->{bayes_score})); 775 } 776 if ($opt_mid) { 777 my $mid = $ma->get_header("Message-Id"); 778 if ($mid) { # message contains a Message-Id: 779 while($mid =~ s/\([^\(\)]*\)//s) {}; # remove comments and 780 $mid =~ s/^\s+|\s+$//sg; # leading and trailing spaces 781 $mid =~ s/\s.*$//s; # keep only the first token 782 } 783 else { # it doesn't have a Message-Id: 784 $mid = $id; # so build one from the id 785 $mid =~ s,^.*/,,; # remove the path 786 $mid = "<$mid\@$host.masses.spamassassin.org>"; # and put it together 787 } 788 $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c; # replace dangerous chars with . (so regexp search just works) 789 push(@extra, "mid=$mid"); 790 } 791 push(@extra, "scantime=" . ($after - $before)); 792 push(@extra, "format=$format"); 793 794 if ($opt_logmem) { 795 my $mem = memory_track_finish(); 796 if ($mem) { 797 push(@extra, $mem); 798 } 799 } 800 801 push(@extra, "reuse=" . ($reusing ? "yes" : "no")); 802 803 # log the scoreset we're in 804 { 805 my $set = 0; 806 if ($opt_net) { $set |= 1; } 807 if ($status && defined $status->{bayes_score}) { $set |= 2; } 808 push(@extra, "set=".$set); 809 } 810 811 if ($opt_client) { 812 push(@extra, "host=$where"); 813 } 814 815 my $yorn; 816 my $score; 817 my $tests; 818 my $extra; 819 820 if ($opt_loguris) { 821 $yorn = '.'; 822 $score = 0; 823 $tests = join(" ", sort @uris); 824 $extra = ''; 825 } else { 826 $yorn = $status->is_spam() ? 'Y' : '.'; 827 # don't bother adjusting scores for reuse 828 $score = $status->get_score(); 829 # list of tests hit 830 my %tests; 831 foreach (( 832 split(/,/, $status->get_names_of_tests_hit()), 833 split(/,/, $status->get_names_of_subtests_hit()) 834 )) { 835 $tests{$_}++; 836 } 837 my @tests; 838 foreach (sort keys %tests) { 839 # Use compact RULE(hitcount) format 840 push @tests, $tests{$_} > 1 ? "$_($tests{$_})" : $_; 841 } 842 $tests = join(",", @tests); 843 $extra = join(",", @extra); 844 } 845 846 if (defined $opt_rewrite) { 847 print REWRITE $status->rewrite_mail(); 848 } 849 850 $id =~ s/\s/_/g; 851 852 # if we have an origid set, it'll be the server mode's message number, so 853 # attach it to our result appropriately. 854 if (defined $origid) { 855 $out = "$origid "; 856 } 857 858 $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); 859 860 if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { 861 $out .= logkilled($ma, $id, "possible virus"); 862 } 863 864 if ($opt_loghits) { 865 my $log = ''; 866 foreach my $t (sort keys %{$status->{pattern_hits}}) { 867 $_ = $status->{pattern_hits}->{$t}; 868 $_ ||= ''; 869 s/\r/\\r/gs; # fix unprintables 870 s/\n/\\n/gs; 871 $log .= "$t=\"$_\" "; 872 } 873 if ($log) { 874 chomp $log; 875 $out .= "# $log\n"; 876 } 877 } 878 879 if (defined $status) { $status->finish(); } 880 $ma->finish(); 881 undef $ma; # clean 'em up 882 undef $status; 883 884 showdots_blip(); 885# print ">>>> out = $out\n"; 886 return $out; 887} 888 889sub showdots_blip { 890 return unless ($showdots_active); 891 892 $showdots_counter++; 893 if ($showdots_counter % $showdots_every == 0) { 894 print STDERR '.'; 895 if ($showdots_counter % (60 * $showdots_every) == 0) { 896 print STDERR "\n"; 897 } 898 } 899} 900 901sub showdots_finish { 902 print STDERR "\n" if ($showdots_active); 903 $showdots_counter = 0; 904} 905 906# ick. We have to go grovelling through the body parts to see if a message 907# is a report_safe-marked-up message, because a local scanner will overwrite 908# any remote scanner's X-Spam-Checker-Version header. 909# 910sub message_should_be_deencapped { 911 my ($ma) = @_; 912 913 # not sure why this is undefined, but it is sometimes 914 if (defined $ma->{body_parts} && scalar @{$ma->{body_parts}} > 0) { 915 my $firstpart = $ma->{body_parts}->[0]; 916 if (!$firstpart->{headers}->{'content-type'} 917 || $firstpart->{headers}->{'content-type'} ne 'text/plain') 918 { 919 return 0; # not a 'report_safe' encapsulation 920 } 921 922 if (scalar @{$firstpart->{raw}} < 3) { return 0; } # too short to be a report 923 924 # grab first 2 lines 925 my $text = $firstpart->{raw}->[0] . $firstpart->{raw}->[1]; 926 $text =~ s/\s+/ /gs; 927 if ($text =~ /^Spam detection software, running on the system \"(\S+)\"/) { 928 my $hname = $1; 929 if ($hname =~ /$opt_deencap/io) { 930 return 1; 931 } 932 } 933 } 934 935 return 0; # a different host marked it up. pass it through! 936} 937 938sub logkilled { 939 my ($ma, $id, $reason) = @_; 940 941 my $from = $ma->get_header("From") || 'undef'; 942 my $to = $ma->get_header("To") || 'undef'; 943 my $subj = $ma->get_header("Subject") || 'undef'; 944 my $mid = $ma->get_header("Message-Id") || 'undef'; 945 chomp ($from); 946 chomp ($to); 947 chomp ($subj); 948 chomp ($mid); 949 return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n"; 950} 951 952sub progress { 953 my ($time) = @_; 954 $time ||= 0; 955 956 # Are we at the end or otherwise at a point we should print status? Then do it. 957 if ($total_messages == $total_count || $total_count % $statusevery == 0) { 958 my $time = strftime("%Y-%m-%d", localtime($time)); 959 status(sprintf("%3d%% ham: %-6d spam: %-6d date: %s", 960 int(($total_count / $total_messages) * 100), $ham_count, $spam_count, $time)); 961 } 962} 963 964sub status { 965 my($str) = @_; 966 my $now = strftime("%Y-%m-%d %X", localtime(time)); 967 printf STDERR "status: %-48s now: %s\n", $str, $now; 968} 969 970########################################################################### 971 972our ($mem_size, $mem_rss, $mem_shared); 973 974sub memory_track_start { 975 if ($^O =~ /linux/i) { 976 if (open (IN, "</proc/$$/statm")) { 977 my $statm = <IN>; 978 close IN; 979 if ($statm =~ /^(\d+) (\d+) (\d+) /) { 980 $mem_size = $1; 981 $mem_rss = $2; 982 $mem_shared = $3; 983 } 984 } 985 } 986} 987 988sub memory_track_finish { 989 my $str = ''; 990 991 if ($^O =~ /linux/i) { 992 if (open (IN, "</proc/$$/statm")) { 993 my $statm = <IN>; 994 close IN; 995 if ($statm =~ /^(\d+) (\d+) (\d+) /) { 996 my $size = $1; 997 my $rss = $2; 998 my $shared = $3; 999 1000 $str = sprintf ("memsz=%d,memrss=%d,memshr=%d", 1001 ($size - $mem_size), 1002 ($rss - $mem_rss), 1003 ($shared - $mem_shared)); 1004 } 1005 } 1006 } 1007 return $str; 1008} 1009 1010sub get_current_svn_revision { 1011 my $revision; 1012 1013 # this is usually "${TOPDIR}/masses" 1014 my $dir = $FindBin::Bin || "."; 1015 1016 if (-d "$dir/../.svn" || -f "$dir/svninfo.tmp") { 1017 if (-f "$dir/svninfo.tmp") { 1018 # created by build/automc/buildbot_ready for chrooted mass-checks 1019 open (SVNINFO, "< $dir/svninfo.tmp"); 1020 } 1021 else { 1022 # note, ".." since we want to pick up changes outside 'masses' 1023 # too! 1024 open (SVNINFO, "( svn info --non-interactive $dir/.. || svn info $dir/.. ) 2>&1 |"); 1025 } 1026 1027 while (<SVNINFO>) { 1028 # Revision: 383822 1029 next unless /^Revision: (\d+)/; 1030 $revision = $1; 1031 last; 1032 } 1033 close SVNINFO; 1034 return $revision if $revision; 1035 } 1036 1037 # this probably will never work due to Rules Project changes TODO 1038 if (open(TESTING, "$opt_c/70_testing.cf")) { 1039 chomp($revision = <TESTING>); 1040 $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; 1041 close(TESTING); 1042 return $revision if $revision; 1043 } 1044 1045 return $revision || "unknown"; 1046} 1047 1048############################################################################ 1049 1050## children processors, start and process, used when opt_j > 1 1051 1052sub start_children { 1053 my ($count, $child, $pid, $socket) = @_; 1054 1055 my $io = IO::Socket->new(); 1056 my $parent; 1057 1058 # create children 1059 for (my $i = 0; $i < $count; $i++) { 1060 ($child->[$i],$parent) = $io->socketpair(AF_UNIX,SOCK_STREAM,PF_UNSPEC) 1061 or die "mass-check: socketpair failed: $!"; 1062 if ($pid->[$i] = fork) { 1063 close $parent; 1064 1065 # disable caching for parent<->child relations 1066 my ($old) = select($child->[$i]); 1067 $|++; 1068 select($old); 1069 1070 $socket->add($child->[$i]); 1071 aidbg "mass-check: starting new child $i (pid ".$pid->[$i].")\n"; 1072 next; 1073 } 1074 elsif (defined $pid->[$i]) { 1075 my $result; 1076 my $line; 1077 1078 close $tmpfd if defined $tmpfd; 1079 1080 close $child->[$i]; 1081 select($parent); 1082 $| = 1; # print to parent by default, turn off buffering 1083 send_line($parent,"START"); 1084 while ($line = read_line($parent)) { 1085 if ($line eq "exit") { 1086 close $parent; 1087 exit; 1088 } 1089 1090 my($class, $format, $date, $where, $result) = $iter->_run_message($line); 1091 $result ||= ''; 1092 1093 # If determine_receive_date is not set, the original input date 1094 # wasn't calculated, but run_message would have done so, so reset 1095 # the packed version if possible ... use defined for date since 1096 # it could == 0. 1097 if (!$iter->{determine_receive_date} && $class && $format && defined $date && $where) { 1098 $line = Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $where); 1099 } 1100 1101 $result = encode_base64($result); 1102 send_line($parent,"$result\0RESULT $line"); 1103 } 1104 exit; 1105 } 1106 else { 1107 die "mass-check: cannot fork: $!"; 1108 } 1109 } 1110} 1111 1112## handling killing off the children 1113 1114sub reap_children { 1115 my ($count, $socket, $pid) = @_; 1116 1117 # If the child died, sending it the exit will generate a SIGPIPE, but we 1118 # don't really care since the readline will go undef (which is fine), 1119 # then we do the waitpid which will finish it off. So we end up in the 1120 # right state, in theory. 1121 local $SIG{'PIPE'} = 'IGNORE'; 1122 1123 for (my $i = 0; $i < $count; $i++) { 1124 aidbg "mass-check: killing child $i (pid ",$pid->[$i],")\n"; 1125 send_line($socket->[$i],"exit"); # tell the child to die. 1126 close $socket->[$i]; 1127 waitpid($pid->[$i], 0); # wait for the signal ... 1128 } 1129} 1130 1131# in server mode, this gets called to read in the HTTP request from a given 1132# socket, then return the information the client sent to us. 1133sub handle_http_request { 1134 my $socket = shift; 1135 1136 my $headers = {}; 1137 my $postdata = {}; 1138 1139 # read in the request (POST / HTTP/1.0) 1140 my $line = $socket->getline(); 1141 $line ||= ''; 1142 $rcvd_bytes += length($line); 1143 $line =~ s/\r\n$//; 1144 1145 my ($type, $URI, $VERS) = $line =~ /^([a-zA-Z]+)\s+(\S+)(?:\s*(\S+))/; 1146 unless ($type && $URI && $VERS) { 1147 $type ||= ''; 1148 $URI ||= ''; 1149 1150 return ($type, $URI, $headers, $postdata); 1151 } 1152 1153 $type = uc $type; 1154 1155 # read in headers, "key: value" up to a blank line 1156 do { 1157 $line = $socket->getline(); 1158 last unless defined $line; 1159 $rcvd_bytes += length($line); 1160 $line =~ s/\r\n$//; 1161 1162 if ($line) { 1163 my ($k,$v) = split(/:\s*/, $line, 2); 1164 $headers->{lc $k} = $v; 1165 } 1166 } while ($line !~ /^$/); 1167 1168 # if this is a POST request w/ content-length, there'll be a payload, deal 1169 # with it. we only support compressed payloads. 1170 my $postheader; 1171 if ($type eq 'POST' && $headers->{'content-length'}) { 1172 $rcvd_bytes += $headers->{'content-length'}; 1173 my $pd = ''; 1174 if ($headers->{'content-encoding'} eq 'x-gzip') { 1175 # assign an id to the client if it doesn't already have one 1176 $headers->{'client-id'} ||= ++$client_id; 1177 1178 my ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 1179 die "Can't make tempfile, exiting" unless $gzpath; 1180 1181 # TODO: don't read in the entire thing at once to avoid memory bloat 1182 my $rd; 1183 $socket->read($rd, $headers->{'content-length'}) || die "mass-check: error reading in data from client\n"; 1184 print $gzfd $rd; 1185 $gzfd->close; 1186 1187 my $fd = IO::Zlib->new($gzpath, "rb"); 1188 die "Can't open temp result file: $!" unless $fd; 1189 1190 if ($headers->{'action'}) { 1191 # different types of POST contents are packed different ways 1192 if ($headers->{'action'} eq 'sending cache') { 1193 1194 if ($opt_cs_schedule_cache) { 1195 # save the client cache to a tied hash 1196 1197 # get a temp file for the client cache database 1198 my ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 1199 close $dbfd; 1200 unlink $dbpath; 1201 push @cache_tmp_files, $dbpath.'.pag'; 1202 push @cache_tmp_files, $dbpath.'.dir'; 1203 1204 tie %{$client_caches{$headers->{'client-id'}}}, "SDBM_File", $dbpath, 1205 O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!\n"; 1206 1207 # dump the client's list of cached message paths to the DB_HASH 1208 my $client_cache = 0; 1209 my $client_cache_hits = 0; 1210 while (my $msg = read_line($fd)) { 1211 my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg); 1212 $client_cache++; 1213 if (exists $server_caches{'to_process'}{$d[3]}) { 1214 $client_caches{$headers->{'client-id'}}{$d[3]} = undef; 1215 $server_caches{'cache_count'}{$d[3]}++; 1216 delete $server_caches{'not_cached'}{$d[3]}; 1217 # log this client's cache hit rate 1218 $client_cache_hits++; 1219 } 1220 } 1221 status("client $headers->{'client-id'} has $client_cache msgs cached ($client_cache_hits usable)"); 1222 status("client $headers->{'client-id'} has ".(sprintf("%.1f", ($client_cache_hits / $total_messages * 100))).'% of required messages'); 1223 } 1224 } 1225 elsif ($headers->{'action'} eq 'sending results') { 1226 # process the results 1227 $pd = read_line($fd); 1228 1229 # key1=value1&key2=value2... 1230 %{$postdata} = map { 1231 my($k,$v) = split(/=/, $_, 2); 1232 1233 # we need to decode the key and value 1234 $k =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; 1235 $v =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; 1236 1237 $k => $v; 1238 } split(/\&/, $pd); 1239 1240 } 1241 } 1242 1243 $fd->close; 1244 undef $fd; 1245 $gzfd->close; 1246 unlink $gzpath; 1247 } 1248 } 1249 return($type, $URI, $headers, $postdata); 1250} 1251 1252# in server mode, generate a gzip compressed data stream with the messages and 1253# return the path to the compressed file which the server will read and pass 1254# to the client. 1255# 1256# Input: 1257# - Number of messages to generate (scalar) 1258# - Hash of Arrays of outstanding requests (reference to hash of array refs) 1259# timestamp# -> [ num1, num2, ... ] 1260# Used to quickly find outstanding/timed out messages to send to client. 1261# - Hash of outstanding messages and associated data (ref to hash of hash refs) 1262# num1 -> { data => 'binary data from scan mode', timestamp => timestamp# } 1263# Used later on to specify the timestamp entry to remove the entry from. 1264# - Paths only? If true, just include the original message data in the gzip 1265# file. Otherwise, include the message data. Useful if the client has the 1266# corpus available via the same paths as originally specified. 1267# 1268# Returns: scalar path to gzip file 1269# 1270sub generate_messages { 1271 my($msgs, $timestamps, $msgsout, $paths_only, $client_id) = @_; 1272 1273 # Hold the message numbers we'll be sending out 1274 my @tosend = (); 1275 my @sent = (); 1276 1277 # Find out if any of the messages we sent out before need to be sent out 1278 # again because we haven't seen a response within the timeout. 1279 my $tooold = time - $opt_cs_timeout; 1280 foreach (sort { $a <=> $b } keys %{$timestamps}) { 1281 # since we're going in numeric order, if the current entry is newer than 1282 # the timeout value, the rest will be too, so stop looking. 1283 last if ($_ > $tooold); 1284 1285 # messages that might be eligible for retry 1286 my @toretry = (); 1287 1288 # how many messages do we still need to fulfill the request? 1289 my $wanted = $msgs - @tosend; 1290 1291 if (@{$timestamps->{$_}} > $wanted) { 1292 # there are more entries in the timestamp list than we want, so just 1293 # grab that many off the list. 1294 push(@toretry, splice @{$timestamps->{$_}}, 0, $wanted); 1295 } 1296 else { 1297 # there are just enough, or not enough entries on the timestamp list to 1298 # satisfy our request, so take them all and we'll loop around. 1299 push(@toretry, @{$timestamps->{$_}}); 1300 delete $timestamps->{$_}; 1301 } 1302 1303 # limit retries 1304 foreach my $num (@toretry) { 1305 if ($msgsout->{$num}->{'count'} < $opt_cs_max_tries) { 1306 $msgsout->{$num}->{'count'}++; 1307 push @tosend, $num; 1308 } else { 1309 warn "skipping message num $num after $opt_cs_max_tries attempts, index: ". 1310 (Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$num}->{'data'}))[3]."\n"; 1311 delete $msgsout->{$num}; 1312 $failed_msgs++; 1313 } 1314 } 1315 1316 # Ok, we have enough messages so we can stop now. 1317 last if (@tosend == $msgs); 1318 } 1319 1320 if (!$opt_cs_schedule_cache) { 1321 # if we still have the temp file with the input messages open, we'll fillup 1322 # out message output queue with messages from there. 1323 if ($tmpfd) { 1324 while (@tosend < $msgs) { 1325 my $msg = read_line($tmpfd); 1326 1327 # no more messages from the temp file, close it out 1328 unless ($msg) { 1329 delete $msgsout->{'curnum'}; 1330 close $tmpfd; 1331 undef $tmpfd; 1332 last; 1333 } 1334 1335 # we got a result, so assign it a number (curnum) and store the data 1336 # appropriately, then add the new number to the queue. 1337 my $num = ++$msgsout->{'curnum'}; 1338 $msgsout->{$num}->{'data'} = $msg; 1339 $msgsout->{$num}->{'count'}++; 1340 push(@tosend, $num); 1341 } 1342 } 1343 } 1344 else { 1345 if (($msgs_processed + $failed_msgs) != $total_messages) { 1346 # select messages based on what the client*s* have cached 1347 1348 # if the client hasn't sent cache data, fake it 1349 unless (exists $client_caches{$client_id}) { 1350 %{$client_caches{$client_id}} = (); 1351 $unique_cache_completed{$client_id} = 1; 1352 } 1353 1354 # first: select messages that only the current client has cached 1355 MESSAGE: while (!$unique_cache_completed{$client_id} && 1356 @tosend < $msgs && 1357 (my($path, undef) = each %{$client_caches{$client_id}})) { 1358 # check that no other clients have it cached 1359 next MESSAGE if $server_caches{'cache_count'}{$path} > 1; 1360 1361 # we got a result, so assign it a number (curnum) and store the data 1362 # appropriately, then add the new number to the queue. 1363 my $num = ++$msgsout->{'curnum'}; 1364 $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; 1365 $msgsout->{$num}->{'count'}++; 1366 push(@tosend, [ $num, 1 ]); 1367 delete $server_caches{'to_process'}{$path}; 1368 delete $server_caches{'cache_count'}{$path}; 1369 delete $client_caches{$client_id}{$path}; 1370 $cache_hits++; 1371 } 1372 if (@tosend < $msgs) { 1373 $unique_cache_completed{$client_id} = 1; 1374 } 1375 1376 # second: hand out messages that no clients have cached 1377 while (@tosend < $msgs && 1378 (my($path, undef) = each %{$server_caches{'not_cached'}})) { 1379 # we got a result, so assign it a number (curnum) and store the data 1380 # appropriately, then add the new number to the queue. 1381 my $num = ++$msgsout->{'curnum'}; 1382 $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; 1383 $msgsout->{$num}->{'count'}++; 1384 push(@tosend, [ $num, 0 ]); 1385 delete $server_caches{'to_process'}{$path}; 1386 delete $server_caches{'cache_count'}{$path}; 1387 delete $server_caches{'not_cached'}{$path}; 1388 } 1389 1390 # third: hand out messages in the client's cache regardless of how many 1391 # other clients have them cached; smart scheduling takes too long 1392 while (@tosend < $msgs && 1393 (my($path, undef) = each %{$client_caches{$client_id}})) { 1394 # we got a result, so assign it a number (curnum) and store the data 1395 # appropriately, then add the new number to the queue. 1396 my $num = ++$msgsout->{'curnum'}; 1397 $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; 1398 $msgsout->{$num}->{'count'}++; 1399 push(@tosend, [ $num, 1 ]); 1400 delete $server_caches{'to_process'}{$path}; 1401 delete $server_caches{'cache_count'}{$path}; 1402 $cache_hits++; 1403 1404 # testing with 5 clients, this seems to be faster (in msgs/hr) than 1405 # deleting from each other clients' cache when we go to get their 1406 # messages; of course it's more I/O on the server this way 1407 my $cache_count = $server_caches{'cache_count'}{$path}; 1408 foreach my $cc (keys %client_caches) { 1409 delete $client_caches{$cc}{$path}; 1410 last unless --$cache_count; 1411 } 1412 } 1413 1414 # fourth: hand out messages that other clients have cached without 1415 # regard for how many clients have cached them; smart 1416 # scheduling takes too long (at least when using DBM) 1417 while (@tosend < $msgs && 1418 (my($path, $msg) = each %{$server_caches{'to_process'}})) { 1419 # we got a result, so assign it a number (curnum) and store the data 1420 # appropriately, then add the new number to the queue. 1421 my $num = ++$msgsout->{'curnum'}; 1422 $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; 1423 $msgsout->{$num}->{'count'}++; 1424 push(@tosend, [ $num, 0 ]); 1425 delete $server_caches{'to_process'}{$path}; 1426 delete $server_caches{'cache_count'}{$path}; 1427 1428 my $cache_count = $server_caches{'cache_count'}{$path}; 1429 foreach my $cc (keys %client_caches) { 1430 delete $client_caches{$cc}{$path}; 1431 last unless --$cache_count; 1432 } 1433 } 1434 } 1435 else { 1436 # close the tmpfd, etc so that the main loop knows we're done 1437 delete $msgsout->{'curnum'}; 1438 close $tmpfd; 1439 undef $tmpfd; 1440 } 1441 } 1442 1443 # ok, at this point, @tosend ought to have a list of numbers, pointers into 1444 # %{$msgsout}. turn that into a tar file. 1445 return '' unless @tosend; 1446 1447 my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 1448 die "Can't make tempfile, exiting" unless $gzpath; 1449 close($gzfd); 1450 1451 $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; 1452 1453 # first line is the number of messages included in the file 1454 send_line($gzfd, scalar @tosend) || die "mass-check: error when writing to gz temp file\n"; 1455 1456 # Generate an archive in the temp file 1457 foreach my $num (@tosend) { 1458 my $in_cache = 0; 1459 if (ref($num) eq 'ARRAY') { 1460 $in_cache = $num->[1]; 1461 $num = $num->[0]; 1462 } 1463 # Archive format, gzip compressed file w/ 4 parts per message: 1464 # 1- server message number in text format 1465 # 2- server index string, binary packed format 1466 # 3- a 1 if the message is included, 0 otherwise -- unless paths_only 1467 # 4- message content -- unless paths_only or #3 is 0 1468 my $data = $msgsout->{$num}->{'data'}; 1469 if (!$paths_only) { 1470 unless ($in_cache) { 1471 my $msg = ($iter->_run_message($data))[4]; 1472 if ($msg) { 1473 send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; 1474 send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; 1475 send_line($gzfd, '1') || die "mass-check: error when writing to gz temp file\n"; 1476 send_line($gzfd, (defined $msg ? join('', @{$msg}) : '')) || 1477 die "mass-check: error when writing to gz temp file\n"; 1478 push @sent, $num; 1479 } else { 1480 # if the message has an error (probably due to it being removed sometime during the 1481 # run) we send a message number of 0 to indicate to the client that the server had an 1482 # error retrieving the message; we need to do this since we have already added the 1483 # line telling the client how many messages to expect 1484 my $filename = (Mail::SpamAssassin::ArchiveIterator::_index_unpack($data))[3]; 1485 warn qq/mass-check: ArchiveIterator returned error for "$filename"\n/; 1486 send_line($gzfd, 0) || die "mass-check: error when writing to gz temp file\n"; 1487 delete $msgsout->{$num}; 1488 $failed_msgs++; 1489 } 1490 } else { 1491 send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; 1492 send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; 1493 send_line($gzfd, '0') || die "mass-check: error when writing to gz temp file\n"; 1494 push @sent, $num; 1495 } 1496 } else { 1497 send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; 1498 send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; 1499 # the client deals with missing messages on its own (sort of) 1500 push @sent, $num; 1501 } 1502 } 1503 1504 $gzfd->close; 1505 1506 # update timestamp entries 1507 my $ts = time; 1508 1509 # make sure the timestamp is unique! without Time::HiRes it is trivial and 1510 # common to have two reissueings of timedout messages in the same second 1511 # with Time::HiRes we'll check anyway since we'll waste less time checking 1512 # for uniqueness than one of us will waste debugging a report of mass-check 1513 # not completing due to someone's wacky clock (some (non-para-)virtualized 1514 # servers will have their clocks go forward and backward) 1515 while (exists $timestamps->{$ts}) { 1516 if (HAS_TIME_HI_RES) { 1517 $ts = Time::HiRes::time(); 1518 } else { 1519 $ts += 0.01; 1520 } 1521 } 1522 1523 foreach (@sent) { 1524 $msgsout->{$_}->{'timestamp'} = $ts; 1525 } 1526 1527 # conveniently, this list should be the only thing sent out w/ this 1528 # timestamp, so just set the reference appropriately. :) 1529 $timestamps->{$ts} = \@sent; 1530 1531 if ($opt_noisy) { 1532 status('sent '.scalar(@sent).' of '.scalar(@tosend).' intended messages'); 1533 } 1534 1535 return $gzpath; 1536} 1537 1538# we've gotten results posted, so clean up msgsout and timestamp hashes and 1539# process result... 1540sub handle_post_results { 1541 my($postdata, $timestamps, $msgsout) = @_; 1542 1543 # local version to batch the removals 1544 my %timestamps = (); 1545 1546 # $msgsout->{num}->{data|timestamp} 1547 # $timestamp{num} = [ msgout_nums ... ] 1548 # $postdata{num} = result_string 1549 1550 while( my($k,$v) = each %{$postdata} ) { 1551 # message run results will be \d+ => log entry 1552 next if ($k !~ /^\d+$/); 1553 1554 # if we've been waiting for this result, process it, otherwise throw it on 1555 # the ground. multiple clients could have been given the same messages to 1556 # process, and we take whatever the first responder sends us. 1557 if (exists $msgsout->{$k}) { 1558 # the result_sub will need parts of the message data, so get it ready 1559 my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$k}->{'data'}); 1560 1561 # go ahead and do the result 1562 &{$iter->{result_sub}}($d[1], $v, $d[0]); 1563 1564 # prep to get rid of the cached entries 1565 $timestamps{$msgsout->{$k}->{'timestamp'}}->{$k} = 1; 1566 delete $msgsout->{$k}; 1567 1568 $msgs_processed++; 1569 } 1570 } 1571 1572 # if we got any results, clean out the results from the timestamp arrays 1573 while ( my($k,$v) = each %timestamps ) { 1574 # trim out the result list from the timestamp sent list 1575 my @temp = grep(!exists $v->{$_}, @{$timestamps->{$k}}); 1576 1577 # if there are results left for a specific timestamp, update the array 1578 # pointer. otherwise, delete the timestamp entry since it's empty. 1579 if (@temp) { 1580 $timestamps->{$k} = \@temp; 1581 } 1582 else { 1583 delete $timestamps->{$k}; 1584 } 1585 } 1586} 1587 1588# This function reads from $tmpfd and processes the message as appropriate wrt 1589# $opt_j, $opt_restart, etc. 1590# 1591sub run_through_messages { 1592 # do everything in one process 1593 if ($opt_j <= 1 && !defined $opt_restart) { 1594 my $message; 1595 my $messages; 1596 my $total_count = 0; 1597 1598 while (($total_messages > $total_count) && ($message = read_line($tmpfd))) { 1599 my($class, undef, $date, undef, $result) = $iter->_run_message($message); 1600 if ($result) { 1601 &{$iter->{result_sub}}($class, $result, $date); 1602 } 1603 $total_count++; 1604 } 1605 $msgs_processed += $total_count; 1606 } 1607 # more than one process or one process with restarts 1608 else { 1609 my $select = IO::Select->new(); 1610 1611 my $total_count = 0; 1612 my $needs_restart = 0; 1613 my @child = (); 1614 my @pid = (); 1615 my $messages; 1616 1617 # start children processes 1618 start_children($opt_j, \@child, \@pid, $select); 1619 1620 # feed childen, make them work for it, repeat 1621 while ($select->count()) { 1622 foreach my $socket ($select->can_read()) { 1623 my $line = read_line($socket); 1624 1625 # some error happened during the read! 1626 if (!defined $line) { 1627 $needs_restart = 1; 1628 warn "mass-check: readline failed, attempting to recover\n"; 1629 $select->remove($socket); 1630 } 1631 elsif ($line =~ /^([^\0]*)\0RESULT (.+)$/s) { 1632 my $result = decode_base64($1); 1633 my ($date,$class,$type) = Mail::SpamAssassin::ArchiveIterator::_index_unpack($2); 1634 aidbg "mass-check: $class, $type, $date\n"; 1635 1636 if (defined $opt_restart && ($total_count % $opt_restart) == 0) { 1637 $needs_restart = 1; 1638 } 1639 1640 # if messages remain, and we don't need to restart, send message 1641 if (($total_messages > $total_count) && !$needs_restart) { 1642 my $line = read_line($tmpfd); 1643 unless ($line) { 1644 warn "mass-check: found short message list ($total_messages, $total_count)\n"; 1645 $select->remove($socket); 1646 next; 1647 } 1648 1649 send_line($socket, $line); 1650 $total_count++; 1651 aidbg "mass-check: $total_messages $total_count\n"; 1652 } 1653 else { 1654 # stop listening on this child since we're done with it 1655 aidbg "mass-check: $needs_restart $total_messages $total_count\n"; 1656 $select->remove($socket); 1657 } 1658 1659 # deal with the result we received 1660 if ($result) { 1661 &{$iter->{result_sub}}($class, $result, $date); 1662 } 1663 } 1664 elsif ($line eq "START") { 1665 if ($total_messages > $total_count) { 1666 # we still have messages, send one to child 1667 send_line($socket, read_line($tmpfd)); 1668 $total_count++; 1669 aidbg "mass-check: $total_messages $total_count\n"; 1670 } 1671 else { 1672 # no more messages, so stop listening on this child 1673 aidbg "mass-check: $needs_restart $total_messages $total_count\n"; 1674 $select->remove($socket); 1675 } 1676 } 1677 else { 1678 $needs_restart = 1; 1679 warn "mass-check: bad line from readline: $line\n"; 1680 $select->remove($socket); 1681 } 1682 } 1683 1684 aidbg "mass-check: out of loop, $total_messages $total_count $needs_restart ".$select->count()."\n"; 1685 1686 # If there are still messages to process, and we need to restart 1687 # the children, and all of the children are idle, let's go ahead. 1688 if ($needs_restart && $select->count == 0 && $total_messages > $total_count) { 1689 $needs_restart = 0; 1690 1691 aidbg "mass-check: needs restart, $total_messages total, $total_count done\n"; 1692 reap_children($opt_j, \@child, \@pid); 1693 @child=(); 1694 @pid=(); 1695 start_children($opt_j, \@child, \@pid, $select); 1696 } 1697 } 1698 $msgs_processed += $total_count; 1699 1700 # reap children 1701 reap_children($opt_j, \@child, \@pid); 1702 } 1703} 1704 1705# send an HTTP response to a socket based on the input result, headers, and 1706# data values. 1707sub http_response { 1708 my($socket, $result, $headers, $data) = @_; 1709 1710 $headers->{'Content-Length'} = length $data; 1711 $headers->{'Cache-Usage'} = $opt_cs_cache ? 'allowed' : 'disallowed'; 1712 1713 print $socket 1714 "HTTP/1.0 $result\r\n", 1715 "Pragma: no-cache\r\n", 1716 "Server: mass-check/$svn_revision\r\n", 1717 (map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}), "\r\n"; 1718 print $socket $data; 1719 1720 if ($opt_cs_verbose) { 1721 $sent_bytes += length( 1722 "HTTP/1.0 $result\r\n". 1723 "Pragma: no-cache\r\n". 1724 "Server: mass-check/$svn_revision\r\n"); 1725 # length of a map output isn't what the other end claims to see 1726 while (my($f,$v) = each %{$headers}) { 1727 $sent_bytes += length($f) + length($v) + 4; 1728 } 1729 $sent_bytes += 2 + length($data); 1730 } 1731} 1732 1733# the client needs to make a request to the server on a given socket. 1734sub http_make_request { 1735 my($socket, $type, $uri, $headers, $data) = @_; 1736 1737 $headers->{'Content-Length'} = length $data; 1738 1739 print $socket 1740 "$type $uri HTTP/1.0\r\n", 1741 "User-Agent: mass-check/$svn_revision\r\n", 1742 (map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}), "\r\n"; 1743 print $socket $data; 1744 1745 if ($opt_cs_verbose) { 1746 $sent_bytes += length( 1747 "$type $uri HTTP/1.0\r\n". 1748 "User-Agent: mass-check/$svn_revision\r\n"); 1749 # length of a map output isn't what the other end claims to see 1750 while (my($f,$v) = each %{$headers}) { 1751 $sent_bytes += length($f) + length($v) + 4; 1752 } 1753 $sent_bytes += 2 + length($data); 1754 } 1755 1756 # parse the response that the server sends us 1757 my $line = $socket->getline() || ''; 1758 $rcvd_bytes += length($line); 1759 my(undef, $code, $string) = split(/\s+/, $line, 3); 1760 return unless $code == 200; 1761 1762 my %headers = (); 1763 do { 1764 $line = $socket->getline(); 1765 last unless defined $line; 1766 $rcvd_bytes += length($line); 1767 $line =~ s/\r\n$//; 1768 1769 if ($line) { 1770 my ($k,$v) = split(/:\s*/, $line, 2); 1771 $headers{lc $k} = $v; 1772 } 1773 } while ($line !~ /^$/); 1774 $rcvd_bytes += $headers{'content-length'} if $headers{'content-length'}; 1775 $client_id ||= $headers{'client-id'} if $headers{'client-id'}; 1776 1777 # disable cache-usage if the server disallows cache usage 1778 $opt_cs_cache = 0 if $headers{'cache-usage'} eq 'disallowed'; 1779 1780 # the server has sent us notification that it's going to exit, so let's 1781 # follow suit. 1782 return 'finished' if ($headers{'finished'}); 1783 1784 my $gzpath = ''; 1785 if ($headers{'content-type'} eq 'application/x-gzip') { 1786 my $gzfd; 1787 ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 1788 die "Can't make tempfile, exiting" unless $gzpath; 1789 1790 my $rd; 1791 $socket->read($rd, $headers{'content-length'}) || die "mass-check: error reading in data from server\n"; 1792 print $gzfd $rd; 1793 close $gzfd; 1794 } 1795 1796 $socket->close(); 1797 return $gzpath; 1798} 1799 1800# Be conservative -- encode most things. 1801# we could encode spaces to plusses, then decode that later, but... 1802sub post_encode { 1803 my $string = shift; 1804 $string =~ s/([^a-zA-Z0-9_,.\/\\-])/sprintf "%%%02x",unpack("C",$1)/egx; 1805 return $string; 1806} 1807 1808# remove all of the files in a given directory, non-recursive 1809sub clean_dir { 1810 my $dir = shift; 1811 1812 unless (opendir(DIR, $dir)) { 1813 warn "error: can't opendir $dir: $!\n"; 1814 return; 1815 } 1816 while(my $file = readdir(DIR)) { 1817 $file =~ /^(.+)$/; # untaint 1818 $file = $1; 1819 1820 my $path = File::Spec->catfile($dir, $file); 1821 next unless (-f $path); 1822 1823 if (!unlink $path) { 1824 warn "error: can't remove file $path: $!\n"; 1825 closedir(DIR); 1826 return; 1827 } 1828 } 1829 closedir(DIR); 1830 return 1; 1831} 1832 1833############################################################################ 1834 1835# four bytes in network/vax format (little endian) as length of message 1836# the rest is the actual message 1837 1838sub read_line { 1839 my $fd = shift; 1840 my($length,$msg); 1841 1842 # read in the 4 byte length and unpack 1843 $fd->read($length, 4) || return; 1844 1845 $length = unpack("V", $length); 1846 return unless $length; 1847 1848 # read in the rest of the single message 1849 $fd->read($msg, $length) || return; 1850 1851 return $msg; 1852} 1853 1854sub send_line { 1855 my $fd = shift; 1856 foreach ( @_ ) { 1857 my $length = pack("V", length $_); 1858 $fd->print($length.$_) || return 0; 1859 } 1860 1861 return 1; 1862} 1863 1864############################################################################ 1865 1866# this is the function that implemented server mode. basically, sit and wait 1867# for connections to come in. when a client sends in a request, deal with any 1868# results that the client sent, then generate a response and send it back, 1869# and then go back to waiting. lather, rinse, repeat. 1870sub server_mode { 1871 $opt_cs_max ||= 1000; 1872 $opt_cs_timeout ||= 60 * 5; 1873 $opt_cs_max_tries ||= 3; 1874 1875 # IO::Socket::SSL isn't that smart 1876 $opt_server =~ s/:(\d+)$//; 1877 my $port = $1; 1878 1879 # ::SSL needs resolved hostnames, at least on Solaris. note: not IPv6-aware 1880 my $localaddr = (gethostbyname($opt_server))[4] || $opt_server; 1881 $localaddr = Socket::inet_ntoa($localaddr); 1882 1883 my $serv_socket; 1884 if ($opt_cs_ssl) { 1885 $serv_socket = IO::Socket::SSL->new( 1886 LocalAddr => $localaddr, 1887 LocalPort => $port, 1888 ReuseAddr => 1, 1889 Listen => 5, 1890 SSL_verify_mode => 0x02, 1891 SSL_use_cert => 1, 1892 SSL_version => "TLSv1", 1893 SSL_key_file => "spamassassin/server-key.pem", 1894 SSL_cert_file => "spamassassin/server-cert.pem", 1895 ); 1896 } 1897 else { 1898 $serv_socket = IO::Socket::INET->new( 1899 LocalAddr => $localaddr, 1900 LocalPort => $port, 1901 ReuseAddr => 1, 1902 Listen => 5, 1903 ); 1904 } 1905 1906 die "Could not create socket: $!\n" unless $serv_socket; 1907 1908 if ($opt_progress) { 1909 status('server ready for connections'); 1910 } 1911 1912 # Setup out "what messages have been sent out" hashes 1913 my $timestamps = {}; 1914 my $msgsout = { 'curnum' => 0 }; 1915 1916 # Generate an IO::Select object and put the server socket on the queue 1917 my $select = IO::Select->new( $serv_socket ); 1918 1919 # We'll keep looping while there's something to pay attention to 1920 while ($select->count()) { 1921 # Sit and block until there's something for us to read from 1922 foreach my $socket ($select->can_read()) { 1923 if ($socket == $serv_socket) { 1924 # it's the server socket, go ahead and accept the connection and add 1925 # it to the queue. 1926 $select->add($serv_socket->accept); 1927 } 1928 else { 1929 # it's some client, so deal with the request 1930 my($type, $URI, $headers, $postdata) = handle_http_request($socket); 1931 1932 # we don't do GET, so just send something back 1933 if ($type eq 'GET') { 1934 if ($opt_noisy) { 1935 status('GET request from '.$socket->peerhost); 1936 } 1937 1938 http_response($socket, "200 OK", { 1939 'Content-type' => 'text/plain', 1940 }, 1941 "Your GET request came from IP Address: ".$socket->peerhost."\n"); 1942 } 1943 elsif ($type eq 'POST') { 1944 # ooh, POST. deal with any results that the client sent 1945 handle_post_results($postdata, $timestamps, $msgsout); 1946 1947 if ($opt_noisy) { 1948 status('POST request from '.$socket->peerhost); 1949 } 1950 1951 # based on the number of messages that the client requested, 1952 # generate a gzip file with the appropriate data in it 1953 my $messages = ''; 1954 if ($postdata->{'max_messages'}) { 1955 my $msgnum = $postdata->{'max_messages'}; 1956 if ($msgnum > $opt_cs_max || $msgnum < 1) { 1957 $msgnum = $opt_cs_max; 1958 } 1959 1960 if ($opt_noisy) { 1961 status('client requested '.$postdata->{'max_messages'}.' messages'); 1962 } 1963 1964 $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'}, $headers->{'client-id'}); 1965 } 1966 1967 # $messages will contain the path to the gzip file if there are 1968 # messages to send out. 1969 if ($messages && open(MSG, $messages)) { 1970 $t_first_msg = time unless $t_first_msg; 1971 binmode(MSG); 1972 local $/ = undef; # go go slurp mode 1973 1974 # send the response 1975 http_response($socket, "200 OK", { 1976 'Content-Type' => 'application/x-gzip', 1977 'Content-Encoding' => 'x-gzip', 1978 'Client-ID' => $headers->{'client-id'}, 1979 }, 1980 scalar <MSG>); 1981 1982 close(MSG); 1983 1984 # we don't need the file anymore, so get rid of it 1985 unlink $messages; 1986 } 1987 elsif (!keys %{$msgsout} && !defined $tmpfd) { 1988 # we have no more outstanding messages and our original queue of 1989 # messages to process is empty, so tell the client to exit. 1990 http_response($socket, "200 OK", { 1991 "Content-type" => "text/plain", 1992 'Client-ID' => $headers->{'client-id'}, 1993 "Finished" => 1, 1994 }, 1995 'We are all done'); 1996 $t_last_msg = time; 1997 } 1998 else { 1999 # when in doubt, treat this like a GET 2000 http_response($socket, "200 OK", { 2001 "Content-type" => "text/plain", 2002 'Client-ID' => $headers->{'client-id'}, 2003 }, 2004 "Your POST request (sans max_messages) came from IP Address: ".$socket->peerhost."\n"); 2005 } 2006 } 2007 else { 2008 # for error, "501 Not Implemented" 2009 http_response($socket, '501 Not Implemented', {}, ''); 2010 } 2011 2012 # ok, we don't do keepalive, so get rid of the socket 2013 $select->remove($socket); 2014 $socket->close; 2015 } 2016 } 2017 2018 if ($opt_noisy) { 2019 status((exists $msgsout->{'curnum'} ? (scalar(keys %{$msgsout})-1) : 2020 scalar(keys %{$msgsout})).' messages outstanding'); 2021 } 2022 2023#print "msgs waiting: ".join(" ", keys %{$msgsout})."\n"; 2024#print "tmpfd defined? ".(defined $tmpfd ? "yes" : "no")."\n"; 2025 2026 # we're not awaiting responses and we've exhausted the input file, so 2027 # drop the server socket. :) 2028 $select->remove($serv_socket) if (!keys %{$msgsout} && !defined $tmpfd); 2029 } 2030 2031 # remove the server and client message cache temp files 2032 foreach my $tied_hash (keys %server_caches) { 2033 undef $tied_hash; 2034 } 2035 foreach my $tied_hash (keys %client_caches) { 2036 undef $tied_hash; 2037 } 2038 foreach my $tmp_file (@cache_tmp_files) { 2039 unlink $tmp_file; 2040 } 2041} 2042 2043# this is the function that implements client mode. generally, in a loop: 2044# make a request of the server for some max number of messages, and send our 2045# results back at the same time. based on the results of that request, put 2046# messages into a temp dir and process them. prep the results and loop. 2047# lather, rinse, repeat. 2048sub client_mode { 2049 $opt_cs_max ||= 1000; 2050 $opt_cs_timeout ||= 60 * 2; 2051 $opt_cs_conn_retries ||= 60; # 1 hour 2052 2053 my($host, $port, $uri); 2054 2055 if ($opt_client =~ /^http:\/\/([^\/]+)(\/.*)?/) { 2056 ($host, $uri) = ($1,$2); 2057 } else { 2058 $host = $opt_client; 2059 if ($host =~ /^:/) { 2060 $host = 'localhost'.$host; 2061 } 2062 } 2063 ($host, $port) = split(/:/, $host); 2064 2065 die "No host found in opt_client" unless $host; 2066 $uri ||= "/"; 2067 2068 # ::SSL needs resolved hostnames, at least on Solaris. note: not IPv6-aware 2069 $host = Socket::inet_ntoa(Socket::inet_aton($host)); 2070 2071 # use this to track how many messages we ought to be requesting 2072 # start at 100 to get warmed up 2073 my $msgnum = $opt_cs_max > 100 ? 100 : $opt_cs_max; 2074 2075 my $tmpdir; 2076 2077 # if we're not doing paths_only, create a temp dir where we'll put the 2078 # incoming messages to process. 2079 if (!$opt_cs_paths_only) { 2080 $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir(); 2081 die "Can't create tempdir" unless $tmpdir; 2082 } 2083 2084 my $made_conn_once = 0; 2085 2086 # keep going until something stops us. 2087 while (1) { 2088 # send cache data if this is the first connect 2089 # do this before creating the connection so we don't waste all the other 2090 # clients' time, once we've connected, by monopolizing the single server 2091 my ($gzpath, $gzfd, $action); 2092 if (!$made_conn_once && $opt_cs_schedule_cache) { 2093 # we need to find out what messages we have in our client cache so that we 2094 # can tell the server what we've got so that it can optimize our cache hit 2095 # rate by requesting us to scan messages that are already in our cache 2096 $action = 'sending cache'; 2097 $gzpath = scan_client_cache(); 2098 } 2099 else { 2100 $action = 'sending results'; 2101 # if the number of messages to request is too much, bring it down 2102 $msgnum = $opt_cs_max if ($msgnum > $opt_cs_max); 2103 2104 # prep the POST request 2105 $postdata{'max_messages'} = $msgnum; 2106 $postdata{'paths_only'} = 1 if ($opt_cs_paths_only); 2107 2108 # compress the results 2109 ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 2110 die "Can't make tempfile, exiting" unless $gzpath; 2111 $gzfd->close; 2112 $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; 2113 2114 # the actual POST data string 2115 send_line($gzfd, join('&', map { post_encode($_) . '=' . post_encode($postdata{$_}) } keys %postdata)) || 2116 die "mass-check: error when writing to gz temp file\n"; 2117 $gzfd->close; 2118 %postdata = (); 2119 } 2120 2121 # connect to server 2122 my $socket; 2123 if ($opt_cs_ssl) { 2124 $socket = IO::Socket::SSL->new( 2125 PeerAddr => $host, 2126 PeerPort => $port, 2127 SSL_version => "TLSv1", 2128 SSL_use_cert => 1, 2129 SSL_key_file => "spamassassin/client-key.pem", 2130 SSL_cert_file => "spamassassin/client-cert.pem", 2131 ); 2132 } else { 2133 $socket = IO::Socket::INET->new( 2134 PeerAddr => $host, 2135 PeerPort => $port 2136 ); 2137 } 2138 2139 if (!$socket) { 2140 unlink $gzpath; 2141 undef $gzfd; 2142 # if we haven't yet made a connection, keep retrying; 2143 # this is probably the server still in "scan stage" 2144 if (!$made_conn_once) { 2145 if ($opt_cs_conn_retries-- > 0) { 2146 status('failed to connect, sleeping for retry') if ($opt_noisy); 2147 sleep 60; 2148 next; 2149 } else { 2150 status('failed to connect, giving up') if ($opt_noisy); 2151 last; 2152 } 2153 } 2154 2155 # last if connection fails after scanning something 2156 last; 2157 } 2158 2159 $made_conn_once = 1; 2160 status("requesting $msgnum messages from server") if ($opt_noisy); 2161 2162 # make request, include and then drop results if there are any 2163 my $POSTDATA = ''; 2164 if ($gzpath) { 2165 if (open(RESULTS, $gzpath)) { 2166 binmode(RESULTS); 2167 { 2168 # slurp here, rather than into an anonymous variable in the 2169 # http_make_request call so that we don't end up slurping the 2170 # response from the server too 2171 local $/ = undef; # slurp 2172 $POSTDATA = scalar <RESULTS>; 2173 } 2174 close(RESULTS); 2175 unlink $gzpath; 2176 undef $gzfd; 2177 } else { 2178 die "Can't open tempfile, exiting" unless $gzpath; 2179 } 2180 } 2181 2182 my $result = http_make_request($socket, 'POST', $uri, { 2183 'Host' => $host, 2184 'Content-Type' => 'application/x-www-form-urlencoded', 2185 'Content-Encoding' => 'x-gzip', 2186 'Action' => $action, 2187 'Client-ID' => ($client_id || '') 2188 }, 2189 $POSTDATA 2190 ); 2191 undef $POSTDATA; 2192 2193 $t_last_msg = time; 2194 2195 # If we received messages to run through, go ahead and do it. 2196 # otherwise, just sleep for the timeout length and try again 2197 if (!defined $result) { 2198 # we got an error?!? abort! 2199 last; 2200 } 2201 elsif ($result eq 'finished') { 2202 # the server said that we're done 2203 status('server has no more work, exiting.') if ($opt_noisy); 2204 last; 2205 } 2206 elsif ($result eq '') { 2207 # no messages means the server may give us more work down the road. 2208 # sleep for client_timeout seconds and try the request again 2209 status("Received no messages, waiting $opt_cs_timeout seconds") if ($opt_noisy); 2210 sleep $opt_cs_timeout; 2211 } 2212 else { 2213 # we got messages, so deal with them. 2214 my $time_start = time; 2215 $t_first_msg = $time_start unless $t_first_msg; 2216 2217 # postdata will hold our results, real will hold the original message 2218 # data from the server's scan mode. 2219 %postdata = (); 2220 %real = (); 2221 $init_results = $total_count = $spam_count = $ham_count = 0; 2222 2223 # we got a result, so do things with it! 2224 my $gzfd = IO::Zlib->new($result, "rb"); 2225 die "Can't open temp result file: $!" unless $gzfd; 2226 2227 # used for the temp queue file 2228 my $tmppath; 2229 ($tmppath, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 2230 die "Can't make tempfile, exiting" unless $tmppath; 2231 unlink $tmppath; 2232 2233 # if we have a temp directory, clean it out for this run 2234 clean_dir($tmpdir) if ($tmpdir); 2235 2236 # Archive format, gzip compressed file w/ 4 parts per message: 2237 # 1- server message number in text format 2238 # 2- server index string, binary packed format 2239 # 3- a 1 if the message is included, 0 otherwise -- unless paths_only 2240 # 4- message content -- unless paths_only or #3 is 0 2241 2242 # number of messages 2243 $msgnum = $total_messages = read_line($gzfd) || die "mass-check: error reading from gzip message file\n"; 2244 2245 status("server says it gave us $total_messages messages") if ($opt_progress); 2246 my $actual_messages = 0; 2247 2248 # loop through and prep all of the messages the server sent 2249 for(my $i = 0 ; $i < $total_messages; $i++ ) { 2250 my $num = read_line($gzfd); 2251 last unless defined $num; 2252 2253 next unless $num; # a message number of 0 indicates a msg-error on the server 2254 2255 my $index = read_line($gzfd); 2256 last unless defined $index; 2257 2258 my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index); 2259 2260 # if we're doing paths_only, there'll be no message content 2261 if (!$opt_cs_paths_only) { 2262 my $msg_included = read_line($gzfd); 2263 last unless defined $msg_included; 2264 2265 my $msg; 2266 if ($msg_included == 1) { 2267 $msg = read_line($gzfd); 2268 last unless defined $msg; 2269 } 2270 2271 # permanently cache the message on the client 2272 my $cache_success = 0; 2273 if ($opt_cs_cache) { 2274 unless (-f "$opt_cs_cachedir/$d[3]") { 2275 umask 0077; # prevent others on the system from reading corpus messages 2276 # TODO: the entire mass-check script should probably use umask 0077; 2277 2278 mkdir $opt_cs_cachedir; # mkdir won't bitch if it already exists, that's ok 2279 if (-d $opt_cs_cachedir && $d[3] =~ m{^/?(.*(?![^\\]\\)/)?(.+)$}) { 2280 my ($path, $filename) = ($1, $2); 2281 my $dir = ''; 2282 $path ||= ''; 2283 while ($path =~ m#((?![^\\]\\)[^/]+/)#gc) { 2284 $dir .= $1; 2285 mkdir "$opt_cs_cachedir/$dir"; 2286 } 2287 if (-d "$opt_cs_cachedir/$dir") { 2288 if (open(OUT, ">$opt_cs_cachedir/$dir$filename")) { 2289 print OUT $msg; 2290 close $msg; 2291 2292 $cache_success = 1; 2293 } 2294 else { 2295 warn "Can't create/write message cache file $opt_cs_cachedir/$dir$filename: $!"; 2296 } 2297 } 2298 } 2299 } 2300 else { 2301 $cache_success = 1; 2302 $cache_hits++; 2303 } 2304 if ($cache_success) { 2305 # this is a little tricky -- we need to process the files in the 2306 # path and format we've created, but the original data is needed 2307 # to create a proper result later, so deal with that here. 2308 $real{"$opt_cs_cachedir/$d[3]"} = \@d; 2309 push @{$real{"$opt_cs_cachedir/$d[3]"}}, $num; 2310 send_line($tmpfd, 2311 Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$opt_cs_cachedir/$d[3]")) || 2312 die "mass-check: error writing out temp file in client mode\n"; 2313 $actual_messages++; 2314 } 2315 } 2316 2317 # if the cache failed try a temp file instead 2318 if (!$opt_cs_cache || !$cache_success) { 2319 next unless defined $msg; 2320 # it's going to be a dir of file formatted messages 2321 if (open(OUT, ">$tmpdir/$num")) { 2322 print OUT $msg; 2323 close(OUT); 2324 2325 # this is a little tricky -- we need to process the files in the 2326 # path and format we've created, but the original data is needed 2327 # to create a proper result later, so deal with that here. 2328 $real{"$tmpdir/$num"} = \@d; 2329 push @{$real{"$tmpdir/$num"}}, $num; 2330 send_line($tmpfd, 2331 Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$tmpdir/$num")) || 2332 die "mass-check: error writing out temp file in client mode\n"; 2333 $actual_messages++; 2334 } 2335 else { 2336 warn "Can't create/write message temp file $tmpdir/$num: $!"; 2337 } 2338 } 2339 } 2340 # paths_only_mode 2341 else { 2342 # permanently cache the message on the client 2343 my $cache_success = 0; 2344 if ($opt_cs_cache) { 2345 unless (-f "$opt_cs_cachedir/$d[3]") { 2346 umask 0077; # prevent others on the system from reading corpus messages 2347 # TODO: the entire mass-check script should probably use umask 0077; 2348 2349 mkdir $opt_cs_cachedir; # mkdir won't bitch if it already exists, that's ok 2350 if (-d $opt_cs_cachedir && $d[3] =~ m{^/?(.*(?![^\\]\\)/)?(.+)$}) { 2351 my ($path, $filename) = ($1, $2); 2352 my $dir = ''; 2353 $path ||= ''; 2354 while ($path =~ m#((?![^\\]\\)[^/]+/)#gc) { 2355 $dir .= $1; 2356 mkdir "$opt_cs_cachedir/$dir"; 2357 } 2358 if (-d "$opt_cs_cachedir/$dir") { 2359 # copy the file to the local cache for use next (and this) time 2360 if (copy($d[3], "$opt_cs_cachedir/$d[3]")) { 2361 $cache_success = 1; 2362 } 2363 } 2364 } 2365 } 2366 else { 2367 $cache_success = 1; 2368 $cache_hits++; 2369 } 2370 if ($cache_success) { 2371 # this is a little tricky -- we need to process the files in the 2372 # path and format we've created, but the original data is needed 2373 # to create a proper result later, so deal with that here. 2374 $real{"$opt_cs_cachedir/$d[3]"} = \@d; 2375 push @{$real{"$opt_cs_cachedir/$d[3]"}}, $num; 2376 send_line($tmpfd, 2377 Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$opt_cs_cachedir/$d[3]")) || 2378 die "mass-check: error writing out temp file in client mode\n"; 2379 $actual_messages++; 2380 } 2381 } 2382 2383 # if the messages isn't in our cache use the supplied path instead 2384 if (!$opt_cs_cache || !$cache_success) { 2385 # in paths_only mode, there's no kluging between formats since we're 2386 # reading the same corpus, however we do still need to track server 2387 # message number to message data so our results will be useable. 2388 $real{$d[3]} = \@d; 2389 push @{$real{$d[3]}}, $num; 2390 send_line($tmpfd, $index) || 2391 die "mass-check: error writing out temp file in client mode\n"; 2392 $actual_messages++; 2393 } 2394 } 2395 } 2396 2397 $gzfd->close; 2398 unlink $result; 2399 2400 # if the server tries to give us messages, but some are errored, 2401 # we need to know that so we don't try to process them all. 2402 if ($total_messages != $actual_messages) { 2403 status("server actually gave us $actual_messages messages") if ($opt_progress); 2404 $total_messages = $actual_messages; 2405 } 2406 2407 if ($opt_progress) { 2408 status('starting run stage'); 2409 } 2410 2411 # we're about to start running, so go back to the start of the file 2412 seek $tmpfd, 0, 0; 2413 2414 run_through_messages(); 2415 2416 # we're done with the temp file -- bye bye 2417 close($tmpfd); 2418 2419 # figure out new max messages, try keeping ~cs_timeout between runs 2420 my $time_end = time; 2421 2422 # if we only requested a small number of messages, it may take <1s to 2423 # run through them, so fake it and say it took 1s. 2424 if ($time_end == $time_start) { 2425 $time_end++; 2426 } 2427 2428 if ($opt_progress) { 2429 status('completed run stage'); 2430 } 2431 2432 status('completed run in '.($time_end-$time_start).' seconds') if ($opt_noisy); 2433 $msgnum = int($msgnum * $opt_cs_timeout / ($time_end-$time_start)) || 1; 2434 } 2435 } 2436 2437 # if we were using a temp dir, clean it out and then remove it 2438 if ($tmpdir) { 2439 clean_dir($tmpdir); 2440 rmdir $tmpdir; 2441 } 2442} 2443 2444# scan the client's cache and return a path to a gzip archive of AI output 2445sub scan_client_cache { 2446 status('starting cache scan stage') if ($opt_progress); 2447 2448 # make sure the cachedir exists so that AI doesn't bail out 2449 mkdir $opt_cs_cachedir unless -e $opt_cs_cachedir; 2450 2451 # Make a temp file and delete it 2452 my $tmpf; 2453 ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 2454 die 'mass-check: failed to create temp file' unless $tmpf; 2455 unlink $tmpf or die "mass-check: unlink '$tmpf': $!"; 2456 2457 my @targets = ("h:dir:$opt_cs_cachedir"); 2458 generate_queue(\@targets, $tmpfd); 2459 2460 # we now have a temporary file with the messages to process 2461 seek($tmpfd, 0, 0); 2462 # the first line is the number of messages 2463 $total_messages = read_line($tmpfd); 2464 showdots_finish(); 2465 status("completed cache scan stage, $total_messages messages") if ($opt_progress); 2466 2467 unless ($tmpfd) { 2468 status('cache scan failed'); 2469 return; 2470 } 2471 status('compressing cache data') if ($opt_progress); 2472 2473 # create a temp file for compression 2474 my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); 2475 die "Can't make tempfile, exiting" unless $gzpath; 2476 $gzfd->close; 2477 $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; 2478 2479 while (my $msg = read_line($tmpfd)) { 2480 $msg =~ s/\r?\n$//; 2481 my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg); 2482 $d[3] =~ s/^$opt_cs_cachedir//; 2483 2484 send_line($gzfd, Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], $d[2], $d[3])) || 2485 die "mass-check: error writing out temp file in client mode\n"; 2486 } 2487 close $tmpfd; 2488 undef $tmpfd; 2489 $gzfd->close; 2490 status('cache data compressed to '.(-s $gzpath).' bytes') if ($opt_progress); 2491 2492 return $gzpath; 2493} 2494 2495############################################################################ 2496 2497# in server mode, just return the ref to the message data 2498sub wanted_server { 2499 my ($class, $id, $time, $dataref, $format) = @_; 2500 return $dataref; 2501} 2502 2503# very similar to result() except the result has the message number at the 2504# front, so strip it off and then set the POST data appropriately. 2505sub result_client { 2506 my ($class, $result, $time) = @_; 2507 2508 # don't open results files until we get here to avoid overwriting files 2509 init_results() if !$init_results; 2510 2511 if ($class eq "s") { 2512 $spam_count++; 2513 } 2514 elsif ($class eq "h") { 2515 $ham_count++; 2516 } 2517 2518 $total_count++; 2519 2520 if ($opt_progress) { 2521 progress($time); 2522 } 2523 2524 if ($result =~ s/^(\d+)\s+//m) { 2525 $postdata{$1} = $result; 2526 } 2527 else { 2528 warn ">> WTH!? result is not in the correct format: $result\n"; 2529 # 20071114: bit of a hack 2530 # prevent malformed cs_client message result lines from preventing the 2531 # cs_server running in cs_schedule cache mode from completing 2532 # TODO: find out how result lines get malformed (malformed result lines 2533 # will still hang any cs_server not running with cs_schedule_cache) 2534 $failed_msgs++; 2535 } 2536} 2537 2538sub aidbg { 2539 if (would_log("dbg", "mass-check") == 2) { 2540 dbg (@_); 2541 } 2542} 2543 2544sub deal_with_before_after { 2545 my($which, $time) = @_; 2546 2547 if ($time && $time =~ /^-\d+$/) { 2548 $time = time + $time; 2549 } 2550 elsif ($time && $time !~ /^-?\d+$/) { 2551 if (HAS_TIME_PARSEDATE) { 2552 $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1); 2553 } 2554 else { 2555 die "You need Time::ParseDate if you use either the --before or --after option."; 2556 } 2557 } 2558 2559 if ($which eq 'before') { 2560 $opt_before = $time; 2561 } 2562 else { 2563 $opt_after = $time; 2564 } 2565 2566 if ($opt_before && $opt_after && $opt_after >= $opt_before) { 2567 die "--before ($opt_before) <= --after ($opt_after) -- conflict!"; 2568 } 2569} 2570 2571sub generate_queue { 2572 my ($targets, $tmpfd) = @_; 2573 2574 # scan the targets and get the number and list of messages 2575 $iter->_scan_targets($targets, 2576 sub { 2577 my($self, $date, $class, $format, $mail) = @_; 2578 push(@{$self->{$class}}, Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $mail)); 2579 } 2580 ); 2581 2582 # deal with opt_head and opt_tail 2583 top_and_tail_messages($iter->{h}); 2584 top_and_tail_messages($iter->{s}); 2585 2586 my $messages; 2587 if ($opt_n) { 2588 # OPT_N == 1 means don't bother sorting on message receive date 2589 2590 # for ease of memory, we'll play with pointers 2591 $messages = $iter->{s}; 2592 undef $iter->{s}; 2593 if ($iter->{h}) { 2594 push(@{$messages}, @{$iter->{h}}); 2595 undef $iter->{h}; 2596 } 2597 } 2598 else { 2599 # OPT_N == 0 means sort on message receive date 2600 2601 # Sort the spam and ham groups by date 2602 my @s = @{$iter->{s}}; 2603 undef $iter->{s}; 2604 my @h = @{$iter->{h}}; 2605 undef $iter->{h}; 2606 2607 # interleave ordered spam and ham 2608 if (@s && @h) { 2609 my $ratio = @s / @h; 2610 while (@s && @h) { 2611 push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h); 2612 } 2613 } 2614 # push the rest onto the end 2615 push @{$messages}, @s, @h; 2616 } 2617 2618 # head or tail < 0 means crop the total list, negate the value appropriately 2619 if ($opt_tail < 0) { 2620 splice(@{$messages}, 0, $opt_tail); 2621 } 2622 if ($opt_head < 0) { 2623 splice(@{$messages}, -$opt_head); 2624 } 2625 2626 my $num = $Mail::SpamAssassin::ArchiveIterator::MESSAGES = scalar(@{$messages}); 2627 2628 # Dump out the number of messages and the message index info to 2629 # the temp file 2630 send_line($tmpfd, $num, @{$messages}); 2631} 2632 2633sub top_and_tail_messages { 2634 my ($ary) = @_; 2635 2636 if ($opt_n) { 2637 # OPT_N == 1 means don't bother sorting on message receive date 2638 2639 # head or tail > 0 means crop each list 2640 if ($opt_tail > 0) { 2641 splice(@{$ary}, 0, -$opt_tail); 2642 } 2643 if ($opt_head > 0) { 2644 splice(@{$ary}, min ($opt_head, scalar @{$ary})); 2645 } 2646 } 2647 else { 2648 # OPT_N == 0 means sort on message receive date 2649 2650 # Sort the spam and ham groups by date 2651 my @s = sort { $a cmp $b } @{$ary}; 2652 2653 # head or tail > 0 means crop each list 2654 if ($opt_tail > 0) { 2655 splice(@s, 0, -$opt_tail); 2656 } 2657 if ($opt_head > 0) { 2658 splice(@s, min ($opt_head, scalar @s)); 2659 } 2660 2661 @{$ary} = @s; 2662 } 2663} 2664 2665sub min { 2666 return ($_[0] < $_[1] ? $_[0] : $_[1]); 2667} 2668 2669############################################################################ 2670 2671sub run_post_scan { 2672 return unless $opt_run_post_scan; 2673 system($opt_run_post_scan); 2674 if ($? >> 8 != 0) { 2675 warn "$opt_run_post_scan failed"; 2676 } 2677} 2678 2679