1#!/usr/bin/perl -w
2use strict;
3use MIME::Base64;
4
5#
6# <@LICENSE>
7# Licensed to the Apache Software Foundation (ASF) under one or more
8# contributor license agreements.  See the NOTICE file distributed with
9# this work for additional information regarding copyright ownership.
10# The ASF licenses this file to you under the Apache License, Version 2.0
11# (the "License"); you may not use this file except in compliance with
12# the License.  You may obtain a copy of the License at:
13#
14#     http://www.apache.org/licenses/LICENSE-2.0
15#
16# Unless required by applicable law or agreed to in writing, software
17# distributed under the License is distributed on an "AS IS" BASIS,
18# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19# See the License for the specific language governing permissions and
20# limitations under the License.
21# </@LICENSE>
22
23sub aidbg;
24
25sub usage {
26  my $status = shift;
27
28  my $out = $status ? \*STDERR : \*STDOUT;
29  print $out <<EOF;
30usage: mass-check [options] target ...
31
32  -c=file       set configuration/rules directory
33  -p=dir        set user-prefs directory
34  -f=file       read list of targets from <file>
35  -j=jobs       specify the number of processes to run simultaneously
36  --net         turn on network checks
37  --mid         report Message-ID from each message
38  --debug[=LIST] report debugging information (default is all facilities, LIST
39                is a comma-separated list of facilities)
40  --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
41  --rules=RE    Only test rules matching the given regexp RE
42  --restart=N   restart all of the children after processing N messages
43  --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
44                were encapsulated by servers matching the regexp RE
45                (default = extract all SpamAssassin-encapsulated mails)
46  --lint        check rules for syntax before running
47  --cf='config line'  Additional line of configuration
48  --pre='config line' Additional line of ".pre" (prepended to configuration)
49  --run_post_scan='command'  Run the named command after the 'scan' stage,
50                before starting the 'run' stage
51
52  verbosity options
53  --progress    show progress updates during check
54  --noisy       show noisier progress updates during check
55  --showdots    print a dot for each scanned message
56
57  client/server mode options
58  --server=host:port
59                use server mode, running on the given hostname and port
60  --client=host:port
61  		use client mode, connecting to the given hostname and port
62  --cs_conn_retries=N
63		only used in client mode. set the number of times to retry
64		the initial connection to the server, while waiting 60
65		seconds between connection attempts, default is 60 retries
66  --cs_max=N
67  		at most, only ever request (client)/give out (server) a
68		maximum of N messages (defaults to 1000)
69  --cs_timeout=N
70  		in client mode, try to connect to the server every N seconds
71		defaults to 120
72		in server mode, timeout messages after N seconds
73		defaults to 300
74  --cs_paths_only
75		only used in client mode.  when making requests of the
76		server, only ask for paths to the messages and not the
77		messages themselves.  useful when the client and server
78		have the same paths to the corpus data.
79  --cs_ssl      use SSL to encrypt on-the-wire client-server traffic
80                (requires IO::Socket::SSL, see
81                http://wiki.apache.org/spamassassin/SslMassCheck for more)
82  --cs_verbose  Log network bandwidth utilization figures and other statistics
83  --cs_schedule_cache
84                Distribute messages so that they are checked on clients who
85                have the messages in their local message cache, implies
86                --cs_cache; if --cs_schedule_cache is not enabled, but
87                --cs_cache is, clients running in --cs_paths_only mode will
88                opportunistically use the messages from their local cache
89  --cs_cache	in client mode, enable the local message cache
90                in server mode, allows the clients to use cached messages
91                and/or add to their local caches
92  --cs_cachedir=dir
93                write cache info for --cs_cache in this directory tree
94  --cs_max_tries=N
95                maximum number of attempts to have a client scan a message
96                defaults to 3
97
98  log options
99  -o            write all logs to stdout
100  --loghits     log the text hit for patterns (useful for debugging)
101  --loguris	log the URIs found
102  --logmem	log the memory delta (only on Linux)
103  --hamlog=log  use <log> as ham log ('ham.log' is default)
104  --spamlog=log use <log> as spam log ('spam.log' is default)
105
106  message selection options
107  -n            no date sorting or spam/ham interleaving
108  --cache	use cache information when selecting messages
109  --cachedir=dir write cache info for --cache in this directory tree
110  --all         don't skip big messages
111
112  message selection options, can be specified for each target
113  --after=N     only test mails received after time_t N (negative values
114                are an offset from current time, e.g. -86400 = last day)
115                or after date as parsed by Time::ParseDate (e.g. '-6 months')
116  --before=N    same as --after, except received times are before time_t N
117  --scanprob=N  probability of scanning a message, range 0.0 - 1.0 (default: 1.0)
118
119  message selection options, can be specified for each target class
120  --head=N      only check first N ham and N spam (N messages if -n used)
121  --tail=N      only check last N ham and N spam (N messages if -n used)
122
123  simple target options (implies -o and no ham/spam classification)
124  --dir         subsequent targets are directories
125  --file        subsequent targets are files in RFC 822 format
126  --mbox        subsequent targets are mbox files
127  --mbx         subsequent targets are mbx files
128
129  Just left over functions we should remove at some point:
130  --bayes       report score from Bayesian classifier
131
132  options used during score generation process
133  --learn=N     learn N% of messages as spam or ham
134  --reuse       reuse network checks if X-Spam-Status: is present in messages
135                (note: both clients and servers in c/s mode need this)
136
137  non-option arguments are used as target names (mail files and folders),
138  the target format is: <class>:<format>:<location>
139  <class>       is "spam" or "ham"
140  <format>      is "dir", "file", "mbx", "mbox", or "detect"
141  <location>    is a file or directory name.  globbing of ~ and * is supported
142
143(see http://wiki.apache.org/spamassassin/MassCheck for more details.)
144
145EOF
146  exit($status);
147}
148
149###########################################################################
150
151our ($opt_c, $opt_p, $opt_f, $opt_j, $opt_n, $opt_o, $opt_all, $opt_bayes,
152     $opt_debug, $opt_format, $opt_hamlog, $opt_head, $opt_loghits,
153     $opt_mid, $opt_net, $opt_nosort, $opt_progress, $opt_showdots,
154     $opt_spamlog, $opt_tail, $opt_rules, $opt_restart, $opt_loguris,
155     $opt_logmem, $opt_after, $opt_before, $opt_rewrite, $opt_deencap,
156     $opt_learn, $opt_reuse, $opt_lint, $opt_cache, $opt_noisy, $opt_cf,
157     $total_messages, $statusevery, $opt_cachedir, $opt_scanprob,
158     $opt_client, $opt_cs_conn_retries, $opt_cs_max, $opt_cs_timeout,
159     $opt_cs_paths_only, $opt_server, %postdata, %real, $svn_revision,
160     $opt_cs_ssl, $opt_run_post_scan, $opt_cs_verbose, %client_caches,
161     %server_caches, @cache_tmp_files, %min_other_caches,
162     %unique_cache_completed, $opt_cs_schedule_cache, $opt_cs_cache,
163     $opt_cs_cachedir, $opt_cs_max_tries, $opt_pre,
164     $tmpfd);
165
166use FindBin;
167
168# use "blib" so that we can use e.g. @@LOCAL_STATE_DIR@@; "lib" doesn't
169# have that stuff substituted :(  use lib too, though, as a backup,
170# since some users might be running mass-check without "make" first
171use lib "$FindBin::Bin/../lib";
172use lib "$FindBin::Bin/../blib/lib";
173
174use IO::Select;
175use IO::Socket;
176use Socket qw();
177use Mail::SpamAssassin::ArchiveIterator;
178use Mail::SpamAssassin;
179use Mail::SpamAssassin::Logger;
180use File::Copy;
181use File::Spec;
182use Getopt::Long;
183use POSIX qw(strftime);
184use Fcntl qw(O_RDWR O_CREAT);;
185use Config;
186
187use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
188use constant HAS_IO_ZLIB => eval { require IO::Zlib; };
189use constant HAS_IO_SOCKET_SSL => eval { require IO::Socket::SSL; };
190use constant HAS_TIME_HI_RES => eval { require Time::HiRes; };
191use constant HAS_SDBM_FILE => eval { require SDBM_File; };
192
193
194# default settings
195$opt_c = "$FindBin::Bin/../rules";
196$opt_p = "$FindBin::Bin/spamassassin";
197$opt_j = 1;
198$opt_head = 0;
199$opt_tail = 0;
200$opt_net = 0;
201$opt_hamlog = "ham.log";
202$opt_spamlog = "spam.log";
203$opt_learn = 0;
204$opt_cf = [];
205$opt_pre = [];
206
207my $rcvd_bytes = 0;
208my $sent_bytes = 0;
209my $t_first_msg = 0;
210my $t_last_msg = 0;
211my $msgs_processed = 0;
212my $failed_msgs = 0;
213my $cache_hits = 0;
214my $client_id = 0;
215
216my @ORIG_ARGV = @ARGV;
217GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s",
218	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
219	   "progress!", "rewrite:s", "showdots", "spamlog=s", "tail=i",
220	   "rules=s", "restart=i", "loguris", "run_post_scan=s",
221	   "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache",
222           "cachedir=s", "noisy", "scanprob=f",
223	   "server=s", "cs_max=i", "cs_timeout=i", "cs_conn_retries=i",
224	   "cs_paths_only", "client=s", "cs_ssl", "cs_verbose",
225           "cs_schedule_cache", "cs_cache", "cs_cachedir=s", "cs_max_tries=i",
226	   "before=s" => \&deal_with_before_after,
227	   "after=s" => \&deal_with_before_after,
228	   "cf=s" => \@{$opt_cf},
229	   "pre=s" => \@{$opt_pre},
230	   "dir" => sub { $opt_format = "dir"; },
231	   "file" => sub { $opt_format = "file"; },
232	   "mbox" => sub { $opt_format = "mbox"; },
233	   "mbx" => sub { $opt_format = "mbx"; },
234	   "help" => sub { usage(0); },
235	   '<>' => \&target) or usage(1);
236
237# We need IO::Zlib for client-server mode!
238if ( ($opt_client || $opt_server) && ! HAS_IO_ZLIB ) {
239  die "IO::Zlib required for client/server mode!\n";
240}
241
242if ($opt_cs_ssl && ! HAS_IO_SOCKET_SSL ) {
243  die "IO::Socket::SSL required for --cs_ssl!\n";
244}
245
246if ($opt_noisy) {
247  $opt_progress = 1;        # implies --progress
248}
249
250$opt_debug ||= 'all' if defined $opt_debug;
251
252if ($opt_cs_schedule_cache) {
253  $opt_cs_cache = 1;        # implies --cs_cache
254}
255
256if ($opt_client && $opt_cs_cache && !$opt_cs_cachedir) {
257  warn "You must specify a local message cache directory with --cs_cachedir\n".
258       "when using the --cs_cache option.\n";
259  exit;
260}
261
262if ($opt_server && $opt_cs_schedule_cache && !$opt_n) {
263  warn '*'x74 ."\n";
264  warn '*'. ' 'x72 ."*\n";
265  warn "*   Corpus will be run un-sorted but with date stamp loggging which is   *\n";
266  warn "*   needed for score generation log selection but is not available when  *\n";
267  warn "*   using the -n option.  This may affect the results of any bayes and   *\n";
268  warn "*   AWL tests run during this mass-check.                                *\n";
269  warn '*'. ' 'x72 ."*\n";
270  warn '*'x74 ."\n";
271}
272
273if ($opt_cs_schedule_cache && !HAS_SDBM_FILE) {
274  warn "--cs_schedule_cache requires the Perl module SDBM_File.\n";
275  exit;
276}
277
278# --lint
279if ($opt_lint) {
280  # In theory we could probably use the same spamtest object as below,
281  # but since it's probably not expecting that, and we don't want
282  # strange things happening, create a local object.
283  my $spamlint = create_spamtest();
284  $spamlint->debug_diagnostics();
285  my $res = $spamlint->lint_rules();
286  $spamlint->finish();
287  if ($res) {
288    warn "lint: $res issues detected, ".
289        "please rerun with debug enabled for more information\n";
290    exit 1;
291  }
292}
293
294# test messages for the mass-check
295my @targets;
296if (!$opt_client) {
297  if ($opt_f) {
298    open(F, $opt_f) || die "cannot read target $opt_f: $!";
299    push(@targets, map { chomp; $_ } <F>);
300    close(F);
301  }
302  usage(1) if !@targets;
303}
304
305if ($opt_reuse) {
306  # if we have --reuse, don't bother testing DNS; we shouldn't be hitting
307  # the wire at all, and in fact we may be running without a net connection
308  push @{$opt_cf}, "dns_available yes\n";
309
310  # need to load M:SA:Plugin:Reuse
311  push @{$opt_pre}, "loadplugin Mail::SpamAssassin::Plugin::Reuse\n";
312
313}
314
315my $user_prefs = "$opt_p/user_prefs";
316
317sub create_spamtest {
318  return Mail::SpamAssassin->new({
319    'debug'              		=> $opt_debug,
320    'rules_filename'     		=> $opt_c,
321    'site_rules_filename'		=> "$opt_p/local.cf",
322    'userprefs_filename'		=> $user_prefs,
323    'userstate_dir'     		=> $opt_p,
324    'save_pattern_hits'  		=> $opt_loghits,
325    'dont_copy_prefs'   		=> 1,
326    'local_tests_only'   		=> $opt_net ? 0 : 1,
327    'only_these_rules'   		=> $opt_rules,
328    'ignore_safety_expire_timeout'	=> 1,
329    'pre_config_text'                   => join("\n", @{$opt_pre})."\n",
330    'post_config_text'                  => join("\n", @{$opt_cf})."\n",
331    PREFIX				=> '',
332    DEF_RULES_DIR        		=> $opt_c,
333    LOCAL_RULES_DIR      		=> '',
334  });
335}
336
337my $spamtest = create_spamtest();
338$spamtest->compile_now(0);      # 0 since we will be reading more configs
339$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
340$spamtest->read_scoreonly_config($user_prefs);
341$spamtest->call_plugins("prefork_init");  # since SA 3.4.0
342
343my $who = `id -un 2>/dev/null`;
344my $where = `uname -n 2>/dev/null`;
345my $when = `date -u`;
346my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || $where || `hostname` || 'localhost';
347chomp $who;
348chomp $where;
349chomp $when;
350chomp $host;
351$svn_revision = get_current_svn_revision();
352
353# when displaying the commandline, quote any arguments which have
354# "questionable" characters such as spaces, pipes, etc.
355my $cmdline = join(' ',map { m@[^A-Za-z0-9_/\\.-]@ ? qq/"$_"/ : $_ } @ORIG_ARGV); $cmdline =~ s/\s+/ /gs;
356my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better
357
358my $log_header = "# mass-check results from $who\@$where, on $when\n" .
359		 "# M:SA version ".$spamtest->Version()."\n" .
360		 "# SVN revision: $svn_revision\n" .
361                 "# Date: $isowhen\n" .
362		 "# Perl version: $] on $Config{archname}\n" .
363                 "# Switches: '$cmdline'\n";
364
365my $updates = ($opt_noisy ? 100 : 10);
366my $total_count = 0;
367my $spam_count = 0;
368my $ham_count = 0;
369my $init_results = 0;
370
371my $showdots_active = ($opt_showdots || $opt_noisy);
372my $showdots_counter = 0;
373my $showdots_every = ($opt_showdots ? 1 : 20);
374
375my $AIopts = {
376	'opt_all' => $opt_all,
377        'opt_skip_empty_messages' => 1,
378      };
379
380if (!$opt_client) {
381  # Deal with --rewrite
382  if (defined $opt_rewrite) {
383    my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out");
384    open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!";
385  }
386
387  # ArchiveIterator options for non-client mode
388  $AIopts->{'opt_scanprob'} = $opt_scanprob;
389  $AIopts->{'opt_cache'} = $opt_cache;
390  $AIopts->{'opt_cachedir'} = $opt_cachedir;
391  $AIopts->{'opt_after'} = $opt_after;
392  $AIopts->{'opt_before'} = $opt_before;
393  $AIopts->{'scan_progress_sub'} = \&showdots_blip;
394  $AIopts->{'opt_want_date'} = ! $opt_n;
395
396  # ensure that scanprob stuff is predictable and reproducable
397  if (defined $opt_scanprob && $opt_scanprob < 1.0) {
398    srand(1);
399  }
400}
401else {
402  # ArchiveIterator options for client mode -- tends to be simple
403  $opt_n = 1;
404  $AIopts->{'opt_want_date'} = 0;
405}
406
407###########################################################################
408## SCAN MODE
409
410my $iter = Mail::SpamAssassin::ArchiveIterator->new($AIopts);
411
412# setup the AI functions
413if ($opt_client) {
414  $iter->set_functions(\&wanted, \&result_client);
415}
416elsif ($opt_server) {
417  $iter->set_functions(\&wanted_server, \&result);
418}
419else {
420  $iter->set_functions(\&wanted, \&result);
421}
422
423#AFTER SETTING ALL OPTS, RUN _set_default_message_selection_opts() TO GET SANE DEFAULTS
424$iter->_set_default_message_selection_opts();
425
426my $messages;
427
428# normal mode as well as a server do scan mode and get a temp file
429if (!$opt_client) {
430  status('starting scan stage') if ($opt_progress);
431
432  # Make a temp file and delete it
433  my $tmpf;
434  ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile();
435  die 'mass-check: failed to create temp file' unless $tmpf;
436  unlink $tmpf or die "mass-check: unlink '$tmpf': $!";
437
438  # having opt_j or server mode means do scan in a separate process
439  if ($opt_server || $opt_j) {
440    if ($tmpf = fork()) {
441      # parent
442      waitpid($tmpf, 0);
443    }
444    elsif (defined $tmpf) {
445      # child -- process using message_array
446      generate_queue(\@targets, $tmpfd);
447      exit;
448    }
449    else {
450      die "mass-check: cannot fork: $!";
451    }
452  }
453  else {
454    # we get here if opt_j == 0, so scan in this process
455    generate_queue(\@targets, $tmpfd);
456  }
457
458  # we now have a temporary file with the messages to process
459  seek($tmpfd, 0, 0);
460  # the first line is the number of messages
461  $total_messages = read_line($tmpfd);
462
463  if (!$total_messages) {
464    die "mass-check: no messages to process\n";
465  }
466
467  if ($opt_cs_schedule_cache) {
468    # create a tied hash database for the server to use for matching against
469    # client hashes
470
471    # while we're at it, count the total number of messages for ourself
472    # I suspect that AI gets it wrong sometimes, somehow
473    $total_messages = 0;
474
475    # get a temp file for the server database on messages to process
476    my ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile();
477    close $dbfd;
478    unlink $dbpath;
479    push @cache_tmp_files, $dbpath.'.pag';
480    push @cache_tmp_files, $dbpath.'.dir';
481
482    tie %{$server_caches{'to_process'}}, "SDBM_File", $dbpath,
483      O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!";
484
485    # get a temp file for the server DB of messages that are not cached anywhere
486    ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile();
487    close $dbfd;
488    unlink $dbpath;
489    push @cache_tmp_files, $dbpath.'.pag';
490    push @cache_tmp_files, $dbpath.'.dir';
491
492    tie %{$server_caches{'not_cached'}}, "SDBM_File", $dbpath,
493      O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!";
494
495    # get a temp file for the server database of cache hit counts per message
496    ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile();
497    close $dbfd;
498    unlink $dbpath;
499    push @cache_tmp_files, $dbpath.'.pag';
500    push @cache_tmp_files, $dbpath.'.dir';
501
502    tie %{$server_caches{'cache_count'}}, "SDBM_File", $dbpath,
503      O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!";
504
505    # dump the server's list of messages to process to the DB_HASHes
506    while (my $msg = read_line($tmpfd)) {
507      my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg);
508      $server_caches{'to_process'}{$d[3]} = $msg;
509      $server_caches{'not_cached'}{$d[3]} = undef;
510      $total_messages++;
511    }
512    # and back to the first message (although we currently never use it again)
513    seek($tmpfd, 0, 0);
514    read_line($tmpfd);
515  }
516
517  showdots_finish();
518  status("completed scan stage, $total_messages messages") if ($opt_progress);
519}
520
521###########################################################################
522## RUN MODE
523
524run_post_scan();
525
526if ($opt_client) {
527  client_mode();
528}
529else {
530  status('starting run stage') if ($opt_progress);
531
532  if ($opt_server) {
533    server_mode();
534  }
535  else {
536    $t_first_msg = time;
537    run_through_messages();
538    $t_last_msg = time;
539  }
540
541  status('completed run stage') if ($opt_progress);
542}
543
544# Even though we're about to exit, let's clean up after ourselves
545close($tmpfd) if ($tmpfd);
546showdots_finish();
547
548if (defined $opt_rewrite) {
549  close(REWRITE);
550}
551
552$spamtest->finish();
553
554if ($opt_cs_verbose) {
555  if ($opt_client || $opt_server) {
556    warn "network I/O: sent=$sent_bytes, received=$rcvd_bytes\n";
557  }
558  $t_last_msg++ if $t_last_msg == $t_first_msg;
559  warn "processed $msgs_processed messages (failed to scan $failed_msgs) in ".($t_last_msg - $t_first_msg).
560       ' seconds ('.(sprintf("%.1f", ($msgs_processed / ($t_last_msg - $t_first_msg) * 3.6))).
561       " kmsgs/hr)\n";
562  $msgs_processed ||= 1;
563  warn "cache hits: $cache_hits ("
564    .(sprintf("%.1f", ($cache_hits / $msgs_processed * 100)))."%)\n" if $opt_cs_cache;
565}
566
567# exit status: did we check at least one message correctly?
568exit(!($ham_count || $spam_count));
569
570###########################################################################
571
572sub target  {
573  my ($target) = @_;
574
575  # message-selection options; these can now be specified separately
576  # for each target
577  my %selopts = (
578    opt_scanprob => $opt_scanprob,
579    opt_after => $opt_after,
580    opt_before => $opt_before
581  );
582
583  if (!defined($opt_format)) {
584    push(@targets, { %selopts, target => $target });
585  }
586  else {
587    $opt_o = 1;
588    push(@targets, { %selopts, target => "spam:$opt_format:$target" });
589  }
590}
591
592###########################################################################
593
594sub init_results {
595  $init_results = 1;
596
597  showdots_finish();
598
599  # now, showdots only happens if --showdots was used
600  $showdots_active = $opt_showdots;
601
602  if ($opt_progress) {
603    # round up since 100% will be caught at end already
604    $statusevery = int($total_messages / $updates + 1);
605
606    # if $messages < $updates, just give a status line per msg.
607    $statusevery ||= 1;
608  }
609
610  return if $opt_client;
611
612  if ($opt_o) {
613    autoflush STDOUT 1;
614    print STDOUT $log_header;
615  }
616  else {
617    open(HAM, "> $opt_hamlog") || die "open of $opt_hamlog failed: $!";
618    open(SPAM, "> $opt_spamlog") || die "open of $opt_spamlog failed: $!";
619    autoflush HAM 1;
620    autoflush SPAM 1;
621    print HAM $log_header;
622    print SPAM $log_header;
623  }
624}
625
626sub result {
627  my ($class, $result, $time) = @_;
628
629  # don't open results files until we get here to avoid overwriting files
630  init_results() if !$init_results;
631
632  if ($class eq "s") {
633    if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
634    $spam_count++;
635  }
636  elsif ($class eq "h") {
637    if ($opt_o) { print STDOUT $result; } else { print HAM $result; }
638    $ham_count++;
639  }
640
641  $total_count++;
642
643  if ($opt_progress) {
644    progress($time);
645  }
646}
647
648sub wanted {
649  my ($class, $id, $time, $dataref, $format) = @_;
650  my $out = '';
651
652  # if origid is defined, it'll be the message number from server mode
653  my $origid;
654
655  # client mode is a little crazy because we need to kluge around the fact
656  # that the information needed to do the run is different than the
657  # information that goes into the results.
658  if ($opt_client) {
659    # change the format and id to the real version, make sure to remember
660    # the server's message number
661    # note: pop/push makes it work regardless of any changes to M::SA::AI
662    $origid = pop @{$real{$id}};
663    $format = $real{$id}->[2];
664    $id = $real{$id}->[3];
665  }
666
667  memory_track_start() if ($opt_logmem);
668  $spamtest->timer_reset;
669
670  # parse the message, and force it to complete
671  my $ma = $spamtest->parse($dataref, 1);
672
673  # get X-Spam-Status: header for rule hit resue
674  my $x_spam_status;
675  my $reusing;
676  if ($opt_reuse) {
677    $x_spam_status = $ma->get_header("X-Spam-Status");
678    $x_spam_status and $x_spam_status =~ s/,\s+/,/gs;
679  }
680
681  # remove SpamAssassin markup, if present and the mail was spam
682  my $header = $ma->get_header("Received");
683  if ($header && $header =~ /\bwith SpamAssassin\b/) {
684    if (!$opt_deencap || message_should_be_deencapped($ma)) {
685      my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
686      $ma->finish();
687      $ma = $new_ma;
688    }
689  }
690
691  if ($opt_reuse) {
692    if ($x_spam_status
693        && $x_spam_status =~ m/tests=(\S*)/
694        && $x_spam_status !~ /\bshortcircuit=(?:ham|spam|default)\b/)
695    {
696      my @previous = split(/,/, $1);
697      # Bug 7709
698      # Amavis X-Spam-Status rules include score and are enclosed in []
699      # Amavis: [RULENAME=0.01,RULENAME_2=0.01]
700      # Spamassassin: RULENAME,RULENAME_2
701      # .. also support compact RULE(hits), no need to count hits here
702      s/[\[\]]//, s/=.*//, s/\(\d+\)$// foreach (@previous);
703      $ma->{metadata}->{reuse_tests_hit} = { map {$_ => 1} @previous };
704      $reusing = 1;
705    }
706  }
707
708  # plugin hook to cause us to skip messages
709  my $skip = $spamtest->call_plugins("mass_check_skip_message", {
710        class => $class,
711        'time' => $time,
712        'id' => $id,
713        msg => $ma
714      });
715  if ($skip) {
716    $ma->finish();
717    return;
718  }
719
720  # log-uris support
721  my $status;
722  my @uris;
723  my $before;
724  my $after;
725  if ($opt_loguris) {
726    my $pms = Mail::SpamAssassin::PerMsgStatus->new($spamtest, $ma);
727    @uris = $pms->get_uri_list();
728    $pms->finish();
729
730  } else {
731    $before = time;
732    $status = $spamtest->check($ma);
733    $after = time;
734  }
735
736  my @extra;
737
738  # sample-based learning
739  if ($opt_learn > 0) {
740    my $spam;
741    # spam learned as ham = 0.05%
742    if ($class eq 's' && rand(100) < 0.05) {
743      $spam = 0;
744    }
745    # ham learned as spam = 0.01%
746    elsif ($class eq 'h' && rand(100) < 0.01) {
747      $spam = 1;
748    }
749    # spam/ham learned correctly
750    elsif (rand(100) < $opt_learn) {
751      if ($class eq 's') {
752	$spam = 1;
753      }
754      elsif ($class eq 'h') {
755	$spam = 0;
756      }
757      else {
758	die "unknown class, learning failed";
759      }
760    }
761    if (defined $spam) {
762      my $result = ($spam ? "spam" : "ham");
763      my $status = $spamtest->learn($ma, undef, $spam, 0);
764      my $learned = $status->did_learn();
765      $result = "undef" if !defined $learned;
766      push(@extra, "learn=".$result);
767    }
768  }
769
770  if (defined($time)) {
771    push(@extra, "time=".$time);
772  }
773  if ($status && defined $status->{bayes_score}) {
774    push(@extra, "bayes=".sprintf("%06f", $status->{bayes_score}));
775  }
776  if ($opt_mid) {
777    my $mid = $ma->get_header("Message-Id");
778    if ($mid) {     # message contains a Message-Id:
779      while($mid =~ s/\([^\(\)]*\)//s) {};   # remove comments and
780      $mid =~ s/^\s+|\s+$//sg;               # leading and trailing spaces
781      $mid =~ s/\s.*$//s;                    # keep only the first token
782    }
783    else {          # it doesn't have a Message-Id:
784      $mid = $id;             # so build one from the id
785      $mid =~ s,^.*/,,;       # remove the path
786      $mid = "<$mid\@$host.masses.spamassassin.org>";  # and put it together
787    }
788    $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c;     # replace dangerous chars with . (so regexp search just works)
789    push(@extra, "mid=$mid");
790  }
791  push(@extra, "scantime=" . ($after - $before));
792  push(@extra, "format=$format");
793
794  if ($opt_logmem) {
795    my $mem = memory_track_finish();
796    if ($mem) {
797      push(@extra, $mem);
798    }
799  }
800
801  push(@extra, "reuse=" . ($reusing ? "yes" : "no"));
802
803  # log the scoreset we're in
804  {
805    my $set = 0;
806    if ($opt_net) { $set |= 1; }
807    if ($status && defined $status->{bayes_score}) { $set |= 2; }
808    push(@extra, "set=".$set);
809  }
810
811  if ($opt_client) {
812    push(@extra, "host=$where");
813  }
814
815  my $yorn;
816  my $score;
817  my $tests;
818  my $extra;
819
820  if ($opt_loguris) {
821    $yorn = '.';
822    $score = 0;
823    $tests = join(" ", sort @uris);
824    $extra = '';
825  } else {
826    $yorn = $status->is_spam() ? 'Y' : '.';
827    # don't bother adjusting scores for reuse
828    $score = $status->get_score();
829    # list of tests hit
830    my %tests;
831    foreach ((
832      split(/,/, $status->get_names_of_tests_hit()),
833      split(/,/, $status->get_names_of_subtests_hit())
834    )) {
835      $tests{$_}++;
836    }
837    my @tests;
838    foreach (sort keys %tests) {
839      # Use compact RULE(hitcount) format
840      push @tests, $tests{$_} > 1 ? "$_($tests{$_})" : $_;
841    }
842    $tests = join(",", @tests);
843    $extra = join(",", @extra);
844  }
845
846  if (defined $opt_rewrite) {
847    print REWRITE $status->rewrite_mail();
848  }
849
850  $id =~ s/\s/_/g;
851
852  # if we have an origid set, it'll be the server mode's message number, so
853  # attach it to our result appropriately.
854  if (defined $origid) {
855    $out = "$origid ";
856  }
857
858  $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);
859
860  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
861    $out .= logkilled($ma, $id, "possible virus");
862  }
863
864  if ($opt_loghits) {
865    my $log = '';
866    foreach my $t (sort keys %{$status->{pattern_hits}}) {
867      $_ = $status->{pattern_hits}->{$t};
868      $_ ||= '';
869      s/\r/\\r/gs;      # fix unprintables
870      s/\n/\\n/gs;
871      $log .= "$t=\"$_\" ";
872    }
873    if ($log) {
874      chomp $log;
875      $out .= "# $log\n";
876    }
877  }
878
879  if (defined $status) { $status->finish(); }
880  $ma->finish();
881  undef $ma;		# clean 'em up
882  undef $status;
883
884  showdots_blip();
885#  print ">>>> out = $out\n";
886  return $out;
887}
888
889sub showdots_blip {
890  return unless ($showdots_active);
891
892  $showdots_counter++;
893  if ($showdots_counter % $showdots_every == 0) {
894    print STDERR '.';
895    if ($showdots_counter % (60 * $showdots_every) == 0) {
896      print STDERR "\n";
897    }
898  }
899}
900
901sub showdots_finish {
902  print STDERR "\n" if ($showdots_active);
903  $showdots_counter = 0;
904}
905
906# ick.  We have to go grovelling through the body parts to see if a message
907# is a report_safe-marked-up message, because a local scanner will overwrite
908# any remote scanner's X-Spam-Checker-Version header.
909#
910sub message_should_be_deencapped {
911  my ($ma) = @_;
912
913  # not sure why this is undefined, but it is sometimes
914  if (defined $ma->{body_parts} && scalar @{$ma->{body_parts}} > 0) {
915    my $firstpart = $ma->{body_parts}->[0];
916    if (!$firstpart->{headers}->{'content-type'}
917        || $firstpart->{headers}->{'content-type'} ne 'text/plain')
918    {
919      return 0;     # not a 'report_safe' encapsulation
920    }
921
922    if (scalar @{$firstpart->{raw}} < 3) { return 0; } # too short to be a report
923
924    # grab first 2 lines
925    my $text = $firstpart->{raw}->[0] . $firstpart->{raw}->[1];
926    $text =~ s/\s+/ /gs;
927    if ($text =~ /^Spam detection software, running on the system \"(\S+)\"/) {
928      my $hname = $1;
929      if ($hname =~ /$opt_deencap/io) {
930        return 1;
931      }
932    }
933  }
934
935  return 0;     # a different host marked it up.  pass it through!
936}
937
938sub logkilled {
939  my ($ma, $id, $reason) = @_;
940
941  my $from = $ma->get_header("From")       || 'undef';
942  my $to   = $ma->get_header("To")         || 'undef';
943  my $subj = $ma->get_header("Subject")    || 'undef';
944  my $mid  = $ma->get_header("Message-Id") || 'undef';
945  chomp ($from);
946  chomp ($to);
947  chomp ($subj);
948  chomp ($mid);
949  return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n";
950}
951
952sub progress {
953  my ($time) = @_;
954  $time ||= 0;
955
956  # Are we at the end or otherwise at a point we should print status?  Then do it.
957  if ($total_messages == $total_count || $total_count % $statusevery == 0) {
958    my $time = strftime("%Y-%m-%d", localtime($time));
959    status(sprintf("%3d%% ham: %-6d spam: %-6d date: %s",
960	int(($total_count / $total_messages) * 100), $ham_count, $spam_count, $time));
961  }
962}
963
964sub status {
965  my($str) = @_;
966  my $now = strftime("%Y-%m-%d %X", localtime(time));
967  printf STDERR "status: %-48s now: %s\n", $str, $now;
968}
969
970###########################################################################
971
972our ($mem_size, $mem_rss, $mem_shared);
973
974sub memory_track_start {
975  if ($^O =~ /linux/i) {
976    if (open (IN, "</proc/$$/statm")) {
977      my $statm = <IN>;
978      close IN;
979      if ($statm =~ /^(\d+) (\d+) (\d+) /) {
980        $mem_size = $1;
981        $mem_rss = $2;
982        $mem_shared = $3;
983      }
984    }
985  }
986}
987
988sub memory_track_finish {
989  my $str = '';
990
991  if ($^O =~ /linux/i) {
992    if (open (IN, "</proc/$$/statm")) {
993      my $statm = <IN>;
994      close IN;
995      if ($statm =~ /^(\d+) (\d+) (\d+) /) {
996        my $size = $1;
997        my $rss = $2;
998        my $shared = $3;
999
1000        $str = sprintf ("memsz=%d,memrss=%d,memshr=%d",
1001                ($size - $mem_size),
1002                ($rss - $mem_rss),
1003                ($shared - $mem_shared));
1004      }
1005    }
1006  }
1007  return $str;
1008}
1009
1010sub get_current_svn_revision {
1011  my $revision;
1012
1013  # this is usually "${TOPDIR}/masses"
1014  my $dir = $FindBin::Bin || ".";
1015
1016  if (-d "$dir/../.svn" || -f "$dir/svninfo.tmp") {
1017    if (-f "$dir/svninfo.tmp") {
1018      # created by build/automc/buildbot_ready for chrooted mass-checks
1019      open (SVNINFO, "< $dir/svninfo.tmp");
1020    }
1021    else {
1022      # note, ".." since we want to pick up changes outside 'masses'
1023      # too!
1024      open (SVNINFO, "( svn info --non-interactive $dir/.. || svn info $dir/.. ) 2>&1 |");
1025    }
1026
1027    while (<SVNINFO>) {
1028      # Revision: 383822
1029      next unless /^Revision: (\d+)/;
1030      $revision = $1;
1031      last;
1032    }
1033    close SVNINFO;
1034    return $revision if $revision;
1035  }
1036
1037  # this probably will never work due to Rules Project changes TODO
1038  if (open(TESTING, "$opt_c/70_testing.cf")) {
1039    chomp($revision = <TESTING>);
1040    $revision =~ s/.*\$Rev:\s*(\S+).*/$1/;
1041    close(TESTING);
1042    return $revision if $revision;
1043  }
1044
1045  return $revision || "unknown";
1046}
1047
1048############################################################################
1049
1050## children processors, start and process, used when opt_j > 1
1051
1052sub start_children {
1053  my ($count, $child, $pid, $socket) = @_;
1054
1055  my $io = IO::Socket->new();
1056  my $parent;
1057
1058  # create children
1059  for (my $i = 0; $i < $count; $i++) {
1060    ($child->[$i],$parent) = $io->socketpair(AF_UNIX,SOCK_STREAM,PF_UNSPEC)
1061	or die "mass-check: socketpair failed: $!";
1062    if ($pid->[$i] = fork) {
1063      close $parent;
1064
1065      # disable caching for parent<->child relations
1066      my ($old) = select($child->[$i]);
1067      $|++;
1068      select($old);
1069
1070      $socket->add($child->[$i]);
1071      aidbg "mass-check: starting new child $i (pid ".$pid->[$i].")\n";
1072      next;
1073    }
1074    elsif (defined $pid->[$i]) {
1075      my $result;
1076      my $line;
1077
1078      close $tmpfd if defined $tmpfd;
1079
1080      close $child->[$i];
1081      select($parent);
1082      $| = 1;	# print to parent by default, turn off buffering
1083      send_line($parent,"START");
1084      while ($line = read_line($parent)) {
1085	if ($line eq "exit") {
1086	  close $parent;
1087	  exit;
1088	}
1089
1090	my($class, $format, $date, $where, $result) = $iter->_run_message($line);
1091	$result ||= '';
1092
1093	# If determine_receive_date is not set, the original input date
1094	# wasn't calculated, but run_message would have done so, so reset
1095	# the packed version if possible ...  use defined for date since
1096	# it could == 0.
1097        if (!$iter->{determine_receive_date} && $class && $format && defined $date && $where) {
1098	  $line = Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $where);
1099        }
1100
1101	$result = encode_base64($result);
1102	send_line($parent,"$result\0RESULT $line");
1103      }
1104      exit;
1105    }
1106    else {
1107      die "mass-check: cannot fork: $!";
1108    }
1109  }
1110}
1111
1112## handling killing off the children
1113
1114sub reap_children {
1115  my ($count, $socket, $pid) = @_;
1116
1117  # If the child died, sending it the exit will generate a SIGPIPE, but we
1118  # don't really care since the readline will go undef (which is fine),
1119  # then we do the waitpid which will finish it off.  So we end up in the
1120  # right state, in theory.
1121  local $SIG{'PIPE'} = 'IGNORE';
1122
1123  for (my $i = 0; $i < $count; $i++) {
1124    aidbg "mass-check: killing child $i (pid ",$pid->[$i],")\n";
1125    send_line($socket->[$i],"exit"); # tell the child to die.
1126    close $socket->[$i];
1127    waitpid($pid->[$i], 0); # wait for the signal ...
1128  }
1129}
1130
1131# in server mode, this gets called to read in the HTTP request from a given
1132# socket, then return the information the client sent to us.
1133sub handle_http_request {
1134  my $socket = shift;
1135
1136  my $headers = {};
1137  my $postdata = {};
1138
1139  # read in the request (POST / HTTP/1.0)
1140  my $line = $socket->getline();
1141  $line ||= '';
1142  $rcvd_bytes += length($line);
1143  $line =~ s/\r\n$//;
1144
1145  my ($type, $URI, $VERS) = $line =~ /^([a-zA-Z]+)\s+(\S+)(?:\s*(\S+))/;
1146  unless ($type && $URI && $VERS) {
1147    $type ||= '';
1148    $URI  ||= '';
1149
1150    return ($type, $URI, $headers, $postdata);
1151  }
1152
1153  $type = uc $type;
1154
1155  # read in headers, "key: value" up to a blank line
1156  do {
1157    $line = $socket->getline();
1158    last unless defined $line;
1159    $rcvd_bytes += length($line);
1160    $line =~ s/\r\n$//;
1161
1162    if ($line) {
1163      my ($k,$v) = split(/:\s*/, $line, 2);
1164      $headers->{lc $k} = $v;
1165    }
1166  } while ($line !~ /^$/);
1167
1168  # if this is a POST request w/ content-length, there'll be a payload, deal
1169  # with it.  we only support compressed payloads.
1170  my $postheader;
1171  if ($type eq 'POST' && $headers->{'content-length'}) {
1172    $rcvd_bytes += $headers->{'content-length'};
1173    my $pd = '';
1174    if ($headers->{'content-encoding'} eq 'x-gzip') {
1175      # assign an id to the client if it doesn't already have one
1176      $headers->{'client-id'} ||= ++$client_id;
1177
1178      my ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
1179      die "Can't make tempfile, exiting" unless $gzpath;
1180
1181      # TODO: don't read in the entire thing at once to avoid memory bloat
1182      my $rd;
1183      $socket->read($rd, $headers->{'content-length'}) || die "mass-check: error reading in data from client\n";
1184      print $gzfd $rd;
1185      $gzfd->close;
1186
1187      my $fd = IO::Zlib->new($gzpath, "rb");
1188      die "Can't open temp result file: $!" unless $fd;
1189
1190      if ($headers->{'action'}) {
1191        # different types of POST contents are packed different ways
1192        if ($headers->{'action'} eq 'sending cache') {
1193
1194          if ($opt_cs_schedule_cache) {
1195            # save the client cache to a tied hash
1196
1197            # get a temp file for the client cache database
1198            my ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile();
1199            close $dbfd;
1200            unlink $dbpath;
1201            push @cache_tmp_files, $dbpath.'.pag';
1202            push @cache_tmp_files, $dbpath.'.dir';
1203
1204            tie %{$client_caches{$headers->{'client-id'}}}, "SDBM_File", $dbpath,
1205              O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!\n";
1206
1207            # dump the client's list of cached message paths to the DB_HASH
1208            my $client_cache = 0;
1209            my $client_cache_hits = 0;
1210            while (my $msg = read_line($fd)) {
1211              my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg);
1212              $client_cache++;
1213              if (exists $server_caches{'to_process'}{$d[3]}) {
1214                $client_caches{$headers->{'client-id'}}{$d[3]} = undef;
1215                $server_caches{'cache_count'}{$d[3]}++;
1216                delete $server_caches{'not_cached'}{$d[3]};
1217                # log this client's cache hit rate
1218                $client_cache_hits++;
1219              }
1220            }
1221            status("client $headers->{'client-id'} has $client_cache msgs cached ($client_cache_hits usable)");
1222            status("client $headers->{'client-id'} has ".(sprintf("%.1f", ($client_cache_hits / $total_messages * 100))).'% of required messages');
1223          }
1224        }
1225        elsif ($headers->{'action'} eq 'sending results') {
1226          # process the results
1227          $pd = read_line($fd);
1228
1229          # key1=value1&key2=value2...
1230          %{$postdata} = map {
1231            my($k,$v) = split(/=/, $_, 2);
1232
1233            # we need to decode the key and value
1234            $k =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg;
1235            $v =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg;
1236
1237            $k => $v;
1238          } split(/\&/, $pd);
1239
1240        }
1241      }
1242
1243      $fd->close;
1244      undef $fd;
1245      $gzfd->close;
1246      unlink $gzpath;
1247    }
1248  }
1249  return($type, $URI, $headers, $postdata);
1250}
1251
1252# in server mode, generate a gzip compressed data stream with the messages and
1253# return the path to the compressed file which the server will read and pass
1254# to the client.
1255#
1256# Input:
1257#  - Number of messages to generate (scalar)
1258#  - Hash of Arrays of outstanding requests (reference to hash of array refs)
1259#     timestamp# -> [ num1, num2, ... ]
1260#     Used to quickly find outstanding/timed out messages to send to client.
1261#  - Hash of outstanding messages and associated data (ref to hash of hash refs)
1262#     num1 -> { data => 'binary data from scan mode', timestamp => timestamp# }
1263#     Used later on to specify the timestamp entry to remove the entry from.
1264#  - Paths only?  If true, just include the original message data in the gzip
1265#    file.  Otherwise, include the message data.  Useful if the client has the
1266#    corpus available via the same paths as originally specified.
1267#
1268# Returns: scalar path to gzip file
1269#
1270sub generate_messages {
1271  my($msgs, $timestamps, $msgsout, $paths_only, $client_id) = @_;
1272
1273  # Hold the message numbers we'll be sending out
1274  my @tosend = ();
1275  my @sent = ();
1276
1277  # Find out if any of the messages we sent out before need to be sent out
1278  # again because we haven't seen a response within the timeout.
1279  my $tooold = time - $opt_cs_timeout;
1280  foreach (sort { $a <=> $b } keys %{$timestamps}) {
1281    # since we're going in numeric order, if the current entry is newer than
1282    # the timeout value, the rest will be too, so stop looking.
1283    last if ($_ > $tooold);
1284
1285    # messages that might be eligible for retry
1286    my @toretry = ();
1287
1288    # how many messages do we still need to fulfill the request?
1289    my $wanted = $msgs - @tosend;
1290
1291    if (@{$timestamps->{$_}} > $wanted) {
1292      # there are more entries in the timestamp list than we want, so just
1293      # grab that many off the list.
1294      push(@toretry, splice @{$timestamps->{$_}}, 0, $wanted);
1295    }
1296    else {
1297      # there are just enough, or not enough entries on the timestamp list to
1298      # satisfy our request, so take them all and we'll loop around.
1299      push(@toretry, @{$timestamps->{$_}});
1300      delete $timestamps->{$_};
1301    }
1302
1303    # limit retries
1304    foreach my $num (@toretry) {
1305      if ($msgsout->{$num}->{'count'} < $opt_cs_max_tries) {
1306        $msgsout->{$num}->{'count'}++;
1307        push @tosend, $num;
1308      } else {
1309        warn "skipping message num $num after $opt_cs_max_tries attempts, index: ".
1310          (Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$num}->{'data'}))[3]."\n";
1311        delete $msgsout->{$num};
1312        $failed_msgs++;
1313      }
1314    }
1315
1316    # Ok, we have enough messages so we can stop now.
1317    last if (@tosend == $msgs);
1318  }
1319
1320  if (!$opt_cs_schedule_cache) {
1321    # if we still have the temp file with the input messages open, we'll fillup
1322    # out message output queue with messages from there.
1323    if ($tmpfd) {
1324      while (@tosend < $msgs) {
1325        my $msg = read_line($tmpfd);
1326
1327        # no more messages from the temp file, close it out
1328        unless ($msg) {
1329          delete $msgsout->{'curnum'};
1330          close $tmpfd;
1331          undef $tmpfd;
1332          last;
1333        }
1334
1335        # we got a result, so assign it a number (curnum) and store the data
1336        # appropriately, then add the new number to the queue.
1337        my $num = ++$msgsout->{'curnum'};
1338        $msgsout->{$num}->{'data'} = $msg;
1339        $msgsout->{$num}->{'count'}++;
1340        push(@tosend, $num);
1341      }
1342    }
1343  }
1344  else {
1345    if (($msgs_processed + $failed_msgs) != $total_messages) {
1346      # select messages based on what the client*s* have cached
1347
1348      # if the client hasn't sent cache data, fake it
1349      unless (exists $client_caches{$client_id}) {
1350        %{$client_caches{$client_id}} = ();
1351        $unique_cache_completed{$client_id} = 1;
1352      }
1353
1354      # first: select messages that only the current client has cached
1355      MESSAGE: while (!$unique_cache_completed{$client_id} &&
1356                 @tosend < $msgs &&
1357                 (my($path, undef) = each %{$client_caches{$client_id}})) {
1358        # check that no other clients have it cached
1359        next MESSAGE if $server_caches{'cache_count'}{$path} > 1;
1360
1361        # we got a result, so assign it a number (curnum) and store the data
1362        # appropriately, then add the new number to the queue.
1363        my $num = ++$msgsout->{'curnum'};
1364        $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path};
1365        $msgsout->{$num}->{'count'}++;
1366        push(@tosend, [ $num, 1 ]);
1367        delete $server_caches{'to_process'}{$path};
1368        delete $server_caches{'cache_count'}{$path};
1369        delete $client_caches{$client_id}{$path};
1370        $cache_hits++;
1371      }
1372      if (@tosend < $msgs) {
1373        $unique_cache_completed{$client_id} = 1;
1374      }
1375
1376      # second: hand out messages that no clients have cached
1377      while (@tosend < $msgs &&
1378             (my($path, undef) = each %{$server_caches{'not_cached'}})) {
1379        # we got a result, so assign it a number (curnum) and store the data
1380        # appropriately, then add the new number to the queue.
1381        my $num = ++$msgsout->{'curnum'};
1382        $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path};
1383        $msgsout->{$num}->{'count'}++;
1384        push(@tosend, [ $num, 0 ]);
1385        delete $server_caches{'to_process'}{$path};
1386        delete $server_caches{'cache_count'}{$path};
1387        delete $server_caches{'not_cached'}{$path};
1388      }
1389
1390      # third: hand out messages in the client's cache regardless of how many
1391      #        other clients have them cached; smart scheduling takes too long
1392      while (@tosend < $msgs &&
1393             (my($path, undef) = each %{$client_caches{$client_id}})) {
1394        # we got a result, so assign it a number (curnum) and store the data
1395        # appropriately, then add the new number to the queue.
1396        my $num = ++$msgsout->{'curnum'};
1397        $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path};
1398        $msgsout->{$num}->{'count'}++;
1399        push(@tosend, [ $num, 1 ]);
1400        delete $server_caches{'to_process'}{$path};
1401        delete $server_caches{'cache_count'}{$path};
1402        $cache_hits++;
1403
1404        # testing with 5 clients, this seems to be faster (in msgs/hr) than
1405        # deleting from each other clients' cache when we go to get their
1406        # messages; of course it's more I/O on the server this way
1407        my $cache_count = $server_caches{'cache_count'}{$path};
1408        foreach my $cc (keys %client_caches) {
1409          delete $client_caches{$cc}{$path};
1410          last unless --$cache_count;
1411        }
1412      }
1413
1414      # fourth: hand out messages that other clients have cached without
1415      #         regard for how many clients have cached them; smart
1416      #         scheduling takes too long (at least when using DBM)
1417      while (@tosend < $msgs &&
1418             (my($path, $msg) = each %{$server_caches{'to_process'}})) {
1419        # we got a result, so assign it a number (curnum) and store the data
1420        # appropriately, then add the new number to the queue.
1421        my $num = ++$msgsout->{'curnum'};
1422        $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path};
1423        $msgsout->{$num}->{'count'}++;
1424        push(@tosend, [ $num, 0 ]);
1425        delete $server_caches{'to_process'}{$path};
1426        delete $server_caches{'cache_count'}{$path};
1427
1428        my $cache_count = $server_caches{'cache_count'}{$path};
1429        foreach my $cc (keys %client_caches) {
1430          delete $client_caches{$cc}{$path};
1431          last unless --$cache_count;
1432        }
1433      }
1434    }
1435    else {
1436      # close the tmpfd, etc so that the main loop knows we're done
1437      delete $msgsout->{'curnum'};
1438      close $tmpfd;
1439      undef $tmpfd;
1440    }
1441  }
1442
1443  # ok, at this point, @tosend ought to have a list of numbers, pointers into
1444  # %{$msgsout}.  turn that into a tar file.
1445  return '' unless @tosend;
1446
1447  my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
1448  die "Can't make tempfile, exiting" unless $gzpath;
1449  close($gzfd);
1450
1451  $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!";
1452
1453  # first line is the number of messages included in the file
1454  send_line($gzfd, scalar @tosend) || die "mass-check: error when writing to gz temp file\n";
1455
1456  # Generate an archive in the temp file
1457  foreach my $num (@tosend) {
1458    my $in_cache = 0;
1459    if (ref($num) eq 'ARRAY') {
1460      $in_cache = $num->[1];
1461      $num = $num->[0];
1462    }
1463    # Archive format, gzip compressed file w/ 4 parts per message:
1464    # 1- server message number in text format
1465    # 2- server index string, binary packed format
1466    # 3- a 1 if the message is included, 0 otherwise -- unless paths_only
1467    # 4- message content -- unless paths_only or #3 is 0
1468    my $data = $msgsout->{$num}->{'data'};
1469    if (!$paths_only) {
1470      unless ($in_cache) {
1471        my $msg = ($iter->_run_message($data))[4];
1472        if ($msg) {
1473          send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
1474          send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
1475          send_line($gzfd, '1') || die "mass-check: error when writing to gz temp file\n";
1476          send_line($gzfd, (defined $msg ? join('', @{$msg}) : '')) ||
1477            die "mass-check: error when writing to gz temp file\n";
1478          push @sent, $num;
1479        } else {
1480          # if the message has an error (probably due to it being removed sometime during the
1481          # run) we send a message number of 0 to indicate to the client that the server had an
1482          # error retrieving the message; we need to do this since we have already added the
1483          # line telling the client how many messages to expect
1484          my $filename = (Mail::SpamAssassin::ArchiveIterator::_index_unpack($data))[3];
1485          warn qq/mass-check: ArchiveIterator returned error for "$filename"\n/;
1486          send_line($gzfd, 0) || die "mass-check: error when writing to gz temp file\n";
1487          delete $msgsout->{$num};
1488          $failed_msgs++;
1489        }
1490      } else {
1491        send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
1492        send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
1493        send_line($gzfd, '0') || die "mass-check: error when writing to gz temp file\n";
1494        push @sent, $num;
1495      }
1496    } else {
1497      send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
1498      send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
1499      # the client deals with missing messages on its own (sort of)
1500      push @sent, $num;
1501    }
1502  }
1503
1504  $gzfd->close;
1505
1506  # update timestamp entries
1507  my $ts = time;
1508
1509  # make sure the timestamp is unique!  without Time::HiRes it is trivial and
1510  # common to have two reissueings of timedout messages in the same second
1511  # with Time::HiRes we'll check anyway since we'll waste less time checking
1512  # for uniqueness than one of us will waste debugging a report of mass-check
1513  # not completing due to someone's wacky clock (some (non-para-)virtualized
1514  # servers will have their clocks go forward and backward)
1515  while (exists $timestamps->{$ts}) {
1516    if (HAS_TIME_HI_RES) {
1517      $ts = Time::HiRes::time();
1518    } else {
1519      $ts += 0.01;
1520    }
1521  }
1522
1523  foreach (@sent) {
1524    $msgsout->{$_}->{'timestamp'} = $ts;
1525  }
1526
1527  # conveniently, this list should be the only thing sent out w/ this
1528  # timestamp, so just set the reference appropriately. :)
1529  $timestamps->{$ts} = \@sent;
1530
1531  if ($opt_noisy) {
1532    status('sent '.scalar(@sent).' of '.scalar(@tosend).' intended messages');
1533  }
1534
1535  return $gzpath;
1536}
1537
1538# we've gotten results posted, so clean up msgsout and timestamp hashes and
1539# process result...
1540sub handle_post_results {
1541  my($postdata, $timestamps, $msgsout) = @_;
1542
1543  # local version to batch the removals
1544  my %timestamps = ();
1545
1546  # $msgsout->{num}->{data|timestamp}
1547  # $timestamp{num} = [ msgout_nums ... ]
1548  # $postdata{num} = result_string
1549
1550  while( my($k,$v) = each %{$postdata} ) {
1551    # message run results will be \d+ => log entry
1552    next if ($k !~ /^\d+$/);
1553
1554    # if we've been waiting for this result, process it, otherwise throw it on
1555    # the ground.  multiple clients could have been given the same messages to
1556    # process, and we take whatever the first responder sends us.
1557    if (exists $msgsout->{$k}) {
1558      # the result_sub will need parts of the message data, so get it ready
1559      my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$k}->{'data'});
1560
1561      # go ahead and do the result
1562      &{$iter->{result_sub}}($d[1], $v, $d[0]);
1563
1564      # prep to get rid of the cached entries
1565      $timestamps{$msgsout->{$k}->{'timestamp'}}->{$k} = 1;
1566      delete $msgsout->{$k};
1567
1568      $msgs_processed++;
1569    }
1570  }
1571
1572  # if we got any results, clean out the results from the timestamp arrays
1573  while ( my($k,$v) = each %timestamps ) {
1574    # trim out the result list from the timestamp sent list
1575    my @temp = grep(!exists $v->{$_}, @{$timestamps->{$k}});
1576
1577    # if there are results left for a specific timestamp, update the array
1578    # pointer.  otherwise, delete the timestamp entry since it's empty.
1579    if (@temp) {
1580      $timestamps->{$k} = \@temp;
1581    }
1582    else {
1583      delete $timestamps->{$k};
1584    }
1585  }
1586}
1587
1588# This function reads from $tmpfd and processes the message as appropriate wrt
1589# $opt_j, $opt_restart, etc.
1590#
1591sub run_through_messages {
1592  # do everything in one process
1593  if ($opt_j <= 1 && !defined $opt_restart) {
1594    my $message;
1595    my $messages;
1596    my $total_count = 0;
1597
1598    while (($total_messages > $total_count) && ($message = read_line($tmpfd))) {
1599      my($class, undef, $date, undef, $result) = $iter->_run_message($message);
1600      if ($result) {
1601        &{$iter->{result_sub}}($class, $result, $date);
1602      }
1603      $total_count++;
1604    }
1605    $msgs_processed += $total_count;
1606  }
1607  # more than one process or one process with restarts
1608  else {
1609    my $select = IO::Select->new();
1610
1611    my $total_count = 0;
1612    my $needs_restart = 0;
1613    my @child = ();
1614    my @pid = ();
1615    my $messages;
1616
1617    # start children processes
1618    start_children($opt_j, \@child, \@pid, $select);
1619
1620    # feed childen, make them work for it, repeat
1621    while ($select->count()) {
1622      foreach my $socket ($select->can_read()) {
1623        my $line = read_line($socket);
1624
1625        # some error happened during the read!
1626        if (!defined $line) {
1627          $needs_restart = 1;
1628          warn "mass-check: readline failed, attempting to recover\n";
1629          $select->remove($socket);
1630        }
1631        elsif ($line =~ /^([^\0]*)\0RESULT (.+)$/s) {
1632	  my $result = decode_base64($1);
1633	  my ($date,$class,$type) = Mail::SpamAssassin::ArchiveIterator::_index_unpack($2);
1634	  aidbg "mass-check: $class, $type, $date\n";
1635
1636	  if (defined $opt_restart && ($total_count % $opt_restart) == 0) {
1637	    $needs_restart = 1;
1638	  }
1639
1640	  # if messages remain, and we don't need to restart, send message
1641	  if (($total_messages > $total_count) && !$needs_restart) {
1642	    my $line = read_line($tmpfd);
1643	    unless ($line) {
1644	      warn "mass-check: found short message list ($total_messages, $total_count)\n";
1645	      $select->remove($socket);
1646	      next;
1647	    }
1648
1649            send_line($socket, $line);
1650	    $total_count++;
1651	    aidbg "mass-check: $total_messages $total_count\n";
1652	  }
1653	  else {
1654	    # stop listening on this child since we're done with it
1655	    aidbg "mass-check: $needs_restart $total_messages $total_count\n";
1656	    $select->remove($socket);
1657	  }
1658
1659	  # deal with the result we received
1660	  if ($result) {
1661	    &{$iter->{result_sub}}($class, $result, $date);
1662	  }
1663        }
1664        elsif ($line eq "START") {
1665	  if ($total_messages > $total_count) {
1666	    # we still have messages, send one to child
1667	    send_line($socket, read_line($tmpfd));
1668	    $total_count++;
1669	    aidbg "mass-check: $total_messages $total_count\n";
1670	  }
1671	  else {
1672	    # no more messages, so stop listening on this child
1673	    aidbg "mass-check: $needs_restart $total_messages $total_count\n";
1674	    $select->remove($socket);
1675	  }
1676        }
1677        else {
1678          $needs_restart = 1;
1679          warn "mass-check: bad line from readline: $line\n";
1680          $select->remove($socket);
1681        }
1682      }
1683
1684      aidbg "mass-check: out of loop, $total_messages $total_count $needs_restart ".$select->count()."\n";
1685
1686      # If there are still messages to process, and we need to restart
1687      # the children, and all of the children are idle, let's go ahead.
1688      if ($needs_restart && $select->count == 0 && $total_messages > $total_count) {
1689        $needs_restart = 0;
1690
1691        aidbg "mass-check: needs restart, $total_messages total, $total_count done\n";
1692        reap_children($opt_j, \@child, \@pid);
1693        @child=();
1694        @pid=();
1695        start_children($opt_j, \@child, \@pid, $select);
1696      }
1697    }
1698    $msgs_processed += $total_count;
1699
1700    # reap children
1701    reap_children($opt_j, \@child, \@pid);
1702  }
1703}
1704
1705# send an HTTP response to a socket based on the input result, headers, and
1706# data values.
1707sub http_response {
1708  my($socket, $result, $headers, $data) = @_;
1709
1710  $headers->{'Content-Length'} = length $data;
1711  $headers->{'Cache-Usage'} = $opt_cs_cache ? 'allowed' : 'disallowed';
1712
1713  print $socket
1714    "HTTP/1.0 $result\r\n",
1715    "Pragma: no-cache\r\n",
1716    "Server: mass-check/$svn_revision\r\n",
1717    (map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}), "\r\n";
1718  print $socket $data;
1719
1720  if ($opt_cs_verbose) {
1721    $sent_bytes += length(
1722      "HTTP/1.0 $result\r\n".
1723      "Pragma: no-cache\r\n".
1724      "Server: mass-check/$svn_revision\r\n");
1725    # length of a map output isn't what the other end claims to see
1726    while (my($f,$v) = each %{$headers}) {
1727      $sent_bytes += length($f) + length($v) + 4;
1728    }
1729    $sent_bytes += 2 + length($data);
1730  }
1731}
1732
1733# the client needs to make a request to the server on a given socket.
1734sub http_make_request {
1735  my($socket, $type, $uri, $headers, $data) = @_;
1736
1737  $headers->{'Content-Length'} = length $data;
1738
1739  print $socket
1740    "$type $uri HTTP/1.0\r\n",
1741    "User-Agent: mass-check/$svn_revision\r\n",
1742    (map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}), "\r\n";
1743  print $socket $data;
1744
1745  if ($opt_cs_verbose) {
1746    $sent_bytes += length(
1747      "$type $uri HTTP/1.0\r\n".
1748      "User-Agent: mass-check/$svn_revision\r\n");
1749    # length of a map output isn't what the other end claims to see
1750    while (my($f,$v) = each %{$headers}) {
1751      $sent_bytes += length($f) + length($v) + 4;
1752    }
1753    $sent_bytes += 2 + length($data);
1754  }
1755
1756  # parse the response that the server sends us
1757  my $line = $socket->getline() || '';
1758  $rcvd_bytes += length($line);
1759  my(undef, $code, $string) = split(/\s+/, $line, 3);
1760  return unless $code == 200;
1761
1762  my %headers = ();
1763  do {
1764    $line = $socket->getline();
1765    last unless defined $line;
1766    $rcvd_bytes += length($line);
1767    $line =~ s/\r\n$//;
1768
1769    if ($line) {
1770      my ($k,$v) = split(/:\s*/, $line, 2);
1771      $headers{lc $k} = $v;
1772    }
1773  } while ($line !~ /^$/);
1774  $rcvd_bytes += $headers{'content-length'} if $headers{'content-length'};
1775  $client_id ||= $headers{'client-id'} if $headers{'client-id'};
1776
1777  # disable cache-usage if the server disallows cache usage
1778  $opt_cs_cache = 0 if $headers{'cache-usage'} eq 'disallowed';
1779
1780  # the server has sent us notification that it's going to exit, so let's
1781  # follow suit.
1782  return 'finished' if ($headers{'finished'});
1783
1784  my $gzpath = '';
1785  if ($headers{'content-type'} eq 'application/x-gzip') {
1786    my $gzfd;
1787    ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
1788    die "Can't make tempfile, exiting" unless $gzpath;
1789
1790    my $rd;
1791    $socket->read($rd, $headers{'content-length'}) || die "mass-check: error reading in data from server\n";
1792    print $gzfd $rd;
1793    close $gzfd;
1794  }
1795
1796  $socket->close();
1797  return $gzpath;
1798}
1799
1800# Be conservative -- encode most things.
1801# we could encode spaces to plusses, then decode that later, but...
1802sub post_encode {
1803  my $string = shift;
1804  $string =~  s/([^a-zA-Z0-9_,.\/\\-])/sprintf "%%%02x",unpack("C",$1)/egx;
1805  return $string;
1806}
1807
1808# remove all of the files in a given directory, non-recursive
1809sub clean_dir {
1810  my $dir = shift;
1811
1812  unless (opendir(DIR, $dir)) {
1813    warn "error: can't opendir $dir: $!\n";
1814    return;
1815  }
1816  while(my $file = readdir(DIR)) {
1817    $file =~ /^(.+)$/;       # untaint
1818    $file = $1;
1819
1820    my $path = File::Spec->catfile($dir, $file);
1821    next unless (-f $path);
1822
1823    if (!unlink $path) {
1824      warn "error: can't remove file $path: $!\n";
1825      closedir(DIR);
1826      return;
1827    }
1828  }
1829  closedir(DIR);
1830  return 1;
1831}
1832
1833############################################################################
1834
1835# four bytes in network/vax format (little endian) as length of message
1836# the rest is the actual message
1837
1838sub read_line {
1839  my $fd = shift;
1840  my($length,$msg);
1841
1842  # read in the 4 byte length and unpack
1843  $fd->read($length, 4) || return;
1844
1845  $length = unpack("V", $length);
1846  return unless $length;
1847
1848  # read in the rest of the single message
1849  $fd->read($msg, $length) || return;
1850
1851  return $msg;
1852}
1853
1854sub send_line {
1855  my $fd = shift;
1856  foreach ( @_ ) {
1857    my $length = pack("V", length $_);
1858    $fd->print($length.$_) || return 0;
1859  }
1860
1861  return 1;
1862}
1863
1864############################################################################
1865
1866# this is the function that implemented server mode.  basically, sit and wait
1867# for connections to come in.  when a client sends in a request, deal with any
1868# results that the client sent, then generate a response and send it back,
1869# and then go back to waiting.  lather, rinse, repeat.
1870sub server_mode {
1871  $opt_cs_max ||= 1000;
1872  $opt_cs_timeout ||= 60 * 5;
1873  $opt_cs_max_tries ||= 3;
1874
1875  # IO::Socket::SSL isn't that smart
1876  $opt_server =~ s/:(\d+)$//;
1877  my $port = $1;
1878
1879  # ::SSL needs resolved hostnames, at least on Solaris. note: not IPv6-aware
1880  my $localaddr = (gethostbyname($opt_server))[4] || $opt_server;
1881  $localaddr = Socket::inet_ntoa($localaddr);
1882
1883  my $serv_socket;
1884  if ($opt_cs_ssl) {
1885    $serv_socket = IO::Socket::SSL->new(
1886            LocalAddr => $localaddr,
1887            LocalPort => $port,
1888            ReuseAddr => 1,
1889            Listen => 5,
1890            SSL_verify_mode => 0x02,
1891            SSL_use_cert => 1,
1892            SSL_version => "TLSv1",
1893            SSL_key_file => "spamassassin/server-key.pem",
1894            SSL_cert_file => "spamassassin/server-cert.pem",
1895          );
1896  }
1897  else {
1898    $serv_socket = IO::Socket::INET->new(
1899            LocalAddr => $localaddr,
1900            LocalPort => $port,
1901            ReuseAddr => 1,
1902            Listen => 5,
1903          );
1904  }
1905
1906  die "Could not create socket: $!\n" unless $serv_socket;
1907
1908  if ($opt_progress) {
1909    status('server ready for connections');
1910  }
1911
1912  # Setup out "what messages have been sent out" hashes
1913  my $timestamps = {};
1914  my $msgsout = { 'curnum' => 0 };
1915
1916  # Generate an IO::Select object and put the server socket on the queue
1917  my $select = IO::Select->new( $serv_socket );
1918
1919  # We'll keep looping while there's something to pay attention to
1920  while ($select->count()) {
1921    # Sit and block until there's something for us to read from
1922    foreach my $socket ($select->can_read()) {
1923      if ($socket == $serv_socket) {
1924        # it's the server socket, go ahead and accept the connection and add
1925	# it to the queue.
1926        $select->add($serv_socket->accept);
1927      }
1928      else {
1929	# it's some client, so deal with the request
1930	my($type, $URI, $headers, $postdata) = handle_http_request($socket);
1931
1932	# we don't do GET, so just send something back
1933	if ($type eq 'GET') {
1934	  if ($opt_noisy) {
1935	    status('GET request from '.$socket->peerhost);
1936	  }
1937
1938	  http_response($socket, "200 OK", {
1939	      'Content-type' => 'text/plain',
1940	    },
1941	    "Your GET request came from IP Address: ".$socket->peerhost."\n");
1942	}
1943        elsif ($type eq 'POST') {
1944	  # ooh, POST.  deal with any results that the client sent
1945	  handle_post_results($postdata, $timestamps, $msgsout);
1946
1947	  if ($opt_noisy) {
1948	    status('POST request from '.$socket->peerhost);
1949	  }
1950
1951          # based on the number of messages that the client requested,
1952	  # generate a gzip file with the appropriate data in it
1953	  my $messages = '';
1954	  if ($postdata->{'max_messages'}) {
1955	    my $msgnum = $postdata->{'max_messages'};
1956	    if ($msgnum > $opt_cs_max || $msgnum < 1) {
1957	      $msgnum = $opt_cs_max;
1958	    }
1959
1960	    if ($opt_noisy) {
1961	      status('client requested '.$postdata->{'max_messages'}.' messages');
1962	    }
1963
1964	    $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'}, $headers->{'client-id'});
1965	  }
1966
1967          # $messages will contain the path to the gzip file if there are
1968	  # messages to send out.
1969          if ($messages && open(MSG, $messages)) {
1970	    $t_first_msg = time unless $t_first_msg;
1971	    binmode(MSG);
1972	    local $/ = undef;  # go go slurp mode
1973
1974	    # send the response
1975	    http_response($socket, "200 OK", {
1976	      'Content-Type' => 'application/x-gzip',
1977	      'Content-Encoding' => 'x-gzip',
1978              'Client-ID'    => $headers->{'client-id'},
1979	      },
1980	      scalar <MSG>);
1981
1982	    close(MSG);
1983
1984	    # we don't need the file anymore, so get rid of it
1985	    unlink $messages;
1986          }
1987	  elsif (!keys %{$msgsout} && !defined $tmpfd) {
1988	    # we have no more outstanding messages and our original queue of
1989	    # messages to process is empty, so tell the client to exit.
1990	    http_response($socket, "200 OK", {
1991              "Content-type" => "text/plain",
1992              'Client-ID'    => $headers->{'client-id'},
1993	      "Finished" => 1,
1994	      },
1995	      'We are all done');
1996            $t_last_msg = time;
1997	  }
1998	  else {
1999	    # when in doubt, treat this like a GET
2000	    http_response($socket, "200 OK", {
2001              "Content-type" => "text/plain",
2002              'Client-ID'    => $headers->{'client-id'},
2003	      },
2004              "Your POST request (sans max_messages) came from IP Address: ".$socket->peerhost."\n");
2005	  }
2006	}
2007	else {
2008          # for error, "501 Not Implemented"
2009	  http_response($socket, '501 Not Implemented', {}, '');
2010	}
2011
2012	# ok, we don't do keepalive, so get rid of the socket
2013        $select->remove($socket);
2014	$socket->close;
2015      }
2016    }
2017
2018    if ($opt_noisy) {
2019      status((exists $msgsout->{'curnum'} ? (scalar(keys %{$msgsout})-1) :
2020              scalar(keys %{$msgsout})).' messages outstanding');
2021    }
2022
2023#print "msgs waiting: ".join(" ", keys %{$msgsout})."\n";
2024#print "tmpfd defined? ".(defined $tmpfd ? "yes" : "no")."\n";
2025
2026    # we're not awaiting responses and we've exhausted the input file, so
2027    # drop the server socket. :)
2028    $select->remove($serv_socket) if (!keys %{$msgsout} && !defined $tmpfd);
2029  }
2030
2031  # remove the server and client message cache temp files
2032  foreach my $tied_hash (keys %server_caches) {
2033    undef $tied_hash;
2034  }
2035  foreach my $tied_hash (keys %client_caches) {
2036    undef $tied_hash;
2037  }
2038  foreach my $tmp_file (@cache_tmp_files) {
2039    unlink $tmp_file;
2040  }
2041}
2042
2043# this is the function that implements client mode.  generally, in a loop:
2044#  make a request of the server for some max number of messages, and send our
2045#  results back at the same time.  based on the results of that request, put
2046#  messages into a temp dir and process them.  prep the results and loop.
2047#  lather, rinse, repeat.
2048sub client_mode {
2049  $opt_cs_max ||= 1000;
2050  $opt_cs_timeout ||= 60 * 2;
2051  $opt_cs_conn_retries ||= 60;            # 1 hour
2052
2053  my($host, $port, $uri);
2054
2055  if ($opt_client =~ /^http:\/\/([^\/]+)(\/.*)?/) {
2056    ($host, $uri) = ($1,$2);
2057  } else {
2058    $host = $opt_client;
2059    if ($host =~ /^:/) {
2060      $host = 'localhost'.$host;
2061    }
2062  }
2063  ($host, $port) = split(/:/, $host);
2064
2065  die "No host found in opt_client" unless $host;
2066  $uri ||= "/";
2067
2068  # ::SSL needs resolved hostnames, at least on Solaris. note: not IPv6-aware
2069  $host = Socket::inet_ntoa(Socket::inet_aton($host));
2070
2071  # use this to track how many messages we ought to be requesting
2072  # start at 100 to get warmed up
2073  my $msgnum = $opt_cs_max > 100 ? 100 : $opt_cs_max;
2074
2075  my $tmpdir;
2076
2077  # if we're not doing paths_only, create a temp dir where we'll put the
2078  # incoming messages to process.
2079  if (!$opt_cs_paths_only) {
2080    $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir();
2081    die "Can't create tempdir" unless $tmpdir;
2082  }
2083
2084  my $made_conn_once = 0;
2085
2086  # keep going until something stops us.
2087  while (1) {
2088    # send cache data if this is the first connect
2089    # do this before creating the connection so we don't waste all the other
2090    # clients' time, once we've connected, by monopolizing the single server
2091    my ($gzpath, $gzfd, $action);
2092    if (!$made_conn_once && $opt_cs_schedule_cache) {
2093      # we need to find out what messages we have in our client cache so that we
2094      # can tell the server what we've got so that it can optimize our cache hit
2095      # rate by requesting us to scan messages that are already in our cache
2096      $action = 'sending cache';
2097      $gzpath = scan_client_cache();
2098    }
2099    else {
2100      $action = 'sending results';
2101      # if the number of messages to request is too much, bring it down
2102      $msgnum = $opt_cs_max if ($msgnum > $opt_cs_max);
2103
2104      # prep the POST request
2105      $postdata{'max_messages'} = $msgnum;
2106      $postdata{'paths_only'} = 1 if ($opt_cs_paths_only);
2107
2108      # compress the results
2109      ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
2110      die "Can't make tempfile, exiting" unless $gzpath;
2111      $gzfd->close;
2112      $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!";
2113
2114      # the actual POST data string
2115      send_line($gzfd, join('&', map { post_encode($_) . '=' . post_encode($postdata{$_}) } keys %postdata)) ||
2116        die "mass-check: error when writing to gz temp file\n";
2117      $gzfd->close;
2118      %postdata = ();
2119    }
2120
2121    # connect to server
2122    my $socket;
2123    if ($opt_cs_ssl) {
2124      $socket = IO::Socket::SSL->new(
2125		PeerAddr => $host,
2126		PeerPort => $port,
2127		SSL_version => "TLSv1",
2128		SSL_use_cert => 1,
2129		SSL_key_file => "spamassassin/client-key.pem",
2130		SSL_cert_file => "spamassassin/client-cert.pem",
2131	      );
2132    } else {
2133      $socket = IO::Socket::INET->new(
2134		PeerAddr => $host,
2135		PeerPort => $port
2136              );
2137    }
2138
2139    if (!$socket) {
2140      unlink $gzpath;
2141      undef $gzfd;
2142      # if we haven't yet made a connection, keep retrying;
2143      # this is probably the server still in "scan stage"
2144      if (!$made_conn_once) {
2145        if ($opt_cs_conn_retries-- > 0) {
2146          status('failed to connect, sleeping for retry') if ($opt_noisy);
2147          sleep 60;
2148          next;
2149        } else {
2150          status('failed to connect, giving up') if ($opt_noisy);
2151          last;
2152        }
2153      }
2154
2155      # last if connection fails after scanning something
2156      last;
2157    }
2158
2159    $made_conn_once = 1;
2160    status("requesting $msgnum messages from server") if ($opt_noisy);
2161
2162    # make request, include and then drop results if there are any
2163    my $POSTDATA = '';
2164    if ($gzpath) {
2165      if (open(RESULTS, $gzpath)) {
2166        binmode(RESULTS);
2167        {
2168          # slurp here, rather than into an anonymous variable in the
2169          # http_make_request call so that we don't end up slurping the
2170          # response from the server too
2171          local $/ = undef; # slurp
2172          $POSTDATA = scalar <RESULTS>;
2173        }
2174        close(RESULTS);
2175        unlink $gzpath;
2176        undef $gzfd;
2177      } else {
2178        die "Can't open tempfile, exiting" unless $gzpath;
2179      }
2180    }
2181
2182    my $result = http_make_request($socket, 'POST', $uri, {
2183      'Host'		=> $host,
2184      'Content-Type'	=> 'application/x-www-form-urlencoded',
2185      'Content-Encoding' => 'x-gzip',
2186      'Action'		=> $action,
2187      'Client-ID'	=> ($client_id || '')
2188      },
2189      $POSTDATA
2190    );
2191    undef $POSTDATA;
2192
2193    $t_last_msg = time;
2194
2195    # If we received messages to run through, go ahead and do it.
2196    # otherwise, just sleep for the timeout length and try again
2197    if (!defined $result) {
2198      # we got an error?!?  abort!
2199      last;
2200    }
2201    elsif ($result eq 'finished') {
2202      # the server said that we're done
2203      status('server has no more work, exiting.') if ($opt_noisy);
2204      last;
2205    }
2206    elsif ($result eq '') {
2207      # no messages means the server may give us more work down the road.
2208      # sleep for client_timeout seconds and try the request again
2209      status("Received no messages, waiting $opt_cs_timeout seconds") if ($opt_noisy);
2210      sleep $opt_cs_timeout;
2211    }
2212    else {
2213      # we got messages, so deal with them.
2214      my $time_start = time;
2215      $t_first_msg = $time_start unless $t_first_msg;
2216
2217      # postdata will hold our results, real will hold the original message
2218      # data from the server's scan mode.
2219      %postdata = ();
2220      %real = ();
2221      $init_results = $total_count = $spam_count = $ham_count = 0;
2222
2223      # we got a result, so do things with it!
2224      my $gzfd = IO::Zlib->new($result, "rb");
2225      die "Can't open temp result file: $!" unless $gzfd;
2226
2227      # used for the temp queue file
2228      my $tmppath;
2229      ($tmppath, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile();
2230      die "Can't make tempfile, exiting" unless $tmppath;
2231      unlink $tmppath;
2232
2233      # if we have a temp directory, clean it out for this run
2234      clean_dir($tmpdir) if ($tmpdir);
2235
2236      # Archive format, gzip compressed file w/ 4 parts per message:
2237      # 1- server message number in text format
2238      # 2- server index string, binary packed format
2239      # 3- a 1 if the message is included, 0 otherwise -- unless paths_only
2240      # 4- message content -- unless paths_only or #3 is 0
2241
2242      # number of messages
2243      $msgnum = $total_messages = read_line($gzfd) || die "mass-check: error reading from gzip message file\n";
2244
2245      status("server says it gave us $total_messages messages") if ($opt_progress);
2246      my $actual_messages = 0;
2247
2248      # loop through and prep all of the messages the server sent
2249      for(my $i = 0 ; $i < $total_messages; $i++ ) {
2250        my $num = read_line($gzfd);
2251	last unless defined $num;
2252
2253        next unless $num; # a message number of 0 indicates a msg-error on the server
2254
2255        my $index = read_line($gzfd);
2256	last unless defined $index;
2257
2258	my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index);
2259
2260	# if we're doing paths_only, there'll be no message content
2261	if (!$opt_cs_paths_only) {
2262          my $msg_included = read_line($gzfd);
2263          last unless defined $msg_included;
2264
2265	  my $msg;
2266          if ($msg_included == 1) {
2267            $msg = read_line($gzfd);
2268            last unless defined $msg;
2269          }
2270
2271          # permanently cache the message on the client
2272          my $cache_success = 0;
2273          if ($opt_cs_cache) {
2274            unless (-f "$opt_cs_cachedir/$d[3]") {
2275              umask 0077; # prevent others on the system from reading corpus messages
2276                          # TODO: the entire mass-check script should probably use umask 0077;
2277
2278              mkdir $opt_cs_cachedir; # mkdir won't bitch if it already exists, that's ok
2279              if (-d $opt_cs_cachedir && $d[3] =~ m{^/?(.*(?![^\\]\\)/)?(.+)$}) {
2280                my ($path, $filename) = ($1, $2);
2281                my $dir = '';
2282                $path ||= '';
2283                while ($path =~ m#((?![^\\]\\)[^/]+/)#gc) {
2284                  $dir .= $1;
2285                  mkdir "$opt_cs_cachedir/$dir";
2286                }
2287                if (-d "$opt_cs_cachedir/$dir") {
2288                  if (open(OUT, ">$opt_cs_cachedir/$dir$filename")) {
2289                    print OUT $msg;
2290                    close $msg;
2291
2292                    $cache_success = 1;
2293                  }
2294                  else {
2295                    warn "Can't create/write message cache file $opt_cs_cachedir/$dir$filename: $!";
2296                  }
2297                }
2298              }
2299            }
2300            else {
2301              $cache_success = 1;
2302              $cache_hits++;
2303            }
2304            if ($cache_success) {
2305              # this is a little tricky -- we need to process the files in the
2306              # path and format we've created, but the original data is needed
2307              # to create a proper result later, so deal with that here.
2308              $real{"$opt_cs_cachedir/$d[3]"} = \@d;
2309              push @{$real{"$opt_cs_cachedir/$d[3]"}}, $num;
2310              send_line($tmpfd,
2311                Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$opt_cs_cachedir/$d[3]")) ||
2312                  die "mass-check: error writing out temp file in client mode\n";
2313              $actual_messages++;
2314            }
2315          }
2316
2317          # if the cache failed try a temp file instead
2318          if (!$opt_cs_cache || !$cache_success) {
2319            next unless defined $msg;
2320 	    # it's going to be a dir of file formatted messages
2321	    if (open(OUT, ">$tmpdir/$num")) {
2322	      print OUT $msg;
2323	      close(OUT);
2324
2325              # this is a little tricky -- we need to process the files in the
2326              # path and format we've created, but the original data is needed
2327              # to create a proper result later, so deal with that here.
2328              $real{"$tmpdir/$num"} = \@d;
2329              push @{$real{"$tmpdir/$num"}}, $num;
2330              send_line($tmpfd,
2331                Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$tmpdir/$num")) ||
2332                  die "mass-check: error writing out temp file in client mode\n";
2333              $actual_messages++;
2334	    }
2335	    else {
2336	      warn "Can't create/write message temp file $tmpdir/$num: $!";
2337	    }
2338          }
2339	}
2340        # paths_only_mode
2341	else {
2342          # permanently cache the message on the client
2343          my $cache_success = 0;
2344          if ($opt_cs_cache) {
2345            unless (-f "$opt_cs_cachedir/$d[3]") {
2346              umask 0077; # prevent others on the system from reading corpus messages
2347                          # TODO: the entire mass-check script should probably use umask 0077;
2348
2349              mkdir $opt_cs_cachedir; # mkdir won't bitch if it already exists, that's ok
2350              if (-d $opt_cs_cachedir && $d[3] =~ m{^/?(.*(?![^\\]\\)/)?(.+)$}) {
2351                my ($path, $filename) = ($1, $2);
2352                my $dir = '';
2353                $path ||= '';
2354                while ($path =~ m#((?![^\\]\\)[^/]+/)#gc) {
2355                  $dir .= $1;
2356                  mkdir "$opt_cs_cachedir/$dir";
2357                }
2358                if (-d "$opt_cs_cachedir/$dir") {
2359                  # copy the file to the local cache for use next (and this) time
2360                  if (copy($d[3], "$opt_cs_cachedir/$d[3]")) {
2361                    $cache_success = 1;
2362                  }
2363                }
2364              }
2365            }
2366            else {
2367              $cache_success = 1;
2368              $cache_hits++;
2369            }
2370            if ($cache_success) {
2371              # this is a little tricky -- we need to process the files in the
2372              # path and format we've created, but the original data is needed
2373              # to create a proper result later, so deal with that here.
2374              $real{"$opt_cs_cachedir/$d[3]"} = \@d;
2375              push @{$real{"$opt_cs_cachedir/$d[3]"}}, $num;
2376              send_line($tmpfd,
2377                Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$opt_cs_cachedir/$d[3]")) ||
2378                  die "mass-check: error writing out temp file in client mode\n";
2379              $actual_messages++;
2380            }
2381          }
2382
2383          # if the messages isn't in our cache use the supplied path instead
2384          if (!$opt_cs_cache || !$cache_success) {
2385	    # in paths_only mode, there's no kluging between formats since we're
2386	    # reading the same corpus, however we do still need to track server
2387	    # message number to message data so our results will be useable.
2388            $real{$d[3]} = \@d;
2389            push @{$real{$d[3]}}, $num;
2390	    send_line($tmpfd, $index) ||
2391	        die "mass-check: error writing out temp file in client mode\n";
2392            $actual_messages++;
2393          }
2394	}
2395      }
2396
2397      $gzfd->close;
2398      unlink $result;
2399
2400      # if the server tries to give us messages, but some are errored,
2401      # we need to know that so we don't try to process them all.
2402      if ($total_messages != $actual_messages) {
2403        status("server actually gave us $actual_messages messages") if ($opt_progress);
2404        $total_messages = $actual_messages;
2405      }
2406
2407      if ($opt_progress) {
2408        status('starting run stage');
2409      }
2410
2411      # we're about to start running, so go back to the start of the file
2412      seek $tmpfd, 0, 0;
2413
2414      run_through_messages();
2415
2416      # we're done with the temp file -- bye bye
2417      close($tmpfd);
2418
2419      # figure out new max messages, try keeping ~cs_timeout between runs
2420      my $time_end = time;
2421
2422      # if we only requested a small number of messages, it may take <1s to
2423      # run through them, so fake it and say it took 1s.
2424      if ($time_end == $time_start) {
2425        $time_end++;
2426      }
2427
2428      if ($opt_progress) {
2429        status('completed run stage');
2430      }
2431
2432      status('completed run in '.($time_end-$time_start).' seconds') if ($opt_noisy);
2433      $msgnum = int($msgnum * $opt_cs_timeout / ($time_end-$time_start)) || 1;
2434    }
2435  }
2436
2437  # if we were using a temp dir, clean it out and then remove it
2438  if ($tmpdir) {
2439    clean_dir($tmpdir);
2440    rmdir $tmpdir;
2441  }
2442}
2443
2444# scan the client's cache and return a path to a gzip archive of AI output
2445sub scan_client_cache {
2446  status('starting cache scan stage') if ($opt_progress);
2447
2448  # make sure the cachedir exists so that AI doesn't bail out
2449  mkdir $opt_cs_cachedir unless -e $opt_cs_cachedir;
2450
2451  # Make a temp file and delete it
2452  my $tmpf;
2453  ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile();
2454  die 'mass-check: failed to create temp file' unless $tmpf;
2455  unlink $tmpf or die "mass-check: unlink '$tmpf': $!";
2456
2457  my @targets = ("h:dir:$opt_cs_cachedir");
2458  generate_queue(\@targets, $tmpfd);
2459
2460  # we now have a temporary file with the messages to process
2461  seek($tmpfd, 0, 0);
2462  # the first line is the number of messages
2463  $total_messages = read_line($tmpfd);
2464  showdots_finish();
2465  status("completed cache scan stage, $total_messages messages") if ($opt_progress);
2466
2467  unless ($tmpfd) {
2468    status('cache scan failed');
2469    return;
2470  }
2471  status('compressing cache data') if ($opt_progress);
2472
2473  # create a temp file for compression
2474  my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
2475  die "Can't make tempfile, exiting" unless $gzpath;
2476  $gzfd->close;
2477  $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!";
2478
2479  while (my $msg = read_line($tmpfd)) {
2480    $msg =~ s/\r?\n$//;
2481    my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg);
2482    $d[3] =~ s/^$opt_cs_cachedir//;
2483
2484    send_line($gzfd, Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], $d[2], $d[3])) ||
2485      die "mass-check: error writing out temp file in client mode\n";
2486  }
2487  close $tmpfd;
2488  undef $tmpfd;
2489  $gzfd->close;
2490  status('cache data compressed to '.(-s $gzpath).' bytes') if ($opt_progress);
2491
2492  return $gzpath;
2493}
2494
2495############################################################################
2496
2497# in server mode, just return the ref to the message data
2498sub wanted_server {
2499  my ($class, $id, $time, $dataref, $format) = @_;
2500  return $dataref;
2501}
2502
2503# very similar to result() except the result has the message number at the
2504# front, so strip it off and then set the POST data appropriately.
2505sub result_client {
2506  my ($class, $result, $time) = @_;
2507
2508  # don't open results files until we get here to avoid overwriting files
2509  init_results() if !$init_results;
2510
2511  if ($class eq "s") {
2512    $spam_count++;
2513  }
2514  elsif ($class eq "h") {
2515    $ham_count++;
2516  }
2517
2518  $total_count++;
2519
2520  if ($opt_progress) {
2521    progress($time);
2522  }
2523
2524  if ($result =~ s/^(\d+)\s+//m) {
2525    $postdata{$1} = $result;
2526  }
2527  else {
2528    warn ">> WTH!?  result is not in the correct format: $result\n";
2529    # 20071114: bit of a hack
2530    # prevent malformed cs_client message result lines from preventing the
2531    # cs_server running in cs_schedule cache mode from completing
2532    # TODO: find out how result lines get malformed (malformed result lines
2533    # will still hang any cs_server not running with cs_schedule_cache)
2534    $failed_msgs++;
2535  }
2536}
2537
2538sub aidbg {
2539  if (would_log("dbg", "mass-check") == 2) {
2540    dbg (@_);
2541  }
2542}
2543
2544sub deal_with_before_after {
2545  my($which, $time) = @_;
2546
2547  if ($time && $time =~ /^-\d+$/) {
2548    $time = time + $time;
2549  }
2550  elsif ($time && $time !~ /^-?\d+$/) {
2551    if (HAS_TIME_PARSEDATE) {
2552      $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
2553    }
2554    else {
2555      die "You need Time::ParseDate if you use either the --before or --after option.";
2556    }
2557  }
2558
2559  if ($which eq 'before') {
2560    $opt_before = $time;
2561  }
2562  else {
2563    $opt_after = $time;
2564  }
2565
2566  if ($opt_before && $opt_after && $opt_after >= $opt_before) {
2567    die "--before ($opt_before) <= --after ($opt_after) -- conflict!";
2568  }
2569}
2570
2571sub generate_queue {
2572  my ($targets, $tmpfd) = @_;
2573
2574  # scan the targets and get the number and list of messages
2575  $iter->_scan_targets($targets,
2576    sub {
2577      my($self, $date, $class, $format, $mail) = @_;
2578      push(@{$self->{$class}}, Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $mail));
2579    }
2580  );
2581
2582  # deal with opt_head and opt_tail
2583  top_and_tail_messages($iter->{h});
2584  top_and_tail_messages($iter->{s});
2585
2586  my $messages;
2587  if ($opt_n) {
2588    # OPT_N == 1 means don't bother sorting on message receive date
2589
2590    # for ease of memory, we'll play with pointers
2591    $messages = $iter->{s};
2592    undef $iter->{s};
2593    if ($iter->{h}) {
2594      push(@{$messages}, @{$iter->{h}});
2595      undef $iter->{h};
2596    }
2597  }
2598  else {
2599    # OPT_N == 0 means sort on message receive date
2600
2601    # Sort the spam and ham groups by date
2602    my @s = @{$iter->{s}};
2603    undef $iter->{s};
2604    my @h = @{$iter->{h}};
2605    undef $iter->{h};
2606
2607    # interleave ordered spam and ham
2608    if (@s && @h) {
2609      my $ratio = @s / @h;
2610      while (@s && @h) {
2611	push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h);
2612      }
2613    }
2614    # push the rest onto the end
2615    push @{$messages}, @s, @h;
2616  }
2617
2618  # head or tail < 0 means crop the total list, negate the value appropriately
2619  if ($opt_tail < 0) {
2620    splice(@{$messages}, 0, $opt_tail);
2621  }
2622  if ($opt_head < 0) {
2623    splice(@{$messages}, -$opt_head);
2624  }
2625
2626  my $num = $Mail::SpamAssassin::ArchiveIterator::MESSAGES = scalar(@{$messages});
2627
2628  # Dump out the number of messages and the message index info to
2629  # the temp file
2630  send_line($tmpfd, $num, @{$messages});
2631}
2632
2633sub top_and_tail_messages {
2634  my ($ary) = @_;
2635
2636  if ($opt_n) {
2637    # OPT_N == 1 means don't bother sorting on message receive date
2638
2639    # head or tail > 0 means crop each list
2640    if ($opt_tail > 0) {
2641      splice(@{$ary}, 0, -$opt_tail);
2642    }
2643    if ($opt_head > 0) {
2644      splice(@{$ary}, min ($opt_head, scalar @{$ary}));
2645    }
2646  }
2647  else {
2648    # OPT_N == 0 means sort on message receive date
2649
2650    # Sort the spam and ham groups by date
2651    my @s = sort { $a cmp $b } @{$ary};
2652
2653    # head or tail > 0 means crop each list
2654    if ($opt_tail > 0) {
2655      splice(@s, 0, -$opt_tail);
2656    }
2657    if ($opt_head > 0) {
2658      splice(@s, min ($opt_head, scalar @s));
2659    }
2660
2661    @{$ary} = @s;
2662  }
2663}
2664
2665sub min {
2666  return ($_[0] < $_[1] ? $_[0] : $_[1]);
2667}
2668
2669############################################################################
2670
2671sub run_post_scan {
2672  return unless $opt_run_post_scan;
2673  system($opt_run_post_scan);
2674  if ($? >> 8 != 0) {
2675    warn "$opt_run_post_scan failed";
2676  }
2677}
2678
2679