1#!/usr/bin/perl -w -T
2# <@LICENSE>
3# Licensed to the Apache Software Foundation (ASF) under one or more
4# contributor license agreements.  See the NOTICE file distributed with
5# this work for additional information regarding copyright ownership.
6# The ASF licenses this file to you under the Apache License, Version 2.0
7# (the "License"); you may not use this file except in compliance with
8# the License.  You may obtain a copy of the License at:
9#
10#     http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17# </@LICENSE>
18
19use strict;
20use warnings;
21# use bytes;
22
23use Errno qw(EBADF);
24use Getopt::Long;
25use Pod::Usage;
26use File::Spec;
27use POSIX qw(locale_h setsid sigprocmask _exit);
28
29POSIX::setlocale(LC_TIME,'C');
30
31our ( $spamtest, %opt, $isspam, $forget, $messagecount, $learnedcount, $messagelimit, $progress,
32      $total_messages, $init_results, $start_time, $synconly, $learnprob, @targets, $bayes_override_path );
33
34my $PREFIX = '@@PREFIX@@';  # substituted at 'make' time
35my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@';  # substituted at 'make' time
36my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@';  # substituted at 'make' time
37
38use lib '@@INSTALLSITELIB@@';                # substituted at 'make' time
39
40BEGIN {                          # see comments in "spamassassin.raw" for doco
41  my @bin = File::Spec->splitpath($0);
42  my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1], '') : $bin[1])
43            || File::Spec->curdir;
44
45  if (-e $bin.'/lib/Mail/SpamAssassin.pm'
46        || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' )
47  {
48    my $searchrelative;
49    $searchrelative = 1;    # disabled during "make install": REMOVEFORINST
50    if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm')
51    {
52      unshift ( @INC, '../blib/lib' );
53    } else {
54      foreach ( qw(lib ../lib/site_perl
55                ../lib/spamassassin ../share/spamassassin/lib))
56      {
57        my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) );
58        if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) )
59        { unshift ( @INC, $dir ); last; }
60      }
61    }
62  }
63}
64
65use Mail::SpamAssassin;
66use Mail::SpamAssassin::ArchiveIterator;
67use Mail::SpamAssassin::Message;
68use Mail::SpamAssassin::PerMsgLearner;
69use Mail::SpamAssassin::Util::Progress;
70use Mail::SpamAssassin::Logger;
71
72###########################################################################
73
74$SIG{PIPE} = 'IGNORE';
75
76# used to be CmdLearn::cmd_run() ...
77
78%opt = (
79  'force-expire' => 0,
80  'use-ignores'  => 0,
81  'nosync'       => 0,
82  'quiet'        => 0,
83  'cf'           => []
84);
85
86Getopt::Long::Configure(
87  qw(bundling no_getopt_compat
88    permute no_auto_abbrev no_ignore_case)
89);
90
91GetOptions(
92  'forget'      => \$forget,
93  'ham|nonspam' => sub { $isspam = 0; },
94  'spam'        => sub { $isspam = 1; },
95  'sync'        => \$synconly,
96  'rebuild'     => sub { $synconly = 1; warn "The --rebuild option has been deprecated.  Please use --sync instead.\n" },
97
98  'q|quiet'     => \$opt{'quiet'},
99  'username|u=s'    => \$opt{'username'},
100  'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
101  'prefspath|prefs-file|p=s'                => \$opt{'prefspath'},
102  'siteconfigpath=s'                        => \$opt{'siteconfigpath'},
103  'cf=s'                                    => \@{$opt{'cf'}},
104
105  'folders|f=s'          => \$opt{'folders'},
106  'force-expire|expire'  => \$opt{'force-expire'},
107  'local|L'              => \$opt{'local'},
108  'no-sync|nosync'       => \$opt{'nosync'},
109  'showdots'             => \$opt{'showdots'},
110  'progress'             => \$opt{'progress'},
111  'use-ignores'          => \$opt{'use-ignores'},
112  'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated.  Please use --no-sync instead.\n" },
113
114  'learnprob=f' => \$opt{'learnprob'},
115  'randseed=i'  => \$opt{'randseed'},
116  'stopafter=i' => \$opt{'stopafter'},
117  'max-size=i'  => \$opt{'max-size'},
118
119  'debug|debug-level|D:s' => \$opt{'debug'},
120  'help|h|?'        => \$opt{'help'},
121  'version|V'       => \$opt{'version'},
122
123  'dump:s' => \$opt{'dump'},
124  'import' => \$opt{'import'},
125
126  'backup'    => \$opt{'backup'},
127  'clear'     => \$opt{'clear'},
128  'restore=s' => \$opt{'restore'},
129
130  'dir'    => sub { $opt{'old_format'} = 'dir'; },
131  'file'   => sub { $opt{'old_format'} = 'file'; },
132  'mbox'   => sub { $opt{'format'}     = 'mbox'; },
133  'mbx'    => sub { $opt{'format'}     = 'mbx'; },
134  'single' => sub { $opt{'old_format'} = 'single'; },
135
136  'db|dbpath=s' => \$bayes_override_path,
137  're|regexp=s' => \$opt{'regexp'},
138
139  '<>' => \&target,
140  )
141  or usage( 0, "Unknown option!" );
142
143if ( defined $opt{'help'} ) {
144  usage( 0, "For more information read the manual page" );
145}
146if ( defined $opt{'version'} ) {
147  print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
148  exit 0;
149}
150
151# set debug areas, if any specified (only useful for command-line tools)
152if (defined $opt{'debug'}) {
153  $opt{'debug'} ||= 'all';
154}
155
156if ( $opt{'force-expire'} ) {
157  $synconly = 1;
158}
159
160if ($opt{'showdots'} && $opt{'progress'}) {
161  print "--showdots and --progress may not be used together, please select just one\n";
162  exit 0;
163}
164
165if ( !defined $isspam
166  && !defined $synconly
167  && !defined $forget
168  && !defined $opt{'dump'}
169  && !defined $opt{'import'}
170  && !defined $opt{'clear'}
171  && !defined $opt{'backup'}
172  && !defined $opt{'restore'}
173  && !defined $opt{'folders'} )
174{
175  usage( 0,
176"Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore"
177  );
178}
179
180# We need to make sure the journal syncs pre-forget...
181if ( defined $forget && $opt{'nosync'} ) {
182  $opt{'nosync'} = 0;
183  warn
184"sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n";
185}
186
187if ( defined $opt{'old_format'} ) {
188
189  #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single.
190  #Convert it to the new behavior:
191  if ( $opt{'old_format'} eq 'single' ) {
192    push ( @ARGV, '-' );
193  }
194}
195
196my $post_config = '';
197
198# kluge to support old check_bayes_db operation
199# bug 3799: init() will go r/o with the configured DB, and then dbpath needs
200# to override.  Just access the dbpath version via post_config_text.
201if ( defined $bayes_override_path ) {
202  # Add a default prefix if the path is a directory
203  if ( -d $bayes_override_path ) {
204    $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' );
205  }
206
207  $post_config .= "bayes_path $bayes_override_path\n";
208}
209
210# These options require bayes_scanner, which requires "use_bayes 1", but
211# that's not necessary for these commands.
212if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} ||
213    defined $opt{'backup'} || defined $opt{'restore'}) {
214  $post_config .= "use_bayes 1\n";
215}
216
217$post_config .= join("\n", @{$opt{'cf'}})."\n";
218
219# create the tester factory
220$spamtest = Mail::SpamAssassin->new(
221  {
222    rules_filename      => $opt{'configpath'},
223    site_rules_filename => $opt{'siteconfigpath'},
224    userprefs_filename  => $opt{'prefspath'},
225    username            => $opt{'username'},
226    debug               => $opt{'debug'},
227    local_tests_only    => $opt{'local'},
228    dont_copy_prefs     => 1,
229    PREFIX              => $PREFIX,
230    DEF_RULES_DIR       => $DEF_RULES_DIR,
231    LOCAL_RULES_DIR     => $LOCAL_RULES_DIR,
232    post_config_text	=> $post_config,
233  }
234);
235
236$spamtest->init(1);
237dbg("sa-learn: spamtest initialized");
238
239# Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin;
240# To be resolved more cleanly!!!
241if ($spamtest->{bayes_scanner}) {
242  foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) {
243    if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) {
244      # copy plugin's "store" object ref one level up!
245      $spamtest->{bayes_scanner}->{store} = $plugin->{store};
246    }
247  }
248}
249
250if (Mail::SpamAssassin::Util::am_running_on_windows()) {
251  binmode(STDIN)  or die "cannot set binmode on STDIN: $!";  # bug 4363
252  binmode(STDOUT) or die "cannot set binmode on STDOUT: $!";
253}
254
255if ( defined $opt{'dump'} ) {
256  my ( $magic, $toks );
257
258  if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) {    # show us all tokens!
259    ( $magic, $toks ) = ( 1, 1 );
260  }
261  elsif ( $opt{'dump'} eq 'magic' ) {    # show us magic tokens only
262    ( $magic, $toks ) = ( 1, 0 );
263  }
264  elsif ( $opt{'dump'} eq 'data' ) {     # show us data tokens only
265    ( $magic, $toks ) = ( 0, 1 );
266  }
267  else {                                 # unknown option
268    warn "Unknown dump option '" . $opt{'dump'} . "'\n";
269    $spamtest->finish_learner();
270    exit 1;
271  }
272
273  if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) {
274    $spamtest->finish_learner();
275    die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n";
276  }
277
278  $spamtest->finish_learner();
279  # make sure we notice any write errors while flushing output buffer
280  close STDOUT  or die "error closing STDOUT: $!";
281  close STDIN   or die "error closing STDIN: $!";
282  exit 0;
283}
284
285if ( defined $opt{'import'} ) {
286  my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
287  $spamtest->finish_learner();
288  # make sure we notice any write errors while flushing output buffer
289  close STDOUT  or die "error closing STDOUT: $!";
290  close STDIN   or die "error closing STDIN: $!";
291  exit( !$ret );
292}
293
294if (defined $opt{'clear'}) {
295  unless ($spamtest->{bayes_scanner}->{store}->clear_database()) {
296    $spamtest->finish_learner();
297    die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n";
298  }
299
300  $spamtest->finish_learner();
301  # make sure we notice any write errors while flushing output buffer
302  close STDOUT  or die "error closing STDOUT: $!";
303  close STDIN   or die "error closing STDIN: $!";
304  exit 0;
305}
306
307if (defined $opt{'backup'}) {
308  unless ($spamtest->{bayes_scanner}->{store}->backup_database()) {
309    $spamtest->finish_learner();
310    die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n";
311  }
312
313  $spamtest->finish_learner();
314  # make sure we notice any write errors while flushing output buffer
315  close STDOUT  or die "error closing STDOUT: $!";
316  close STDIN   or die "error closing STDIN: $!";
317  exit 0;
318}
319
320if (defined $opt{'restore'}) {
321
322  my $filename = $opt{'restore'};
323
324  unless ($filename) {
325    $spamtest->finish_learner();
326    die "ERROR: You must specify a filename to restore.\n";
327  }
328
329  unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) {
330    $spamtest->finish_learner();
331    die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n";
332  }
333
334  $spamtest->finish_learner();
335  # make sure we notice any write errors while flushing output buffer
336  close STDOUT  or die "error closing STDOUT: $!";
337  close STDIN   or die "error closing STDIN: $!";
338  exit 0;
339}
340
341if ( !$spamtest->{conf}->{use_bayes} ) {
342  warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n";
343  exit 1;
344}
345
346$spamtest->init_learner(
347  {
348    force_expire      => $opt{'force-expire'},
349    learn_to_journal  => $opt{'nosync'},
350    wait_for_lock     => 1,
351    caller_will_untie => 1
352  }
353);
354
355$spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'};
356
357if ($synconly) {
358  $spamtest->rebuild_learner_caches(
359    {
360      verbose  => !$opt{'quiet'},
361      showdots => $opt{'showdots'}
362    }
363  );
364  $spamtest->finish_learner();
365  # make sure we notice any write errors while flushing output buffer
366  close STDOUT  or die "error closing STDOUT: $!";
367  close STDIN   or die "error closing STDIN: $!";
368  exit 0;
369}
370
371$messagelimit = $opt{'stopafter'};
372$learnprob    = $opt{'learnprob'};
373
374if ( defined $opt{'randseed'} ) {
375  srand( $opt{'randseed'} );
376}
377
378# sync the journal first if we're going to go r/w so we make sure to
379# learn everything before doing anything else.
380#
381if ( !$opt{nosync} ) {
382  $spamtest->rebuild_learner_caches();
383}
384
385# what is the result of the run?  will end up being the exit code.
386my $exit_status = 0;
387
388# run this lot in an eval block, so we can catch die's and clear
389# up the dbs.
390eval {
391  $SIG{HUP}  = \&killed;
392  $SIG{INT}  = \&killed;
393  $SIG{TERM} = \&killed;
394
395  if ( $opt{folders} ) {
396    open( F, $opt{folders} )  or die "cannot open $opt{folders}: $!";
397    for ($!=0; <F>; $!=0) {
398      chomp;
399      next if /^\s*$/;
400      if (/^(ham|spam):(\w*):(.*)/) {
401        my $class  = $1;
402        my $format = $2 || "detect";
403        my $target = $3;
404        push ( @targets, "$class:$format:$target" );
405      }
406      else {
407        target($_);
408      }
409    }
410    defined $_ || $!==0  or
411      $!==EBADF ? dbg("error reading from $opt{folders}: $!")
412                : die "error reading from $opt{folders}: $!";
413    close(F)  or die "error closing $opt{folders}: $!";
414  }
415
416  ###########################################################################
417  # Deal with the target listing, and STDIN -> tempfile
418
419  my $tempfile; # will be defined if stdin -> tempfile
420  push(@targets, @ARGV);
421  @targets = ('-') unless @targets || $opt{folders};
422
423  for(my $elem = 0; $elem <= $#targets; $elem++) {
424    # ArchiveIterator doesn't really like STDIN, so if "-" is specified
425    # as a target, make it a temp file instead.
426    if ( $targets[$elem] =~ /(?:^|:)-$/ ) {
427      if (defined $tempfile) {
428        # uh-oh, stdin specified multiple times?
429        warn "skipping extra stdin target (".$targets[$elem].")\n";
430        splice @targets, $elem, 1;
431        $elem--; # go back to this element again
432        next;
433      }
434      else {
435        my $handle;
436        ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile();
437        binmode $handle  or die "cannot set binmode on file $tempfile: $!";
438
439        # avoid slurping the whole file into memory, copy chunk by chunk
440        my($inbuf,$nread);
441        while ( $nread=sysread(STDIN,$inbuf,16384) )
442          { print {$handle} $inbuf  or die "error writing to $tempfile: $!" }
443        defined $nread  or die "error reading from STDIN: $!";
444          close $handle  or die "error closing $tempfile: $!";
445
446        # re-aim the targets at the tempfile instead of STDIN
447        $targets[$elem] =~ s/-$/$tempfile/;
448      }
449    }
450
451    # make sure the target list is in the normal AI format
452    if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) {
453      my $item = splice @targets, $elem, 1;
454      target($item); # add back to the list
455      $elem--; # go back to this element again
456      next;
457    }
458  }
459
460  ###########################################################################
461
462  my $iter = Mail::SpamAssassin::ArchiveIterator->new(
463    {
464        # skip messages larger than max-size bytes,
465        # 0 for no limit, undef defaults to 500 KB
466      'opt_max_size' => $opt{'max-size'},
467      'opt_want_date' => 0,
468      'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex},
469    }
470  );
471
472  $iter->set_functions(\&wanted, \&result);
473  $messagecount = 0;
474  $learnedcount = 0;
475
476  $init_results = 0;
477  $start_time = time;
478
479  # if exit_status isn't already set to non-zero, set it to the reverse of the
480  # run result (0 is bad, 1+ is good -- the opposite of exit status codes)
481  my $run_ok = eval { $exit_status ||= ! $iter->run(@targets); 1 };
482
483  print STDERR "\n" if ($opt{showdots});
484  $progress->final() if ($opt{progress} && $progress);
485
486  my $phrase = defined $forget ? "Forgot" : "Learned";
487  print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n"
488    if !$opt{'quiet'};
489
490  # If we needed to make a tempfile, go delete it.
491  if (defined $tempfile) {
492    unlink $tempfile  or die "cannot unlink temporary file $tempfile: $!";
493    undef $tempfile;
494  }
495
496  if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ }
497  1;
498} or do {
499  my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
500  $spamtest->finish_learner();
501  die $eval_stat;
502};
503
504$spamtest->finish_learner();
505# make sure we notice any write errors while flushing output buffer
506close STDOUT  or die "error closing STDOUT: $!";
507close STDIN   or die "error closing STDIN: $!";
508exit $exit_status;
509
510###########################################################################
511
512sub killed {
513  $spamtest->finish_learner();
514  die "interrupted";
515}
516
517sub target {
518  my ($target) = @_;
519
520  my $class = ( $isspam ? "spam" : "ham" );
521  my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" );
522
523  push ( @targets, "$class:$format:$target" );
524}
525
526###########################################################################
527
528sub init_results {
529  $init_results = 1;
530
531  return unless $opt{'progress'};
532
533  $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES;
534
535  $progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,});
536}
537
538###########################################################################
539
540sub result {
541  my ($class, $result, $time) = @_;
542
543  # don't open results files until we get here to avoid overwriting files
544  &init_results if !$init_results;
545
546  $progress->update($messagecount) if ($opt{progress} && $progress);
547}
548
549###########################################################################
550
551sub wanted {
552  my ( $class, $id, $time, $dataref ) = @_;
553
554  my $spam = $class eq "s" ? 1 : 0;
555
556  if ( defined($learnprob) ) {
557    if ( int( rand( 1 / $learnprob ) ) != 0 ) {
558      print STDERR '_' if ( $opt{showdots} );
559      return 1;
560    }
561  }
562
563  if ( defined($messagelimit) && $learnedcount > $messagelimit ) {
564    $progress->final() if ($opt{progress} && $progress);
565    die 'HITLIMIT';
566  }
567
568  $messagecount++;
569  my $ma = $spamtest->parse($dataref);
570
571  if ( $ma->get_header("X-Spam-Checker-Version") ) {
572    my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
573    $ma->finish();
574    $ma = $new_ma;
575  }
576
577  my $status = $spamtest->learn( $ma, undef, $spam, $forget );
578  my $learned = $status->did_learn();
579
580  if ( !defined $learned ) {    # undef=learning unavailable
581    die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n";
582  }
583  elsif ( $learned == 1 ) {   # 1=message was learned.  0=message wasn't learned
584    $learnedcount++;
585  }
586
587  # Do cleanup ...
588  $status->finish();
589  undef $status;
590
591  $ma->finish();
592  undef $ma;
593
594  print STDERR '.' if ( $opt{showdots} );
595  return 1;
596}
597
598###########################################################################
599
600sub usage {
601  my ( $verbose, $message ) = @_;
602  my $ver = Mail::SpamAssassin::Version();
603  print "SpamAssassin version $ver\n";
604  pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 );
605}
606
607# ---------------------------------------------------------------------------
608
609=head1 NAME
610
611sa-learn - train SpamAssassin's Bayesian classifier
612
613=head1 SYNOPSIS
614
615B<sa-learn> [options] [file]...
616
617B<sa-learn> [options] --dump [ all | data | magic ]
618
619Options:
620
621 --ham                 Learn messages as ham (non-spam)
622 --spam                Learn messages as spam
623 --forget              Forget a message
624 --use-ignores         Use bayes_ignore_from and bayes_ignore_to
625 --sync                Synchronize the database and the journal if needed
626 --force-expire        Force a database sync and expiry run
627 --dbpath <path>       Allows commandline override (in bayes_path form)
628                       for where to read the Bayes DB from
629 --dump [all|data|magic]  Display the contents of the Bayes database
630                       Takes optional argument for what to display
631  --regexp <re>        For dump only, specifies which tokens to
632                       dump based on a regular expression.
633 -f file, --folders=file  Read list of files/directories from file
634 --dir                 Ignored; historical compatibility
635 --file                Ignored; historical compatibility
636 --mbox                Input sources are in mbox format
637 --mbx                 Input sources are in mbx format
638 --max-size <b>        Skip messages larger than b bytes;
639                       defaults to 500 KB, 0 implies no limit
640 --showdots            Show progress using dots
641 --progress            Show progress using progress bar
642 --no-sync             Skip synchronizing the database and journal
643                       after learning
644 -L, --local           Operate locally, no network accesses. Use
645                       of this is recommended, see documentation.
646 --import              Migrate data from older version/non DB_File
647                       based databases
648 --clear               Wipe out existing database
649 --backup              Backup, to STDOUT, existing database
650 --restore <filename>  Restore a database from filename
651 -u username, --username=username
652                       Override username taken from the runtime
653                       environment, used with SQL
654 -C path, --configpath=path, --config-file=path
655                       Path to standard configuration dir
656 -p prefs, --prefspath=file, --prefs-file=file
657                       Set user preferences file
658 --siteconfigpath=path Path for site configs
659                       (default:  @@PREFIX@@/etc/mail/spamassassin)
660 --cf='config line'    Additional line of configuration
661 -D, --debug [area=n,...]  Print debugging messages
662 -V, --version         Print version
663 -h, --help            Print usage message
664
665=head1 DESCRIPTION
666
667Given a typical selection of your incoming mail classified as spam or ham
668(non-spam), this tool will feed each mail to SpamAssassin, allowing it
669to 'learn' what signs are likely to mean spam, and which are likely to
670mean ham.
671
672Simply run this command once for each of your mail folders, and it will
673''learn'' from the mail therein.
674
675Note that csh-style I<globbing> in the mail folder names is supported;
676in other words, listing a folder name as C<*> will scan every folder
677that matches.  See C<Mail::SpamAssassin::ArchiveIterator> for more details.
678
679If you are using mail boxes in format other than maildir you should use
680the B<--mbox> or B<--mbx> parameters.
681
682SpamAssassin remembers which mail messages it has learnt already, and will not
683re-learn those messages again, unless you use the B<--forget> option. Messages
684learnt as spam will have SpamAssassin markup removed, on the fly.
685
686If you make a mistake and scan a mail as ham when it is spam, or vice
687versa, simply rerun this command with the correct classification, and the
688mistake will be corrected.  SpamAssassin will automatically 'forget' the
689previous indications.
690
691Users of C<spamd> who wish to perform training remotely, over a network,
692should investigate the C<spamc -L> switch.
693
694=head1 OPTIONS
695
696=over 4
697
698=item B<--ham>
699
700Learn the input message(s) as ham.   If you have previously learnt any of the
701messages as spam, SpamAssassin will forget them first, then re-learn them as
702ham.  Alternatively, if you have previously learnt them as ham, it'll skip them
703this time around.  If the messages have already been filtered through
704SpamAssassin, the learner will ignore any modifications SpamAssassin may have
705made.
706
707=item B<--spam>
708
709Learn the input message(s) as spam.   If you have previously learnt any of the
710messages as ham, SpamAssassin will forget them first, then re-learn them as
711spam.  Alternatively, if you have previously learnt them as spam, it'll skip
712them this time around.  If the messages have already been filtered through
713SpamAssassin, the learner will ignore any modifications SpamAssassin may have
714made.
715
716=item B<--folders>=I<filename>, B<-f> I<filename>
717
718sa-learn will read in the list of folders from the specified file, one folder
719per line in the file.  If the folder is prefixed with C<ham:type:> or C<spam:type:>,
720sa-learn will learn that folder appropriately, otherwise the folders will be
721assumed to be of the type specified by B<--ham> or B<--spam>.
722
723C<type> above is optional, but is the same as the standard for
724ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not
725specified).
726
727=item B<--mbox>
728
729sa-learn will read in the file(s) containing the emails to be learned,
730and will process them in mbox format (one or more emails per file).
731
732=item B<--mbx>
733
734sa-learn will read in the file(s) containing the emails to be learned,
735and will process them in mbx format (one or more emails per file).
736
737=item B<--use-ignores>
738
739Don't learn the message if a from address matches configuration file
740item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>.
741The option might be used when learning from a large file of messages
742from which the hammy spam messages or spammy ham messages have not
743been removed.
744
745=item B<--sync>
746
747Synchronize the journal and databases.  Upon successfully syncing the
748database with the entries in the journal, the journal file is removed.
749
750=item B<--force-expire>
751
752Forces an expiry attempt, regardless of whether it may be necessary
753or not.  Note: This doesn't mean any tokens will actually expire.
754Please see the EXPIRATION section below.
755
756Note: C<--force-expire> also causes the journal data to be synchronized
757into the Bayes databases.
758
759=item B<--forget>
760
761Forget a given message previously learnt.
762
763=item B<--dbpath>
764
765Allows a commandline override of the I<bayes_path> configuration option.
766
767=item B<--dump> I<option>
768
769Display the contents of the Bayes database.  Without an option or with
770the I<all> option, all magic tokens and data tokens will be displayed.
771I<magic> will only display magic tokens, and I<data> will only display
772the data tokens.
773
774Can also use the B<--regexp> I<RE> option to specify which tokens to
775display based on a regular expression.
776
777=item B<--clear>
778
779Clear an existing Bayes database by removing all traces of the database.
780
781WARNING: This is destructive and should be used with care.
782
783=item B<--backup>
784
785Performs a dump of the Bayes database in machine/human readable format.
786
787The dump will include token and seen data.  It is suitable for input back
788into the --restore command.
789
790=item B<--restore>=I<filename>
791
792Performs a restore of the Bayes database defined by I<filename>.
793
794WARNING: This is a destructive operation, previous Bayes data will be wiped out.
795
796=item B<-h>, B<--help>
797
798Print help message and exit.
799
800=item B<-u> I<username>, B<--username>=I<username>
801
802If specified this username will override the username taken from the runtime
803environment.  You can use this option to specify users in a virtual user
804configuration when using SQL as the Bayes backend.
805
806NOTE: This option will not change to the given I<username>, it will only attempt
807to act on behalf of that user.  Because of this you will need to have proper
808permissions to be able to change files owned by I<username>.  In the case of SQL
809this generally is not a problem.
810
811=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
812
813Use the specified path for locating the distributed configuration files.
814Ignore the default directories (usually C</usr/share/spamassassin> or similar).
815
816=item B<--siteconfigpath>=I<path>
817
818Use the specified path for locating site-specific configuration files.  Ignore
819the default directories (usually C</etc/mail/spamassassin> or similar).
820
821=item B<--cf='config line'>
822
823Add additional lines of configuration directly from the command-line, parsed
824after the configuration files are read.   Multiple B<--cf> arguments can be
825used, and each will be considered a separate line of configuration.
826
827=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
828
829Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>).
830
831=item B<--progress>
832
833Prints a progress bar (to STDERR) showing the current progress.  In the case
834where no valid terminal is found this option will behave very much like the
835--showdots option.
836
837=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
838
839Produce debugging output. If no areas are listed, all debugging information is
840printed. Diagnostic output can also be enabled for each area individually;
841I<area> is the area of the code to instrument. For example, to produce
842diagnostic output on bayes, learn, and dns, use:
843
844        spamassassin -D bayes,learn,dns
845
846For more information about which areas (also known as channels) are available,
847please see the documentation at:
848
849        C<http://wiki.apache.org/spamassassin/DebugChannels>
850
851Higher priority informational messages that are suitable for logging in normal
852circumstances are available with an area of "info".
853
854=item B<--no-sync>
855
856Skip the slow synchronization step which normally takes place after
857changing database entries.  If you plan to learn from many folders in
858a batch, or to learn many individual messages one-by-one, it is faster
859to use this switch and run C<sa-learn --sync> once all the folders have
860been scanned.
861
862Clarification: The state of I<--no-sync> overrides the
863I<bayes_learn_to_journal> configuration option.  If not specified,
864sa-learn will learn to the database directly.  If specified, sa-learn
865will learn to the journal file.
866
867Note: I<--sync> and I<--no-sync> can be specified on the same commandline,
868which is slightly confusing.  In this case, the I<--no-sync> option is
869ignored since there is no learn operation.
870
871=item B<-L>, B<--local>
872
873Do not perform any network accesses while learning details about the mail
874messages.  This should be normally used, as there really isn't anything
875Bayes can learn from network lookup results.  Official SpamAssassin plugins
876do not currently do any network lookups when learning, but it's possible
877that third party ones might.
878
879=item B<--import>
880
881If you previously used SpamAssassin's Bayesian learner without the C<DB_File>
882module installed, it will have created files in other formats, such as
883C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>.  This switch allows you to migrate
884that old data into the C<DB_File> format.  It will overwrite any data currently
885in the C<DB_File>.
886
887Can also be used with the B<--dbpath> I<path> option to specify the location of
888the Bayes files to use.
889
890=back
891
892=head1 MIGRATION
893
894There are now multiple backend storage modules available for storing
895user's bayesian data. As such you might want to migrate from one
896backend to another. Here is a simple procedure for migrating from one
897backend to another.
898
899Note that if you have individual user databases you will have to
900perform a similar procedure for each one of them.
901
902=over 4
903
904=item sa-learn --sync
905
906This will sync any outstanding journal entries
907
908=item sa-learn --backup > backup.txt
909
910This will save all your Bayes data to a plain text file.
911
912=item sa-learn --clear
913
914This is optional, but good to do to clear out the old database.
915
916=item Repeat!
917
918At this point, if you have multiple databases, you should perform the
919procedure above for each of them. (i.e. each user's database needs to
920be backed up before continuing.)
921
922=item Switch backends
923
924Once you have backed up all databases you can update your
925configuration for the new database backend. This will involve at least
926the bayes_store_module config option and may involve some additional
927config options depending on what is required by the module. (For
928example, you may need to configure an SQL database.)
929
930=item sa-learn --restore backup.txt
931
932Again, you need to do this for every database.
933
934=back
935
936If you are migrating to SQL you can make use of the -u <username>
937option in sa-learn to populate each user's database. Otherwise, you
938must run sa-learn as the user who database you are restoring.
939
940
941=head1 INTRODUCTION TO BAYESIAN FILTERING
942
943(Thanks to Michael Bell for this section!)
944
945For a more lengthy description of how this works, go to
946http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably
947readable, even if statistics make me break out in hives.
948
949The short semi-inaccurate version: Given training, a spam heuristics engine
950can take the most "spammy" and "hammy" words and apply probabilistic
951analysis. Furthermore, once given a basis for the analysis, the engine can
952continue to learn iteratively by applying both the non-Bayesian and Bayesian
953rulesets together to create evolving "intelligence".
954
955SpamAssassin 2.50 and later supports Bayesian spam analysis, in
956the form of the BAYES rules. This is a new feature, quite powerful,
957and is disabled until enough messages have been learnt.
958
959The pros of Bayesian spam analysis:
960
961=over 4
962
963=item Can greatly reduce false positives and false negatives.
964
965It learns from your mail, so it is tailored to your unique e-mail flow.
966
967=item Once it starts learning, it can continue to learn from SpamAssassin
968and improve over time.
969
970=back
971
972And the cons:
973
974=over 4
975
976=item A decent number of messages are required before results are useful
977for ham/spam determination.
978
979=item It's hard to explain why a message is or isn't marked as spam.
980
981i.e.: a straightforward rule, that matches, say, "VIAGRA" is
982easy to understand. If it generates a false positive or false negative,
983it is fairly easy to understand why.
984
985With Bayesian analysis, it's all probabilities - "because the past says
986it is likely as this falls into a probabilistic distribution common to past
987spam in your systems". Tell that to your users!  Tell that to the client
988when he asks "what can I do to change this". (By the way, the answer in
989this case is "use whitelisting".)
990
991=item It will take disk space and memory.
992
993The databases it maintains take quite a lot of resources to store and use.
994
995=back
996
997=head1 GETTING STARTED
998
999Still interested? Ok, here's the guidelines for getting this working.
1000
1001First a high-level overview:
1002
1003=over 4
1004
1005=item Build a significant sample of both ham and spam.
1006
1007I suggest several thousand of each, placed in SPAM and HAM directories or
1008mailboxes.  Yes, you MUST hand-sort this - otherwise the results won't be much
1009better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY
1010message.  You're urged to avoid using a publicly available corpus (sample) -
1011this must be taken from YOUR mail server, if it is to be statistically useful.
1012Otherwise, the results may be pretty skewed.
1013
1014=item Use this tool to teach SpamAssassin about these samples, like so:
1015
1016	sa-learn --spam /path/to/spam/folder
1017	sa-learn --ham /path/to/ham/folder
1018	...
1019
1020Let SpamAssassin proceed, learning stuff. When it finds ham and spam
1021it will add the "interesting tokens" to the database.
1022
1023=item If you need SpamAssassin to forget about specific messages, use
1024the B<--forget> option.
1025
1026This can be applied to either ham or spam that has run through the
1027B<sa-learn> processes. It's a bit of a hammer, really, lowering the
1028weighting of the specific tokens in that message (only if that message has
1029been processed before).
1030
1031=item Learning from single messages uses a command like this:
1032
1033	sa-learn --ham --no-sync mailmessage
1034
1035This is handy for binding to a key in your mail user agent.  It's very fast, as
1036all the time-consuming stuff is deferred until you run with the C<--sync>
1037option.
1038
1039=item Autolearning is enabled by default
1040
1041If you don't have a corpus of mail saved to learn, you can let
1042SpamAssassin automatically learn the mail that you receive.  If you are
1043autolearning from scratch, the amount of mail you receive will determine
1044how long until the BAYES_* rules are activated.
1045
1046=back
1047
1048=head1 EFFECTIVE TRAINING
1049
1050Learning filters require training to be effective.  If you don't train
1051them, they won't work.  In addition, you need to train them with new
1052messages regularly to keep them up-to-date, or their data will become
1053stale and impact accuracy.
1054
1055You need to train with both spam I<and> ham mails.  One type of mail
1056alone will not have any effect.
1057
1058Note that if your mail folders contain things like forwarded spam,
1059discussions of spam-catching rules, etc., this will cause trouble.  You
1060should avoid scanning those messages if possible.  (An easy way to do this
1061is to move them aside, into a folder which is not scanned.)
1062
1063If the messages you are learning from have already been filtered through
1064SpamAssassin, the learner will compensate for this.  In effect, it learns what
1065each message would look like if you had run C<spamassassin -d> over it in
1066advance.
1067
1068Another thing to be aware of, is that typically you should aim to train
1069with at least 1000 messages of spam, and 1000 ham messages, if
1070possible.  More is better, but anything over about 5000 messages does not
1071improve accuracy significantly in our tests.
1072
1073Be careful that you train from the same source -- for example, if you train
1074on old spam, but new ham mail, then the classifier will think that
1075a mail with an old date stamp is likely to be spam.
1076
1077It's also worth noting that training with a very small quantity of
1078ham, will produce atrocious results.  You should aim to train with at
1079least the same amount (or more if possible!) of ham data than spam.
1080
1081On an on-going basis, it is best to keep training the filter to make
1082sure it has fresh data to work from.  There are various ways to do
1083this:
1084
1085=over 4
1086
1087=item 1. Supervised learning
1088
1089This means keeping a copy of all or most of your mail, separated into spam
1090and ham piles, and periodically re-training using those.  It produces
1091the best results, but requires more work from you, the user.
1092
1093(An easy way to do this, by the way, is to create a new folder for
1094'deleted' messages, and instead of deleting them from other folders,
1095simply move them in there instead.  Then keep all spam in a separate
1096folder and never delete it.  As long as you remember to move misclassified
1097mails into the correct folder set, it is easy enough to keep up to date.)
1098
1099=item 2. Unsupervised learning from Bayesian classification
1100
1101Another way to train is to chain the results of the Bayesian classifier
1102back into the training, so it reinforces its own decisions.  This is only
1103safe if you then retrain it based on any errors you discover.
1104
1105SpamAssassin does not support this method, due to experimental results
1106which strongly indicate that it does not work well, and since Bayes is
1107only one part of the resulting score presented to the user (while Bayes
1108may have made the wrong decision about a mail, it may have been overridden
1109by another system).
1110
1111=item 3. Unsupervised learning from SpamAssassin rules
1112
1113Also called 'auto-learning' in SpamAssassin.  Based on statistical
1114analysis of the SpamAssassin success rates, we can automatically train the
1115Bayesian database with a certain degree of confidence that our training
1116data is accurate.
1117
1118It should be supplemented with some supervised training in addition, if
1119possible.
1120
1121This is the default, but can be turned off by setting the SpamAssassin
1122configuration parameter C<bayes_auto_learn> to 0.
1123
1124=item 4. Mistake-based training
1125
1126This means training on a small number of mails, then only training on
1127messages that SpamAssassin classifies incorrectly.  This works, but it
1128takes longer to get it right than a full training session would.
1129
1130=back
1131
1132=head1 FILES
1133
1134B<sa-learn> and the other parts of SpamAssassin's Bayesian learner,
1135use a set of persistent database files to store the learnt tokens, as follows.
1136
1137=over 4
1138
1139=item bayes_toks
1140
1141The database of tokens, containing the tokens learnt, their count of
1142occurrences in ham and spam, and the timestamp when the token was last
1143seen in a message.
1144
1145This database also contains some 'magic' tokens, as follows: the version
1146number of the database, the number of ham and spam messages learnt, the
1147number of tokens in the database, and timestamps of: the last journal
1148sync, the last expiry run, the last expiry token reduction count, the
1149last expiry timestamp delta, the oldest token timestamp in the database,
1150and the newest token timestamp in the database.
1151
1152This is a database file, using C<DB_File>.  The database 'version
1153number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x
1154development releases, 2 for 2.6x, and 3 for 3.0 and later releases.
1155
1156=item bayes_seen
1157
1158A map of Message-Id and some data from headers and body to what that
1159message was learnt as. This is used so that SpamAssassin can avoid
1160re-learning a message it has already seen, and so it can reverse the
1161training if you later decide that message was learnt incorrectly.
1162
1163This is a database file, using C<DB_File>.
1164
1165=item bayes_journal
1166
1167While SpamAssassin is scanning mails, it needs to track which tokens
1168it uses in its calculations.  To avoid the contention of having each
1169SpamAssassin process attempting to gain write access to the Bayes DB,
1170the token timestamps are written to a 'journal' file which will later
1171(either automatically or via C<sa-learn --sync>) be used to synchronize
1172the Bayes DB.
1173
1174Also, through the use of C<bayes_learn_to_journal>, or when using the
1175C<--no-sync> option with sa-learn, the actual learning data will take
1176be placed into the journal for later synchronization.  This is typically
1177useful for high-traffic sites to avoid the same contention as stated
1178above.
1179
1180=back
1181
1182=head1 EXPIRATION
1183
1184Since SpamAssassin can auto-learn messages, the Bayes database files
1185could increase perpetually until they fill your disk.  To control this,
1186SpamAssassin performs journal synchronization and bayes expiration
1187periodically when certain criteria (listed below) are met.
1188
1189SpamAssassin can sync the journal and expire the DB tokens either
1190manually or opportunistically.  A journal sync is due if I<--sync>
1191is passed to sa-learn (manual), or if the following is true
1192(opportunistic):
1193
1194=over 4
1195
1196=item - bayes_journal_max_size does not equal 0 (means don't sync)
1197
1198=item - the journal file exists
1199
1200=back
1201
1202and either:
1203
1204=over 4
1205
1206=item - the journal file has a size greater than bayes_journal_max_size
1207
1208=back
1209
1210or
1211
1212=over 4
1213
1214=item - a journal sync has previously occurred, and at least 1 day has
1215passed since that sync
1216
1217=back
1218
1219Expiry is due if I<--force-expire> is passed to sa-learn (manual),
1220or if all of the following are true (opportunistic):
1221
1222=over 4
1223
1224=item - the last expire was attempted at least 12hrs ago
1225
1226=item - bayes_auto_expire does not equal 0
1227
1228=item - the number of tokens in the DB is > 100,000
1229
1230=item - the number of tokens in the DB is > bayes_expiry_max_db_size
1231
1232=item - there is at least a 12 hr difference between the oldest and newest token atimes
1233
1234=back
1235
1236=head2 EXPIRE LOGIC
1237
1238If either the manual or opportunistic method causes an expire run
1239to start, here is the logic that is used:
1240
1241=over 4
1242
1243=item - figure out how many tokens to keep.  take the larger of
1244either bayes_expiry_max_db_size * 75% or 100,000 tokens.  therefore, the goal
1245reduction is number of tokens - number of tokens to keep.
1246
1247=item - if the reduction number is < 1000 tokens, abort (not worth the effort).
1248
1249=item - if an expire has been done before, guesstimate the new
1250atime delta based on the old atime delta.  (new_atime_delta =
1251old_atime_delta * old_reduction_count / goal)
1252
1253=item - if no expire has been done before, or the last expire looks
1254"weird", do an estimation pass.  The definition of "weird" is:
1255
1256=over 8
1257
1258=item - last expire over 30 days ago
1259
1260=item - last atime delta was < 12 hrs
1261
1262=item - last reduction count was < 1000 tokens
1263
1264=item - estimated new atime delta is < 12 hrs
1265
1266=item - the difference between the last reduction count and the goal reduction count is > 50%
1267
1268=back
1269
1270=back
1271
1272=head2 ESTIMATION PASS LOGIC
1273
1274Go through each of the DB's tokens.  Starting at 12hrs, calculate
1275whether or not the token would be expired (based on the difference
1276between the token's atime and the db's newest token atime) and keep
1277the count.  Work out from 12hrs exponentially by powers of 2.  ie:
127812hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs
1279* 512 (6144hrs, or 256 days).
1280
1281The larger the delta, the smaller the number of tokens that will
1282be expired.  Conversely, the number of tokens goes up as the delta
1283gets smaller.  So starting at the largest atime delta, figure out
1284which delta will expire the most tokens without going above the
1285goal expiration count.  Use this to choose the atime delta to use,
1286unless one of the following occurs:
1287
1288=over 8
1289
1290=item - the largest atime (smallest reduction count) would expire
1291too many tokens.  this means the learned tokens are mostly old and
1292there needs to be new tokens learned before an expire can
1293occur.
1294
1295=item - all of the atime choices result in 0 tokens being removed.
1296this means the tokens are all newer than 12 hours and there needs
1297to be new tokens learned before an expire can occur.
1298
1299=item - the number of tokens that would be removed is < 1000.  the
1300benefit isn't worth the effort.  more tokens need to be learned.
1301
1302=back
1303
1304If the expire run gets past this point, it will continue to the end.
1305A new DB is created since the majority of DB libraries don't shrink the
1306DB file when tokens are removed.  So we do the "create new, migrate old
1307to new, remove old, rename new" shuffle.
1308
1309=head2 EXPIRY RELATED CONFIGURATION SETTINGS
1310
1311=over 4
1312
1313=item C<bayes_auto_expire> is used to specify whether or not SpamAssassin
1314ought to opportunistically attempt to expire the Bayes database.
1315The default is 1 (yes).
1316
1317=item C<bayes_expiry_max_db_size> specifies both the auto-expire token
1318count point, as well as the resulting number of tokens after expiry
1319as described above.  The default value is 150,000, which is roughly
1320equivalent to a 6Mb database file if you're using DB_File.
1321
1322=item C<bayes_journal_max_size> specifies how large the Bayes
1323journal will grow before it is opportunistically synced.  The
1324default value is 102400.
1325
1326=back
1327
1328=head1 INSTALLATION
1329
1330The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module.
1331Install this as a normal Perl module, using C<perl -MCPAN -e shell>,
1332or by hand.
1333
1334=head1 SEE ALSO
1335
1336spamassassin(1)
1337spamc(1)
1338Mail::SpamAssassin(3)
1339Mail::SpamAssassin::ArchiveIterator(3)
1340
1341E<lt>http://www.paulgraham.com/E<gt>
1342Paul Graham's "A Plan For Spam" paper
1343
1344E<lt>http://www.linuxjournal.com/article/6467E<gt>
1345Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin
1346
1347E<lt>http://www.bgl.nu/~glouis/bogofilter/E<gt>
1348'Training on error' page.  A discussion of various Bayes training regimes,
1349including 'train on error' and unsupervised training.
1350
1351=head1 PREREQUISITES
1352
1353C<Mail::SpamAssassin>
1354
1355=head1 AUTHORS
1356
1357The SpamAssassin(tm) Project E<lt>https://spamassassin.apache.org/E<gt>
1358
1359=cut
1360
1361