1#!/usr/bin/perl -w -T 2# <@LICENSE> 3# Licensed to the Apache Software Foundation (ASF) under one or more 4# contributor license agreements. See the NOTICE file distributed with 5# this work for additional information regarding copyright ownership. 6# The ASF licenses this file to you under the Apache License, Version 2.0 7# (the "License"); you may not use this file except in compliance with 8# the License. You may obtain a copy of the License at: 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17# </@LICENSE> 18 19use strict; 20use warnings; 21# use bytes; 22 23use Errno qw(EBADF); 24use Getopt::Long; 25use Pod::Usage; 26use File::Spec; 27use POSIX qw(locale_h setsid sigprocmask _exit); 28 29POSIX::setlocale(LC_TIME,'C'); 30 31our ( $spamtest, %opt, $isspam, $forget, $messagecount, $learnedcount, $messagelimit, $progress, 32 $total_messages, $init_results, $start_time, $synconly, $learnprob, @targets, $bayes_override_path ); 33 34my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time 35my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time 36my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time 37 38use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time 39 40BEGIN { # see comments in "spamassassin.raw" for doco 41 my @bin = File::Spec->splitpath($0); 42 my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1], '') : $bin[1]) 43 || File::Spec->curdir; 44 45 if (-e $bin.'/lib/Mail/SpamAssassin.pm' 46 || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' ) 47 { 48 my $searchrelative; 49 $searchrelative = 1; # disabled during "make install": REMOVEFORINST 50 if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm') 51 { 52 unshift ( @INC, '../blib/lib' ); 53 } else { 54 foreach ( qw(lib ../lib/site_perl 55 ../lib/spamassassin ../share/spamassassin/lib)) 56 { 57 my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) ); 58 if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) ) 59 { unshift ( @INC, $dir ); last; } 60 } 61 } 62 } 63} 64 65use Mail::SpamAssassin; 66use Mail::SpamAssassin::ArchiveIterator; 67use Mail::SpamAssassin::Message; 68use Mail::SpamAssassin::PerMsgLearner; 69use Mail::SpamAssassin::Util::Progress; 70use Mail::SpamAssassin::Logger; 71 72########################################################################### 73 74$SIG{PIPE} = 'IGNORE'; 75 76# used to be CmdLearn::cmd_run() ... 77 78%opt = ( 79 'force-expire' => 0, 80 'use-ignores' => 0, 81 'nosync' => 0, 82 'quiet' => 0, 83 'cf' => [] 84); 85 86Getopt::Long::Configure( 87 qw(bundling no_getopt_compat 88 permute no_auto_abbrev no_ignore_case) 89); 90 91GetOptions( 92 'forget' => \$forget, 93 'ham|nonspam' => sub { $isspam = 0; }, 94 'spam' => sub { $isspam = 1; }, 95 'sync' => \$synconly, 96 'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" }, 97 98 'q|quiet' => \$opt{'quiet'}, 99 'username|u=s' => \$opt{'username'}, 100 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'}, 101 'prefspath|prefs-file|p=s' => \$opt{'prefspath'}, 102 'siteconfigpath=s' => \$opt{'siteconfigpath'}, 103 'cf=s' => \@{$opt{'cf'}}, 104 105 'folders|f=s' => \$opt{'folders'}, 106 'force-expire|expire' => \$opt{'force-expire'}, 107 'local|L' => \$opt{'local'}, 108 'no-sync|nosync' => \$opt{'nosync'}, 109 'showdots' => \$opt{'showdots'}, 110 'progress' => \$opt{'progress'}, 111 'use-ignores' => \$opt{'use-ignores'}, 112 'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" }, 113 114 'learnprob=f' => \$opt{'learnprob'}, 115 'randseed=i' => \$opt{'randseed'}, 116 'stopafter=i' => \$opt{'stopafter'}, 117 'max-size=i' => \$opt{'max-size'}, 118 119 'debug|debug-level|D:s' => \$opt{'debug'}, 120 'help|h|?' => \$opt{'help'}, 121 'version|V' => \$opt{'version'}, 122 123 'dump:s' => \$opt{'dump'}, 124 'import' => \$opt{'import'}, 125 126 'backup' => \$opt{'backup'}, 127 'clear' => \$opt{'clear'}, 128 'restore=s' => \$opt{'restore'}, 129 130 'dir' => sub { $opt{'old_format'} = 'dir'; }, 131 'file' => sub { $opt{'old_format'} = 'file'; }, 132 'mbox' => sub { $opt{'format'} = 'mbox'; }, 133 'mbx' => sub { $opt{'format'} = 'mbx'; }, 134 'single' => sub { $opt{'old_format'} = 'single'; }, 135 136 'db|dbpath=s' => \$bayes_override_path, 137 're|regexp=s' => \$opt{'regexp'}, 138 139 '<>' => \&target, 140 ) 141 or usage( 0, "Unknown option!" ); 142 143if ( defined $opt{'help'} ) { 144 usage( 0, "For more information read the manual page" ); 145} 146if ( defined $opt{'version'} ) { 147 print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n"; 148 exit 0; 149} 150 151# set debug areas, if any specified (only useful for command-line tools) 152if (defined $opt{'debug'}) { 153 $opt{'debug'} ||= 'all'; 154} 155 156if ( $opt{'force-expire'} ) { 157 $synconly = 1; 158} 159 160if ($opt{'showdots'} && $opt{'progress'}) { 161 print "--showdots and --progress may not be used together, please select just one\n"; 162 exit 0; 163} 164 165if ( !defined $isspam 166 && !defined $synconly 167 && !defined $forget 168 && !defined $opt{'dump'} 169 && !defined $opt{'import'} 170 && !defined $opt{'clear'} 171 && !defined $opt{'backup'} 172 && !defined $opt{'restore'} 173 && !defined $opt{'folders'} ) 174{ 175 usage( 0, 176"Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore" 177 ); 178} 179 180# We need to make sure the journal syncs pre-forget... 181if ( defined $forget && $opt{'nosync'} ) { 182 $opt{'nosync'} = 0; 183 warn 184"sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n"; 185} 186 187if ( defined $opt{'old_format'} ) { 188 189 #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single. 190 #Convert it to the new behavior: 191 if ( $opt{'old_format'} eq 'single' ) { 192 push ( @ARGV, '-' ); 193 } 194} 195 196my $post_config = ''; 197 198# kluge to support old check_bayes_db operation 199# bug 3799: init() will go r/o with the configured DB, and then dbpath needs 200# to override. Just access the dbpath version via post_config_text. 201if ( defined $bayes_override_path ) { 202 # Add a default prefix if the path is a directory 203 if ( -d $bayes_override_path ) { 204 $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' ); 205 } 206 207 $post_config .= "bayes_path $bayes_override_path\n"; 208} 209 210# These options require bayes_scanner, which requires "use_bayes 1", but 211# that's not necessary for these commands. 212if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} || 213 defined $opt{'backup'} || defined $opt{'restore'}) { 214 $post_config .= "use_bayes 1\n"; 215} 216 217$post_config .= join("\n", @{$opt{'cf'}})."\n"; 218 219# create the tester factory 220$spamtest = Mail::SpamAssassin->new( 221 { 222 rules_filename => $opt{'configpath'}, 223 site_rules_filename => $opt{'siteconfigpath'}, 224 userprefs_filename => $opt{'prefspath'}, 225 username => $opt{'username'}, 226 debug => $opt{'debug'}, 227 local_tests_only => $opt{'local'}, 228 dont_copy_prefs => 1, 229 PREFIX => $PREFIX, 230 DEF_RULES_DIR => $DEF_RULES_DIR, 231 LOCAL_RULES_DIR => $LOCAL_RULES_DIR, 232 post_config_text => $post_config, 233 } 234); 235 236$spamtest->init(1); 237dbg("sa-learn: spamtest initialized"); 238 239# Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin; 240# To be resolved more cleanly!!! 241if ($spamtest->{bayes_scanner}) { 242 foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) { 243 if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) { 244 # copy plugin's "store" object ref one level up! 245 $spamtest->{bayes_scanner}->{store} = $plugin->{store}; 246 } 247 } 248} 249 250if (Mail::SpamAssassin::Util::am_running_on_windows()) { 251 binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363 252 binmode(STDOUT) or die "cannot set binmode on STDOUT: $!"; 253} 254 255if ( defined $opt{'dump'} ) { 256 my ( $magic, $toks ); 257 258 if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens! 259 ( $magic, $toks ) = ( 1, 1 ); 260 } 261 elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only 262 ( $magic, $toks ) = ( 1, 0 ); 263 } 264 elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only 265 ( $magic, $toks ) = ( 0, 1 ); 266 } 267 else { # unknown option 268 warn "Unknown dump option '" . $opt{'dump'} . "'\n"; 269 $spamtest->finish_learner(); 270 exit 1; 271 } 272 273 if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) { 274 $spamtest->finish_learner(); 275 die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n"; 276 } 277 278 $spamtest->finish_learner(); 279 # make sure we notice any write errors while flushing output buffer 280 close STDOUT or die "error closing STDOUT: $!"; 281 close STDIN or die "error closing STDIN: $!"; 282 exit 0; 283} 284 285if ( defined $opt{'import'} ) { 286 my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade(); 287 $spamtest->finish_learner(); 288 # make sure we notice any write errors while flushing output buffer 289 close STDOUT or die "error closing STDOUT: $!"; 290 close STDIN or die "error closing STDIN: $!"; 291 exit( !$ret ); 292} 293 294if (defined $opt{'clear'}) { 295 unless ($spamtest->{bayes_scanner}->{store}->clear_database()) { 296 $spamtest->finish_learner(); 297 die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n"; 298 } 299 300 $spamtest->finish_learner(); 301 # make sure we notice any write errors while flushing output buffer 302 close STDOUT or die "error closing STDOUT: $!"; 303 close STDIN or die "error closing STDIN: $!"; 304 exit 0; 305} 306 307if (defined $opt{'backup'}) { 308 unless ($spamtest->{bayes_scanner}->{store}->backup_database()) { 309 $spamtest->finish_learner(); 310 die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n"; 311 } 312 313 $spamtest->finish_learner(); 314 # make sure we notice any write errors while flushing output buffer 315 close STDOUT or die "error closing STDOUT: $!"; 316 close STDIN or die "error closing STDIN: $!"; 317 exit 0; 318} 319 320if (defined $opt{'restore'}) { 321 322 my $filename = $opt{'restore'}; 323 324 unless ($filename) { 325 $spamtest->finish_learner(); 326 die "ERROR: You must specify a filename to restore.\n"; 327 } 328 329 unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) { 330 $spamtest->finish_learner(); 331 die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n"; 332 } 333 334 $spamtest->finish_learner(); 335 # make sure we notice any write errors while flushing output buffer 336 close STDOUT or die "error closing STDOUT: $!"; 337 close STDIN or die "error closing STDIN: $!"; 338 exit 0; 339} 340 341if ( !$spamtest->{conf}->{use_bayes} ) { 342 warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n"; 343 exit 1; 344} 345 346$spamtest->init_learner( 347 { 348 force_expire => $opt{'force-expire'}, 349 learn_to_journal => $opt{'nosync'}, 350 wait_for_lock => 1, 351 caller_will_untie => 1 352 } 353); 354 355$spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'}; 356 357if ($synconly) { 358 $spamtest->rebuild_learner_caches( 359 { 360 verbose => !$opt{'quiet'}, 361 showdots => $opt{'showdots'} 362 } 363 ); 364 $spamtest->finish_learner(); 365 # make sure we notice any write errors while flushing output buffer 366 close STDOUT or die "error closing STDOUT: $!"; 367 close STDIN or die "error closing STDIN: $!"; 368 exit 0; 369} 370 371$messagelimit = $opt{'stopafter'}; 372$learnprob = $opt{'learnprob'}; 373 374if ( defined $opt{'randseed'} ) { 375 srand( $opt{'randseed'} ); 376} 377 378# sync the journal first if we're going to go r/w so we make sure to 379# learn everything before doing anything else. 380# 381if ( !$opt{nosync} ) { 382 $spamtest->rebuild_learner_caches(); 383} 384 385# what is the result of the run? will end up being the exit code. 386my $exit_status = 0; 387 388# run this lot in an eval block, so we can catch die's and clear 389# up the dbs. 390eval { 391 $SIG{HUP} = \&killed; 392 $SIG{INT} = \&killed; 393 $SIG{TERM} = \&killed; 394 395 if ( $opt{folders} ) { 396 open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!"; 397 for ($!=0; <F>; $!=0) { 398 chomp; 399 next if /^\s*$/; 400 if (/^(ham|spam):(\w*):(.*)/) { 401 my $class = $1; 402 my $format = $2 || "detect"; 403 my $target = $3; 404 push ( @targets, "$class:$format:$target" ); 405 } 406 else { 407 target($_); 408 } 409 } 410 defined $_ || $!==0 or 411 $!==EBADF ? dbg("error reading from $opt{folders}: $!") 412 : die "error reading from $opt{folders}: $!"; 413 close(F) or die "error closing $opt{folders}: $!"; 414 } 415 416 ########################################################################### 417 # Deal with the target listing, and STDIN -> tempfile 418 419 my $tempfile; # will be defined if stdin -> tempfile 420 push(@targets, @ARGV); 421 @targets = ('-') unless @targets || $opt{folders}; 422 423 for(my $elem = 0; $elem <= $#targets; $elem++) { 424 # ArchiveIterator doesn't really like STDIN, so if "-" is specified 425 # as a target, make it a temp file instead. 426 if ( $targets[$elem] =~ /(?:^|:)-$/ ) { 427 if (defined $tempfile) { 428 # uh-oh, stdin specified multiple times? 429 warn "skipping extra stdin target (".$targets[$elem].")\n"; 430 splice @targets, $elem, 1; 431 $elem--; # go back to this element again 432 next; 433 } 434 else { 435 my $handle; 436 ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile(); 437 binmode $handle or die "cannot set binmode on file $tempfile: $!"; 438 439 # avoid slurping the whole file into memory, copy chunk by chunk 440 my($inbuf,$nread); 441 while ( $nread=sysread(STDIN,$inbuf,16384) ) 442 { print {$handle} $inbuf or die "error writing to $tempfile: $!" } 443 defined $nread or die "error reading from STDIN: $!"; 444 close $handle or die "error closing $tempfile: $!"; 445 446 # re-aim the targets at the tempfile instead of STDIN 447 $targets[$elem] =~ s/-$/$tempfile/; 448 } 449 } 450 451 # make sure the target list is in the normal AI format 452 if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) { 453 my $item = splice @targets, $elem, 1; 454 target($item); # add back to the list 455 $elem--; # go back to this element again 456 next; 457 } 458 } 459 460 ########################################################################### 461 462 my $iter = Mail::SpamAssassin::ArchiveIterator->new( 463 { 464 # skip messages larger than max-size bytes, 465 # 0 for no limit, undef defaults to 500 KB 466 'opt_max_size' => $opt{'max-size'}, 467 'opt_want_date' => 0, 468 'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex}, 469 } 470 ); 471 472 $iter->set_functions(\&wanted, \&result); 473 $messagecount = 0; 474 $learnedcount = 0; 475 476 $init_results = 0; 477 $start_time = time; 478 479 # if exit_status isn't already set to non-zero, set it to the reverse of the 480 # run result (0 is bad, 1+ is good -- the opposite of exit status codes) 481 my $run_ok = eval { $exit_status ||= ! $iter->run(@targets); 1 }; 482 483 print STDERR "\n" if ($opt{showdots}); 484 $progress->final() if ($opt{progress} && $progress); 485 486 my $phrase = defined $forget ? "Forgot" : "Learned"; 487 print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n" 488 if !$opt{'quiet'}; 489 490 # If we needed to make a tempfile, go delete it. 491 if (defined $tempfile) { 492 unlink $tempfile or die "cannot unlink temporary file $tempfile: $!"; 493 undef $tempfile; 494 } 495 496 if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ } 497 1; 498} or do { 499 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; 500 $spamtest->finish_learner(); 501 die $eval_stat; 502}; 503 504$spamtest->finish_learner(); 505# make sure we notice any write errors while flushing output buffer 506close STDOUT or die "error closing STDOUT: $!"; 507close STDIN or die "error closing STDIN: $!"; 508exit $exit_status; 509 510########################################################################### 511 512sub killed { 513 $spamtest->finish_learner(); 514 die "interrupted"; 515} 516 517sub target { 518 my ($target) = @_; 519 520 my $class = ( $isspam ? "spam" : "ham" ); 521 my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" ); 522 523 push ( @targets, "$class:$format:$target" ); 524} 525 526########################################################################### 527 528sub init_results { 529 $init_results = 1; 530 531 return unless $opt{'progress'}; 532 533 $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES; 534 535 $progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,}); 536} 537 538########################################################################### 539 540sub result { 541 my ($class, $result, $time) = @_; 542 543 # don't open results files until we get here to avoid overwriting files 544 &init_results if !$init_results; 545 546 $progress->update($messagecount) if ($opt{progress} && $progress); 547} 548 549########################################################################### 550 551sub wanted { 552 my ( $class, $id, $time, $dataref ) = @_; 553 554 my $spam = $class eq "s" ? 1 : 0; 555 556 if ( defined($learnprob) ) { 557 if ( int( rand( 1 / $learnprob ) ) != 0 ) { 558 print STDERR '_' if ( $opt{showdots} ); 559 return 1; 560 } 561 } 562 563 if ( defined($messagelimit) && $learnedcount > $messagelimit ) { 564 $progress->final() if ($opt{progress} && $progress); 565 die 'HITLIMIT'; 566 } 567 568 $messagecount++; 569 my $ma = $spamtest->parse($dataref); 570 571 if ( $ma->get_header("X-Spam-Checker-Version") ) { 572 my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); 573 $ma->finish(); 574 $ma = $new_ma; 575 } 576 577 my $status = $spamtest->learn( $ma, undef, $spam, $forget ); 578 my $learned = $status->did_learn(); 579 580 if ( !defined $learned ) { # undef=learning unavailable 581 die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n"; 582 } 583 elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned 584 $learnedcount++; 585 } 586 587 # Do cleanup ... 588 $status->finish(); 589 undef $status; 590 591 $ma->finish(); 592 undef $ma; 593 594 print STDERR '.' if ( $opt{showdots} ); 595 return 1; 596} 597 598########################################################################### 599 600sub usage { 601 my ( $verbose, $message ) = @_; 602 my $ver = Mail::SpamAssassin::Version(); 603 print "SpamAssassin version $ver\n"; 604 pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 ); 605} 606 607# --------------------------------------------------------------------------- 608 609=head1 NAME 610 611sa-learn - train SpamAssassin's Bayesian classifier 612 613=head1 SYNOPSIS 614 615B<sa-learn> [options] [file]... 616 617B<sa-learn> [options] --dump [ all | data | magic ] 618 619Options: 620 621 --ham Learn messages as ham (non-spam) 622 --spam Learn messages as spam 623 --forget Forget a message 624 --use-ignores Use bayes_ignore_from and bayes_ignore_to 625 --sync Synchronize the database and the journal if needed 626 --force-expire Force a database sync and expiry run 627 --dbpath <path> Allows commandline override (in bayes_path form) 628 for where to read the Bayes DB from 629 --dump [all|data|magic] Display the contents of the Bayes database 630 Takes optional argument for what to display 631 --regexp <re> For dump only, specifies which tokens to 632 dump based on a regular expression. 633 -f file, --folders=file Read list of files/directories from file 634 --dir Ignored; historical compatibility 635 --file Ignored; historical compatibility 636 --mbox Input sources are in mbox format 637 --mbx Input sources are in mbx format 638 --max-size <b> Skip messages larger than b bytes; 639 defaults to 500 KB, 0 implies no limit 640 --showdots Show progress using dots 641 --progress Show progress using progress bar 642 --no-sync Skip synchronizing the database and journal 643 after learning 644 -L, --local Operate locally, no network accesses. Use 645 of this is recommended, see documentation. 646 --import Migrate data from older version/non DB_File 647 based databases 648 --clear Wipe out existing database 649 --backup Backup, to STDOUT, existing database 650 --restore <filename> Restore a database from filename 651 -u username, --username=username 652 Override username taken from the runtime 653 environment, used with SQL 654 -C path, --configpath=path, --config-file=path 655 Path to standard configuration dir 656 -p prefs, --prefspath=file, --prefs-file=file 657 Set user preferences file 658 --siteconfigpath=path Path for site configs 659 (default: @@PREFIX@@/etc/mail/spamassassin) 660 --cf='config line' Additional line of configuration 661 -D, --debug [area=n,...] Print debugging messages 662 -V, --version Print version 663 -h, --help Print usage message 664 665=head1 DESCRIPTION 666 667Given a typical selection of your incoming mail classified as spam or ham 668(non-spam), this tool will feed each mail to SpamAssassin, allowing it 669to 'learn' what signs are likely to mean spam, and which are likely to 670mean ham. 671 672Simply run this command once for each of your mail folders, and it will 673''learn'' from the mail therein. 674 675Note that csh-style I<globbing> in the mail folder names is supported; 676in other words, listing a folder name as C<*> will scan every folder 677that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details. 678 679If you are using mail boxes in format other than maildir you should use 680the B<--mbox> or B<--mbx> parameters. 681 682SpamAssassin remembers which mail messages it has learnt already, and will not 683re-learn those messages again, unless you use the B<--forget> option. Messages 684learnt as spam will have SpamAssassin markup removed, on the fly. 685 686If you make a mistake and scan a mail as ham when it is spam, or vice 687versa, simply rerun this command with the correct classification, and the 688mistake will be corrected. SpamAssassin will automatically 'forget' the 689previous indications. 690 691Users of C<spamd> who wish to perform training remotely, over a network, 692should investigate the C<spamc -L> switch. 693 694=head1 OPTIONS 695 696=over 4 697 698=item B<--ham> 699 700Learn the input message(s) as ham. If you have previously learnt any of the 701messages as spam, SpamAssassin will forget them first, then re-learn them as 702ham. Alternatively, if you have previously learnt them as ham, it'll skip them 703this time around. If the messages have already been filtered through 704SpamAssassin, the learner will ignore any modifications SpamAssassin may have 705made. 706 707=item B<--spam> 708 709Learn the input message(s) as spam. If you have previously learnt any of the 710messages as ham, SpamAssassin will forget them first, then re-learn them as 711spam. Alternatively, if you have previously learnt them as spam, it'll skip 712them this time around. If the messages have already been filtered through 713SpamAssassin, the learner will ignore any modifications SpamAssassin may have 714made. 715 716=item B<--folders>=I<filename>, B<-f> I<filename> 717 718sa-learn will read in the list of folders from the specified file, one folder 719per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>, 720sa-learn will learn that folder appropriately, otherwise the folders will be 721assumed to be of the type specified by B<--ham> or B<--spam>. 722 723C<type> above is optional, but is the same as the standard for 724ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not 725specified). 726 727=item B<--mbox> 728 729sa-learn will read in the file(s) containing the emails to be learned, 730and will process them in mbox format (one or more emails per file). 731 732=item B<--mbx> 733 734sa-learn will read in the file(s) containing the emails to be learned, 735and will process them in mbx format (one or more emails per file). 736 737=item B<--use-ignores> 738 739Don't learn the message if a from address matches configuration file 740item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>. 741The option might be used when learning from a large file of messages 742from which the hammy spam messages or spammy ham messages have not 743been removed. 744 745=item B<--sync> 746 747Synchronize the journal and databases. Upon successfully syncing the 748database with the entries in the journal, the journal file is removed. 749 750=item B<--force-expire> 751 752Forces an expiry attempt, regardless of whether it may be necessary 753or not. Note: This doesn't mean any tokens will actually expire. 754Please see the EXPIRATION section below. 755 756Note: C<--force-expire> also causes the journal data to be synchronized 757into the Bayes databases. 758 759=item B<--forget> 760 761Forget a given message previously learnt. 762 763=item B<--dbpath> 764 765Allows a commandline override of the I<bayes_path> configuration option. 766 767=item B<--dump> I<option> 768 769Display the contents of the Bayes database. Without an option or with 770the I<all> option, all magic tokens and data tokens will be displayed. 771I<magic> will only display magic tokens, and I<data> will only display 772the data tokens. 773 774Can also use the B<--regexp> I<RE> option to specify which tokens to 775display based on a regular expression. 776 777=item B<--clear> 778 779Clear an existing Bayes database by removing all traces of the database. 780 781WARNING: This is destructive and should be used with care. 782 783=item B<--backup> 784 785Performs a dump of the Bayes database in machine/human readable format. 786 787The dump will include token and seen data. It is suitable for input back 788into the --restore command. 789 790=item B<--restore>=I<filename> 791 792Performs a restore of the Bayes database defined by I<filename>. 793 794WARNING: This is a destructive operation, previous Bayes data will be wiped out. 795 796=item B<-h>, B<--help> 797 798Print help message and exit. 799 800=item B<-u> I<username>, B<--username>=I<username> 801 802If specified this username will override the username taken from the runtime 803environment. You can use this option to specify users in a virtual user 804configuration when using SQL as the Bayes backend. 805 806NOTE: This option will not change to the given I<username>, it will only attempt 807to act on behalf of that user. Because of this you will need to have proper 808permissions to be able to change files owned by I<username>. In the case of SQL 809this generally is not a problem. 810 811=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path> 812 813Use the specified path for locating the distributed configuration files. 814Ignore the default directories (usually C</usr/share/spamassassin> or similar). 815 816=item B<--siteconfigpath>=I<path> 817 818Use the specified path for locating site-specific configuration files. Ignore 819the default directories (usually C</etc/mail/spamassassin> or similar). 820 821=item B<--cf='config line'> 822 823Add additional lines of configuration directly from the command-line, parsed 824after the configuration files are read. Multiple B<--cf> arguments can be 825used, and each will be considered a separate line of configuration. 826 827=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs> 828 829Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>). 830 831=item B<--progress> 832 833Prints a progress bar (to STDERR) showing the current progress. In the case 834where no valid terminal is found this option will behave very much like the 835--showdots option. 836 837=item B<-D> [I<area,...>], B<--debug> [I<area,...>] 838 839Produce debugging output. If no areas are listed, all debugging information is 840printed. Diagnostic output can also be enabled for each area individually; 841I<area> is the area of the code to instrument. For example, to produce 842diagnostic output on bayes, learn, and dns, use: 843 844 spamassassin -D bayes,learn,dns 845 846For more information about which areas (also known as channels) are available, 847please see the documentation at: 848 849 C<http://wiki.apache.org/spamassassin/DebugChannels> 850 851Higher priority informational messages that are suitable for logging in normal 852circumstances are available with an area of "info". 853 854=item B<--no-sync> 855 856Skip the slow synchronization step which normally takes place after 857changing database entries. If you plan to learn from many folders in 858a batch, or to learn many individual messages one-by-one, it is faster 859to use this switch and run C<sa-learn --sync> once all the folders have 860been scanned. 861 862Clarification: The state of I<--no-sync> overrides the 863I<bayes_learn_to_journal> configuration option. If not specified, 864sa-learn will learn to the database directly. If specified, sa-learn 865will learn to the journal file. 866 867Note: I<--sync> and I<--no-sync> can be specified on the same commandline, 868which is slightly confusing. In this case, the I<--no-sync> option is 869ignored since there is no learn operation. 870 871=item B<-L>, B<--local> 872 873Do not perform any network accesses while learning details about the mail 874messages. This should be normally used, as there really isn't anything 875Bayes can learn from network lookup results. Official SpamAssassin plugins 876do not currently do any network lookups when learning, but it's possible 877that third party ones might. 878 879=item B<--import> 880 881If you previously used SpamAssassin's Bayesian learner without the C<DB_File> 882module installed, it will have created files in other formats, such as 883C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate 884that old data into the C<DB_File> format. It will overwrite any data currently 885in the C<DB_File>. 886 887Can also be used with the B<--dbpath> I<path> option to specify the location of 888the Bayes files to use. 889 890=back 891 892=head1 MIGRATION 893 894There are now multiple backend storage modules available for storing 895user's bayesian data. As such you might want to migrate from one 896backend to another. Here is a simple procedure for migrating from one 897backend to another. 898 899Note that if you have individual user databases you will have to 900perform a similar procedure for each one of them. 901 902=over 4 903 904=item sa-learn --sync 905 906This will sync any outstanding journal entries 907 908=item sa-learn --backup > backup.txt 909 910This will save all your Bayes data to a plain text file. 911 912=item sa-learn --clear 913 914This is optional, but good to do to clear out the old database. 915 916=item Repeat! 917 918At this point, if you have multiple databases, you should perform the 919procedure above for each of them. (i.e. each user's database needs to 920be backed up before continuing.) 921 922=item Switch backends 923 924Once you have backed up all databases you can update your 925configuration for the new database backend. This will involve at least 926the bayes_store_module config option and may involve some additional 927config options depending on what is required by the module. (For 928example, you may need to configure an SQL database.) 929 930=item sa-learn --restore backup.txt 931 932Again, you need to do this for every database. 933 934=back 935 936If you are migrating to SQL you can make use of the -u <username> 937option in sa-learn to populate each user's database. Otherwise, you 938must run sa-learn as the user who database you are restoring. 939 940 941=head1 INTRODUCTION TO BAYESIAN FILTERING 942 943(Thanks to Michael Bell for this section!) 944 945For a more lengthy description of how this works, go to 946http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably 947readable, even if statistics make me break out in hives. 948 949The short semi-inaccurate version: Given training, a spam heuristics engine 950can take the most "spammy" and "hammy" words and apply probabilistic 951analysis. Furthermore, once given a basis for the analysis, the engine can 952continue to learn iteratively by applying both the non-Bayesian and Bayesian 953rulesets together to create evolving "intelligence". 954 955SpamAssassin 2.50 and later supports Bayesian spam analysis, in 956the form of the BAYES rules. This is a new feature, quite powerful, 957and is disabled until enough messages have been learnt. 958 959The pros of Bayesian spam analysis: 960 961=over 4 962 963=item Can greatly reduce false positives and false negatives. 964 965It learns from your mail, so it is tailored to your unique e-mail flow. 966 967=item Once it starts learning, it can continue to learn from SpamAssassin 968and improve over time. 969 970=back 971 972And the cons: 973 974=over 4 975 976=item A decent number of messages are required before results are useful 977for ham/spam determination. 978 979=item It's hard to explain why a message is or isn't marked as spam. 980 981i.e.: a straightforward rule, that matches, say, "VIAGRA" is 982easy to understand. If it generates a false positive or false negative, 983it is fairly easy to understand why. 984 985With Bayesian analysis, it's all probabilities - "because the past says 986it is likely as this falls into a probabilistic distribution common to past 987spam in your systems". Tell that to your users! Tell that to the client 988when he asks "what can I do to change this". (By the way, the answer in 989this case is "use whitelisting".) 990 991=item It will take disk space and memory. 992 993The databases it maintains take quite a lot of resources to store and use. 994 995=back 996 997=head1 GETTING STARTED 998 999Still interested? Ok, here's the guidelines for getting this working. 1000 1001First a high-level overview: 1002 1003=over 4 1004 1005=item Build a significant sample of both ham and spam. 1006 1007I suggest several thousand of each, placed in SPAM and HAM directories or 1008mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much 1009better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY 1010message. You're urged to avoid using a publicly available corpus (sample) - 1011this must be taken from YOUR mail server, if it is to be statistically useful. 1012Otherwise, the results may be pretty skewed. 1013 1014=item Use this tool to teach SpamAssassin about these samples, like so: 1015 1016 sa-learn --spam /path/to/spam/folder 1017 sa-learn --ham /path/to/ham/folder 1018 ... 1019 1020Let SpamAssassin proceed, learning stuff. When it finds ham and spam 1021it will add the "interesting tokens" to the database. 1022 1023=item If you need SpamAssassin to forget about specific messages, use 1024the B<--forget> option. 1025 1026This can be applied to either ham or spam that has run through the 1027B<sa-learn> processes. It's a bit of a hammer, really, lowering the 1028weighting of the specific tokens in that message (only if that message has 1029been processed before). 1030 1031=item Learning from single messages uses a command like this: 1032 1033 sa-learn --ham --no-sync mailmessage 1034 1035This is handy for binding to a key in your mail user agent. It's very fast, as 1036all the time-consuming stuff is deferred until you run with the C<--sync> 1037option. 1038 1039=item Autolearning is enabled by default 1040 1041If you don't have a corpus of mail saved to learn, you can let 1042SpamAssassin automatically learn the mail that you receive. If you are 1043autolearning from scratch, the amount of mail you receive will determine 1044how long until the BAYES_* rules are activated. 1045 1046=back 1047 1048=head1 EFFECTIVE TRAINING 1049 1050Learning filters require training to be effective. If you don't train 1051them, they won't work. In addition, you need to train them with new 1052messages regularly to keep them up-to-date, or their data will become 1053stale and impact accuracy. 1054 1055You need to train with both spam I<and> ham mails. One type of mail 1056alone will not have any effect. 1057 1058Note that if your mail folders contain things like forwarded spam, 1059discussions of spam-catching rules, etc., this will cause trouble. You 1060should avoid scanning those messages if possible. (An easy way to do this 1061is to move them aside, into a folder which is not scanned.) 1062 1063If the messages you are learning from have already been filtered through 1064SpamAssassin, the learner will compensate for this. In effect, it learns what 1065each message would look like if you had run C<spamassassin -d> over it in 1066advance. 1067 1068Another thing to be aware of, is that typically you should aim to train 1069with at least 1000 messages of spam, and 1000 ham messages, if 1070possible. More is better, but anything over about 5000 messages does not 1071improve accuracy significantly in our tests. 1072 1073Be careful that you train from the same source -- for example, if you train 1074on old spam, but new ham mail, then the classifier will think that 1075a mail with an old date stamp is likely to be spam. 1076 1077It's also worth noting that training with a very small quantity of 1078ham, will produce atrocious results. You should aim to train with at 1079least the same amount (or more if possible!) of ham data than spam. 1080 1081On an on-going basis, it is best to keep training the filter to make 1082sure it has fresh data to work from. There are various ways to do 1083this: 1084 1085=over 4 1086 1087=item 1. Supervised learning 1088 1089This means keeping a copy of all or most of your mail, separated into spam 1090and ham piles, and periodically re-training using those. It produces 1091the best results, but requires more work from you, the user. 1092 1093(An easy way to do this, by the way, is to create a new folder for 1094'deleted' messages, and instead of deleting them from other folders, 1095simply move them in there instead. Then keep all spam in a separate 1096folder and never delete it. As long as you remember to move misclassified 1097mails into the correct folder set, it is easy enough to keep up to date.) 1098 1099=item 2. Unsupervised learning from Bayesian classification 1100 1101Another way to train is to chain the results of the Bayesian classifier 1102back into the training, so it reinforces its own decisions. This is only 1103safe if you then retrain it based on any errors you discover. 1104 1105SpamAssassin does not support this method, due to experimental results 1106which strongly indicate that it does not work well, and since Bayes is 1107only one part of the resulting score presented to the user (while Bayes 1108may have made the wrong decision about a mail, it may have been overridden 1109by another system). 1110 1111=item 3. Unsupervised learning from SpamAssassin rules 1112 1113Also called 'auto-learning' in SpamAssassin. Based on statistical 1114analysis of the SpamAssassin success rates, we can automatically train the 1115Bayesian database with a certain degree of confidence that our training 1116data is accurate. 1117 1118It should be supplemented with some supervised training in addition, if 1119possible. 1120 1121This is the default, but can be turned off by setting the SpamAssassin 1122configuration parameter C<bayes_auto_learn> to 0. 1123 1124=item 4. Mistake-based training 1125 1126This means training on a small number of mails, then only training on 1127messages that SpamAssassin classifies incorrectly. This works, but it 1128takes longer to get it right than a full training session would. 1129 1130=back 1131 1132=head1 FILES 1133 1134B<sa-learn> and the other parts of SpamAssassin's Bayesian learner, 1135use a set of persistent database files to store the learnt tokens, as follows. 1136 1137=over 4 1138 1139=item bayes_toks 1140 1141The database of tokens, containing the tokens learnt, their count of 1142occurrences in ham and spam, and the timestamp when the token was last 1143seen in a message. 1144 1145This database also contains some 'magic' tokens, as follows: the version 1146number of the database, the number of ham and spam messages learnt, the 1147number of tokens in the database, and timestamps of: the last journal 1148sync, the last expiry run, the last expiry token reduction count, the 1149last expiry timestamp delta, the oldest token timestamp in the database, 1150and the newest token timestamp in the database. 1151 1152This is a database file, using C<DB_File>. The database 'version 1153number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x 1154development releases, 2 for 2.6x, and 3 for 3.0 and later releases. 1155 1156=item bayes_seen 1157 1158A map of Message-Id and some data from headers and body to what that 1159message was learnt as. This is used so that SpamAssassin can avoid 1160re-learning a message it has already seen, and so it can reverse the 1161training if you later decide that message was learnt incorrectly. 1162 1163This is a database file, using C<DB_File>. 1164 1165=item bayes_journal 1166 1167While SpamAssassin is scanning mails, it needs to track which tokens 1168it uses in its calculations. To avoid the contention of having each 1169SpamAssassin process attempting to gain write access to the Bayes DB, 1170the token timestamps are written to a 'journal' file which will later 1171(either automatically or via C<sa-learn --sync>) be used to synchronize 1172the Bayes DB. 1173 1174Also, through the use of C<bayes_learn_to_journal>, or when using the 1175C<--no-sync> option with sa-learn, the actual learning data will take 1176be placed into the journal for later synchronization. This is typically 1177useful for high-traffic sites to avoid the same contention as stated 1178above. 1179 1180=back 1181 1182=head1 EXPIRATION 1183 1184Since SpamAssassin can auto-learn messages, the Bayes database files 1185could increase perpetually until they fill your disk. To control this, 1186SpamAssassin performs journal synchronization and bayes expiration 1187periodically when certain criteria (listed below) are met. 1188 1189SpamAssassin can sync the journal and expire the DB tokens either 1190manually or opportunistically. A journal sync is due if I<--sync> 1191is passed to sa-learn (manual), or if the following is true 1192(opportunistic): 1193 1194=over 4 1195 1196=item - bayes_journal_max_size does not equal 0 (means don't sync) 1197 1198=item - the journal file exists 1199 1200=back 1201 1202and either: 1203 1204=over 4 1205 1206=item - the journal file has a size greater than bayes_journal_max_size 1207 1208=back 1209 1210or 1211 1212=over 4 1213 1214=item - a journal sync has previously occurred, and at least 1 day has 1215passed since that sync 1216 1217=back 1218 1219Expiry is due if I<--force-expire> is passed to sa-learn (manual), 1220or if all of the following are true (opportunistic): 1221 1222=over 4 1223 1224=item - the last expire was attempted at least 12hrs ago 1225 1226=item - bayes_auto_expire does not equal 0 1227 1228=item - the number of tokens in the DB is > 100,000 1229 1230=item - the number of tokens in the DB is > bayes_expiry_max_db_size 1231 1232=item - there is at least a 12 hr difference between the oldest and newest token atimes 1233 1234=back 1235 1236=head2 EXPIRE LOGIC 1237 1238If either the manual or opportunistic method causes an expire run 1239to start, here is the logic that is used: 1240 1241=over 4 1242 1243=item - figure out how many tokens to keep. take the larger of 1244either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal 1245reduction is number of tokens - number of tokens to keep. 1246 1247=item - if the reduction number is < 1000 tokens, abort (not worth the effort). 1248 1249=item - if an expire has been done before, guesstimate the new 1250atime delta based on the old atime delta. (new_atime_delta = 1251old_atime_delta * old_reduction_count / goal) 1252 1253=item - if no expire has been done before, or the last expire looks 1254"weird", do an estimation pass. The definition of "weird" is: 1255 1256=over 8 1257 1258=item - last expire over 30 days ago 1259 1260=item - last atime delta was < 12 hrs 1261 1262=item - last reduction count was < 1000 tokens 1263 1264=item - estimated new atime delta is < 12 hrs 1265 1266=item - the difference between the last reduction count and the goal reduction count is > 50% 1267 1268=back 1269 1270=back 1271 1272=head2 ESTIMATION PASS LOGIC 1273 1274Go through each of the DB's tokens. Starting at 12hrs, calculate 1275whether or not the token would be expired (based on the difference 1276between the token's atime and the db's newest token atime) and keep 1277the count. Work out from 12hrs exponentially by powers of 2. ie: 127812hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs 1279* 512 (6144hrs, or 256 days). 1280 1281The larger the delta, the smaller the number of tokens that will 1282be expired. Conversely, the number of tokens goes up as the delta 1283gets smaller. So starting at the largest atime delta, figure out 1284which delta will expire the most tokens without going above the 1285goal expiration count. Use this to choose the atime delta to use, 1286unless one of the following occurs: 1287 1288=over 8 1289 1290=item - the largest atime (smallest reduction count) would expire 1291too many tokens. this means the learned tokens are mostly old and 1292there needs to be new tokens learned before an expire can 1293occur. 1294 1295=item - all of the atime choices result in 0 tokens being removed. 1296this means the tokens are all newer than 12 hours and there needs 1297to be new tokens learned before an expire can occur. 1298 1299=item - the number of tokens that would be removed is < 1000. the 1300benefit isn't worth the effort. more tokens need to be learned. 1301 1302=back 1303 1304If the expire run gets past this point, it will continue to the end. 1305A new DB is created since the majority of DB libraries don't shrink the 1306DB file when tokens are removed. So we do the "create new, migrate old 1307to new, remove old, rename new" shuffle. 1308 1309=head2 EXPIRY RELATED CONFIGURATION SETTINGS 1310 1311=over 4 1312 1313=item C<bayes_auto_expire> is used to specify whether or not SpamAssassin 1314ought to opportunistically attempt to expire the Bayes database. 1315The default is 1 (yes). 1316 1317=item C<bayes_expiry_max_db_size> specifies both the auto-expire token 1318count point, as well as the resulting number of tokens after expiry 1319as described above. The default value is 150,000, which is roughly 1320equivalent to a 6Mb database file if you're using DB_File. 1321 1322=item C<bayes_journal_max_size> specifies how large the Bayes 1323journal will grow before it is opportunistically synced. The 1324default value is 102400. 1325 1326=back 1327 1328=head1 INSTALLATION 1329 1330The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module. 1331Install this as a normal Perl module, using C<perl -MCPAN -e shell>, 1332or by hand. 1333 1334=head1 SEE ALSO 1335 1336spamassassin(1) 1337spamc(1) 1338Mail::SpamAssassin(3) 1339Mail::SpamAssassin::ArchiveIterator(3) 1340 1341E<lt>http://www.paulgraham.com/E<gt> 1342Paul Graham's "A Plan For Spam" paper 1343 1344E<lt>http://www.linuxjournal.com/article/6467E<gt> 1345Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin 1346 1347E<lt>http://www.bgl.nu/~glouis/bogofilter/E<gt> 1348'Training on error' page. A discussion of various Bayes training regimes, 1349including 'train on error' and unsupervised training. 1350 1351=head1 PREREQUISITES 1352 1353C<Mail::SpamAssassin> 1354 1355=head1 AUTHORS 1356 1357The SpamAssassin(tm) Project E<lt>https://spamassassin.apache.org/E<gt> 1358 1359=cut 1360 1361