1#!/usr/bin/env perl
2
3use warnings;
4use strict;
5use Pod::Usage;
6use Getopt::Long;
7use Time::HiRes qw(gettimeofday tv_interval);
8use JSON::XS;
9use String::ShellQuote;
10use FileHandle;
11use IPC::Open2;
12use Data::Dumper;
13
14my $spam_dir;
15my $ham_dir;
16my $parallel            = 1;
17my $classifier          = "bayes";
18my $spam_symbol         = "BAYES_SPAM";
19my $ham_symbol          = "BAYES_HAM";
20my $timeout             = 10;
21my $rspamc              = $ENV{'RSPAMC'} || "rspamc";
22my $bogofilter          = $ENV{'BOGOFILTER'} || "bogofilter";
23my $dspam               = $ENV{'DSPAM'} || "dspam";
24my $train_fraction      = 0.5;
25my $use_bogofilter      = 0;
26my $use_dspam           = 0;
27my $check_only          = 0;
28my $rspamc_prob_trigger = 95;
29my $man;
30my $help;
31
32GetOptions(
33    "spam|s=s"           => \$spam_dir,
34    "ham|h=s"            => \$ham_dir,
35    "spam-symbol=s"      => \$spam_symbol,
36    "ham-symbol=s"       => \$ham_symbol,
37    "classifier|c=s"     => \$classifier,
38    "timeout|t=f"        => \$timeout,
39    "parallel|p=i"       => \$parallel,
40    "train-fraction|t=f" => \$train_fraction,
41    "bogofilter|b"       => \$use_bogofilter,
42    "dspam|d"            => \$use_dspam,
43    "check-only"         => \$check_only,
44    "help|?"             => \$help,
45    "man"                => \$man
46) or pod2usage(2);
47
48pod2usage(1) if $help;
49pod2usage( -exitval => 0, -verbose => 2 ) if $man;
50
51sub read_dir_files {
52    my ( $dir, $target ) = @_;
53    opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
54    while ( my $file = readdir $dh ) {
55        if ( -f "$dir/$file" ) {
56            push @{$target}, "$dir/$file";
57        }
58    }
59}
60
61sub shuffle_array {
62    my ($ar) = @_;
63
64    for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
65        if ( $i > 1 ) {
66            my $sel = int( rand( $i - 1 ) );
67            ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
68        }
69    }
70}
71
72sub learn_rspamc {
73    my ( $files, $spam ) = @_;
74    my $processed = 0;
75
76    my $cmd         = $spam ? "learn_spam" : "learn_ham";
77    my $args_quoted = shell_quote @{$files};
78    open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" )
79      or die "cannot spawn $rspamc: $!";
80
81    while (<$p>) {
82        my $res = eval('decode_json($_)');
83        if ( $res && $res->{'success'} ) {
84            $processed++;
85        }
86    }
87
88    return $processed;
89}
90
91sub learn_bogofilter {
92    my ( $files, $spam ) = @_;
93    my $processed = 0;
94
95    foreach my $f ( @{$files} ) {
96        my $args_quoted = shell_quote $f;
97        my $fl          = $spam ? "-s" : "-n";
98        `$bogofilter  -I $args_quoted $fl`;
99        if ( $? == 0 ) {
100            $processed++;
101        }
102    }
103
104    return $processed;
105}
106
107sub learn_dspam {
108    my ( $files, $spam ) = @_;
109    my $processed = 0;
110
111    foreach my $f ( @{$files} ) {
112        my $args_quoted = shell_quote $f;
113        my $fl          = $spam ? "--class=spam" : "--class=innocent";
114        open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
115          or die "cannot run $dspam: $!";
116
117        open( my $inp, "< $f" );
118        while (<$inp>) {
119            print $p $_;
120        }
121    }
122
123    return $processed;
124}
125
126sub learn_samples {
127    my ( $ar_ham, $ar_spam ) = @_;
128    my $len;
129    my $processed = 0;
130    my $total     = 0;
131    my $learn_func;
132
133    my @files_spam;
134    my @files_ham;
135
136    if ($use_dspam) {
137        $learn_func = \&learn_dspam;
138    }
139    elsif ($use_bogofilter) {
140        $learn_func = \&learn_bogofilter;
141    }
142    else {
143        $learn_func = \&learn_rspamc;
144    }
145
146    $len = int( scalar @{$ar_ham} * $train_fraction );
147    my @cur_vec;
148
149    # Shuffle spam and ham samples
150    for ( my $i = 0 ; $i < $len ; $i++ ) {
151        if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
152            push @cur_vec, @{$ar_ham}[$i];
153            push @files_ham, [@cur_vec];
154            @cur_vec = ();
155            $total++;
156        }
157        else {
158            push @cur_vec, @{$ar_ham}[$i];
159        }
160    }
161
162    $len     = int( scalar @{$ar_spam} * $train_fraction );
163    @cur_vec = ();
164    for ( my $i = 0 ; $i < $len ; $i++ ) {
165        if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
166            push @cur_vec, @{$ar_spam}[$i];
167            push @files_spam, [@cur_vec];
168            @cur_vec = ();
169            $total++;
170        }
171        else {
172            push @cur_vec, @{$ar_spam}[$i];
173        }
174    }
175
176    for ( my $i = 0 ; $i < $total ; $i++ ) {
177        my $args;
178        my $spam;
179
180        if ( $i % 2 == 0 ) {
181            $args = pop @files_spam;
182
183            if ( !$args ) {
184                $args = pop @files_ham;
185                $spam = 0;
186            }
187            else {
188                $spam = 1;
189            }
190        }
191        else {
192            $args = pop @files_ham;
193            if ( !$args ) {
194                $args = pop @files_spam;
195                $spam = 1;
196            }
197            else {
198                $spam = 0;
199            }
200        }
201
202        my $r = $learn_func->( $args, $spam );
203        if ($r) {
204            $processed += $r;
205        }
206    }
207
208    return $processed;
209}
210
211sub check_rspamc {
212    my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
213
214    my $args_quoted = shell_quote @{$files};
215    my $processed   = 0;
216
217    open(
218        my $p,
219"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
220    ) or die "cannot spawn $rspamc: $!";
221
222    while (<$p>) {
223        my $res = eval('decode_json($_)');
224        if ( $res && $res->{'default'} ) {
225            $processed++;
226
227            if ($spam) {
228                if ( $res->{'default'}->{$ham_symbol} ) {
229                    my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
230                    if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
231                        my $percentage = int($1);
232                        if ( $percentage >= $rspamc_prob_trigger ) {
233                            $$fp_cnt++;
234                        }
235                    }
236                    else {
237                        $$fp_cnt++;
238                    }
239                }
240                elsif ( !$res->{'default'}->{$spam_symbol} ) {
241                    $$fn_cnt++;
242                }
243                else {
244                    $$detected_cnt++;
245                }
246            }
247            else {
248                if ( $res->{'default'}->{$spam_symbol} ) {
249                    my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
250                    if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
251
252                        my $percentage = int($1);
253                        if ( $percentage >= $rspamc_prob_trigger ) {
254                            $$fp_cnt++;
255                        }
256                    }
257                    else {
258                        $$fp_cnt++;
259                    }
260                }
261                elsif ( !$res->{'default'}->{$ham_symbol} ) {
262                    $$fn_cnt++;
263                }
264                else {
265                    $$detected_cnt++;
266                }
267            }
268        }
269    }
270
271    return $processed;
272}
273
274sub check_bogofilter {
275    my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
276    my $processed = 0;
277
278    foreach my $f ( @{$files} ) {
279        my $args_quoted = shell_quote $f;
280
281        open( my $p, "$bogofilter -t -I $args_quoted |" )
282          or die "cannot spawn $bogofilter: $!";
283
284        while (<$p>) {
285            if ( $_ =~ /^([SHU])\s+.*$/ ) {
286                $processed++;
287
288                if ($spam) {
289                    if ( $1 eq 'H' ) {
290                        $$fp_cnt++;
291                    }
292                    elsif ( $1 eq 'U' ) {
293                        $$fn_cnt++;
294                    }
295                    else {
296                        $$detected_cnt++;
297                    }
298                }
299                else {
300                    if ( $1 eq 'S' ) {
301                        $$fp_cnt++;
302                    }
303                    elsif ( $1 eq 'U' ) {
304                        $$fn_cnt++;
305                    }
306                    else {
307                        $$detected_cnt++;
308                    }
309                }
310            }
311        }
312    }
313
314    return $processed;
315}
316
317sub check_dspam {
318    my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
319    my $processed = 0;
320
321    foreach my $f ( @{$files} ) {
322        my $args_quoted = shell_quote $f;
323
324        my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" );
325        open( my $inp, "< $f" );
326        while (<$inp>) {
327            print Writer $_;
328        }
329        close Writer;
330
331        while (<Reader>) {
332            if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) {
333                $processed++;
334                my $percentage = int( $2 * 100.0 );
335
336                if ($spam) {
337                    if ( $1 eq 'Innocent' ) {
338                        if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) {
339                            $$fp_cnt++;
340                        }
341                    }
342                    elsif ( $1 ne 'Spam' ) {
343                        $$fn_cnt++;
344                    }
345                    else {
346                        $$detected_cnt++;
347                    }
348                }
349                else {
350                    if ( $1 eq 'Spam' ) {
351                        if ( $percentage >= $rspamc_prob_trigger ) {
352                            $$fp_cnt++;
353                        }
354                    }
355                    elsif ( $1 ne 'Innocent' ) {
356                        $$fn_cnt++;
357                    }
358                    else {
359                        $$detected_cnt++;
360                    }
361                }
362            }
363        }
364        close Reader;
365        waitpid( $pid, 0 );
366    }
367
368    return $processed;
369}
370
371sub cross_validate {
372    my ($hr)          = @_;
373    my $args          = "";
374    my $processed     = 0;
375    my $fp_spam       = 0;
376    my $fn_spam       = 0;
377    my $fp_ham        = 0;
378    my $fn_ham        = 0;
379    my $total_spam    = 0;
380    my $total_ham     = 0;
381    my $detected_spam = 0;
382    my $detected_ham  = 0;
383    my $i             = 0;
384    my $len           = scalar keys %{$hr};
385    my @files_spam;
386    my @files_ham;
387    my @cur_spam;
388    my @cur_ham;
389    my $check_func;
390
391    if ($use_dspam) {
392        $check_func = \&check_dspam;
393    }
394    elsif ($use_bogofilter) {
395        $check_func = \&check_bogofilter;
396    }
397    else {
398        $check_func = \&check_rspamc;
399    }
400
401    while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
402        if ($spam) {
403            if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
404                push @cur_spam, $fn;
405                push @files_spam, [@cur_spam];
406                @cur_spam = ();
407            }
408            else {
409                push @cur_spam, $fn;
410            }
411        }
412        else {
413            if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
414                push @cur_ham, $fn;
415                push @files_ham, [@cur_ham];
416                @cur_ham = ();
417            }
418            else {
419                push @cur_ham, $fn;
420            }
421        }
422    }
423
424    shuffle_array( \@files_spam );
425
426    foreach my $fn (@files_spam) {
427        my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
428        $total_spam += $r;
429        $processed  += $r;
430    }
431
432    shuffle_array( \@files_ham );
433
434    foreach my $fn (@files_ham) {
435        my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
436        $total_ham += $r;
437        $processed += $r;
438    }
439
440    printf "Scanned %d messages
441%d spam messages (%d detected)
442%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
443
444    printf "\nHam FP rate: %.2f%% (%d messages)
445Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham;
446
447    printf "\nSpam FP rate: %.2f%% (%d messages)
448Spam FN rate: %.2f%% (%d messages)\n",
449      $fp_spam / $total_spam * 100.0, $fp_spam,
450      $fn_spam / $total_spam * 100.0, $fn_spam;
451}
452
453if ( !$spam_dir || !$ham_dir ) {
454    die "spam or/and ham directories are not specified";
455}
456
457my @spam_samples;
458my @ham_samples;
459
460read_dir_files( $spam_dir, \@spam_samples );
461read_dir_files( $ham_dir,  \@ham_samples );
462shuffle_array( \@spam_samples );
463shuffle_array( \@ham_samples );
464
465if ( !$check_only ) {
466    my $learned = 0;
467    my $t0      = [gettimeofday];
468    $learned = learn_samples( \@ham_samples, \@spam_samples );
469    my $t1 = [gettimeofday];
470
471    printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 );
472}
473
474my %validation_set;
475my $len = int( scalar @spam_samples * $train_fraction );
476for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
477    $validation_set{ $spam_samples[$i] } = 1;
478}
479
480$len = int( scalar @ham_samples * $train_fraction );
481for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
482    $validation_set{ $ham_samples[$i] } = 0;
483}
484
485cross_validate( \%validation_set );
486
487__END__
488
489=head1 NAME
490
491classifier_test.pl - test various parameters for a classifier
492
493=head1 SYNOPSIS
494
495classifier_test.pl [options]
496
497 Options:
498   --spam                 Directory with spam files
499   --ham                  Directory with ham files
500   --spam-symbol          Symbol for spam (default: BAYES_SPAM)
501   --ham-symbol           Symbol for ham (default: BAYES_HAM)
502   --classifier           Classifier to test (default: bayes)
503   --timeout              Timeout for rspamc (default: 10)
504   --parallel             Parallel execution (default: 1)
505   --help                 Brief help message
506   --man                  Full documentation
507
508=head1 OPTIONS
509
510=over 8
511
512=item B<--spam>
513
514Directory with spam files.
515
516=item B<--ham>
517
518Directory with ham files.
519
520=item B<--classifier>
521
522Specifies classifier name to test.
523
524=item B<--help>
525
526Print a brief help message and exits.
527
528=item B<--man>
529
530Prints the manual page and exits.
531
532=back
533
534=head1 DESCRIPTION
535
536B<classifier_test.pl> is intended to test Rspamd classifier for false positives, false negatives and other parameters.
537It uses half of the corpus for training and half for cross-validation.
538
539=cut
540