1#!/usr/bin/perl -w
2#
3# <@LICENSE>
4# Licensed to the Apache Software Foundation (ASF) under one or more
5# contributor license agreements.  See the NOTICE file distributed with
6# this work for additional information regarding copyright ownership.
7# The ASF licenses this file to you under the Apache License, Version 2.0
8# (the "License"); you may not use this file except in compliance with
9# the License.  You may obtain a copy of the License at:
10#
11#     http://www.apache.org/licenses/LICENSE-2.0
12#
13# Unless required by applicable law or agreed to in writing, software
14# distributed under the License is distributed on an "AS IS" BASIS,
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16# See the License for the specific language governing permissions and
17# limitations under the License.
18# </@LICENSE>
19
20=head1 NAME
21
22logs-to-c - Convert a mass-check log into perceptron format
23
24=head1 SYNOPSIS
25
26logs-to-c [options]
27
28 Options:
29    -c,--cffile=path	  Use path as the rules directory
30    -s,--scoreset=n	  Use scoreset n
31    --spam=file           Location of spam mass-check log
32    --ham=file            Location of ham mass-check log
33
34=head1 DESCRIPTION
35
36B<logs-to-c> will read the mass-check logs F<spam.log> and F<ham.log>
37or as specified by the B<--spam> and B<--ham> options, and convert it
38into the format needed by the perceptron. This is a format that is
39simple for the perceptron to parse, but is not very readable to
40humans.
41
42=head1 BUGS
43
44Please report bugs to http://bugzilla.spamassassin.org/
45
46=head1 SEE ALSO
47
48L<mass-check(1)>, L<perceptron(1)>
49
50=cut
51
52use Getopt::Long qw(:config auto_help bundling);
53use strict;
54
55our $opt_cffile = "../rules";
56our $opt_spam = 'spam.log';
57our $opt_ham = 'ham.log';
58our $opt_scoreset = 0;
59
60GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i");
61
62my $is_spam = '';		# vec aligned with @tests_hit
63my @tests_hit = ();
64my %mutable_tests = ();
65
66our (%rules, %allrules, %scores);
67
68my (%ignored_rule, %range_lo, %range_hi);
69my %rule_to_index;
70
71readscores();
72
73print "Reading per-message hit stat logs and scores...\n";
74my ($num_tests, $num_spam, $num_ham);
75
76read_ranges();
77readlogs();
78
79print "Writing logs and current scores as C code...\n";
80writescores_c();
81
82# show memory usage before we exit
83# print "Running \"ps aux\"...\n";
84# open(PS, "ps aux|");
85# while(<PS>) {
86# print if $. == 1 || /\b$$\b/;
87# }
88# close(PS);
89
90exit 0;
91
92# code to freeze/thaw test lines in as little space as possible
93# this could be faster, but improves memory usage by a phenomenal
94# amount over arrayrefs or strings of comma-separated-values
95my $short_index = 1;
96my %long_to_short;
97my @short_to_long;
98
99sub new_short {
100  $short_index++;
101  $long_to_short{$_[0]} = $short_index;
102  $short_to_long[$short_index] = $_[0];
103  return $short_index;
104}
105
106# uses less than half the memory of join on ',' and even better
107# compared to Storable::freeze
108sub freeze_tests {
109  return pack("w*", map
110	      {
111		$long_to_short{$_} || new_short($_);
112	      } @{$_[0]})
113}
114
115sub thaw_tests {
116  return map { $short_to_long[$_] } unpack("w*", $_[0]);
117}
118
119sub readlogs {
120  my $msgline;
121
122  my $count = 0;
123  $num_spam = $num_ham = 0;
124
125  foreach my $file ($opt_spam, $opt_ham) {
126    open (IN, "<$file") || die "Could not open file '$file': $!";
127
128    my $isspam = ($file eq $opt_spam);
129    my $caught;			# 1st parameter of log line
130    my $rules;			# 4th parameter of log line
131    my $restofline;             # intermediate parse buffer
132
133    while (defined($msgline = <IN>)) {
134      # faster log-reading code from hit-frequencies.
135      # the additional split() is for this case:
136      # ".  -20 /path  time=1112116980,scantime=0,format=f,reuse=no"
137      # in other words, no hits.  split(' ') cannot deal with this
138      # correctly, seeing (".", "-20", "/path", "time=...etc").  Work
139      # around this by using a literal / / regexp split to discard
140      # the csv stuff we don't want out of the rest of the line.
141
142      ($caught, undef, $restofline) = split(' ', $msgline, 3);
143      next unless ($caught =~ /^[Y\.]$/ && $restofline);
144      (undef, $rules) = split(/ /, $restofline, 3);
145
146      # get tests, but ignore unknown tests and subrules
147      my @tests;
148      foreach my $r (split(/,/, $rules)) {
149        my $hits = 1;
150        # Support compacted RULE(hitcount) format
151        if ($r =~ s/\((\d+)\)$//) {
152          $hits = $1;
153        }
154        next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
155        push @tests, $r for (1 .. $hits);
156      }
157
158      if ($isspam) {
159        $num_spam++;
160        vec($is_spam, $count, 1) = 1;
161      }
162      else {
163        $num_ham++;
164        vec($is_spam, $count, 1) = 0;
165      }
166
167      # inlined for speed.
168      # ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests);
169      $tests_hit[$count] = pack("w*", map
170                  {
171                    $long_to_short{$_} || new_short($_);
172                  } @tests);
173
174      # TODO: benchmark using foreach(), map() is often slower
175
176      $count++;                  # increment line
177    }
178    close IN;
179  }
180  $num_tests = $count;
181}
182
183sub readscores {
184  print "Reading scores from \"$opt_cffile\"...\n";
185  my $tmpf = "./tmp/rules$$.pl";
186  system "../build/parse-rules-for-masses ".
187        "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die;
188  require $tmpf;
189  unlink $tmpf;
190  %allrules = %rules;           # ensure it stays global
191}
192
193sub writescores_c {
194  my $output = '';
195  my $size = 0;
196  my $mutable = 0;
197  my $i;
198
199    # jm: now, score-ranges-from-freqs has tflags to work from, so
200    # it will always list all mutable tests.
201
202  my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
203			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
204			   ($a cmp $b)} (keys %scores);
205  my $max_hits_per_msg = 0;
206  for (my $file = 0; $file < $num_tests; $file++) {
207    my(@hits) =
208     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file]));
209    if ((scalar(@hits)+1) > $max_hits_per_msg) {
210      $max_hits_per_msg = scalar(@hits)+1;
211    }
212  }
213
214  for ($i = 0; $i <= $#index_to_rule; $i++) {
215    my $name = $index_to_rule[$i];
216    $rule_to_index{$name} = $i;
217
218    if ($ignored_rule{$name}) { next; }
219
220    if ($mutable_tests{$name} == 0) {
221      $range_lo{$name} = $range_hi{$name} = $scores{$name};
222    } else {
223      $mutable++;
224      if ($range_lo{$name} > $range_hi{$name}) {
225	($range_lo{$name},$range_hi{$name}) =
226	 ($range_hi{$name},$range_lo{$name});
227      }
228      #$range_lo{$name} ||= 0.1;
229      #$range_hi{$name} ||= 1.5;
230
231      # no default score found? set it to max and let GA adjust downwards.  this
232      # seems to help avoid a load of really good rules getting 1.0 scores
233      if ($allrules{$name}->{no_score_found}) {
234        $scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0;
235      }
236    }
237
238    $output .= ".".$i."\n".
239                "n".$name."\n".
240                "b".$scores{$name}."\n".
241                "m".$mutable_tests{$name}."\n".
242                "l".$range_lo{$name}."\n".
243                "h".$range_hi{$name}."\n";
244    $size++;
245  }
246
247
248  open (DAT, ">tmp/scores.data");
249  print DAT "N$size\n", "M$mutable\n", # informational only
250   $output;
251  close DAT;
252
253  open (OUT, ">tmp/scores.h");
254  print OUT "
255#include <stdio.h>
256#include <string.h>
257#include <stdlib.h>
258
259int num_scores = $size;
260int num_mutable = $mutable;
261unsigned char is_mutable[$size];
262double range_lo[$size];
263double range_hi[$size];
264double bestscores[$size];
265char *score_names[$size];
266double tmp_scores[$size][2];
267unsigned char ny_hit[$mutable];
268unsigned char yn_hit[$mutable];
269
270double lookup[$mutable];
271
272/* readscores() is defined in tests.h */
273
274";
275  close OUT;
276
277  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
278}
279
280sub writetests_c {
281  my $max_hits_per_msg = $_[0];
282
283  my(%uniq_files) = ();
284  my(%count_keys) = ();
285  my(%file_key) = ();
286
287  my $file;
288
289  for ($file = 0; $file < $num_tests; $file++)
290  {
291    my $uniq_key = vec($is_spam, $file, 1) . " ";
292
293    my (@good_tests) =
294     grep {length($_) && (! $ignored_rule{$_}) &&
295	    (defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file]));
296
297    @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));
298
299    $uniq_key .= join(" ",@good_tests);
300
301    if (exists($count_keys{$uniq_key})) {
302      $count_keys{$uniq_key}++;
303    } else {
304      $count_keys{$uniq_key} = 1;
305      $file_key{$file} = $uniq_key;
306      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
307    }
308  }
309
310  my $num_nondup = scalar(keys(%uniq_files));
311
312  open (TOP, ">tmp/tests.h");
313  print TOP "
314#include <stdio.h>
315#include <string.h>
316#include <stdlib.h>
317
318int num_tests = $num_tests;
319int num_nondup = $num_nondup;
320int num_spam = $num_spam;
321int num_ham = $num_ham;
322int max_hits_per_msg = $max_hits_per_msg;
323unsigned char num_tests_hit[$num_nondup];
324unsigned char is_spam[$num_nondup];
325unsigned short tests_hit[$num_nondup][$max_hits_per_msg];
326double scores[$num_nondup];
327double tmp_total[$num_nondup];
328int tests_count[$num_nondup];
329
330";
331  $_ = join ('', <DATA>);
332  print TOP $_;
333  close TOP;
334
335  open (DAT, ">tmp/tests.data");
336
337  foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
338    print DAT ".".$uniq_files{$file}."\n";
339
340    my $out = '';
341    $out .= "s".vec($is_spam, $file, 1)."\n";
342
343    my $base_score = 0;
344    my $num_tests_hit = 0;
345    foreach my $test (thaw_tests($tests_hit[$file])) {
346      if ($test eq '') { next; }
347
348      if ($ignored_rule{$test}) {
349        # this is not a log-worthy event anymore, since we have a lot
350        # of T_ test rules that are ignored during perceptron runs
351        # warn "ignored rule $test got a hit in $file!\n";
352        next;
353      }
354
355      if (!defined $rule_to_index{$test}) {
356	warn "test with no C index: $test\n";
357	next;
358      }
359
360      if ($mutable_tests{$test}) {
361        $num_tests_hit++;
362        $out .= "t".$rule_to_index{$test}."\n";
363
364        if ($num_tests_hit >= $max_hits_per_msg) {
365          die "Need to increase \$max_hits_per_msg";
366        }
367      } else {
368	$base_score += $scores{$test};
369      }
370    }
371
372    $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
373    $out .= "c" . $count_keys{$file_key{$file}} . "\n";
374
375    print DAT "n".$num_tests_hit."\n".$out;
376  }
377  close DAT;
378}
379
380sub read_ranges {
381  if (!-f 'tmp/ranges.data') {
382    die "need to make 'tmp/ranges.data' first";
383  }
384
385  # read ranges, and mutableness, from ranges.data.
386  open (IN, "<tmp/ranges.data")
387  	or die "need to run score-ranges-from-freqs first!";
388
389  my $count = 0;
390  while (<IN>) {
391    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
392    my $t = $4;
393    $range_lo{$t} = $1+0;
394    $range_hi{$t} = $2+0;
395    my $mut = $3+0;
396
397    if ($allrules{$t}->{issubrule}) {
398      # warn "$t: ignoring, is sub-rule\n";    # no need to warn
399      $ignored_rule{$t} = 1;
400      $mutable_tests{$t} = 0;
401      next;
402    }
403    if ($t =~ /^T_/) {
404      # warn "$t: ignoring, is T_ test rule\n";    # no need to warn
405      $ignored_rule{$t} = 1;
406      $mutable_tests{$t} = 0;
407      $range_lo{$t} = 0.01;    # clamp to insignificant range
408      $range_hi{$t} = 0.01;
409      next;
410    }
411    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
412      warn "$t: ignoring, score and range == 0\n";
413      $ignored_rule{$t} = 1;
414      $mutable_tests{$t} = 0;
415      next;
416    }
417
418    $ignored_rule{$t} = 0;
419
420    if (!$mut) {
421      $mutable_tests{$t} = 0;
422    } elsif ($range_lo{$t} == $range_hi{$t}) {
423      $mutable_tests{$t} = 0;
424    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
425      $mutable_tests{$t} = 0;
426    } else {
427      $mutable_tests{$t} = 1;
428    }
429    unless ($mutable_tests{$t} || $scores{$t}) {
430      warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
431      $ignored_rule{$t} = 1;
432    }
433  }
434  close IN;
435
436  # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
437  foreach my $t (sort keys %allrules) {
438    next if ($t eq '_scoreset');
439    next if (exists($range_lo{$t}));
440
441    if ($allrules{$t}->{issubrule}) {
442      if (!$ignored_rule{$t}) {
443        # warn "$t: ignoring, is sub-rule\n";  # no need to warn here
444        $ignored_rule{$t} = 1;
445      }
446      $mutable_tests{$t} = 0;
447      next;
448    }
449    if ($t =~ /^T_/) {
450      if (!$ignored_rule{$t}) {
451        # warn "$t: ignoring, is T_ test rule\n";  # no need to warn here
452        $ignored_rule{$t} = 1;
453	$range_lo{$t} = 0.01;    # clamp to insignificant range
454	$range_hi{$t} = 0.01;
455      }
456      $mutable_tests{$t} = 0;
457      next;
458    }
459    $ignored_rule{$t} = 0;
460    unless (exists($mutable_tests{$t}) &&
461	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
462      $mutable_tests{$t} = 0;
463    }
464    unless ($mutable_tests{$t} || $scores{$t}) {
465      if (!$ignored_rule{$t}) {
466        warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
467        $ignored_rule{$t} = 1;
468      }
469    }
470  }
471  foreach my $t (keys %range_lo) {
472    next if ($ignored_rule{$t});
473    if ($mutable_tests{$t}) {
474      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
475	$scores{$t} = -1;
476      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
477	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
478	$scores{$t} = -0.01;
479      }
480      if ($scores{$t} >= $range_hi{$t}) {
481	$scores{$t} = $range_hi{$t} - 0.001;
482      } elsif ($scores{$t} <= $range_lo{$t}) {
483	$scores{$t} = $range_lo{$t} + 0.001;
484      }
485    } else {
486      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
487	next;
488      } elsif ($range_lo{$t} == $range_hi{$t}) {
489	$scores{$t} = $range_lo{$t};
490	next;
491      }
492      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
493	$scores{$t} = -1;
494      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
495	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
496	$scores{$t} = -0.01;
497      }
498      if ($scores{$t} > $range_hi{$t}) {
499	$scores{$t} = $range_hi{$t};
500      } elsif ($scores{$t} < $range_lo{$t}) {
501	$scores{$t} = $range_lo{$t};
502      }
503    }
504  }
505}
506
507
508__DATA__
509
510void loadtests (void) {
511  FILE *fin = fopen ("tmp/tests.data", "r");
512  char buf[256];
513  int file = 0;
514  int tnum = 0;
515
516  while (fgets (buf, 255, fin) != NULL) {
517    char cmd;
518    long arg;
519    float argd;
520
521    cmd = (char) *buf;
522    arg = strtol (buf+1, NULL, 10);
523    argd = (float)strtod (buf+1, NULL);
524
525    if (cmd == '.') {
526      file = arg;
527
528    } else if (cmd == 'n') {
529      tnum = 0;
530      num_tests_hit[file] = arg;
531
532    } else if (cmd == 's') {
533      is_spam[file] = arg;
534
535    } else if (cmd == 'b') {
536      scores[file] = argd;
537
538    } else if (cmd == 't') {
539      tests_hit[file][tnum] = arg; tnum++;
540
541    } else if (cmd == 'c') {
542      tests_count[file] = arg;
543
544    }
545  }
546  fclose(fin);
547
548  printf ("Read test results for %d messages (%d total).\n", file+1,
549	  num_tests);
550}
551
552void loadscores (void) {
553  FILE *fin = fopen ("tmp/scores.data", "r");
554  char buf[256];
555  int snum = 0;
556
557  while (fgets (buf, 255, fin) != NULL) {
558    char cmd;
559    long arg;
560    float argd;
561    char *str, *white;
562
563    cmd = (char) *buf;
564    arg = strtol (buf+1, NULL, 10);
565    argd = (float)strtod (buf+1, NULL);
566    str = buf+1;
567
568    while ((white = strchr (str, '\n')) != NULL) {
569      *white = '\0';
570    }
571
572    if (cmd == '.') {
573      snum = arg;
574
575    } else if (cmd == 'b') {
576      bestscores[snum] = argd;
577
578    } else if (cmd == 'l') {
579      range_lo[snum] = argd;
580
581    } else if (cmd == 'h') {
582      range_hi[snum] = argd;
583
584    } else if (cmd == 'n') {
585      score_names[snum] = strdup (str);	/* leaky leak ;) */
586
587    } else if (cmd == 'm') {
588      is_mutable[snum] = arg;
589    }
590  }
591  fclose(fin);
592
593  printf ("Read scores for %d tests.\n", num_scores);
594}
595