1#!/usr/bin/perl -w 2# 3# <@LICENSE> 4# Licensed to the Apache Software Foundation (ASF) under one or more 5# contributor license agreements. See the NOTICE file distributed with 6# this work for additional information regarding copyright ownership. 7# The ASF licenses this file to you under the Apache License, Version 2.0 8# (the "License"); you may not use this file except in compliance with 9# the License. You may obtain a copy of the License at: 10# 11# http://www.apache.org/licenses/LICENSE-2.0 12# 13# Unless required by applicable law or agreed to in writing, software 14# distributed under the License is distributed on an "AS IS" BASIS, 15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16# See the License for the specific language governing permissions and 17# limitations under the License. 18# </@LICENSE> 19 20=head1 NAME 21 22logs-to-c - Convert a mass-check log into perceptron format 23 24=head1 SYNOPSIS 25 26logs-to-c [options] 27 28 Options: 29 -c,--cffile=path Use path as the rules directory 30 -s,--scoreset=n Use scoreset n 31 --spam=file Location of spam mass-check log 32 --ham=file Location of ham mass-check log 33 34=head1 DESCRIPTION 35 36B<logs-to-c> will read the mass-check logs F<spam.log> and F<ham.log> 37or as specified by the B<--spam> and B<--ham> options, and convert it 38into the format needed by the perceptron. This is a format that is 39simple for the perceptron to parse, but is not very readable to 40humans. 41 42=head1 BUGS 43 44Please report bugs to http://bugzilla.spamassassin.org/ 45 46=head1 SEE ALSO 47 48L<mass-check(1)>, L<perceptron(1)> 49 50=cut 51 52use Getopt::Long qw(:config auto_help bundling); 53use strict; 54 55our $opt_cffile = "../rules"; 56our $opt_spam = 'spam.log'; 57our $opt_ham = 'ham.log'; 58our $opt_scoreset = 0; 59 60GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i"); 61 62my $is_spam = ''; # vec aligned with @tests_hit 63my @tests_hit = (); 64my %mutable_tests = (); 65 66our (%rules, %allrules, %scores); 67 68my (%ignored_rule, %range_lo, %range_hi); 69my %rule_to_index; 70 71readscores(); 72 73print "Reading per-message hit stat logs and scores...\n"; 74my ($num_tests, $num_spam, $num_ham); 75 76read_ranges(); 77readlogs(); 78 79print "Writing logs and current scores as C code...\n"; 80writescores_c(); 81 82# show memory usage before we exit 83# print "Running \"ps aux\"...\n"; 84# open(PS, "ps aux|"); 85# while(<PS>) { 86# print if $. == 1 || /\b$$\b/; 87# } 88# close(PS); 89 90exit 0; 91 92# code to freeze/thaw test lines in as little space as possible 93# this could be faster, but improves memory usage by a phenomenal 94# amount over arrayrefs or strings of comma-separated-values 95my $short_index = 1; 96my %long_to_short; 97my @short_to_long; 98 99sub new_short { 100 $short_index++; 101 $long_to_short{$_[0]} = $short_index; 102 $short_to_long[$short_index] = $_[0]; 103 return $short_index; 104} 105 106# uses less than half the memory of join on ',' and even better 107# compared to Storable::freeze 108sub freeze_tests { 109 return pack("w*", map 110 { 111 $long_to_short{$_} || new_short($_); 112 } @{$_[0]}) 113} 114 115sub thaw_tests { 116 return map { $short_to_long[$_] } unpack("w*", $_[0]); 117} 118 119sub readlogs { 120 my $msgline; 121 122 my $count = 0; 123 $num_spam = $num_ham = 0; 124 125 foreach my $file ($opt_spam, $opt_ham) { 126 open (IN, "<$file") || die "Could not open file '$file': $!"; 127 128 my $isspam = ($file eq $opt_spam); 129 my $caught; # 1st parameter of log line 130 my $rules; # 4th parameter of log line 131 my $restofline; # intermediate parse buffer 132 133 while (defined($msgline = <IN>)) { 134 # faster log-reading code from hit-frequencies. 135 # the additional split() is for this case: 136 # ". -20 /path time=1112116980,scantime=0,format=f,reuse=no" 137 # in other words, no hits. split(' ') cannot deal with this 138 # correctly, seeing (".", "-20", "/path", "time=...etc"). Work 139 # around this by using a literal / / regexp split to discard 140 # the csv stuff we don't want out of the rest of the line. 141 142 ($caught, undef, $restofline) = split(' ', $msgline, 3); 143 next unless ($caught =~ /^[Y\.]$/ && $restofline); 144 (undef, $rules) = split(/ /, $restofline, 3); 145 146 # get tests, but ignore unknown tests and subrules 147 my @tests; 148 foreach my $r (split(/,/, $rules)) { 149 my $hits = 1; 150 # Support compacted RULE(hitcount) format 151 if ($r =~ s/\((\d+)\)$//) { 152 $hits = $1; 153 } 154 next unless (defined $scores{$r} && !$allrules{$r}->{issubrule}); 155 push @tests, $r for (1 .. $hits); 156 } 157 158 if ($isspam) { 159 $num_spam++; 160 vec($is_spam, $count, 1) = 1; 161 } 162 else { 163 $num_ham++; 164 vec($is_spam, $count, 1) = 0; 165 } 166 167 # inlined for speed. 168 # ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests); 169 $tests_hit[$count] = pack("w*", map 170 { 171 $long_to_short{$_} || new_short($_); 172 } @tests); 173 174 # TODO: benchmark using foreach(), map() is often slower 175 176 $count++; # increment line 177 } 178 close IN; 179 } 180 $num_tests = $count; 181} 182 183sub readscores { 184 print "Reading scores from \"$opt_cffile\"...\n"; 185 my $tmpf = "./tmp/rules$$.pl"; 186 system "../build/parse-rules-for-masses ". 187 "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die; 188 require $tmpf; 189 unlink $tmpf; 190 %allrules = %rules; # ensure it stays global 191} 192 193sub writescores_c { 194 my $output = ''; 195 my $size = 0; 196 my $mutable = 0; 197 my $i; 198 199 # jm: now, score-ranges-from-freqs has tflags to work from, so 200 # it will always list all mutable tests. 201 202 my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) || 203 ($mutable_tests{$b} <=> $mutable_tests{$a}) || 204 ($a cmp $b)} (keys %scores); 205 my $max_hits_per_msg = 0; 206 for (my $file = 0; $file < $num_tests; $file++) { 207 my(@hits) = 208 grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file])); 209 if ((scalar(@hits)+1) > $max_hits_per_msg) { 210 $max_hits_per_msg = scalar(@hits)+1; 211 } 212 } 213 214 for ($i = 0; $i <= $#index_to_rule; $i++) { 215 my $name = $index_to_rule[$i]; 216 $rule_to_index{$name} = $i; 217 218 if ($ignored_rule{$name}) { next; } 219 220 if ($mutable_tests{$name} == 0) { 221 $range_lo{$name} = $range_hi{$name} = $scores{$name}; 222 } else { 223 $mutable++; 224 if ($range_lo{$name} > $range_hi{$name}) { 225 ($range_lo{$name},$range_hi{$name}) = 226 ($range_hi{$name},$range_lo{$name}); 227 } 228 #$range_lo{$name} ||= 0.1; 229 #$range_hi{$name} ||= 1.5; 230 231 # no default score found? set it to max and let GA adjust downwards. this 232 # seems to help avoid a load of really good rules getting 1.0 scores 233 if ($allrules{$name}->{no_score_found}) { 234 $scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0; 235 } 236 } 237 238 $output .= ".".$i."\n". 239 "n".$name."\n". 240 "b".$scores{$name}."\n". 241 "m".$mutable_tests{$name}."\n". 242 "l".$range_lo{$name}."\n". 243 "h".$range_hi{$name}."\n"; 244 $size++; 245 } 246 247 248 open (DAT, ">tmp/scores.data"); 249 print DAT "N$size\n", "M$mutable\n", # informational only 250 $output; 251 close DAT; 252 253 open (OUT, ">tmp/scores.h"); 254 print OUT " 255#include <stdio.h> 256#include <string.h> 257#include <stdlib.h> 258 259int num_scores = $size; 260int num_mutable = $mutable; 261unsigned char is_mutable[$size]; 262double range_lo[$size]; 263double range_hi[$size]; 264double bestscores[$size]; 265char *score_names[$size]; 266double tmp_scores[$size][2]; 267unsigned char ny_hit[$mutable]; 268unsigned char yn_hit[$mutable]; 269 270double lookup[$mutable]; 271 272/* readscores() is defined in tests.h */ 273 274"; 275 close OUT; 276 277 writetests_c($max_hits_per_msg); # make sure $rule_to_index is around 278} 279 280sub writetests_c { 281 my $max_hits_per_msg = $_[0]; 282 283 my(%uniq_files) = (); 284 my(%count_keys) = (); 285 my(%file_key) = (); 286 287 my $file; 288 289 for ($file = 0; $file < $num_tests; $file++) 290 { 291 my $uniq_key = vec($is_spam, $file, 1) . " "; 292 293 my (@good_tests) = 294 grep {length($_) && (! $ignored_rule{$_}) && 295 (defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file])); 296 297 @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests)); 298 299 $uniq_key .= join(" ",@good_tests); 300 301 if (exists($count_keys{$uniq_key})) { 302 $count_keys{$uniq_key}++; 303 } else { 304 $count_keys{$uniq_key} = 1; 305 $file_key{$file} = $uniq_key; 306 $uniq_files{$file} = scalar(keys(%count_keys)) - 1; 307 } 308 } 309 310 my $num_nondup = scalar(keys(%uniq_files)); 311 312 open (TOP, ">tmp/tests.h"); 313 print TOP " 314#include <stdio.h> 315#include <string.h> 316#include <stdlib.h> 317 318int num_tests = $num_tests; 319int num_nondup = $num_nondup; 320int num_spam = $num_spam; 321int num_ham = $num_ham; 322int max_hits_per_msg = $max_hits_per_msg; 323unsigned char num_tests_hit[$num_nondup]; 324unsigned char is_spam[$num_nondup]; 325unsigned short tests_hit[$num_nondup][$max_hits_per_msg]; 326double scores[$num_nondup]; 327double tmp_total[$num_nondup]; 328int tests_count[$num_nondup]; 329 330"; 331 $_ = join ('', <DATA>); 332 print TOP $_; 333 close TOP; 334 335 open (DAT, ">tmp/tests.data"); 336 337 foreach $file (sort {$a <=> $b} (keys %uniq_files)) { 338 print DAT ".".$uniq_files{$file}."\n"; 339 340 my $out = ''; 341 $out .= "s".vec($is_spam, $file, 1)."\n"; 342 343 my $base_score = 0; 344 my $num_tests_hit = 0; 345 foreach my $test (thaw_tests($tests_hit[$file])) { 346 if ($test eq '') { next; } 347 348 if ($ignored_rule{$test}) { 349 # this is not a log-worthy event anymore, since we have a lot 350 # of T_ test rules that are ignored during perceptron runs 351 # warn "ignored rule $test got a hit in $file!\n"; 352 next; 353 } 354 355 if (!defined $rule_to_index{$test}) { 356 warn "test with no C index: $test\n"; 357 next; 358 } 359 360 if ($mutable_tests{$test}) { 361 $num_tests_hit++; 362 $out .= "t".$rule_to_index{$test}."\n"; 363 364 if ($num_tests_hit >= $max_hits_per_msg) { 365 die "Need to increase \$max_hits_per_msg"; 366 } 367 } else { 368 $base_score += $scores{$test}; 369 } 370 } 371 372 $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests 373 $out .= "c" . $count_keys{$file_key{$file}} . "\n"; 374 375 print DAT "n".$num_tests_hit."\n".$out; 376 } 377 close DAT; 378} 379 380sub read_ranges { 381 if (!-f 'tmp/ranges.data') { 382 die "need to make 'tmp/ranges.data' first"; 383 } 384 385 # read ranges, and mutableness, from ranges.data. 386 open (IN, "<tmp/ranges.data") 387 or die "need to run score-ranges-from-freqs first!"; 388 389 my $count = 0; 390 while (<IN>) { 391 /^(\S+) (\S+) (\d+) (\S+)$/ or next; 392 my $t = $4; 393 $range_lo{$t} = $1+0; 394 $range_hi{$t} = $2+0; 395 my $mut = $3+0; 396 397 if ($allrules{$t}->{issubrule}) { 398 # warn "$t: ignoring, is sub-rule\n"; # no need to warn 399 $ignored_rule{$t} = 1; 400 $mutable_tests{$t} = 0; 401 next; 402 } 403 if ($t =~ /^T_/) { 404 # warn "$t: ignoring, is T_ test rule\n"; # no need to warn 405 $ignored_rule{$t} = 1; 406 $mutable_tests{$t} = 0; 407 $range_lo{$t} = 0.01; # clamp to insignificant range 408 $range_hi{$t} = 0.01; 409 next; 410 } 411 if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) { 412 warn "$t: ignoring, score and range == 0\n"; 413 $ignored_rule{$t} = 1; 414 $mutable_tests{$t} = 0; 415 next; 416 } 417 418 $ignored_rule{$t} = 0; 419 420 if (!$mut) { 421 $mutable_tests{$t} = 0; 422 } elsif ($range_lo{$t} == $range_hi{$t}) { 423 $mutable_tests{$t} = 0; 424 } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { 425 $mutable_tests{$t} = 0; 426 } else { 427 $mutable_tests{$t} = 1; 428 } 429 unless ($mutable_tests{$t} || $scores{$t}) { 430 warn "$t: ignoring, immutable and score == 0 in this scoreset\n"; 431 $ignored_rule{$t} = 1; 432 } 433 } 434 close IN; 435 436 # catch up on the ones missed; seems to be userconf or 0-hitters mostly. 437 foreach my $t (sort keys %allrules) { 438 next if ($t eq '_scoreset'); 439 next if (exists($range_lo{$t})); 440 441 if ($allrules{$t}->{issubrule}) { 442 if (!$ignored_rule{$t}) { 443 # warn "$t: ignoring, is sub-rule\n"; # no need to warn here 444 $ignored_rule{$t} = 1; 445 } 446 $mutable_tests{$t} = 0; 447 next; 448 } 449 if ($t =~ /^T_/) { 450 if (!$ignored_rule{$t}) { 451 # warn "$t: ignoring, is T_ test rule\n"; # no need to warn here 452 $ignored_rule{$t} = 1; 453 $range_lo{$t} = 0.01; # clamp to insignificant range 454 $range_hi{$t} = 0.01; 455 } 456 $mutable_tests{$t} = 0; 457 next; 458 } 459 $ignored_rule{$t} = 0; 460 unless (exists($mutable_tests{$t}) && 461 ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) { 462 $mutable_tests{$t} = 0; 463 } 464 unless ($mutable_tests{$t} || $scores{$t}) { 465 if (!$ignored_rule{$t}) { 466 warn "$t: ignoring, immutable and score == 0 in this scoreset\n"; 467 $ignored_rule{$t} = 1; 468 } 469 } 470 } 471 foreach my $t (keys %range_lo) { 472 next if ($ignored_rule{$t}); 473 if ($mutable_tests{$t}) { 474 if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { 475 $scores{$t} = -1; 476 } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && 477 ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { 478 $scores{$t} = -0.01; 479 } 480 if ($scores{$t} >= $range_hi{$t}) { 481 $scores{$t} = $range_hi{$t} - 0.001; 482 } elsif ($scores{$t} <= $range_lo{$t}) { 483 $scores{$t} = $range_lo{$t} + 0.001; 484 } 485 } else { 486 if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { 487 next; 488 } elsif ($range_lo{$t} == $range_hi{$t}) { 489 $scores{$t} = $range_lo{$t}; 490 next; 491 } 492 if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { 493 $scores{$t} = -1; 494 } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && 495 ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { 496 $scores{$t} = -0.01; 497 } 498 if ($scores{$t} > $range_hi{$t}) { 499 $scores{$t} = $range_hi{$t}; 500 } elsif ($scores{$t} < $range_lo{$t}) { 501 $scores{$t} = $range_lo{$t}; 502 } 503 } 504 } 505} 506 507 508__DATA__ 509 510void loadtests (void) { 511 FILE *fin = fopen ("tmp/tests.data", "r"); 512 char buf[256]; 513 int file = 0; 514 int tnum = 0; 515 516 while (fgets (buf, 255, fin) != NULL) { 517 char cmd; 518 long arg; 519 float argd; 520 521 cmd = (char) *buf; 522 arg = strtol (buf+1, NULL, 10); 523 argd = (float)strtod (buf+1, NULL); 524 525 if (cmd == '.') { 526 file = arg; 527 528 } else if (cmd == 'n') { 529 tnum = 0; 530 num_tests_hit[file] = arg; 531 532 } else if (cmd == 's') { 533 is_spam[file] = arg; 534 535 } else if (cmd == 'b') { 536 scores[file] = argd; 537 538 } else if (cmd == 't') { 539 tests_hit[file][tnum] = arg; tnum++; 540 541 } else if (cmd == 'c') { 542 tests_count[file] = arg; 543 544 } 545 } 546 fclose(fin); 547 548 printf ("Read test results for %d messages (%d total).\n", file+1, 549 num_tests); 550} 551 552void loadscores (void) { 553 FILE *fin = fopen ("tmp/scores.data", "r"); 554 char buf[256]; 555 int snum = 0; 556 557 while (fgets (buf, 255, fin) != NULL) { 558 char cmd; 559 long arg; 560 float argd; 561 char *str, *white; 562 563 cmd = (char) *buf; 564 arg = strtol (buf+1, NULL, 10); 565 argd = (float)strtod (buf+1, NULL); 566 str = buf+1; 567 568 while ((white = strchr (str, '\n')) != NULL) { 569 *white = '\0'; 570 } 571 572 if (cmd == '.') { 573 snum = arg; 574 575 } else if (cmd == 'b') { 576 bestscores[snum] = argd; 577 578 } else if (cmd == 'l') { 579 range_lo[snum] = argd; 580 581 } else if (cmd == 'h') { 582 range_hi[snum] = argd; 583 584 } else if (cmd == 'n') { 585 score_names[snum] = strdup (str); /* leaky leak ;) */ 586 587 } else if (cmd == 'm') { 588 is_mutable[snum] = arg; 589 } 590 } 591 fclose(fin); 592 593 printf ("Read scores for %d tests.\n", num_scores); 594} 595