1#!/usr/bin/perl
2my $automcdir = "/usr/local/spamassassin/automc/svn/masses/rule-qa/automc";
3
4###!/usr/bin/perl
5##my $automcdir = "/home/jm/ftp/spamassassin/masses/rule-qa/automc";
6
7use strict;
8use warnings;
9
10my $PERL_INTERP = $^X;
11
12our %FREQS_FILENAMES = (
13    'DETAILS.age' => 'set 0, broken down by message age in weeks',
14    'DETAILS.all' => 'set 0, broken down by contributor',
15    'DETAILS.new' => 'set 0, in aggregate',
16    'NET.age' => 'set 1 (network), by message age in weeks',
17    'NET.all' => 'set 1 (network), by contributor',
18    'NET.new' => 'set 1 (network), in aggregate',
19    'SCOREMAP.new' => 'set 0, score-map',
20    'OVERLAP.new' => 'set 0, overlaps between rules',
21);
22
23my $refresh_cache = ($ARGV[0] and $ARGV[0] eq '-refresh');
24
25my $self = Mail::SpamAssassin::CGI::RuleQaApp->new();
26$self->ui_parse_url_base();
27$self->ui_get_url_switches();
28$self->ui_get_daterev();
29$self->ui_get_rules();
30$self->show_view();
31exit;
32
33# ---------------------------------------------------------------------------
34
35package Mail::SpamAssassin::CGI::RuleQaApp;
36use CGI;
37use CGI::Carp 'fatalsToBrowser';
38use Date::Manip;
39use URI::Escape;
40use Time::Local;
41use POSIX qw();
42use Storable qw(nfreeze thaw);
43use Compress::LZ4 qw(compress decompress);
44
45# daterevs -- e.g. "20060429/r239832-r" -- are aligned to just before
46# the time of day when the mass-check tagging occurs; that's 0850 GMT,
47# so align the daterev to 0800 GMT.
48#
49use constant DATEREV_ADJ => - (8 * 60 * 60);
50
51my $FREQS_LINE_TEMPLATE;
52my $FREQS_LINE_TEXT_TEMPLATE;
53my $FREQS_EXTRA_TEMPLATE;
54our %AUTOMC_CONF;
55
56our @ISA = qw();
57
58sub new {
59  my $class = shift;
60  $class = ref($class) || $class;
61  my $self = { };
62
63  $self->{q} = CGI->new();
64
65  $self->{id_counter} = 0;
66  $self->{include_embedded_freqs_xml} = 1;
67  $self->{cgi_param_order} = [ ];
68  $self->{cgi_params} = { };
69  $self->{now} = time();
70
71  bless ($self, $class);
72
73  # some global configuration
74  $self->set_freqs_templates();
75  $self->read_automc_global_conf();
76
77  die "no directory set in automc config for 'html'" unless $AUTOMC_CONF{html};
78  $self->{cachefile} = "$AUTOMC_CONF{html}/ruleqa.scache";
79
80  $self->{scache_keep_time} = defined $AUTOMC_CONF{scache_keep_time} ?
81    $AUTOMC_CONF{scache_keep_time} : 60*60*24*14; # default 2 weeks
82
83  if ($refresh_cache) {
84    $self->refresh_cache();
85    exit;
86  }
87
88  $self->read_cache();
89  $self->precache_params();
90  return $self;
91}
92
93# ---------------------------------------------------------------------------
94
95sub read_automc_global_conf {
96  my ($self) = @_;
97
98  open (CF, "<$automcdir/config") or return;
99  while(<CF>) { /^(?!#)(\S+)=(\S+)/ and $AUTOMC_CONF{$1} = $2; }
100  close CF;
101}
102
103# ---------------------------------------------------------------------------
104
105sub ui_parse_url_base {
106  my ($self) = @_;
107
108# Allow path info to become CGI-ish parameters.
109# the two parts of path info double as (a) daterev, (b) rulename,
110# (c) "s_detail=1".
111# CGI parameters "daterev", "rule", "s_detail" override them though
112#
113  $self->{url_abs} = $self->{q}->url(-absolute=>1);
114  $self->{url_with_path} = $self->{q}->url(-absolute=>1, -path_info=>1);
115
116# if we have a valid, full URL (non-cgi use), use that instead of
117# the "path_info" one, since CGI.pm will unhelpfully remove duplicate
118# slashes.  this screws up "/FOO" rule grep searches.   Also,
119# fix $self->{url_abs} to be correct for the "entire website is web app" case,
120# as CGI.pm gets that wrong, too!
121
122  if ($self->{url_abs} =~ m,^/(?:20\d|last-net|last-preflight|last-night|\d+-days-ago|today),) {
123    $self->{url_with_path} = $self->{url_abs};
124    $self->{url_abs} = "/";
125  } else {
126    $self->{url_with_path} =~ s,^\Q$self->{url_abs}\E,,;
127  }
128
129  if ($self->{url_with_path} =~ s,^/*([^/]+),,) { $self->add_cgi_path_param("daterev", $1); }
130  if ($self->{url_with_path} =~ s,^/(/?[^/]+),,) { $self->add_cgi_path_param("rule", $1); }
131  if ($self->{url_with_path} =~ s,^/detail,,) { $self->add_cgi_path_param("s_detail", "1"); }
132
133# cgi_url: used in hrefs from the generated document
134  $self->{cgi_url} = $self->{url_abs};
135  $self->{cgi_url} =~ s,/ruleqa/ruleqa$,/ruleqa,s;
136  $self->{cgi_url} ||= '/';
137}
138
139# ---------------------------------------------------------------------------
140
141sub ui_get_url_switches {
142  my ($self) = @_;
143
144  $self->{s} = { };
145
146# selection of what will be displayed.
147  $self->{s}{detail} = $self->get_url_switch('s_detail', 0);
148  $self->{s}{g_over_time} = $self->get_url_switch('s_g_over_time', 0);
149  $self->{s}{corpus} = $self->get_url_switch('s_corpus', 0);
150
151  # "?q=FOO" is a shortcut for "?rule=FOO&s_detail=1"; good for shortcuts
152  my $q = $self->{q}->param("q");
153  if ($q) {
154    $self->add_cgi_param("rule", $q);
155    $self->add_cgi_param("s_detail", 1);
156    $self->{s}{detail} = 1;
157  }
158
159  $self->{s}{xml} = $self->get_url_switch('xml', 0);
160  $self->{include_embedded_freqs_xml} = $self->{s}{xml};
161
162# note: age, new, overlap are all now synonyms for detail ;)
163  if ($self->{s}{age} || $self->{s}{overlap} || $self->{s}{detail}) {
164    $self->{s}{age} = 1;
165    $self->{s}{all} = 1;
166    $self->{s}{new} = 1;
167    $self->{s}{overlap} = 1;
168    $self->{s}{scoremap} = 1;
169  }
170
171  # always show "new" set, though
172  $self->{s}{new} = 1;
173}
174
175sub get_url_switch {
176  my ($self, $name, $defval) = @_;
177  my $val = $self->{q}->param($name);
178  if (!defined $val) { return $defval; }
179  return ($val) ? 1 : 0;
180}
181
182# ---------------------------------------------------------------------------
183
184sub ui_get_daterev {
185  my ($self) = @_;
186
187  # when and what
188  $self->{daterev} = $self->{q}->param('daterev') || '';
189
190  $self->{daterevs} = $self->{cached}->{daterevs};
191
192  # sanitise daterev string
193  if (defined $self->{daterev}) {
194
195    # all of these ignore "b" preflight mass-checks, btw
196    if ($self->{daterev} eq 'last-night') {
197      $self->{daterev} = $self->get_daterev_for_days_ago(1);
198      $self->{q}->param('daterev', $self->{daterev});  # make it absolute
199    }
200    elsif ($self->{daterev} =~ /^(\d+)-days-ago$/) {
201      $self->{daterev} = $self->get_daterev_for_days_ago($1);
202      $self->{q}->param('daterev', $self->{daterev});
203    }
204    elsif ($self->{daterev} eq 'last-preflight') {
205      $self->{daterev} = undef;
206    }
207    elsif ($self->{daterev} eq 'today') {
208      $self->{daterev} = $self->get_daterev_by_date(
209            POSIX::strftime "%Y%m%d", gmtime (($self->{now} + DATEREV_ADJ)));
210      $self->{q}->param('daterev', $self->{daterev});
211    }
212    elsif ($self->{daterev} eq 'last-net') {
213      $self->{daterev} = $self->get_last_net_daterev();
214      $self->{q}->param('daterev', $self->{daterev});
215    }
216    elsif ($self->{daterev} =~ /^(20\d\d[01]\d\d\d)$/) {
217      # a date
218      $self->{daterev} = $self->get_daterev_by_date($1);
219      $self->{q}->param('daterev', $self->{daterev});
220    }
221    elsif ($self->{daterev} =~ /(\d+)[\/-](r\d+)-(\S+)/ && $2) {
222      $self->{daterev} = "$1-$2-$3";
223    } else {
224      # default: last-night's
225      $self->{daterev} = $self->get_daterev_for_days_ago(1);
226    }
227  }
228
229  # turn possibly-empty $self->{daterev} into a real date/rev combo (that exists)
230  $self->{daterev} = $self->date_in_direction($self->{daterev}, 0);
231
232  $self->{daterev_md} = $self->get_daterev_metadata($self->{daterev});
233}
234
235# ---------------------------------------------------------------------------
236
237sub ui_get_rules {
238  my ($self) = @_;
239
240  # which rules?
241  $self->{rule} = $self->{q}->param('rule') || '';
242  $self->{rule} =~ s/[^_0-9a-zA-Z\/]//gs; # Sanitize
243  $self->{rules_all} = 0;
244  $self->{rules_grep} = 0;
245  $self->{nicerule} = $self->{rule};
246  if (!$self->{nicerule}) {
247    $self->{rules_all}++; $self->{nicerule} = 'all rules';
248  }
249  if ($self->{rule} =~ /^\//) {
250    $self->{rules_grep}++; $self->{nicerule} = 'regexp '.$self->{rule};
251  }
252
253  $self->{srcpath} = $self->{q}->param('srcpath') || '';
254  $self->{srcpath} =~ s/[^.,_0-9a-zA-Z\/-]//gs; # Sanitize
255  $self->{mtime} = $self->{q}->param('mtime') || '';
256  $self->{mtime} =~ s/[^0-9]//gs; # Sanitize
257
258  $self->{freqs}{head} = { };
259  $self->{freqs}{data} = { };
260  $self->{freqs}{ordr} = { };
261  $self->{line_counter} = 0;
262}
263
264# ---------------------------------------------------------------------------
265# supported views
266
267sub show_view {
268  my ($self) = @_;
269
270  if ($self->{q}->param('mclog')) {
271    $self->show_mclog($self->{q}->param('mclog'));
272  }
273
274  my $graph = $self->{q}->param('graph');
275  if ($graph) {
276    if ($graph eq 'over_time') { $self->graph_over_time(); }
277    else { die "graph '$graph' unknown"; }
278  }
279  elsif ($self->{q}->param('longdatelist')) {
280    print $self->{q}->header();
281    $self->show_daterev_selector_page();
282  }
283  elsif ($self->{q}->param('shortdatelist')) {
284    $self->{s_shortdatelist} = 1;
285    print $self->{q}->header();
286    $self->show_default_view();
287  }
288  else {
289    print $self->{q}->header();
290    $self->show_default_view();
291  }
292}
293
294# ---------------------------------------------------------------------------
295
296sub show_default_header {
297  my ($self, $title) = @_;
298
299  # replaced with use of main, off-zone host:
300  # <!-- <link href="/ruleqa.css" rel="stylesheet" type="text/css"> <script src="https://ruleqa.spamassassin.org/sorttable.js"></script> -->
301
302  my $hdr = q{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
303                    "https://www.w3.org/TR/html4/strict.dtd">
304  <html xmlns="https://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
305  <head><meta http-equiv="Content-type" content="text/html; charset=utf-8">
306  <link rel="icon" href="https://spamassassin.apache.org/images/favicon.ico">
307  <title>}.$title.q{: SpamAssassin Rule QA</title>
308
309  <link href="https://ruleqa.spamassassin.org/ruleqa.css" rel="stylesheet" type="text/css">
310  <script src="https://ruleqa.spamassassin.org/sorttable.js"></script>
311
312  <script type="text/javascript"><!--
313
314    function hide_header(id) {document.getElementById(id).style.display="none";}
315    function show_header(id) {document.getElementById(id).style.display="block";}
316
317    //-->
318  </script>
319
320  </head><body>
321
322        <table width="100%"> <tr> <td valign=top>
323          <h1>SpamAssassin Rule QA</h1>
324        </td> <td valign=top>
325          <p align="right">
326            <a href="https://wiki.apache.org/spamassassin/RuleQaApp">help</a>
327          </p>
328        </td> </tr> </table>
329
330  };
331  #<br> <a href="https://bbmass.spamassassin.org:8011/">preflight mass-check progress</a>
332  return $hdr;
333}
334
335sub show_default_view {
336  my ($self) = @_;
337
338  my $title;
339  if ($self->{s}{detail}) {
340    $title = "Details for $self->{nicerule} in mass-check $self->{daterev}";
341  } else {
342    $title = "Overview of all rules in mass-check $self->{daterev}";
343  }
344  print $self->show_default_header($title);
345
346  my $tmpl = q{
347
348  <div class='updateform'>
349
350  <form action="!THISURL!" method="GET">
351    <table style="padding-left: 0px" class='datetable'>
352
353        <tr>
354        <th> Commit </th>
355        <th> Preflight Mass-Checks </th>
356        <th> Nightly Mass-Checks </th>
357        <th> Network Mass-Checks </th>
358        </tr>
359
360        <tr>
361        <td colspan="4">
362        <div class='ui_label'>
363          List <a href="/">just current daterev</a> /
364          <a href="!shortdatelist!">all daterevs within 2 days</a> /
365          <a href="!longdatelist!">most recent 1000</a> /
366          <a href="!fulldatelist!">full list</a>
367        </div>
368        </td>
369        </tr>
370
371        !daylinkstable!
372
373    </table>
374
375  <table width='100%'>
376  <tr>
377  <td width='100%'>
378  <div class='ui_label'>
379    Or, <a href="https://wiki.apache.org/spamassassin/DateRev">DateRev</a>
380    to display: <input type='textfield' name='daterev' value="!daterev!">
381  </div>
382  <div class='ui_label'>
383    Or, select a recent nightly mass-check by date by entering
384    'YYYYMMDD' in the DateRev text field for a specific date,
385    or <a href='!daterev=last-night!'>last night's nightly run</a>,
386    <a href='!daterev=today!'>today's nightly run</a>,
387    <a href='!daterev=last-net!'>the most recent --net run</a>, or
388    <a href='!daterev=last-preflight!'>the most recent 'preflight' mass-check</a>.
389  </div>
390  </td>
391  </tr>
392  </table>
393
394    <br/>
395
396    <h4> Which Rules?</h4>
397  <div class='ui_label'>
398    Show only these rules (space separated, or regexp with '/' prefix):<br/>
399  </div>
400    <input type='textfield' size='60' name='rule' value="!rule!"><br/>
401    <br/>
402  <div class='ui_label'>
403    Show only rules from source files whose paths contain this string:<br/>
404  </div>
405    <input type='textfield' size='60' name='srcpath' value="!srcpath!"><br/>
406    <br/>
407
408    <!-- <input type='checkbox' name='s_detail' id='s_detail' !s_detail!><label
409        for='s_detail' class='ui_label'>Display full details: message age in weeks, by contributor, as score-map, overlaps with other rules, freshness graphs
410        </label><br/>
411    <br/> -->
412
413<p>
414  <div class='ui_label'>
415    Show only rules from files modified in the
416    <a href='!mtime=1!'>last day</a>,
417    <a href='!mtime=2!'>2</a>,
418    <a href='!mtime=3!'>3</a>,
419    <a href='!mtime=7!'>last week</a>
420  </div>
421</p>
422
423    <div align='right'><input type='submit' name='g' value="Change"></div>
424  </form>
425  </div>
426
427  };
428
429  my @drs = ();
430  {
431    my $origdr = $self->{daterev} || $self->{daterevs}->[-1];
432    $origdr =~ /^(\d+)[\/-](\S+)[\/-]/;
433    my $date = $1;
434    my $rev = $2;
435
436    my $dr_after = date_offset($date, -2);
437    my $dr_before = date_offset($date, 2);
438
439    my $origidx;
440    foreach my $dr (@{$self->{daterevs}}) {
441      next unless ($dr =~ /^(\d+)[\/-]/);
442      my $date = $1;
443
444      next unless ($date >= $dr_after);
445      next unless ($date <= $dr_before);
446      push @drs, $dr;
447
448      if ($dr eq $origdr) {
449        $origidx = scalar @drs - 1;
450      }
451    }
452
453    # if we're doing the default UI -- ie. looking at a mass-check --
454    # cut it down to just a couple around it, for brevity
455    if (!$self->{s_shortdatelist} && defined($origidx)) {
456      my $i = $origidx;
457      while ($i < @drs-1 && $drs[$i] =~ /^${date}-${rev}-/) { $i++; }
458      my $nextrev = $drs[$i]; $nextrev =~ s/-[a-z]$//;
459      while ($i < @drs-1 && $drs[$i] =~ /^${nextrev}-/) { $i++; }
460      if ($i < @drs-1) { splice @drs, $i; }
461
462      $i = $origidx;
463      while ($i > 0 && $drs[$i] =~ /^${date}-${rev}-/) { $i--; }
464      my $prevrev = $drs[$i]; $prevrev =~ s/-[a-z]$//;
465      while ($i > 0 && $drs[$i] =~ /^${prevrev}-/) { $i--; }
466      if ($i > 0) { splice @drs, 0, $i+1; }
467    }
468  }
469
470  $tmpl =~ s{!daylinkstable!}{
471          $self->get_daterev_html_table(\@drs, 0, 0);
472        }ges;
473
474  my $dranchor = "r".$self->{daterev}; $dranchor =~ s/[^A-Za-z0-9]/_/gs;
475  my $sdlurl = $self->gen_toplevel_url("shortdatelist", 1)."#".$dranchor;
476  my $ldlurl = $self->gen_toplevel_url("longdatelist", 1)."#".$dranchor;
477  my $fdlurl = $self->gen_toplevel_url("longdatelist", 1).'&perpage=999999#'.$dranchor;
478
479  $tmpl =~ s/!longdatelist!/$ldlurl/gs;
480  $tmpl =~ s/!fulldatelist!/$fdlurl/gs;
481  $tmpl =~ s/!shortdatelist!/$sdlurl/gs;
482  $tmpl =~ s/!THISURL!/$self->{cgi_url}/gs;
483  $tmpl =~ s/!daterev!/$self->{daterev}/gs;
484  $tmpl =~ s/!mtime=(.*?)!/
485               $self->gen_switch_url("mtime", $1);
486       /eg;
487  $tmpl =~ s/!daterev=(.*?)!/
488               $self->gen_switch_url("daterev", $1);
489       /eg;
490  $tmpl =~ s/!rule!/$self->{rule}/gs;
491  $tmpl =~ s/!srcpath!/$self->{srcpath}/gs;
492  foreach my $opt (keys %{$self->{s}}) {
493    if ($self->{s}{$opt}) {
494      $tmpl =~ s/!s_$opt!/checked /gs;
495    } else {
496      $tmpl =~ s/!s_$opt!/ /gs;
497    }
498  }
499
500  print $tmpl;
501
502  if (!$self->{s}{detail}) {
503
504    print qq{
505
506      <p class='intro'> <strong>Instructions</strong>: click
507      the rule name to view details of a particular rule. </p>
508
509    };
510  }
511
512  # debug: log the chosen sets parameters etc.
513  if (0) {
514    print "<!-- ",
515               "{s}{new} = $self->{s}{new}\n",
516               "{s}{age} = $self->{s}{age}\n",
517               "{s}{all} = $self->{s}{all}\n",
518               "{s}{overlap} = $self->{s}{overlap}\n",
519               "{s}{scoremap} = $self->{s}{scoremap}\n",
520               "{s}{xml} = $self->{s}{xml}\n",
521       "-->\n";
522  }
523
524  $|=1;                # turn off buffering from now on
525
526  my $single_rule_displayed = ($self->{s}{detail} && !($self->{rules_all} || $self->{rules_grep}));
527
528  # only display code if it's a single rule page
529  if ($single_rule_displayed) {
530    my $rev = $self->get_rev_for_daterev($self->{daterev});
531    my $md = $self->get_rule_metadata($rev);
532    my $src = eval { $md->{rulemds}->{$self->{rule}}->{src} } || '(not found)';
533    my $srchref = "https://svn.apache.org/viewvc/spamassassin/trunk/$src?revision=$rev\&view=markup";
534
535    my $lastmod = '(unknown)';
536    if (defined $md->{rulemds}->{$self->{rule}}->{srcmtime}) {
537      $lastmod = eval {
538        POSIX::strftime "%Y-%m-%d %H:%M:%S UTC", gmtime $md->{rulemds}->{$self->{rule}}->{srcmtime}
539      } || '(unknown)';
540    }
541
542    my $tflags = eval {
543          $md->{rulemds}->{$self->{rule}}->{tf}
544        } || '';
545
546    # a missing string is now represented as {}, annoyingly
547    if (ref $tflags =~ /HASH/ || $tflags =~ /^HASH/) { $tflags = ''; }
548
549    $tflags = ($tflags =~ /\S/) ? ", tflags $tflags" : "";
550
551    my $plinkhref = $self->gen_this_url()."#rulemetadata";
552
553    print qq{
554      <p class="srcinfo">
555        Detailed results for rule
556        <a id="rulemetadata"></a><a href="$plinkhref"><b>$self->{rule}</b></a>,
557        from source file <a href="$srchref">$src</a>$tflags.
558        Source file was last modified on $lastmod.
559      </p>
560    };
561  }
562
563  $self->show_all_sets_for_daterev($self->{daterev}, $self->{daterev});
564
565  # don't show "graph" link unless only a single rule is being displayed
566  if ($single_rule_displayed) {
567    my $graph_on = qq{
568
569      <p><a id="over_time_anchor"></a><a id="overtime"
570        href="}.$self->gen_switch_url("s_g_over_time", "0").qq{#overtime"
571        >Hide graph</a></p>
572      <img src="}.$self->gen_switch_url("graph", "over_time").qq{"
573        width='800' height='815' />
574
575    };
576
577    my $graph_off = qq{
578
579      <p><a id="over_time_anchor"></a><a id="overtime"
580        href="}.$self->gen_switch_url("s_g_over_time", "1").qq{#overtime"
581        >Show graph</a></p>
582
583    };
584
585    print qq{
586
587      <h3 class='graph_title'>Graph, hit-rate over time</h3>
588      }.($self->{s}{g_over_time} ? $graph_on : $graph_off).qq{
589
590      </ul>
591
592    };
593    my $corpus_on = qq{
594
595      <p><a id="corpus_anchor"></a><a id="corpus"
596        href="}.$self->gen_switch_url("s_corpus", "0").qq{#corpus"
597        >Hide report</a></p>
598	<table>
599	  <tr class='freqsextra'>
600	    <td><pre class='perruleextra'>}.read_corpus_file().qq{</pre></td>
601	  </tr>
602	<table>
603
604    };
605
606    my $corpus_off = qq{
607
608      <p><a id="corpus_anchor"></a><a id="corpus"
609        href="}.$self->gen_switch_url("s_corpus", "1").qq{#corpus"
610        >Show report</a></p>
611
612    };
613
614    print qq{
615
616      <h3 class='corpus_title'>Corpus quality</h3>
617      }.($self->{s}{corpus} ? $corpus_on : $corpus_off).qq{
618
619      </ul>
620
621    };
622
623    my @parms = $self->get_params_except(qw(
624            rule s_age s_overlap s_all s_detail
625          ));
626    my $url_back = $self->assemble_url(@parms);
627
628    print qq{
629
630      <div class='ui_label'>
631      <p><a href="$url_back">&lt; Back</a> to overview.</p>
632      </div>
633
634    };
635  }
636
637  print qq{
638
639      <div class='ui_label'>
640      <p>Note: the freqs tables are sortable.  Click on the headers to resort them
641      by that column.  <a
642      href="https://www.kryogenix.org/code/browser/sorttable/">(thanks!)</a></p>
643      </div>
644
645  </body></html>
646
647  };
648
649}
650
651sub date_offset {
652  my ($yyyymmdd, $offset_days) = @_;
653  $yyyymmdd =~ /^(....)(..)(..)$/;
654  my $time = timegm(0,0,0,$3,$2-1,$1);
655  $time += (24 * 60 * 60) * $offset_days;
656  return POSIX::strftime "%Y%m%d", gmtime $time;
657}
658
659sub get_all_daterevs {
660  my ($self) = @_;
661
662  die "no directory set in automc config for 'html'" unless $AUTOMC_CONF{html};
663
664  return sort map {
665      s/^.*\/(\d+)\/(r\d+-\S+)$/$1-$2/; $_;
666    } grep { /\/\d+\/r\d+-\S+$/ && -d $_ } (<$AUTOMC_CONF{html}/2*/r*>);
667}
668
669sub date_in_direction {
670  my ($self, $origdaterev, $dir) = @_;
671
672  my $orig;
673  if ($origdaterev && $origdaterev =~ /^(\d+)[\/-](r\d+-\S+)$/) {
674    $orig = "$1-$2";
675  } else {
676    $orig = $self->{daterevs}->[-1];      # the most recent
677  }
678
679  if (!$orig) {
680    die "no daterev found for $origdaterev, with these options: ".
681               join(' ', @{$self->{daterevs}});
682  }
683
684  my $cur;
685  for my $i (0 .. scalar(@{$self->{daterevs}})) {
686    if (defined $self->{daterevs}->[$i] && $self->{daterevs}->[$i] eq $orig) {
687      $cur = $i; last;
688    }
689  }
690
691  # if it's not in the list, $cur should be the last entry
692  if (!defined $cur) { $cur = scalar(@{$self->{daterevs}})-1; }
693
694  my $new;
695  if ($dir < 0) {
696    if ($cur+$dir >= 0) {
697      $new = $self->{daterevs}->[$cur+$dir];
698    }
699  }
700  elsif ($dir == 0) {
701    $new = $self->{daterevs}->[$cur];
702  }
703  else {
704    if ($cur+$dir <= scalar(@{$self->{daterevs}})-1) {
705      $new = $self->{daterevs}->[$cur+$dir];
706    }
707  }
708
709  if ($new && -d $self->get_datadir_for_daterev($new)) {
710    return $new;
711  }
712
713  return undef;       # couldn't find one
714}
715
716sub get_daterev_for_days_ago {
717  my ($self, $days) = @_;
718
719  # don't use a daterev after (now - 12 hours); that's too recent
720  # to be "last night", for purposes of rule-update generation.
721
722  my $notafter = POSIX::strftime "%Y%m%d",
723        gmtime ((($self->{now} + DATEREV_ADJ) + (12*60*60)) - (24*60*60*$days));
724  return $self->get_daterev_by_date($notafter);
725}
726
727sub get_daterev_by_date {
728  my ($self, $notafter) = @_;
729
730  foreach my $dr (reverse @{$self->{daterevs}}) {
731    my $t = $self->get_daterev_metadata($dr);
732    next unless $t;
733
734    next if ($t->{date} + 0 > $notafter);
735    return $dr if ($t->{tag} eq 'n');
736  }
737  return undef;
738}
739
740sub get_last_net_daterev {
741  my ($self) = @_;
742
743  foreach my $dr (reverse @{$self->{daterevs}}) {
744    my $t = $self->get_daterev_metadata($dr);
745    next unless $t;
746    return $dr if ($t->{includes_net});
747  }
748  return undef;
749}
750
751sub show_all_sets_for_daterev {
752  my ($self, $path, $strdate) = @_;
753
754  $strdate = "mass-check date/rev: $path";
755
756  $self->{datadir} = $self->get_datadir_for_daterev($path);
757
758  $self->showfreqset('DETAILS', $strdate);
759
760  # special case: we only build this for one set, as it's quite slow
761  # to generate
762  $self->{s}{scoremap} and $self->showfreqsubset("SCOREMAP.new", $strdate);
763  $self->{s}{overlap} and $self->showfreqsubset("OVERLAP.new", $strdate);
764}
765
766###########################################################################
767
768sub graph_over_time {
769  my ($self) = @_;
770
771  $self->{datadir} = $self->get_datadir_for_daterev($self->{daterev});
772
773  # logs are named e.g.
774  # /home/automc/corpus/html/20051028/r328993/LOGS.all-ham-mc-fast.log.gz
775
776  # untaint
777  $self->{rule} =~ /([_0-9a-zA-Z]+)/; my $saferule = $1;
778  $self->{datadir} =~ /([-\.\,_0-9a-zA-Z\/]+)/; my $safedatadir = $1;
779
780  # outright block possibly-hostile stuff here:
781  # no "../" path traversal
782  die "forbidden: $safedatadir .." if ($safedatadir =~ /\.\./);
783
784  exec ("$PERL_INTERP $automcdir/../rule-hits-over-time ".
785        "--cgi --scale_period=200 --rule='$saferule' ".
786        "--ignore_older=180 ".
787        "$safedatadir/LOGS.*.log.gz")
788    or die "exec failed";
789}
790
791###########################################################################
792
793sub show_mclog {
794  my ($self, $name) = @_;
795
796  print "Content-Type: text/plain\r\n\r\n";
797
798  $self->{datadir} = $self->get_datadir_for_daterev($self->{daterev});
799
800  # logs are named e.g.
801  # .../20051028/r328993-n/LOGS.all-ham-mc-fast-20051028-r328993-n.log.gz
802
803  # untaint
804  $name =~ /^([-\.a-zA-Z0-9]+)/; my $safename = $1;
805  $self->{rule} =~ /([_0-9a-zA-Z]+)/; my $saferule = $1;
806  $self->{datadir} =~ /([-\.\,_0-9a-zA-Z\/]+)/; my $safedatadir = $1;
807
808  # logs now include the daterev, too
809  $self->{daterev} =~ /([-\.\,_0-9a-zA-Z\/]+)/; my $safedaterev = $1;
810  $safedaterev =~ s/\//-/gs;
811  $safedaterev =~ s/^\d+-//; # no date in logfile
812  $safedaterev =~ s/-n$//;
813
814  # outright block possibly-hostile stuff here:
815  # no "../" path traversal
816  die "forbidden: $safedatadir .." if ($safedatadir =~ /\.\./);
817  die "forbidden: $safedaterev .." if ($safedaterev =~ /\.\./);
818  die "forbidden: $safename .." if ($safename =~ /\.\./);
819
820  my $gzfile = "$safedatadir/LOGS.all-$safename.$safedaterev.log.gz";
821  if (!-f $gzfile) {
822    print "cannot open $gzfile\n";
823    die "cannot open $gzfile";
824  }
825
826  my $lines = 0;
827  open (GZ, "pigz -cd < $gzfile | grep -F '$saferule' |") or die "cannot gunzip '$gzfile'";
828  while (<GZ>) {
829    /^[\.Y]\s+\S+\s+\S+\s+(?:\S*,|)\Q$saferule\E[, ]/ or next;
830
831    # sanitise privacy-relevant stuff
832    s/,mid=<.*>,/,mid=<REMOVED_BY_RULEQA>,/gs;
833
834    print;
835    last if ++$lines >= 100;
836  }
837
838  close GZ;
839  exit;
840}
841
842###########################################################################
843
844sub read_corpus_file {
845  return ''; # THERE IS NO CORPUS.all FILE GENERATED ATM
846
847  $self->{datadir} = $self->get_datadir_for_daterev($self->{daterev});
848  $self->{datadir} =~ /([-\.\,_0-9a-zA-Z\/]+)/; my $safedatadir = $1;
849
850  # outright block possibly-hostile stuff here:
851  # no "../" path traversal
852  die "forbidden: $safedatadir .." if ($safedatadir =~ /\.\./);
853
854  open IN, "<$safedatadir/CORPUS.all" or warn "cannot read $safedatadir/CORPUS.all";
855  my $text = join('', <IN>);
856  close IN;
857  return $text;
858}
859
860###########################################################################
861
862sub showfreqset {
863  my ($self, $type, $strdate) = @_;
864  $self->{s}{new} and $self->showfreqsubset("$type.new", $strdate);
865  $self->{s}{all} and $self->showfreqsubset("$type.all", $strdate);
866  $self->{s}{age} and $self->showfreqsubset("$type.age", $strdate);
867}
868
869sub showfreqsubset {
870  my ($self, $filename, $strdate) = @_;
871  $self->read_freqs_file($filename);
872
873  if ($filename eq 'DETAILS.new') {
874    # report which sets we used
875    $self->summarise_head($self->{freqs}{head}{$filename},
876                    $filename, $strdate, $self->{rule});
877  }
878
879  $self->get_freqs_for_rule($filename, $strdate, $self->{rule});
880}
881
882sub summarise_head {
883  my ($self, $head, $filename, $strdate, $rule) = @_;
884
885  my @mcfiles = ();
886  if ($head =~ /^# ham results used for \S+ \S+ \S+: (.*)$/m) {
887    @mcfiles = split(' ', $1);
888  }
889
890  map {
891    s/^ham-//; s/\.r[0-9]+\.log$//;
892  } @mcfiles;
893
894  my $who = join(', ', @mcfiles);
895
896  print qq{
897
898    <!-- <em>(Using mass-check data from: $who)</em> -->
899
900  };
901}
902
903sub read_freqs_file {
904  my ($self, $key, $refresh) = @_;
905
906  $refresh ||= 0;
907  my $file = $self->{datadir}.$key;
908
909  # storable cache file
910  my $scache = "$file.scache";
911
912  if (!-f $file) {
913    # try gz if not found
914    if (-f "$file.gz") {
915      $file = "$file.gz";
916    } else {
917      warn "missing file $file";
918    }
919  }
920
921  if (-f $scache) {
922    # is fresh?
923    if (mtime($scache) >= mtime($file)) {
924      return if $refresh; # just -refresh
925      eval {
926        $self->{freqs} = thaw(decompress(readfile($scache)));
927      };
928      if ($@ || !defined $self->{freqs}) {
929        warn "cache retrieve failed $scache: $@ $!";
930        # remove bad file
931        unlink($scache);
932      }
933      else {
934        return;
935      }
936    }
937    else {
938      # remove stale cache
939      unlink($scache);
940    }
941  }
942
943  if ($file =~ /\.gz$/) {
944    $file =~ s/'//gs;
945    if (!open (IN, "pigz -cd < '$file' |")) {
946      warn "cannot read $file";
947      return;
948    }
949  }
950  elsif (!open (IN, "<$file")) {
951    warn "cannot read $file";
952  }
953
954  $self->{freqs}{head}{$key}=<IN>;
955  $self->{freqs}{data}{$key} = { };
956  $self->{freqs}{ordr}{$key} = [ ];
957  my $lastrule;
958
959  my $subset_is_user = 0;
960  my $subset_is_age = 0;
961  if ($file =~ /\.age/) { $subset_is_age = 1; }
962  if ($file =~ /\.all/) { $subset_is_user = 1; }
963
964  while (<IN>) {
965    if (/^#/ || / \(all messages/ || /OVERALL%/) {
966      $self->{freqs}{head}{$key} .= $_;
967    }
968    elsif (/^\s*MSEC/) {
969      next;	# just ignored for now
970    }
971    elsif (/^\s*scoremap (.*)$/) {
972      $self->{freqs}{data}{$key}{$lastrule}{scoremap} .= $_;
973    }
974    elsif (/^\s*overlap (.*)$/) {
975      $self->{freqs}{data}{$key}{$lastrule}{overlap} .= $_;
976    }
977    elsif (/ (?:([\+\-])\s+)?(\S+?)(\:\S+)?\s*$/) {
978      my $promochar = $1;
979      $lastrule = $2;
980      my $subset = $3;
981      if ($subset) { $subset =~ s/^://; }
982
983      my $is_testing = ($lastrule =~ /^T_/);
984      my $is_subrule = ($lastrule =~ /^__/);
985
986      # assume a default based on rule name; turn off explicitly
987      # the rules that are not hitting qual thresholds.  list
988      # both testing and core rules.
989      my $promo = (!$is_subrule);
990      if ($promochar eq '-') {
991        $promo = 0;
992      }
993
994      my @vals = split;
995      if (!exists $self->{freqs}{data}{$key}{$lastrule}) {
996        push (@{$self->{freqs}{ordr}{$key}}, $lastrule);
997        $self->{freqs}{data}{$key}{$lastrule} = {
998          lines => [ ]
999        };
1000      }
1001
1002      my $line = {
1003        name => $lastrule,
1004        msecs => $vals[0],
1005        spampc => $vals[1],
1006        hampc => $vals[2],
1007        so => $vals[3],
1008        rank => $vals[4],
1009        score => $vals[5],
1010        username => ($subset_is_user ? $subset : undef),
1011        age => ($subset_is_age ? $subset : undef),
1012        promotable => $promo ? '1' : '0',
1013      };
1014      push @{$self->{freqs}{data}{$key}{$lastrule}{lines}}, $line;
1015    }
1016    elsif (!/\S/) {
1017      # silently ignore empty lines
1018    }
1019    else {
1020      warn "warning: unknown freqs line in $file: '$_'";
1021    }
1022  }
1023  close IN;
1024
1025  if ($refresh && !-f $scache) {
1026    eval {
1027      open (OUT, ">$scache.$$") or die "open failed: $@";
1028      print OUT compress(nfreeze(\%{$self->{freqs}}));
1029      close OUT;
1030    };
1031    if ($@ || !rename("$scache.$$", $scache)) {
1032      warn "cache store failed $scache: $@";
1033      unlink("$scache.$$");
1034    }
1035  }
1036}
1037
1038sub get_freqs_for_rule {
1039  my ($self, $key, $strdate, $ruleslist) = @_;
1040
1041  my $desc = $FREQS_FILENAMES{$key};
1042  my $file = $self->{datadir}.$key;
1043
1044  my $titleplinkold = "$key.$strdate";
1045  $titleplinkold =~ s/[^A-Za-z0-9]+/_/gs;
1046
1047  my $titleplinknew = "t".$key;
1048  $titleplinknew =~ s/[^A-Za-z0-9]+/_/gs;
1049  $titleplinknew =~ s/^tDETAILS_//;
1050
1051  my $titleplinkhref = $self->{q}->url(-base=>1).$self->gen_this_url()."#".$titleplinknew;
1052
1053  my $comment = qq{
1054
1055    <!-- freqs start $key -->
1056    <h3 class='freqs_title'>$desc</h3>
1057    <!-- <h4>$strdate</h4> -->
1058
1059  };
1060
1061  my $heads = $self->sub_freqs_head_line($self->{freqs}{head}{$key});
1062  my $header_context = $self->extract_freqs_head_info($self->{freqs}{head}{$key});
1063
1064  my $headers_id = $key; $headers_id =~ s/[^A-Za-z0-9]/_/gs;
1065
1066  $comment .= qq{
1067
1068    <div id="$headers_id" class='headdiv' style='display: none'>
1069    <p class='headclosep' align='right'><a
1070          href="javascript:hide_header('$headers_id')">[close]</a></p>
1071    <pre class='head'>$heads</pre>
1072    </div>
1073
1074    <div id="txt_$headers_id" class='headdiv' style='display: none'>
1075    <p class='headclosep' align='right'><a
1076          href="javascript:hide_header('txt_$headers_id')">[close]</a></p>
1077    <pre class='head'><<<TEXTS>>></pre>
1078    </div>
1079
1080    <br clear="all"/>
1081    <p class='showfreqslink'><a
1082      href="javascript:show_header('txt_$headers_id')">(pasteable)</a> <a
1083      href="javascript:show_header('$headers_id')">(source details)</a>
1084      <a name='$titleplinknew' href='$titleplinkhref' class='title_permalink'>(#)</a>
1085      <a name='$titleplinkold'><!-- backwards compat --></a>
1086    </p>
1087
1088    <table class='sortable' id='freqs_${headers_id}' class='freqs'>
1089      <tr class='freqshead'>
1090      <th>MSECS</th>
1091      <th>SPAM%</th>
1092      <th>HAM%</th>
1093      <th>S/O</th>
1094      <th>RANK</th>
1095      <th>SCORE</th>
1096      <th>NAME</th>
1097      <th>WHO/AGE</th>
1098    </tr>
1099
1100  };
1101
1102  $ruleslist ||= '';
1103  my @rules = split (' ', $ruleslist);
1104
1105  if (ref $self->{freqs}{ordr}{$key} ne 'ARRAY') {
1106    print qq(
1107      <h3 class='freqs_title'>$desc</h3>
1108      <table><p><i>('$key' not yet available)</i></p></table>
1109    );
1110    return;
1111  }
1112
1113  if ($self->{rules_all}) {
1114    push @rules, @{$self->{freqs}{ordr}{$key}};
1115  }
1116  elsif ($self->{rules_grep} && $ruleslist =~ /^\/(.*)$/) {
1117    my $regexp = $1;
1118    foreach my $r (@{$self->{freqs}{ordr}{$key}}) {
1119      next unless ($r =~/${regexp}/i);
1120      push @rules, $r;
1121    }
1122  }
1123
1124  my $srcpath = $self->{srcpath};
1125  my $mtime = $self->{mtime};
1126  my $no_net_rules = (!$self->{daterev_md}->{includes_net});
1127
1128  if ($srcpath || $mtime) {
1129    my $rev = $self->get_rev_for_daterev($self->{daterev});
1130    my $md = $self->get_rule_metadata($rev);
1131    $md = $md->{rulemds};
1132
1133    # use Data::Dumper; print Dumper $md;
1134
1135    if ($srcpath) {    # bug 4984
1136      @rules = grep {
1137          $md->{$_}->{src} and
1138             ($md->{$_}->{src} =~ /\Q$srcpath\E/);
1139         } @rules;
1140    }
1141
1142    if ($mtime) {      # bug 4985
1143      my $target = $self->{now} - ($mtime * 24 * 60 * 60);
1144      @rules = grep {
1145          $md->{$_}->{srcmtime} and
1146             ($md->{$_}->{srcmtime} >= $target);
1147         } @rules;
1148    }
1149
1150    if ($no_net_rules) {    # bug 5047
1151      @rules = grep {
1152          !$md->{$_}->{tf} or
1153             ($md->{$_}->{tf} !~ /\bnet\b/);
1154         } @rules;
1155    }
1156  }
1157
1158  if ($self->{include_embedded_freqs_xml} == 0) {
1159    $FREQS_LINE_TEMPLATE =~ s/<!--\s+<rule>.*?-->//gs;
1160  }
1161
1162  my $texts = $titleplinkhref." :\n\n".
1163  	      "  MSECS    SPAM%     HAM%     S/O    RANK   SCORE  NAME   WHO/AGE\n";
1164             #       0   0.0216   0.0763   0.221    0.52    2.84  X_IP
1165
1166  foreach my $rule (@rules) {
1167    if ($rule && defined $self->{freqs}{data}{$key}{$rule}) {
1168      $comment .= $self->rule_anchor($key,$rule);
1169      $comment .= $self->output_freqs_data_line($self->{freqs}{data}{$key}{$rule},
1170                \$FREQS_LINE_TEMPLATE,
1171                $header_context);
1172      $texts .= $self->output_freqs_data_line($self->{freqs}{data}{$key}{$rule},
1173                \$FREQS_LINE_TEXT_TEMPLATE,
1174                $header_context);
1175    }
1176    else {
1177      $comment .= $self->rule_anchor($key,$rule);
1178      $comment .= "
1179      <tr><td colspan=8>
1180        (no data found)
1181      </td></tr>
1182      ";
1183      $texts .= "(no data found)\n";
1184    }
1185  }
1186
1187  # insert the text into that template
1188  $comment =~ s/<<<TEXTS>>>/$texts/gs;
1189
1190  print $comment;
1191  print "</table>";
1192}
1193
1194sub rule_anchor {
1195  my ($self, $key, $rule) = @_;
1196  return "<a name='".uri_escape($key."_".$rule)."'></a>".
1197            "<a name='$rule'></a>";
1198}
1199
1200sub sub_freqs_head_line {
1201  my ($self, $str) = @_;
1202  $str = "<em><tt>".($str || '')."</tt></em><br/>";
1203  return $str;
1204}
1205
1206sub set_freqs_templates {
1207  my ($self) = @_;
1208
1209  $FREQS_LINE_TEMPLATE = qq{
1210
1211  <tr class='freqsline_promo[% PROMO %]'>
1212    <td>[% MSECS %]</td>
1213    <td><a class='ftd' [% SPAMLOGHREF %]>[% SPAMPC %]<span>[% SPAMPCDETAIL %]</span></a>
1214    <td><a class='ftd' [% HAMLOGHREF %]>[% HAMPC %]<span>[% HAMPCDETAIL %]</span></a>
1215    <td>[% SO %]</td>
1216    <td>[% RANK %]</td>
1217    <td>[% SCORE %]</td>
1218    <td style='text-align: left'><a href="[% NAMEREF %]">[% NAME %]</a></td>
1219    <td>[% USERNAME %][% AGE %][% CORPUSAHREF %]</td>
1220    <!--
1221      <rule><test>[% NAME %]</test><promo>[% PROMO %]</promo> <spc>[% SPAMPC %]</spc><hpc>[% HAMPC %]</hpc><so>[% SO %]</so> <detailhref esc='1'>[% NAMEREFENCD %]</detailhref></rule>
1222    -->
1223  </tr>
1224
1225  };
1226
1227  $FREQS_LINE_TEXT_TEMPLATE =
1228       qq{[% MSECS %]  [% SPAMPC %]  [% HAMPC %]  }.
1229       qq{[% SO %]  [% RANK %]  [% SCORE %]  }.
1230       qq{[% NAME %] [% USERNAME %][% AGE %] }.
1231       "\n";
1232
1233  $FREQS_EXTRA_TEMPLATE = qq{
1234
1235  <tr class='freqsextra'>
1236    <td colspan=7><pre class='perruleextra'>[% EXTRA %]</pre></td>
1237  </tr>
1238
1239  };
1240
1241  $FREQS_LINE_TEMPLATE =~ s/^\s+//gm;
1242  $FREQS_EXTRA_TEMPLATE =~ s/^\s+//gm;
1243
1244  $FREQS_LINE_TEMPLATE =~ s/\s+/ /gs;       # no <pre> stuff in this, shrink it
1245}
1246
1247sub extract_freqs_head_info {
1248  my ($self, $headstr) = @_;
1249  my $ctx = { };
1250
1251  # extract the "real" numbers of mails for particular classes, for
1252  # some of the report types:
1253  #   0     1000     1000    0.500   0.00    0.00  (all messages):mc-fast
1254  #   0     4983     4995    0.499   0.00    0.00  (all messages):mc-med
1255  #   0     9974     9995    0.499   0.00    0.00  (all messages):mc-slow
1256  #   0    19972    19994    0.500   0.00    0.00  (all messages):mc-slower
1257  # or just:
1258  #   0    35929    35984    0.500   0.00    0.00  (all messages)
1259  while ($headstr =~ m/^
1260        \s+\d+\s+(\d+)\s+(\d+)\s+\S+\s+\S+\s+\S+\s+\(all\smessages\)(|:\S+)\s*
1261        $/gmx)
1262  {
1263    $ctx->{'message_count'.$3} = {
1264          nspam => $1,
1265          nham => $2
1266        };
1267  }
1268
1269  return $ctx;
1270}
1271
1272sub create_spampc_detail {
1273  my ($self, $percent, $isspam, $ctx, $line) = @_;
1274
1275  # optimization: no need to look anything up if it's 0.0000%
1276  # disabled; this info may be pretty useful after all
1277  ## if ($percent == 0.0) { return qq{ 0\&nbsp;messages }; }
1278
1279  my $who = $line->{username} || $line->{age};
1280  my $obj;
1281  if ($who) {
1282    $obj = $ctx->{'message_count:'.$who};
1283  } else {
1284    $obj = $ctx->{'message_count'};
1285  }
1286
1287  if (!$obj) {
1288    return "???";      # no data found for that submitter, stop here!
1289  }
1290
1291  my $outof = ($isspam ? $obj->{nspam} : $obj->{nham});
1292  my $count = int ((($percent/100.0) * $outof) + 0.5); # round to nearest int
1293  return qq{
1294    $count\&nbsp;of\&nbsp;$outof\&nbsp;messages
1295  };
1296}
1297
1298sub create_mclog_href {
1299  my ($self, $percent, $isspam, $ctx, $line) = @_;
1300
1301  # optimization: no need to look anything up if it's 0.0000%
1302  return '' if ($percent == 0.0);
1303
1304  # also, does nothing unless there's a username
1305  my $who = $line->{username};
1306  return '' unless $who;
1307
1308  #my $net = ($self->{daterev_md}->{includes_net}) ? '-net' : '';
1309
1310  my $href = $self->assemble_url(
1311            "mclog=".(($isspam ? "spam" : "ham")."-$who"),
1312            "rule=".$line->{name},
1313           "daterev=".$self->{daterev},
1314            $self->get_params_except(qw( mclog rule s_detail )));
1315
1316  return qq{
1317	href='$href'
1318  };
1319}
1320
1321sub output_freqs_data_line {
1322  my ($self, $obj, $template, $header_context) = @_;
1323
1324  # normal freqs lines, with optional subselector after rule name
1325  my $out = '';
1326  foreach my $line (@{$obj->{lines}}) {
1327
1328    my $detailurl = '';
1329    if (!$self->{s}{detail}) {	# not already in "detail" mode
1330      $detailurl = $self->create_detail_url($line->{name});
1331    }
1332
1333    my $score = $line->{score};
1334    if ($line->{name} =~ /^__/) {
1335      $score = '(n/a)';
1336    }
1337
1338    my $SPAMPCDETAIL = $self->create_spampc_detail(
1339                        $line->{spampc}, 1, $header_context, $line);
1340    my $HAMPCDETAIL = $self->create_spampc_detail(
1341                        $line->{hampc}, 0, $header_context, $line);
1342    my $SPAMLOGHREF = $self->create_mclog_href(
1343                        $line->{spampc}, 1, $header_context, $line);
1344    my $HAMLOGHREF = $self->create_mclog_href(
1345                        $line->{hampc}, 0, $header_context, $line);
1346
1347    $self->process_template($template, {
1348        RULEDETAIL => $detailurl,
1349        MSECS =>  $line->{msecs}+0  ? sprintf("%7s", $line->{msecs})  : "      0",
1350        SPAMPC => $line->{spampc}+0 ? sprintf("%7s", $line->{spampc}) : "      0",
1351        HAMPC =>  $line->{hampc}+0  ? sprintf("%7s", $line->{hampc})  : "      0",
1352        SPAMPCDETAIL => $SPAMPCDETAIL,
1353        HAMPCDETAIL => $HAMPCDETAIL,
1354        SPAMLOGHREF => $SPAMLOGHREF,
1355        HAMLOGHREF => $HAMLOGHREF,
1356        SO => sprintf("%6s", $line->{so}),
1357        RANK => sprintf("%6s", $line->{rank}),
1358        SCORE => sprintf("%6s", $score),
1359        NAME => $line->{name},
1360        NAMEREF => $self->create_detail_url($line->{name}),
1361        NAMEREFENCD => uri_escape($self->create_detail_url($line->{name})),
1362        USERNAME => $line->{username} || '',
1363        CORPUSAHREF => $self->create_corpus_href($line->{name}, $line->{username}),
1364        AGE => $line->{age} || '',
1365        PROMO => $line->{promotable},
1366    }, \$out);
1367
1368    $self->{line_counter}++;
1369  }
1370
1371  # add scoremap using the FREQS_EXTRA_TEMPLATE if it's present
1372  if ($obj->{scoremap}) {
1373    my $smap = $obj->{scoremap} || '';
1374    #   scoremap spam: 16  12.11%  777 ****
1375
1376    $self->process_template(\$FREQS_EXTRA_TEMPLATE, {
1377        EXTRA => $smap,
1378    }, \$out);
1379
1380    $self->generate_scoremap_chart($smap, \$out);
1381  }
1382
1383  # add overlap using the FREQS_EXTRA_TEMPLATE if it's present
1384  if ($obj->{overlap}) {
1385    $self->process_template(\$FREQS_EXTRA_TEMPLATE, {
1386        EXTRA => $self->format_overlap($obj->{overlap} || '')
1387    }, \$out);
1388  }
1389
1390  return $out;
1391}
1392
1393sub generate_scoremap_chart {
1394  my ($self, $smap, $outref) = @_;
1395
1396  my %chart;
1397  foreach my $l (split (/^/m, $smap)) {
1398    #   scoremap spam: 16  12.11%  777 ****
1399    $l =~ /^\s*scoremap\s+(\S+):\s+(\S+)\s+(\S+)\%\s+\d+/
1400            or $$outref .= "chart: failed to parse scoremap line: $l<br>";
1401
1402    my ($type, $idx, $pc) = ($1,$2,$3);
1403    next unless $type;
1404
1405    $chart{$type}{$idx} = $pc;
1406  }
1407
1408  my %uniq=();
1409  my $max_x = 0;
1410  my $max_y = 0;
1411  for my $i (keys %{$chart{'spam'}}, keys %{$chart{'ham'}}) {
1412    next if exists $uniq{$i}; undef $uniq{$i};
1413    if (($chart{'spam'}{$i}||0) > $max_y) { $max_y = $chart{'spam'}{$i}; }
1414    if (($chart{'ham'}{$i}||0)  > $max_y) { $max_y = $chart{'ham'}{$i}; }
1415    if ($i > $max_x) { $max_x = $i; }
1416  }
1417  $max_y ||= 0.001;
1418
1419  # ensure 0 .. $max_x are always set
1420  foreach my $i (0 .. $max_x) { $uniq{$i} = undef; }
1421
1422  my @idxes = sort { $a <=> $b } keys %uniq;
1423  if (!scalar @idxes) {
1424    $max_x = 1; @idxes = ( 0 );
1425  }
1426  my $min_x = $idxes[0];
1427
1428  # normalize to [0,100] and set default to 0
1429  my @ycoords_s = map { sprintf "%.2f", (100/$max_y) * ($chart{'spam'}{$_}||0) } @idxes;
1430  my @ycoords_h = map { sprintf "%.2f", (100/$max_y) * ($chart{'ham'}{$_}||0) } @idxes;
1431  my @xcoords   = map { sprintf "%.2f", (100/($max_x||0.0001)) * $_ } @idxes;
1432
1433  my $xgrid = (100/($max_x||0.0001)) * 5;
1434  my $ygrid = (100/($max_y||0.0001)) * 10;
1435
1436  # https://code.google.com/apis/chart/ , woo
1437  my $chartsetup =
1438      "cht=lxy"             # line chart with x- and y-axis coords
1439      ."\&amp;chs=400x200"
1440      ."\&amp;chd=t:".join(",", @xcoords)."|".join(",", @ycoords_h)
1441                 ."|".join(",", @xcoords)."|".join(",", @ycoords_s)
1442      ."\&amp;chts=ff0000,18"
1443      ."\&amp;chdl=Ham|Spam"
1444      ."\&amp;chco=ff0000,0000ff,00ff00"
1445      ."\&amp;chg=$xgrid,$ygrid"
1446      ."\&amp;chxl=0:|$min_x+points|$max_x+points|1:|0\%|$max_y\%"
1447      ."\&amp;chxt=x,y";
1448
1449  $$outref .= "<div class='scoremap_chart'>
1450       <img src='https://chart.apis.google.com/chart?$chartsetup'
1451         class='scoremap_chart' width='400' height='200' align='right'
1452       /></div>\n";
1453}
1454
1455sub format_overlap {
1456  my ($self, $ovl) = @_;
1457
1458  # list the subrules last; they're noisy and typically nonuseful
1459  my $out_fullrules = '';
1460  my $out_subrules = '';
1461
1462  foreach my $line (split(/^/m, $ovl)) {
1463    my $issubrule = ($line =~ /\d+\%\s+of __/
1464                    || $line =~ /\(meta rule and subrule\)/);
1465
1466    $line =~ s{^(\s+overlap\s+(?:ham|spam):\s+\d+% )(\S.+?)$}{
1467        my $str = "$1";
1468        foreach my $rule (split(' ', $2)) {
1469          if ($rule =~ /^(?:[(]?[a-z]{1,6}[)]?|\d+\%[)]?)$/) {    # "of", "hits" etc.
1470            $str .= $rule." ";
1471          } else {
1472            my $post = '';
1473            $rule =~ s/(\;\s*)$// and $post = $1;
1474            $str .= $self->gen_rule_link($rule,$rule).$post." ";
1475          }
1476        }
1477        $str;
1478      }gem;
1479
1480    if ($issubrule) {
1481      $out_subrules .= $line;
1482    } else {
1483      $out_fullrules .= $line;
1484    }
1485  }
1486
1487  return "OVERLAP WITH FULL RULES:\n".$out_fullrules."\n".
1488        "OVERLAP WITH SUBRULES:\n".$out_subrules;
1489}
1490
1491# get rid of slow, overengineered Template::Toolkit.  This replacement
1492# is extremely simple-minded, but doesn't call time() on every invocation,
1493# which makes things just a little bit faster
1494sub process_template {
1495  my ($self, $tmplref, $keys, $outref) = @_;
1496  my $buf = $$tmplref;
1497  foreach my $k (keys %{$keys}) {
1498    $buf =~ s/\[\% \Q$k\E \%\]/$keys->{$k}/gs;
1499  }
1500  $$outref .= $buf;
1501}
1502
1503sub create_detail_url {
1504  my ($self, $rulename) = @_;
1505
1506  if (!$self->{create_detail_url_template}) {
1507    my @parms = (
1508          $self->get_params_except(qw(
1509           rule s_age s_overlap s_all s_detail daterev
1510         )),
1511         "daterev=".$self->{daterev},
1512         "s_detail=1",
1513         "rule=__create_detail_url_template__",
1514       );
1515    $self->{create_detail_url_template} = $self->assemble_url(@parms);
1516  }
1517
1518  my $ret = $self->{create_detail_url_template};
1519  $rulename = uri_escape($rulename);
1520  $ret =~ s/__create_detail_url_template__/${rulename}/gs;
1521  return $ret;
1522}
1523
1524sub create_corpus_href {
1525  my ($self, $rulename, $username) = @_;
1526
1527  if (!$self->{s}{detail} || !$username) {	# not already in "detail" mode
1528    return '';
1529  }
1530  my $url = $self->assemble_url(
1531	    "s_corpus=1",
1532	    "s_detail=1",
1533            "rule=".$rulename,
1534            "daterev=".$self->{daterev},
1535            $self->get_params_except(qw( mclog rule s_detail s_corpus daterev )))
1536	    ."#corpus";
1537  return "&nbsp;<a href='$url' class='mcloghref'>[corpus]</a>";
1538}
1539
1540sub gen_rule_link {
1541  my ($self, $rule, $linktext) = @_;
1542  return "<a href='".$self->create_detail_url($rule)."'>$linktext</a>";
1543}
1544
1545sub gen_switch_url {
1546  my ($self, $switch, $newval) = @_;
1547
1548  my @parms =  $self->get_params_except($switch);
1549  $newval ||= '';
1550  if (!defined $switch) { warn "switch '$switch'='$newval' undef value"; }
1551  push (@parms,
1552        $switch."=".$newval,
1553        "daterev=".$self->{daterev}
1554       );
1555  return $self->assemble_url(@parms);
1556}
1557
1558sub gen_this_url {
1559  my ($self) = @_;
1560  my @parms =  $self->get_params_except("__nonexistent__");
1561  return $self->assemble_url(@parms);
1562}
1563
1564sub gen_toplevel_url {
1565  my ($self, $switch, $newval) = @_;
1566
1567  my @parms =  $self->get_params_except($switch, qw(
1568              rule s_age s_overlap s_all s_detail daterev
1569            ));
1570  $newval ||= '';
1571  if (!defined $switch) { warn "switch '$switch'='$newval' undef value"; }
1572  push (@parms, $switch."=".$newval);
1573  return $self->assemble_url(@parms);
1574}
1575
1576sub get_rev_for_daterev {
1577  my ($self, $daterev) = @_;
1578  # '20060120-r370897-b'
1579  $daterev =~ /-r(\d+)-/ or return undef;
1580  return $1;
1581}
1582
1583sub assemble_url {
1584  my ($self, @orig) = @_;
1585
1586  # e.g. https://buildbot.spamassassin.org/ruleqa?
1587  #     daterev=20060120-r370897-b&rule=T_PH_SEC&s_detail=1
1588
1589  # we support special treatment for 'daterev' and 'rule'
1590  my %path = ();
1591  my @parms = ();
1592  $path{daterev} = '';
1593  $path{rule} = '';
1594  foreach my $p (@orig) {
1595    # some ignored parameter noise, from the form
1596    if (!$p) { next; }
1597    elsif ($p =~ /^keywords=$/) { next; }
1598    elsif ($p =~ /^g=Change$/) { next; }
1599    # default values that can be omitted
1600    elsif ($p =~ /^srcpath=$/) { next; }
1601    elsif ($p =~ /^mtime=$/) { next; }
1602    # the ones we can put in the path
1603    elsif ($p =~ /^rule=(.*)$/) { $path{rule} = $1; }
1604    elsif ($p =~ /^daterev=(.*)$/) { $path{daterev} = $1; }
1605    elsif ($p =~ /^s_detail=(?:1|on)$/) { $path{s_detail} = 1; }
1606    # and all the rest
1607    else { push (@parms, $p); }
1608  }
1609
1610  # ensure "/FOO" rule greps are encoded as "%2FFOO"
1611  $path{rule} =~ s,^/,\%2F,;
1612
1613  my $url = $self->{cgi_url}.
1614        ($path{daterev}  ? '/'.$path{daterev} : '').
1615        ($path{rule}     ? '/'.$path{rule}    : '').
1616        ($path{s_detail} ? '/detail'          : '').
1617        '?'.join('&', sort @parms);
1618
1619  # no need for a trailing ? if there were no parms
1620  $url =~ s/\?$//;
1621
1622  # ensure local URL (not starting with "//", which confuses Firefox)
1623  $url =~ s,^/+,/,;
1624
1625  # now, a much more readable
1626  # https://ruleqa.spamassassin.org/
1627  #      20060120-r370897-b/T_PH_SEC/detail
1628
1629  return $url;
1630}
1631
1632sub precache_params {
1633  my ($self) = @_;
1634
1635  @{$self->{cgi_param_order}} = $self->{q}->param();
1636  foreach my $k (@{$self->{cgi_param_order}}) {
1637    next unless defined ($k);
1638    next if ($k eq 'q');        # a shortcut, ignore for future refs
1639    my $v = $self->{q}->param($k);
1640    if (!defined $v) { $v = ''; }
1641    $k =~ s/[<>]//gs;
1642    $v =~ s/[<>]//gs;
1643    $self->{cgi_params}{$k} = uri_escape($k)."=".uri_escape($v);
1644  }
1645}
1646
1647sub add_cgi_path_param {        # assumes already escaped unless $not_escaped
1648  my ($self, $k, $v, $not_escaped) = @_;
1649  $k =~ s/[<>]//gs;
1650  $v =~ s/[<>]//gs;
1651  if (!defined $self->{cgi_params}{$k}) {
1652    push (@{$self->{cgi_param_order}}, $k);
1653  }
1654  if ($not_escaped) {
1655    $self->{cgi_params}{$k} = uri_escape($k)."=".uri_escape($v);
1656    $self->{q}->param(-name=>$k, -value=>$v);
1657  } else {
1658    $self->{cgi_params}{$k} = $k."=".$v;
1659    $self->{q}->param(-name=>$k, -value=>uri_unescape($v));
1660  }
1661}
1662
1663sub add_cgi_param {     # a variant for unescaped data
1664  my ($self, $k, $v) = @_;
1665  return $self->add_cgi_path_param($k, $v, 1);
1666}
1667
1668sub get_params_except {
1669  my ($self, @excepts) = @_;
1670
1671  my @str = ();
1672  foreach my $p (@{$self->{cgi_param_order}}) {
1673    foreach my $skip (@excepts) {
1674      next unless defined $skip && defined $self->{cgi_params}{$p};
1675      goto nextnext if
1676            ($skip eq $p || $self->{cgi_params}{$p} =~ /^\Q$skip\E=/);
1677    }
1678    push (@str, $self->{cgi_params}{$p});
1679nextnext: ;
1680  }
1681  @str;
1682}
1683
1684sub get_datadir_for_daterev {
1685  my ($self, $npath) = @_;
1686  $npath =~ s/-/\//;
1687  return $AUTOMC_CONF{html}."/".$npath."/";
1688}
1689
1690sub get_daterev_metadata {
1691  my ($self, $dr) = @_;
1692  return $self->{cached}->{daterev_metadata}->{$dr} || { };
1693}
1694
1695sub get_mds_as_text {
1696  my ($self, $mclogmds) = @_;
1697
1698  # 'mclogmd' => [
1699  #    {
1700  #      'daterev' => '20060430/r398298-n',
1701  #      'mcstartdate' => '20060430T122405Z',
1702  #      'mtime' => '1146404744',
1703  #      'rev' => '398298',
1704  #      'file' => 'ham-cthielen.log',
1705  #      'fsize' => '3036336'
1706  #    }, [...]
1707
1708  # $mds_as_text = XMLout($mclogmds);   # debug, as XML
1709
1710  # use Data::Dumper; $mds_as_text = Dumper($mclogmds); # debug, as perl data
1711
1712  my $all = '';
1713  if (ref $mclogmds && $mclogmds->{mclogmd}) {
1714    foreach my $f (@{$mclogmds->{mclogmd}}) {
1715      my $started = $f->{mcstartdate};
1716      my $subtime = POSIX::strftime "%Y%m%dT%H%M%SZ", gmtime $f->{mtime};
1717
1718      $all .= qq{
1719
1720        <p> <b>$f->{file}</b>:<br />
1721            started:&nbsp;$started;<br />
1722            submitted:&nbsp;$subtime;<br />
1723            size: $f->{fsize} bytes
1724        </p>
1725
1726      };
1727    }
1728  }
1729
1730  my $id = "mclogmds_".($self->{id_counter}++);
1731
1732  return qq{
1733
1734    <a href="javascript:show_header('$id')">[+]</a>
1735    <div id='$id' class='mclogmds' style='display: none'>
1736      <p class='headclosep' align='right'><a
1737          href="javascript:hide_header('$id')">[-]</a></p>
1738
1739      $all
1740    </div>
1741
1742  };
1743}
1744
1745sub get_daterev_code_description {
1746  my ($self, $dr) = @_;
1747  my $meta = $self->get_daterev_metadata($dr);
1748
1749  return qq{
1750
1751    <td class="daterevcommittd" width='30%'>
1752    <span class="daterev_code_description">
1753      <p>
1754	<a title="$meta->{author}: $meta->{drtitle} ($meta->{cdate})"
1755          href="!drhref!"><strong>$meta->{rev}</strong>: $meta->{cdate}</a>
1756      </p>
1757      <p><div class='commitmsgdiv'>
1758	$meta->{author}: $meta->{drtitle}
1759      </div></p>
1760    </span>
1761    </td>
1762
1763  };
1764}
1765
1766sub get_daterev_masscheck_description {
1767  my ($self, $dr) = @_;
1768  my $meta = $self->get_daterev_metadata($dr);
1769  my $net = $meta->{includes_net} ? "[net]" : "";
1770
1771  my $isvishtml = '';
1772  my $isvisclass = '';
1773  if ($self->{daterev} eq $dr) {
1774    $isvishtml = '<b>(Viewing)</b>';
1775    $isvisclass = 'mcviewing';
1776  }
1777
1778  my $mds_as_text = '';
1779  if ($meta->{mclogmds}) {
1780    $mds_as_text = $self->get_mds_as_text($meta->{mclogmds}) || '';
1781  }
1782
1783  my $submitters = $meta->{submitters};
1784  # remove daterevs, they're superfluous in this table
1785  $submitters =~ s/\.\d+-r\d+-[a-z]\b//gs;
1786
1787  return qq{
1788
1789    <td class="daterevtd $isvisclass" width='20%'>
1790    <span class="daterev_masscheck_description $isvisclass">
1791      <p>
1792        <a name="$meta->{dranchor}"
1793          href="!drhref!"><strong>
1794            <span class="dr">$dr</span>
1795          </strong></a> $isvishtml
1796      </p><p>
1797        <em><span class="mcsubmitters">$submitters</span></em>
1798        $mds_as_text</x>
1799      </p>
1800      <!-- <span class="mctype">$meta->{type}</span> -->
1801      <!-- <span class="mcwasnet">$net</span> -->
1802      <!-- <span class="mcauthor">$meta->{author}</span> -->
1803      <!-- <span class="date">$meta->{date}</span> -->
1804      <!-- tag=$meta->{tag} -->
1805    </span>
1806    </td>
1807
1808  };
1809}
1810
1811sub get_daterev_html_table {
1812  my ($self, $daterev_list, $reverse) = @_;
1813
1814  my $rows = { };
1815  foreach my $dr (@{$daterev_list}) {
1816    next unless $dr;
1817    my $meta = $self->get_daterev_metadata($dr);
1818
1819    my $colidx;
1820    my $type = $meta->{type};
1821    if ($type eq 'preflight') {
1822      $colidx = 0;
1823    } elsif ($type eq 'net') {
1824      $colidx = 2;
1825    } else {
1826      $colidx = 1;
1827    }
1828
1829    # use the daterev number as the row key
1830    $rows->{$meta->{daterev}} ||= [ ];
1831    $rows->{$meta->{daterev}}->[$colidx] = $meta;
1832  }
1833
1834  my @rowkeys = sort keys %{$rows};
1835  if ($reverse) { @rowkeys = reverse @rowkeys; }
1836
1837  my @html = ();
1838  foreach my $rowdate (@rowkeys) {
1839    my $row = $rows->{$rowdate};
1840
1841    my $meta;
1842    foreach my $col (0 .. 2) {
1843      if ($row->[$col]) {
1844	$meta = $row->[$col];
1845	last;
1846      }
1847    }
1848
1849    next unless $meta;		# no entries in the row
1850
1851    push @html, qq{
1852
1853            <tr class='daterevtr'>
1854
1855      }, $self->gen_daterev_html_commit_td($meta);
1856
1857    foreach my $col (0 .. 2) {
1858      $meta = $row->[$col];
1859      if ($meta) {
1860        push @html, $self->gen_daterev_html_table_td($meta);
1861      }
1862      else {
1863        push @html, qq{
1864
1865                <td class='daterevtdempty' width='20%'></td>
1866
1867          };
1868      }
1869    }
1870    push @html, qq{
1871
1872            </tr>
1873
1874      };
1875  }
1876
1877  return join '', @html;
1878}
1879
1880sub gen_daterev_html_commit_td {
1881  my ($self, $meta) = @_;
1882
1883  my $dr = $meta->{daterev};
1884  my @parms = $self->get_params_except(qw(
1885          daterev longdatelist shortdatelist
1886        ));
1887  my $drhref = $self->assemble_url("daterev=".$dr, @parms);
1888
1889  my $text = $self->get_daterev_code_description($dr) || '';
1890  $text =~ s/!drhref!/$drhref/gs;
1891
1892  return $text;
1893}
1894
1895sub gen_daterev_html_table_td {
1896  my ($self, $meta) = @_;
1897
1898  my $dr = $meta->{daterev};
1899  my @parms = $self->get_params_except(qw(
1900          daterev longdatelist shortdatelist
1901        ));
1902  my $drhref = $self->assemble_url("daterev=".$dr, @parms);
1903
1904  my $text = $self->get_daterev_masscheck_description($dr) || '';
1905  $text =~ s/!drhref!/$drhref/gs;
1906  return $text;
1907}
1908
1909sub show_daterev_selector_page {
1910  my ($self) = @_;
1911
1912  my $title = "Rule QA: all recent mass-check results";
1913  print $self->show_default_header($title);
1914
1915  my $max_listings = $self->{q}->param('perpage') || 1000;	# def. 1000
1916  my @drs = @{$self->{daterevs}};
1917  if ($max_listings > 0 && scalar @drs > $max_listings) {
1918    splice @drs, 0, -$max_listings;
1919  }
1920
1921  print qq{
1922
1923    <h3> All Mass-Checks </h3>
1924    <br/> <a href='#net' name='net'>#</a>
1925
1926    <div class='updateform'>
1927      <table style="padding-left: 0px" class='datetable'>
1928      <tr>
1929      <th> Commit </th>
1930      <th> Preflight Mass-Checks </th>
1931      <th> Nightly Mass-Checks </th>
1932      <th> Network Mass-Checks </th>
1933      </tr>
1934
1935  }.  $self->get_daterev_html_table(\@drs, 1, 1);
1936}
1937
1938
1939sub get_rule_metadata {
1940  my ($self, $rev) = @_;
1941
1942  if ($self->{rule_metadata}->{$rev}) {
1943    return $self->{rule_metadata}->{$rev};
1944  }
1945
1946  my $meta = $self->{rule_metadata}->{$rev} = { };
1947  $meta->{rev} = $rev;
1948
1949  my $fname = $AUTOMC_CONF{html}."/rulemetadata/$rev/rulemetadata.xml";
1950  if (-f $fname) {
1951    eval {
1952      $meta->{rulemds} = parse_rulemetadataxml($fname);
1953      #use Data::Dumper; print STDERR Dumper $meta->{rulemds};
1954
1955      # '__CTYPE_HTML' => {
1956      # 'srcmtime' => '1154348696',
1957      # 'src' => 'rulesrc/core/20_ratware.cf'
1958      # },
1959
1960    };
1961
1962    if ($@ || !defined $meta->{rulemds}) {
1963      warn "rev rulemetadata.xml read failed: $@";
1964    } else {
1965      return $meta;
1966    }
1967  }
1968
1969  # if that failed, just return empty
1970  if (1) {
1971    print "<!-- WARN: Failed to read rule metadata file: $fname -->\n";
1972  }
1973
1974  $meta->{rulemds} = {};
1975  return $meta;
1976}
1977
1978# ---------------------------------------------------------------------------
1979
1980sub read_cache {
1981  my ($self) = @_;
1982  if (!-f $self->{cachefile}) {
1983    warn "missing $self->{cachefile}, run -refresh";
1984    return;
1985  }
1986  eval {
1987    $self->{cached} = thaw(decompress(readfile($self->{cachefile})));
1988  };
1989  if ($@ || !defined $self->{cached}) {
1990    warn "cannot read $self->{cachefile}: $@ $!";
1991  }
1992}
1993
1994# ---------------------------------------------------------------------------
1995
1996sub refresh_cache {
1997  my ($self) = @_;
1998
1999  $self->{cached} = { };
2000
2001  # all known date/revision combos.
2002  @{$self->{cached}->{daterevs}} = $self->get_all_daterevs();
2003
2004  foreach my $dr (@{$self->{cached}->{daterevs}}) {
2005    $self->refresh_daterev_metadata($dr);
2006  }
2007
2008  eval {
2009    open (OUT, ">".$self->{cachefile}.".$$") or die "open failed: $@";
2010    print OUT compress(nfreeze(\%{$self->{cached}}));
2011    close OUT;
2012  };
2013  if ($@ || !rename($self->{cachefile}.".$$", $self->{cachefile})) {
2014    unlink($self->{cachefile}.".$$");
2015    die "cannot write $self->{cachefile}: $@";
2016  }
2017}
2018
2019sub refresh_daterev_metadata {
2020  my ($self, $dr) = @_;
2021
2022  my $meta = $self->{cached}->{daterev_metadata}->{$dr} = { };
2023  $meta->{daterev} = $dr;
2024
2025  my $dranchor = "r".$dr; $dranchor =~ s/[^A-Za-z0-9]/_/gs;
2026  $meta->{dranchor} = $dranchor;
2027
2028  $dr =~ /^(\d+)-r(\d+)-(\S+)$/;
2029  my $date = $1;
2030  my $rev = $2;
2031  my $tag = $3;
2032
2033  my $datadir = $self->get_datadir_for_daterev($dr);
2034  $self->{datadir} = $datadir;
2035
2036  # update scache for all freqfiles
2037  foreach my $f (keys %FREQS_FILENAMES) {
2038    my $file = -f $datadir.$f ? $datadir.$f :
2039      -f $datadir."$f.gz" ? $datadir."$f.gz" : undef;
2040    if (defined $file) {
2041      if (time - mtime($file) <= $self->{scache_keep_time}) {
2042        $self->read_freqs_file($f, 1);
2043      }
2044      else {
2045        # remove too old cachefiles
2046        $file =~ s/\.gz$//;
2047        unlink("$file.scache");
2048      }
2049    }
2050  }
2051
2052  my $fname = "$datadir/info.xml";
2053  my $fastfname = "$datadir/fastinfo.xml";
2054
2055  if (-f $fname && -f $fastfname) {
2056    eval {
2057      my $fastinfo = parse_infoxml($fastfname);
2058      $meta->{rev} = $rev;
2059      $meta->{tag} = $tag;
2060      $meta->{mclogmds} = $fastinfo->{mclogmds};
2061      $meta->{includes_net} = $fastinfo->{includes_net};
2062      $meta->{date} = $fastinfo->{date};
2063      $meta->{submitters} = $fastinfo->{submitters};
2064
2065      if ($rev ne $fastinfo->{rev}) {
2066	warn "dr and fastinfo disagree: ($rev ne $fastinfo->{rev})";
2067      }
2068
2069      my $type;
2070      if ($meta->{tag} && $meta->{tag} eq 'b') {
2071        $type = 'preflight';
2072      } elsif ($meta->{includes_net}) {
2073        $type = 'net';
2074      } else {
2075        $type = 'nightly';
2076      }
2077      $meta->{type} = $type;
2078
2079
2080      my $info = parse_infoxml($fname);
2081      # use Data::Dumper; print Dumper $info;
2082      my $cdate = $info->{checkin_date};
2083      $cdate =~ s/T(\S+)\.\d+Z$/ $1/;
2084
2085      my $drtitle = ($info->{msg} ? $info->{msg} : '');
2086      $drtitle =~ s/[\"\'\&\>\<]/ /gs;
2087      $drtitle =~ s/\s+/ /gs;
2088      $drtitle =~ s/^(.{0,160}).*$/$1/gs;
2089
2090      $meta->{cdate} = $cdate;
2091      $meta->{drtitle} = $drtitle;
2092      $meta->{author} = $info->{author};
2093    };
2094
2095    if ($@) {
2096      warn "daterev info.xml: $@";
2097    }
2098
2099    return $meta;
2100  }
2101
2102  # if that failed, just use the info that can be gleaned from the
2103  # daterev itself.
2104  my $drtitle = "(no info)";
2105
2106  {
2107      $meta->{rev} = $rev;
2108      $meta->{cdate} = $date;
2109      $meta->{drtitle} = '(no info available yet)';
2110      $meta->{includes_net} = 0;
2111      $meta->{date} = $date;
2112      $meta->{submitters} = "";
2113      $meta->{author} = "nobody";
2114      $meta->{tag} = $tag;
2115      $meta->{type} = 'preflight';  # default
2116  }
2117}
2118
2119# return file modification time
2120sub mtime {
2121  return (stat $_[0])[9];
2122}
2123
2124# slurp'a'file
2125sub readfile {
2126  my $file = shift;
2127  my $str;
2128  eval {
2129    open(IN, $file) or die $@;
2130    { local($/); $str = <IN> }
2131    close(IN);
2132  };
2133  if ($@) {
2134    warn "read failed $file: $@";
2135    return undef;
2136  }
2137  return $str;
2138}
2139
2140# fast simple xml parser, since we know what to expect
2141sub parse_rulemetadataxml {
2142  my $file = shift;
2143  my $xmlstr = readfile($file);
2144  my $md = {};
2145  while ($xmlstr =~ m!<rulemetadata>(.*?)</rulemetadata>!gs) {
2146    my $rmd = $1;
2147    my %attrs;
2148    while ($rmd =~ m!<([A-Za-z0-9_]{1,50})>(.*?)</\1>!gs) {
2149      $attrs{$1} = $2;
2150    }
2151    if (defined $attrs{name}) {
2152      foreach (keys %attrs) {
2153        next if $_ eq 'name';
2154        $md->{$attrs{name}}->{$_} = $attrs{$_};
2155      }
2156    }
2157  }
2158  if (!%$md) {
2159    warn "xml parse failed $file";
2160  }
2161  return $md;
2162}
2163
2164sub parse_infoxml {
2165  my $file = shift;
2166  my $xmlstr = readfile($file);
2167  my $opt = {};
2168  if ($xmlstr =~ m!<opt ([^>]*?)>!s) {
2169    my $optstr = $1;
2170    my %attrs;
2171    while ($optstr =~ m!\b([A-Za-z0-9_]{1,50})="([^"]*)"!gs) {
2172      $opt->{$1} = $2;
2173    }
2174  }
2175  if (!%$opt) {
2176    warn "xml parse failed $file";
2177  }
2178  return $opt;
2179}
2180
2181=cut
2182
2183to install, add this line to httpd.conf:
2184
2185  ScriptAlias /ruleqa "/path/to/spamassassin/automc/ruleqa.cgi"
2186
2187
2188