1 /*****************************************************************************
2
3 NAME:
4 bogotune.c -- determine optimal parameters for bogofilter
5
6 AUTHORS:
7 Greg Louis - perl version
8 David Relson - C version
9 Matthias Andree - options parser usability
10
11 ******************************************************************************/
12
13 /* To allow tuning of large corpora, the memory use must be minimized
14 ** for each messages. Given the wordlist in ram, foreach token of a
15 ** test message bogotune only needs to know its spam and ham counts.
16 **
17 ** 1. external wordlist ("-d") flag
18 ** a. read wordlist.db
19 ** b. read messages for test set
20 ** 1. lookup words in wordlist
21 ** 2. discard words
22 ** c. replace wordhashs with wordcnts
23 ** d. de-allocate resident wordlist
24 **
25 ** 2. internal wordlist ("-D") flag
26 ** a. read all messages
27 ** b. distribute messages
28 ** 1. proper pct to wordlist
29 ** 2. proper pct to test set
30 ** 2a. create wordprops from wordlists
31 ** c. replace wordprops with wordcnts
32 ** d. de-allocate resident wordlist
33 */
34
35 /* Limitations:
36 **
37 ** If all the input messages are in msg-count format,
38 ** bogotune will use default ROBX value since an external
39 ** wordlist is needed to compute a real robx value.
40 */
41
42 #include "common.h"
43
44 #include <math.h>
45 #include <ctype.h>
46 #include <errno.h>
47 #include <stdlib.h>
48 #include <sys/types.h>
49 #include <sys/stat.h>
50
51 #include "bogotune.h"
52
53 #include "bogoconfig.h"
54 #include "bogoreader.h"
55 #include "bool.h"
56 #include "collect.h"
57 #include "datastore.h"
58 #include "longoptions.h"
59 #include "msgcounts.h"
60 #include "mime.h"
61 #include "mxcat.h"
62 #include "paths.h"
63 #include "robx.h"
64 #include "score.h"
65 #include "token.h"
66 #include "tunelist.h"
67 #include "wordhash.h"
68 #include "wordlists.h"
69 #include "xmalloc.h"
70 #include "xstrdup.h"
71
72 #undef HAM_CUTOFF /* ignore value in score.h */
73 #undef SPAM_CUTOFF /* ignore value in score.h */
74
75 #define TEST_COUNT 500u /* minimum allowable message count */
76 #define LIST_COUNT 2000u /* minimum msg count in tunelist */
77 #define PREF_COUNT 4000u /* preferred message count */
78 #define LARGE_COUNT 40000u
79
80 #define HAM_CUTOFF 0.10
81 #define MIN_HAM_CUTOFF 0.10 /* minimum final ham_cutoff */
82 #define MAX_HAM_CUTOFF 0.45 /* maximum final ham_cutoff */
83 #define MIN_CUTOFF 0.55 /* minimum cutoff for set_thresh() */
84 #define WARN_MIN 0.50 /* warning minimum for set_thresh() */
85 #define WARN_MAX 0.99 /* warning maximum for set_thresh() */
86 #define MAX_CUTOFF 0.99 /* maximum cutoff for set_thresh() */
87 #define SPAM_CUTOFF 0.975
88 #define FP_CUTOFF 0.999
89 #define SCAN_CUTOFF 0.500 /* skip scans if cutoff is less or equal */
90
91 #define CHECK_PCT 0.01 /* for checking high scoring non-spam
92 ** and low scoring spam */
93
94 /* bogotune's default parameters */
95 #define DEFAULT_ROBS ROBS /* 0.0178 */
96 #define DEFAULT_ROBX ROBX /* 0.5200 */
97 #define DEFAULT_MIN_DEV 0.02
98
99 /* coarse scan parms */
100 #define MD_MIN_C 0.06 /* smallest min_dev to test */
101 #define MD_MAX_C 0.38 /* largest min_dev to test */
102 #define MD_DLT_C 0.08 /* increment */
103
104 /* fine scan parms */
105 #define MD_MIN_F 0.02
106 #define MD_MAX_F MD_MAX_C+MD_DLT_F
107 #define MD_DLT_F 0.014
108
109 /* robx limits */
110 #define RX_MIN 0.4
111 #define RX_MAX 0.6
112
113 enum e_verbosity {
114 SUMMARY = 1, /* summarize main loop iterations */
115 TIME = 2, /* include timing info */
116 PARMS = 3, /* print parameter sets (rs, md, rx) */
117 SCORE_SUMMARY = 5, /* verbosity level for printing scores */
118 SCORE_DETAIL /* verbosity level for printing scores */
119 };
120
121 typedef enum e_ds_loc {
122 DS_NONE = 0, /* datastore locn not specified */
123 DS_ERR = 1, /* error in datastore locn spec */
124 DS_DSK = 2, /* datastore on disk */
125 DS_RAM = 4 /* datastore in ram */
126 } ds_loc;
127
128 #define MOD(n,m) ((n) - (floor((n)/(m)))*(m))
129 #define ROUND(m,n) floor((m)*(n)+.5)/(n)
130
131 #define MIN(n) ((n)/60)
132 #define SECONDS(n) ((n) - MIN(n)*60)
133
134 #define KEY(r) ((r)->fn + (((r)->co > 0.5) ? (r)->co : 0.99))
135 #define ESF_SEL(a,b) (!esf_flag ? (a) : (b))
136
137 /* Global Variables */
138
139 const char *progname = "bogotune";
140 static char *ds_path; /* directory */
141 static ds_loc ds_flag = DS_NONE;
142 static void *env = NULL;
143
144 static bool esf_flag = true; /* test ESF factors if true */
145 static bool exit_zero = false; /* non-error exits zero */
146 static const char *bogolex_file = NULL; /* non-NULL if creating msg-count output */
147 static word_t *w_msg_count;
148
149 static uint message_count;
150
151 static wordhash_t *train;
152 static tunelist_t *ns_and_sp;
153 static tunelist_t *ns_msglists, *sp_msglists;
154
155 static flhead_t *spam_files, *ham_files;
156
157 typedef struct {
158 uint cnt;
159 double *data;
160 } data_t;
161
162 static data_t *rsval;
163 static data_t *rxval;
164 static data_t *mdval;
165 static data_t *spexp;
166 static data_t *nsexp;
167
168 static uint target;
169 static uint ns_cnt, sp_cnt;
170
171 static double spex, nsex;
172 static double check_percent; /* initial check for excessively high/low scores */
173 static double *ns_scores;
174 static double *sp_scores;
175 static double user_robx = 0.0; /* from option '-r value' */
176 static uint coerced_target = 0; /* user supplied with '-T value' */
177
178 static uint ncnt, nsum; /* neighbor count and sum - for gfn() averaging */
179
180 #undef TEST
181
182 #ifdef TEST
183 uint test = 0;
184 #endif
185
186 bool fMakeCheck = false; /* allows quick & dirty regression testing */
187 uint cMakeCheck = 50; /* ... for 50 cycles */
188
189 const char *logtag = NULL;
190
191 /* Function Declarations */
192
193 /* Function Definitions */
194
bt_trap(void)195 static void bt_trap(void) {}
196
get_bool(const char * name,const char * arg)197 static bool get_bool(const char *name, const char *arg)
198 {
199 bool b = str_to_bool(arg);
200 if (DEBUG_CONFIG(2))
201 fprintf(dbgout, "%s -> %s\n", name,
202 b ? "Yes" : "No");
203 return b;
204 }
205
get_cnt(double fst,double lst,double amt)206 static int get_cnt(double fst, double lst, double amt)
207 {
208 int cnt = (fabs(lst - fst) + EPS) / (fabs(amt) - EPS) + 1;
209 return cnt;
210 }
211
seq_by_amt(double fst,double lst,double amt)212 static data_t *seq_by_amt(double fst, double lst, double amt)
213 {
214 uint i;
215 data_t *val = (data_t *)xcalloc(1, sizeof(data_t));
216
217 val->cnt = get_cnt(fst, lst, amt);
218 val->data = (double *)xcalloc(val->cnt, sizeof(double));
219
220 for (i = 0; i < val->cnt; i += 1)
221 val->data[i] = fst + i * amt;
222
223 return val;
224 }
225
seq_canonical(double fst,double amt)226 static data_t *seq_canonical(double fst, double amt)
227 {
228 uint c;
229 float v;
230 uint i = 0;
231 data_t *val = (data_t *)xcalloc(1, sizeof(data_t));
232
233 val->data = (double *)xcalloc(5, sizeof(double));
234
235 fst = max(fst, RX_MIN);
236 fst = min(fst, RX_MAX);
237 val->data[i++] = fst;
238 for (c = 1; c <= 2; c += 1) {
239 v = fst - amt * c; if (v >= RX_MIN) val->data[i++] = v;
240 v = fst + amt * c; if (v <= RX_MAX) val->data[i++] = v;
241 }
242 val->cnt = i;
243
244 return val;
245 }
246
seq_by_pow(double fst,double lst,double amt)247 static data_t *seq_by_pow(double fst, double lst, double amt)
248 {
249 uint i;
250 data_t *val = (data_t *)xcalloc(1, sizeof(data_t));
251
252 val->cnt = get_cnt(fst, lst, amt);
253 val->data = (double *)xcalloc(val->cnt, sizeof(double));
254
255 for (i = 0; i < val->cnt; i += 1)
256 val->data[i] = pow(10, fst + i * amt);
257
258 return val;
259 }
260
data_free(data_t * val)261 static void data_free(data_t *val)
262 {
263 xfree(val->data); xfree(val);
264 }
265
init_coarse(double _rx)266 static void init_coarse(double _rx)
267 {
268 rxval = seq_canonical(_rx, 0.05);
269 rsval = seq_by_pow(0.0, -2.0, -1.0);
270 mdval = seq_by_amt(MD_MIN_C, MD_MAX_C, MD_DLT_C);
271 spexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0);
272 nsexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0);
273 }
274
init_fine(double _rs,double _md,double _rx,double _spex,double _nsex)275 static void init_fine(double _rs, double _md, double _rx,
276 double _spex, double _nsex)
277 {
278 double s0, s1;
279
280 s0 = max(log10(_rs) - 0.5, -2.0);
281 s1 = min(log10(_rs) + 0.5, 0.0);
282
283 rsval = seq_by_pow(s1, s0, -0.25);
284
285 s0 = max(_md - 0.042, MD_MIN_F);
286 s1 = min(_md + 0.042, MD_MAX_F);
287
288 mdval = seq_by_amt(s0, s1, MD_DLT_F);
289 rxval = seq_canonical(_rx, 0.013);
290
291 s0 = max(_spex - 1.5, 0.0);
292 s1 = min(_spex + 1.5, 20.0);
293
294 spexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5);
295
296 s0 = max(_nsex - 1.5, 0.0);
297 s1 = min(_nsex + 1.5, 20.0);
298
299 nsexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5);
300 }
301
print_parms(const char * label,const char * format,data_t * data)302 static void print_parms(const char *label, const char *format, data_t *data)
303 {
304 uint i;
305 printf(" %s: %2u ", label, data->cnt);
306 for (i = 0; i < data->cnt; i += 1) {
307 printf("%s", (i == 0) ? " (" : ", ");
308 printf(format, data->data[i]); /* RATS: ignore */
309 }
310 printf(")\n"); fflush(stdout);
311 }
312
print_all_parms(uint r_count)313 static void print_all_parms(uint r_count)
314 {
315 if (verbose >= PARMS) {
316 print_parms("rsval", "%6.4f", rsval);
317 print_parms("rxval", "%5.3f", rxval);
318 print_parms("mdval", "%5.3f", mdval);
319 print_parms("spexp", "%5.2f", spexp);
320 print_parms("nsexp", "%5.2f", nsexp);
321 }
322 if (verbose >= TIME)
323 printf("cnt: %u (rs: %u, rx: %u, md: %u spex: %u, nsex: %u)\n",
324 r_count, rsval->cnt, rxval->cnt, mdval->cnt, spexp->cnt, nsexp->cnt);
325 }
326
compare_ascending(const void * const ir1,const void * const ir2)327 static int compare_ascending(const void *const ir1, const void *const ir2)
328 {
329 const double d1 = *(const double *)ir1;
330 const double d2 = *(const double *)ir2;
331
332 if (d1 - d2 > 0.0) return 1;
333 if (d2 - d1 > 0.0) return -1;
334
335 return 0;
336 }
337
compare_descending(const void * const ir1,const void * const ir2)338 static int compare_descending(const void *const ir1, const void *const ir2)
339 {
340 const double d1 = *(const double *)ir1;
341 const double d2 = *(const double *)ir2;
342
343 if (d1 - d2 > 0.0) return -1;
344 if (d2 - d1 > 0.0) return +1;
345
346 return 0;
347 }
348
349 #define CO(r) ((r->co > 0.5) ? r->co : 0.99)
350
compare_results(const void * const ir1,const void * const ir2)351 static int compare_results(const void *const ir1, const void *const ir2)
352 {
353 result_t const *r1 = (result_t const *)ir1;
354 result_t const *r2 = (result_t const *)ir2;
355
356 if (KEY(r1) > KEY(r2)) return 1;
357 if (KEY(r2) > KEY(r1)) return -1;
358
359 /* Favor cutoffs > 0.5 */
360 if (CO(r1) > CO(r2) ) return 1;
361 if (CO(r2) > CO(r1) ) return -1;
362
363 if (r1->idx > r2->idx ) return 1;
364 if (r2->idx > r1->idx ) return -1;
365
366 return 0;
367 }
368
369 /* Score all non-spam */
370
score_ns(double * results)371 static void score_ns(double *results)
372 {
373 uint i;
374 uint count = 0;
375
376 if (verbose >= SCORE_DETAIL)
377 printf("ns:\n");
378
379 verbose = -verbose; /* disable bogofilter debug output */
380 for (i = 0; i < COUNTOF(ns_msglists->u.sets); i += 1) {
381 mlhead_t *list = ns_msglists->u.sets[i];
382 mlitem_t *item;
383 for (item = list->head; item != NULL; item = item->next) {
384 wordhash_t *wh = item->wh;
385 double score = msg_compute_spamicity(wh);
386 results[count++] = score;
387 if ( -verbose == SCORE_DETAIL ||
388 (-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS))
389 printf("%6u %0.16f\n", count-1, score);
390 }
391 }
392 verbose = -verbose; /* enable bogofilter debug output */
393
394 qsort(results, count, sizeof(double), compare_descending);
395
396 return;
397 }
398
check_for_high_ns_scores(void)399 static bool check_for_high_ns_scores(void)
400 {
401 uint t = ceil(ns_cnt * check_percent);
402
403 score_ns(ns_scores); /* scores in descending order */
404
405 /* want at least 1 high scoring non-spam for FP determination */
406 if (ns_scores[t-1] < SPAM_CUTOFF)
407 return false;
408
409 if (!quiet) {
410 fprintf(stderr,
411 "Warning: test messages include many high scoring nonspam.\n");
412 fprintf(stderr,
413 " You may wish to reclassify them and rerun.\n");
414 }
415
416 return true;
417 }
418
419 /* Score all spam to determine false negative counts */
420
score_sp(double * results)421 static void score_sp(double *results)
422 {
423 uint i;
424 uint count = 0;
425
426 if (verbose >= SCORE_DETAIL)
427 printf("sp:\n");
428
429 verbose = -verbose; /* disable bogofilter debug output */
430 for (i = 0; i < COUNTOF(sp_msglists->u.sets); i += 1) {
431 mlhead_t *list = sp_msglists->u.sets[i];
432 mlitem_t *item;
433 for (item = list->head; item != NULL; item = item->next) {
434 wordhash_t *wh = item->wh;
435 double score = msg_compute_spamicity(wh);
436 results[count++] = score;
437 if ( -verbose == SCORE_DETAIL ||
438 (-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS))
439 printf("%6u %0.16f\n", count-1, score);
440 }
441 }
442 verbose = -verbose; /* enable bogofilter debug output */
443
444 qsort(results, count, sizeof(double), compare_ascending);
445
446 return;
447 }
448
get_fn_count(uint count,double * results)449 static uint get_fn_count(uint count, double *results)
450 {
451 uint i;
452 uint fn = 0;
453
454 for (i = 0; i < count; i += 1) {
455 double score = results[i];
456 if (score < spam_cutoff)
457 fn += 1;
458 }
459
460 return fn;
461 }
462
check_for_low_sp_scores(void)463 static bool check_for_low_sp_scores(void)
464 {
465 uint t = ceil(sp_cnt * check_percent);
466
467 score_sp(sp_scores); /* get scores */
468
469 /* low scoring spam may cause problems ... */
470 if (sp_scores[t-1] > HAM_CUTOFF)
471 return false;
472
473 if (!quiet) {
474 fprintf(stderr,
475 "Warning: test messages include many low scoring spam.\n");
476 fprintf(stderr,
477 " You may wish to reclassify them and rerun.\n");
478 }
479
480 return true;
481 }
482
scoring_error(void)483 static void scoring_error(void)
484 {
485 int i;
486
487 if (quiet)
488 return;
489
490 printf(" high ham scores:\n");
491 for (i = 0; i < 10 && ns_scores[i] > SPAM_CUTOFF; i += 1)
492 printf(" %2d %8.6f\n", i+1, ns_scores[i]);
493
494 printf(" low spam scores:\n");
495 for (i = 0; i < 10 && sp_scores[i] < HAM_CUTOFF; i += 1)
496 printf(" %2d %8.6f\n", i+1, sp_scores[i]);
497 }
498
499 #ifdef TEST
flag(uint idx,uint cnt,uint dlt)500 static char flag(uint idx, uint cnt, uint dlt)
501 {
502 if (dlt == 0)
503 return ' ';
504 if (idx < cnt)
505 return '+';
506 else
507 return '-';
508 }
509 #endif
510
511 #ifdef TEST
print_ns_scores(uint beg,uint cnt,uint dlt)512 static void print_ns_scores(uint beg, uint cnt, uint dlt)
513 {
514 uint i, m = min(cnt + dlt, ns_cnt);
515
516 printf("ns:\n");
517 for (i = beg; i <= m; i += 1)
518 printf(" %3d %0.16f %c\n", i+1, ns_scores[i], flag(i, cnt, dlt));
519 }
520 #endif
521
522 #ifdef TEST
print_sp_scores(uint beg,uint cnt,uint dlt)523 static void print_sp_scores(uint beg, uint cnt, uint dlt)
524
525 {
526 uint i, m = min(cnt + dlt, sp_cnt);
527
528 printf("sp:\n");
529 for (i = beg; i <= m; i += 1)
530 printf(" %3d %0.16f %c\n", i+1, sp_scores[i], flag(i, cnt, dlt));
531 }
532 #endif
533
scale(uint cnt,uint lo,uint hi,double beg,double end)534 static double scale(uint cnt, uint lo, uint hi, double beg, double end)
535 {
536 double ans;
537
538 if (cnt < lo)
539 return beg;
540 if (cnt > hi)
541 return end;
542
543 ans = beg + (end - beg) * (cnt - lo ) / (hi - lo);
544
545 return ans;
546 }
547
548 /* compute scores and set global variables:
549 **
550 ** target
551 ** spam_cutoff
552 **
553 ** As count increases from 500 to 4000 ...
554 ** 1) initial target percent drops from 1% to 0.25%
555 ** 2) initial minimum target increases from 5 to 10
556 */
557
set_thresh(uint count,double * scores)558 static void set_thresh(uint count, double *scores)
559 {
560 uint ftarget = 0;
561 double cutoff, lgc;
562
563 score_ns(scores); /* get scores */
564
565 /*
566 ** Use parabolic curve to fit data
567 ** message counts: (70814, 12645, 2118, 550)
568 ** target values: ( 22, 18, 8, 1)
569 */
570 lgc = log(count);
571
572 ftarget = max(ROUND(((-0.4831 * lgc) + 12.8976) * lgc - 61.5264, 1.0), 1);
573 while (1) {
574 cutoff = ns_scores[ftarget-1];
575 if (verbose >= PARMS)
576 printf("m: cutoff %8.6f, ftarget %u\n", cutoff, ftarget);
577 if (ftarget == 1 || cutoff >= MIN_CUTOFF)
578 break;
579 ftarget -= 1;
580 }
581
582 /* ensure cutoff is below SPAM_CUTOFF */
583 if (cutoff > SPAM_CUTOFF) {
584 while (cutoff > SPAM_CUTOFF && ++ftarget < count) {
585 cutoff = scores[ftarget-1];
586 if (verbose >= PARMS)
587 printf("s: cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget);
588 }
589 cutoff = SPAM_CUTOFF;
590 --ftarget;
591 if (verbose >= PARMS)
592 printf("s: cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget);
593 }
594
595 if (cutoff < WARN_MIN || cutoff > WARN_MAX) {
596 if (!quiet) {
597 fprintf(stderr,
598 "%s high-scoring non-spams in this data set.\n",
599 (cutoff < WARN_MIN) ? "Too few" : "Too many");
600 fprintf(stderr,
601 "At target %u, cutoff is %8.6f.\n", ftarget, cutoff);
602 }
603 }
604
605 #ifdef TEST
606 if (verbose >= PARMS)
607 print_ns_scores(ftarget-4, ftarget+4, 0);
608 #endif
609
610 target = ftarget;
611 spam_cutoff = cutoff;
612
613 return;
614 }
615
init_count(void)616 static void init_count(void)
617 {
618 message_count = 0;
619 }
620
print_final_count(void)621 static void print_final_count(void)
622 {
623 if (verbose) {
624 if (!fMakeCheck)
625 printf("\r \r");
626 printf("%u messages\n", message_count);
627 fflush(stdout);
628 }
629 }
630
update_count(void)631 static int update_count(void)
632 {
633 message_count += 1;
634
635 if (verbose && (message_count % 100) == 0 && !fMakeCheck) {
636 if ((message_count % 1000) != 0)
637 putchar('.');
638 else
639 printf("\r \r%u ", message_count );
640 fflush(stdout);
641 }
642 return message_count;
643 }
644
calc_db_cachesize(void)645 static unsigned int calc_db_cachesize(void)
646 {
647 struct stat fst;
648 if (!stat(ds_path, &fst)) {
649 int dbc = ceil((double)fst.st_size / (3 * 1024 * 1024));
650 return ((unsigned int)dbc);
651 } else {
652 fprintf(stderr, "Unable to stat %s\n", ds_path);
653 exit (EX_ERROR);
654 }
655 }
656
load_wordlist(ds_foreach_t * hook,void * userdata)657 static void load_wordlist(ds_foreach_t *hook, void *userdata)
658 {
659 bfpath *bfp = bfpath_create(ds_path);
660
661 if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {
662 fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);
663 exit(EX_ERROR);
664 }
665
666 if (verbose) {
667 printf("Reading %s\n", ds_path);
668 fflush(stdout);
669 }
670
671 ds_oper(env, bfp, DS_READ, hook, userdata);
672
673 bfpath_free(bfp);
674
675 return;
676 }
677
load_hook(word_t * key,dsv_t * data,void * userdata)678 static ex_t load_hook(word_t *key, dsv_t *data, void *userdata)
679 {
680 wordprop_t *tokenprop = (wordprop_t *)wordhash_insert(train, key, sizeof(wordprop_t), &wordprop_init);
681
682 (void) userdata; /* quiet compiler complaint */
683
684 tokenprop->cnts.bad = data->spamcount;
685 tokenprop->cnts.good = data->goodcount;
686
687 if (word_cmps(key, ".MSG_COUNT") == 0)
688 set_msg_counts(data->goodcount, data->spamcount);
689
690 if (word_cmps(key, ".ENCODING") == 0) {
691 if (encoding == E_UNKNOWN)
692 encoding = (e_enc)data->spamcount; /* FIXME: is this cast correct? */
693 if (encoding != data->spamcount) {
694 fprintf(stderr, "Can't mix database encodings, i.e. utf-8 and any other.\n");
695 exit(EX_ERROR);
696 }
697 }
698
699 return EX_OK;
700 }
701
set_train_msg_counts(wordhash_t * wh)702 static void set_train_msg_counts(wordhash_t *wh)
703 {
704 (void)wordhash_insert(wh, w_msg_count, sizeof(wordprop_t), NULL);
705
706 if (msgs_good == 0 && msgs_bad == 0) {
707 fprintf(stderr, "Can't find '.MSG_COUNT'.\n");
708 exit(EX_ERROR);
709 }
710 }
711
712 /* write_msgcount_file()
713 **
714 ** Create a message count file from the original messages
715 */
716
print_msgcount_entry(const char * token,uint bad,uint good)717 static void print_msgcount_entry(const char *token, uint bad, uint good)
718 {
719 printf("\"%s\" %u %u\n", token, bad, good);
720 }
721
write_msgcount_file(wordhash_t * wh)722 static void write_msgcount_file(wordhash_t *wh)
723 {
724 hashnode_t *hn;
725
726 print_msgcount_entry(".MSG_COUNT", msgs_bad, msgs_good);
727
728 for (hn = (hashnode_t *)wordhash_first(wh); hn != NULL; hn = (hashnode_t *)wordhash_next(wh)) {
729 word_t *token = hn->key;
730 wordprop_t *wp = (wordprop_t *) hn->data;
731 wordcnts_t *cnts = &wp->cnts;
732
733 if (cnts->good == 0 && cnts->bad == 0) {
734 wp = (wordprop_t *)wordhash_search(train, token, 0);
735 if (wp) {
736 cnts->good = wp->cnts.good;
737 cnts->bad = wp->cnts.bad;
738 }
739 }
740
741 print_msgcount_entry((char *)token->u.text, cnts->bad, cnts->good);
742 }
743
744 return;
745 }
746
read_mailbox(const char * arg,mlhead_t * msgs)747 static uint read_mailbox(const char *arg, mlhead_t *msgs)
748 {
749 if (verbose) {
750 printf("Reading %s\n", arg);
751 fflush(stdout);
752 }
753
754 init_count();
755 mbox_mode = true;
756 bogoreader_init(1, &arg);
757 while ((*reader_more)()) {
758 wordhash_t *whp = NULL;
759 wordhash_t *whc = wordhash_new();
760
761 collect_words(whc);
762
763 if (ds_path != NULL && (msgs_good + msgs_bad) == 0)
764 set_train_msg_counts(whc);
765
766 if (whc->count == 0 && !quiet) {
767 printf("msg #%u, count is %u\n", message_count, whc->count);
768 bt_trap();
769 }
770
771 if (bogolex_file != NULL) {
772 wordhash_sort(whc);
773 lookup_words(whc);
774 write_msgcount_file(whc);
775 }
776 else if (whc->count != 0) {
777 if (!msg_count_file)
778 whp = convert_wordhash_to_propslist(whc, train);
779 else
780 whp = convert_propslist_to_countlist(whc);
781 msglist_add(msgs, whp);
782 }
783
784 update_count();
785
786 if (whc != whp)
787 wordhash_free(whc);
788 }
789
790 print_final_count();
791
792 ns_and_sp->count += message_count;
793 bogoreader_fini();
794
795 return message_count;
796 }
797
filelist_read(int mode,flhead_t * list)798 static uint filelist_read(int mode, flhead_t *list)
799 {
800 uint count = 0;
801 flitem_t *item;
802 mlhead_t *msgs = (mode == REG_GOOD) ? ns_msglists->msgs : sp_msglists->msgs;
803 run_type = (run_t)mode;
804
805 for (item = list->head; item != NULL; item = item->next) {
806 lexer = NULL;
807 msg_count_file = false;
808 count += read_mailbox(item->name, msgs);
809 }
810 return count;
811 }
812
813 /* distribute()
814 **
815 ** Proportionally distribute messages between training and scoring sets.
816 **
817 ** Method:
818 ** If only 2500 messages, use 2000 for training and 500 for scoring.
819 ** If over 4000 messages, use equal numbers for training and scoring.
820 ** In between 2500 and 4000, do a proportional distribution.
821 */
822
distribute(int mode,tunelist_t * ns_or_sp)823 static void distribute(int mode, tunelist_t *ns_or_sp)
824 {
825 int good = mode == REG_GOOD;
826 int bad = 1 - good;
827
828 bool divvy = ds_flag == DS_RAM && user_robx < EPS && !msg_count_file;
829
830 mlitem_t *item;
831 mlhead_t *msgs = ns_or_sp->msgs;
832
833 int score_count = 0;
834 int train_count = 0;
835
836 static int train_good = 0;
837 static int train_bad = 0;
838
839 double ratio = scale(msgs->count,
840 LIST_COUNT + TEST_COUNT, /* small count */
841 LIST_COUNT + LIST_COUNT, /* large count */
842 LIST_COUNT / TEST_COUNT, /* small ratio */
843 LIST_COUNT / LIST_COUNT); /* large ratio */
844
845 for (item = msgs->head; item != NULL; item = item->next) {
846 wordhash_t *wh = item->wh;
847
848 /* training set */
849 if (divvy && train_count / ratio < score_count + 1) {
850 wordhash_set_counts(wh, good, bad);
851 wordhash_add(train, wh, &wordprop_init);
852 train_count += 1;
853 wordhash_free(wh);
854 train_good += good;
855 train_bad += bad;
856 }
857 /* scoring set */
858 else {
859 uint bin = divvy ? MOD(score_count,3) : 0;
860 msglist_add(ns_or_sp->u.sets[bin], wh);
861 score_count += 1;
862 }
863 item->wh = NULL;
864 }
865
866 if (divvy) {
867 wordhash_insert(train, w_msg_count, sizeof(wordprop_t), &wordprop_init);
868 set_msg_counts(train_good, train_bad);
869 }
870
871 if (verbose > 1)
872 printf("%s: train_count = %d, score_count = %d\n",
873 good ? "ns" : "sp",
874 train_count, score_count);
875
876 return;
877 }
878
create_countlists(tunelist_t * ns_or_sp)879 static void create_countlists(tunelist_t *ns_or_sp)
880 {
881 uint i;
882 uint c = COUNTOF(ns_or_sp->u.sets);
883 for (i = 0; i < c; i += 1) {
884 mlhead_t *list = ns_or_sp->u.sets[i];
885 mlitem_t *item;
886
887 for (item = list->head; item != NULL; item = item->next) {
888 wordhash_t *who = item->wh;
889 wordhash_t *whn = convert_propslist_to_countlist(who);
890 if (whn != who) {
891 wordhash_free(who);
892 item->wh = whn;
893 }
894 }
895 }
896
897 return;
898 }
899
print_version(void)900 static void print_version(void)
901 {
902 (void)fprintf(stderr,
903 "%s version %s\n"
904 " Database: %s\n"
905 "Copyright (C) 2002-2006 Greg Louis, David Relson\n\n"
906 "%s comes with ABSOLUTELY NO WARRANTY. "
907 "This is free software, and\nyou are welcome to "
908 "redistribute it under the General Public License. "
909 "See\nthe COPYING file with the source distribution for "
910 "details.\n"
911 "\n",
912 progtype, version, ds_version_str(), PACKAGE);
913 }
914
help(void)915 static void help(void)
916 {
917 (void)fprintf(stderr,
918 "Usage: %s [options] { -c config } { -d directory } -n non-spam-file -s spam-file\n",
919 progname);
920 (void)fprintf(stderr,
921 "\t -h - print this help message.\n"
922 "\t -C - don't read standard config files.\n"
923 "\t -c file - read specified config file.\n"
924 "\t -D - don't read a wordlist file.\n"
925 "\t -d path - specify directory for wordlists.\n"
926 "\t -E - disable ESF (effective size factor) tuning.\n"
927 "\t -M file - rewrite input file in message count format.\n"
928 "\t -r num - specify robx value\n");
929 (void)fprintf(stderr,
930 "\t -T num - specify fp target value\n"
931 "\t -s file1 file2 ... - spam files\n"
932 "\t -n file1 file2 ... - non-spam files\n"
933 "\t -v - increase level of verbose messages\n"
934 "\t -q - quiet (suppress warnings)\n"
935 );
936 (void)fprintf(stderr,
937 "\n"
938 "%s (version %s) is part of the bogofilter package.\n",
939 progname, version);
940 }
941
942 static struct option longopts_bogotune[] = {
943 /* longoptions.h - common options */
944 LONGOPTIONS_COMMON
945 LONGOPTIONS_MAIN_TUNE
946 /* longoptions.h - bogofilter/-lexer options */
947 LONGOPTIONS_LEX
948 /* end of list */
949 { NULL, 0, 0, 0 }
950 };
951
process_arglist(int argc,char ** argv)952 static int process_arglist(int argc, char **argv)
953 {
954 int count = 1;
955
956 bulk_mode = B_CMDLINE;
957
958 #ifdef __EMX__
959 _response (&argc, &argv); /* expand response files (@filename) */
960 _wildcard (&argc, &argv); /* expand wildcards (*.*) */
961 #endif
962
963 #define OPTIONS ":-:c:Cd:DeEM:n:qr:s:tT:vVx:"
964
965 while (1)
966 {
967 int option;
968 int option_index = 0;
969 const char *val;
970
971 option = getopt_long_chk(argc, argv, OPTIONS,
972 longopts_bogotune, &option_index);
973
974 if (option == -1)
975 break;
976
977 val = optarg;
978 process_arg(option, NULL, val, PR_NONE, PASS_1_CLI);
979 }
980
981 if (ds_flag == DS_NONE) /* default is "wordlist on disk" */
982 ds_flag = DS_DSK;
983
984 if (ds_flag == DS_ERR) {
985 fprintf(stderr, "Only one '-d dir' or '-D' option is allowed.\n");
986 exit(EX_ERROR);
987 }
988
989 if (bogolex_file == NULL &&
990 (spam_files->count == 0 || ham_files->count == 0)) {
991 fprintf(stderr,
992 "Bogotune needs both non-spam and spam message sets for its parameter testing.\n");
993 exit(EX_ERROR);
994 }
995
996 if (!suppress_config_file)
997 process_config_files(false, longopts_bogotune);
998
999 return count;
1000 }
1001
process_arg(int option,const char * name,const char * val,priority_t precedence,arg_pass_t pass)1002 int process_arg(int option, const char *name, const char *val, priority_t precedence, arg_pass_t pass)
1003 {
1004 static int lastmode = -1;
1005
1006 (void) name; /* suppress compiler warning */
1007 (void) precedence; /* suppress compiler warning */
1008 (void) pass; /* suppress compiler warning */
1009
1010 if (option == 1) {
1011 /* If getopt's RETURN_IN_ORDER behavior */
1012 switch (lastmode) {
1013 case 'n':
1014 case 's':
1015 option = lastmode;
1016 break;
1017 default:
1018 fprintf(stderr,
1019 "File names may only be given after -n or -s options.\n");
1020 }
1021 }
1022
1023 switch (option) {
1024 case 'c':
1025 read_config_file(val, false, false, PR_CFG_USER, longopts_bogotune);
1026 /*@fallthrough@*/
1027 /* fall through to suppress reading config files */
1028
1029 case 'C':
1030 suppress_config_file = true;
1031 break;
1032
1033 case 'd':
1034 ds_path = xstrdup(val);
1035 ds_flag = (ds_flag == DS_NONE) ? DS_DSK : DS_ERR;
1036 break;
1037
1038 case 'D':
1039 ds_flag = (ds_flag == DS_NONE) ? DS_RAM : DS_ERR;
1040 break;
1041
1042 case 'e':
1043 exit_zero = true;
1044 break;
1045
1046 case 'E':
1047 esf_flag ^= true;
1048 break;
1049
1050 case 'M':
1051 bogolex_file = val;
1052 break;
1053
1054 case 'n':
1055 lastmode = 'n';
1056 filelist_add(ham_files, val);
1057 break;
1058
1059 case 'q':
1060 quiet = true;
1061 break;
1062
1063 case 'r':
1064 user_robx = atof(val);
1065 break;
1066
1067 case 's':
1068 lastmode = 's';
1069 filelist_add(spam_files, val);
1070 break;
1071
1072 #ifdef TEST
1073 case 't':
1074 test += 1;
1075 break;
1076 #endif
1077 case 'T':
1078 coerced_target = atoi(val);
1079 break;
1080
1081 case 'v':
1082 verbose += 1;
1083 break;
1084
1085 case 'V':
1086 print_version();
1087 exit(EX_OK);
1088
1089 case 'x':
1090 if (strcmp(val, "MakeCheck") == 0)
1091 fMakeCheck = true;
1092 else
1093 set_debug_mask(val);
1094 break;
1095
1096 case O_MAX_TOKEN_LEN:
1097 max_token_len = atoi(val);
1098 break;
1099
1100 case O_MIN_TOKEN_LEN:
1101 min_token_len = atoi(val);
1102 break;
1103
1104 case O_MAX_MULTI_TOKEN_LEN:
1105 max_multi_token_len=atoi(val);
1106 break;
1107
1108 case O_MULTI_TOKEN_COUNT:
1109 multi_token_count=atoi(val);
1110 break;
1111
1112 case O_BLOCK_ON_SUBNETS:
1113 block_on_subnets = get_bool(name, val);
1114 break;
1115
1116 case O_REPLACE_NONASCII_CHARACTERS:
1117 replace_nonascii_characters = get_bool(name, val);
1118 break;
1119
1120 case O_TOKEN_COUNT_FIX:
1121 token_count_fix = atoi(val);
1122 break;
1123
1124 case O_TOKEN_COUNT_MIN:
1125 token_count_min = atoi(val);
1126 break;
1127
1128 case O_TOKEN_COUNT_MAX:
1129 token_count_max = atoi(val);
1130 break;
1131
1132 default:
1133 help();
1134 exit(EX_ERROR);
1135 }
1136
1137 return 0;
1138 }
1139
get_robx(void)1140 static double get_robx(void)
1141 {
1142 double rx;
1143
1144 if (user_robx > 0.0)
1145 rx = user_robx;
1146 else if (ds_flag == DS_DSK) {
1147 printf("Calculating initial x value...\n");
1148 verbose = -verbose; /* disable bogofilter debug output */
1149 rx = compute_robinson_x();
1150 verbose = -verbose; /* enable bogofilter debug output */
1151 }
1152 else
1153 rx = ROBX;
1154
1155 if (rx > RX_MAX) rx = RX_MAX;
1156 if (rx < RX_MIN) rx = RX_MIN;
1157
1158 printf("Initial x value is %8.6f\n", rx);
1159
1160 return rx;
1161 }
1162
results_sort(uint r_count,result_t * results)1163 static result_t *results_sort(uint r_count, result_t *results)
1164 {
1165 result_t *ans = (result_t *)xcalloc(r_count, sizeof(result_t));
1166 memcpy(ans, results, r_count * sizeof(result_t));
1167 qsort(ans, r_count, sizeof(result_t), compare_results);
1168 return ans;
1169 }
1170
top_ten(result_t * sorted,uint n)1171 static void top_ten(result_t *sorted, uint n)
1172 {
1173 uint i, j;
1174 bool f;
1175
1176 printf("Top ten parameter sets from this scan:\n");
1177
1178 printf(" rs md rx spesf nsesf co fp fn fppc fnpc\n");
1179 for (f = false; !f; f = true) {
1180 for (i = j = 0; i < 10 && j < n;) {
1181 result_t *r = &sorted[j++];
1182 if (!f && r->fp != target) continue;
1183 sp_esf = ESF_SEL(sp_esf, pow(0.75, r->sp_exp));
1184 ns_esf = ESF_SEL(ns_esf, pow(0.75, r->ns_exp));
1185
1186 printf("%5u %6.4f %5.3f %5.3f %8.6f %8.6f %6.4f %3u %3u %6.4f %6.4f\n",
1187 r->idx, r->rs, r->md, r->rx, sp_esf, ns_esf, r->co,
1188 r->fp, r->fn, r->fp*100.0/ns_cnt, r->fn*100.0/sp_cnt);
1189 ++i;
1190 }
1191 if (i) break;
1192 printf("Warning: fp target not met, using original results\n");
1193 }
1194
1195 printf("\n");
1196 fflush(stdout);
1197
1198 return;
1199 }
1200
1201 /* get false negative */
1202
gfn(result_t * results,uint rsi,uint mdi,uint rxi,uint spi,uint nsi)1203 static uint gfn(result_t *results,
1204 uint rsi, uint mdi, uint rxi,
1205 uint spi, uint nsi)
1206 {
1207 uint i = (((rsi * mdval->cnt + mdi) * rxval->cnt + rxi) * spexp->cnt + spi) * nsexp->cnt + nsi;
1208 result_t *r = &results[i];
1209 uint fn = r->fn;
1210 if (r->fp != target) return INT_MAX;
1211 if (verbose > 100)
1212 printf(" %2u, %2u, %2u, %2u, %2u, %2u\n",
1213 rsi, mdi, rxi, spi, nsi, fn);
1214 ncnt += 1;
1215 nsum += fn;
1216 return fn;
1217 }
1218
count_outliers(uint r_count,result_t * sorted,result_t * unsorted)1219 static result_t *count_outliers(uint r_count, result_t *sorted, result_t *unsorted)
1220 {
1221 bool f = false;
1222 uint i, j = 0, o = 0;
1223 uint rsi, mdi, rxi, spi, nsi;
1224 uint rsc = rsval->cnt - 1;
1225 uint rxc = rxval->cnt - 1;
1226 uint mdc = mdval->cnt - 1;
1227 uint spc = spexp->cnt - 1;
1228 uint nsc = nsexp->cnt - 1;
1229
1230 result_t *r = NULL; /* quench bogus compiler warning */
1231 uint q33 = sorted[r_count * 33 / 100].fn; /* 33% quantile */
1232 uint med = sorted[r_count * 50 / 100].fn; /* median false negative */
1233
1234 if (verbose)
1235 printf("%u%% fn count was %u\n", 50u, med);
1236
1237 for (i = 0; i < r_count; i += 1) {
1238 r = &sorted[i];
1239 if (r->fp != target) continue;
1240 if (j == 0) j = i+1;
1241 if (fMakeCheck && j >= cMakeCheck) break;
1242 rsi = r->rsi; mdi = r->mdi; rxi = r->rxi; spi = r->spi; nsi = r->nsi;
1243 ncnt = nsum = 0;
1244 if (((rsi == 0 ||
1245 (gfn(unsorted, rsi-1, mdi, rxi, spi, nsi)) < med)) &&
1246 ((rsi == rsc ||
1247 (gfn(unsorted, rsi+1, mdi, rxi, spi, nsi)) < med)) &&
1248 ((mdi == 0 ||
1249 (gfn(unsorted, rsi, mdi-1, rxi, spi, nsi)) < med)) &&
1250 ((mdi == mdc ||
1251 (gfn(unsorted, rsi, mdi+1, rxi, spi, nsi)) < med)) &&
1252 ((rxi == 0 ||
1253 (gfn(unsorted, rsi, mdi, rxi-1, spi, nsi)) < med)) &&
1254 ((rxi == rxc ||
1255 (gfn(unsorted, rsi, mdi, rxi+1, spi, nsi)) < med)) &&
1256 ((spi == 0 ||
1257 (gfn(unsorted, rsi, mdi, rxi, spi-1, nsi)) < med)) &&
1258 ((spi == spc ||
1259 (gfn(unsorted, rsi, mdi, rxi, spi+1, nsi)) < med)) &&
1260 ((nsi == 0 ||
1261 (gfn(unsorted, rsi, mdi, rxi, spi, nsi-1)) < med)) &&
1262 ((nsi == nsc ||
1263 (gfn(unsorted, rsi, mdi, rxi, spi, nsi+1)) < med)) &&
1264 (ncnt != 0 && nsum / ncnt < q33))
1265 {
1266 f = true;
1267 break;
1268 }
1269 o++;
1270 }
1271
1272 if (o > 0) {
1273 printf("%u outlier%s encountered. \n",
1274 o, (o > 1) ? "s" : "");
1275 }
1276
1277 if (!f) {
1278 r = &sorted[j-1];
1279 printf("No smooth minimum encountered, using lowest fn count (an outlier). \n");
1280 }
1281
1282 return r;
1283 }
1284
progress(uint cur,uint top)1285 static void progress(uint cur, uint top)
1286 {
1287 uint i;
1288 uint ndots = ceil(70.0 * cur / top);
1289
1290 if (quiet)
1291 return;
1292
1293 if (ndots < 1)
1294 ndots = 1;
1295 printf("\r%3u [", cur);
1296 for (i = 0; i < ndots; i += 1)
1297 printf(".");
1298 for (i = ndots; i < 70; i += 1)
1299 printf(" ");
1300 printf("]");
1301 fflush(stdout);
1302 }
1303
final_warning(void)1304 static void final_warning(void)
1305 {
1306 printf(
1307 "The small number and/or relative uniformity of the test messages imply\n"
1308 "that the recommended values (above), though appropriate to the test set,\n"
1309 "may not remain valid for long. Bogotune should be run again with more\n"
1310 "messages when that becomes possible.\n"
1311 );
1312 }
1313
final_recommendations(bool skip)1314 static void final_recommendations(bool skip)
1315 {
1316 uint m;
1317 bool printed = false;
1318 uint minn[] = { 10000, 2000, 1000, 500, 1 };
1319
1320 printf("Performing final scoring:\n");
1321
1322 printf("Spam... ");
1323 score_sp(sp_scores); /* get scores (in ascending order) */
1324
1325 printf("Non-Spam...\n");
1326 score_ns(ns_scores); /* get scores (in descending order) */
1327
1328 for (m=0; m<10; ++m) printf("%8.6f %8.6f\n", sp_scores[m], ns_scores[m]);
1329
1330 if (verbose >= PARMS)
1331 printf("# ns_cnt %u, sp_cnt %u\n", ns_cnt, sp_cnt);
1332
1333 if (skip) {
1334 printf("\n");
1335 printf("### The following recommendations are provisional.\n");
1336 printf("### Run bogotune with more messages when possible.\n");
1337 printf("\n");
1338 }
1339
1340 printf("\nRecommendations:\n\n");
1341 printf("---cut---\n");
1342 printf("db_cachesize=%u\n", db_cachesize);
1343
1344 printf("robs=%6.4f\n", robs);
1345 printf("min_dev=%5.3f\n", min_dev);
1346 printf("robx=%8.6f\n", robx);
1347 printf("sp_esf=%8.6f\n", sp_esf);
1348 printf("ns_esf=%8.6f\n", ns_esf);
1349
1350 for (m=0; m < COUNTOF(minn); m += 1) {
1351 double cutoff;
1352 uint i, fp = 0, fn = 0;
1353 uint mn = minn[m];
1354 double fpp, fnp;
1355
1356 if (ns_cnt < mn)
1357 continue;
1358
1359 if (mn > 1 ) {
1360 uint t = (ns_cnt + mn - 1) / mn;
1361 cutoff = ns_scores[t-1];
1362 if (cutoff > FP_CUTOFF)
1363 continue;
1364 fp = ns_cnt / mn;
1365 fpp = 100.0 / mn;
1366 }
1367 else {
1368 cutoff = SPAM_CUTOFF;
1369 if (printed)
1370 break;
1371 for (i = 0; i < ns_cnt; i += 1) {
1372 if (ns_scores[i] >= cutoff)
1373 fp += 1;
1374 }
1375 cutoff = ns_scores[fp-1];
1376 fpp = 100.0 * fp / ns_cnt;
1377 }
1378
1379 for (i = 0; i < sp_cnt; i += 1) {
1380 if (sp_scores[i] >= cutoff) {
1381 fn = i;
1382 break;
1383 }
1384 }
1385 fnp = 100.0 * fn / sp_cnt;
1386
1387 if (printed) printf("#");
1388 printf("spam_cutoff=%8.6f\t# for %4.2f%% fp (%u); expect %4.2f%% fn (%u).\n",
1389 cutoff, fpp, fp, fnp, fn);
1390
1391 printed = true;
1392 if (skip)
1393 ham_cutoff = cutoff;
1394 }
1395
1396 if (!skip) {
1397 uint s = ceil(sp_cnt * 0.002 - 1);
1398 ham_cutoff = sp_scores[s];
1399 if (ham_cutoff < MIN_HAM_CUTOFF) ham_cutoff = MIN_HAM_CUTOFF;
1400 if (ham_cutoff > MAX_HAM_CUTOFF) ham_cutoff = MAX_HAM_CUTOFF;
1401 }
1402
1403 printf("ham_cutoff=%5.3f\t\n", ham_cutoff);
1404 printf("---cut---\n");
1405 printf("\n");
1406
1407 if (skip)
1408 final_warning();
1409
1410 printf("Tuning completed.\n");
1411 }
1412
bogotune_init(void)1413 static void bogotune_init(void)
1414 {
1415 const char *msg_count = MSG_COUNT;
1416 w_msg_count = word_news(msg_count);
1417 train = wordhash_new();
1418 ns_and_sp = tunelist_new("tr"); /* training lists */
1419 ns_msglists = tunelist_new("ns"); /* non-spam scoring lists */
1420 sp_msglists = tunelist_new("sp"); /* spam scoring lists */
1421
1422 return;
1423 }
1424
bogotune_free(void)1425 static void bogotune_free(void)
1426 {
1427 xfree(ns_scores);
1428 xfree(sp_scores);
1429
1430 filelist_free(ham_files);
1431 filelist_free(spam_files);
1432
1433 tunelist_free(ns_msglists);
1434 tunelist_free(sp_msglists);
1435 tunelist_free(ns_and_sp);
1436
1437 word_free(w_msg_count);
1438
1439 token_cleanup();
1440 mime_cleanup();
1441
1442 xfree(ds_path);
1443
1444 return;
1445 }
1446
check_msgcount_parms(void)1447 static bool check_msgcount_parms(void)
1448 {
1449 bool ok = true;
1450
1451 if (ds_flag == DS_RAM) {
1452 fprintf(stderr, "A wordlist directory must be specified for converting messages to the message count format.\n");
1453 ok = false;
1454 }
1455
1456 if (ham_files->count != 0 && spam_files->count != 0) {
1457 fprintf(stderr, "Message count files may be created from spam or non-spam inputs but not both.\n");
1458 fprintf(stderr, "Run bogotune once for the spam and again for the non-spam.\n");
1459 ok = false;
1460 }
1461
1462 return ok;
1463 }
1464
check_msg_counts(void)1465 static bool check_msg_counts(void)
1466 {
1467 bool ok = true;
1468 double ratio;
1469
1470 if (msgs_good < LIST_COUNT || msgs_bad < LIST_COUNT) {
1471 if (!quiet)
1472 fprintf(stderr,
1473 "The wordlist contains %u non-spam and %u spam messages.\n"
1474 "Bogotune must be run with at least %u of each.\n",
1475 msgs_good, msgs_bad, LIST_COUNT);
1476 ok = false;
1477 }
1478
1479 ratio = (double)msgs_good / (double)msgs_bad;
1480 fprintf(stderr, "wordlist's ham to spam ratio is %0.1f to 1.0\n", ratio );
1481 if ( ratio < 0.1 || ratio > 10.0) {
1482 if (!quiet) {
1483 fprintf(stderr,
1484 "Bogotune requires the ratio be in the range of 0.1 to 10.\n");
1485 }
1486 ok = false;
1487 }
1488
1489 if (ns_cnt < TEST_COUNT || sp_cnt < TEST_COUNT) {
1490 if (!quiet)
1491 fprintf(stderr,
1492 "The messages sets contain %u non-spam and %u spam. Bogotune "
1493 "requires at least %u non-spam and %u spam messages to run.\n",
1494 ns_cnt, sp_cnt, TEST_COUNT, TEST_COUNT);
1495 exit(EX_ERROR);
1496 }
1497
1498 return ok;
1499 }
1500
show_elapsed_time(int beg,int end,uint cnt,double val,const char * lbl1,const char * lbl2)1501 static void show_elapsed_time(int beg, int end, uint cnt, double val,
1502 const char *lbl1, const char *lbl2)
1503 {
1504 int tm = end - beg;
1505 if (!fMakeCheck)
1506 printf(" %dm:%02ds for %u %s. avg: %.1f %s\n",
1507 MIN(tm), SECONDS(tm), cnt, lbl1, val, lbl2);
1508 }
1509
bogolex(void)1510 static rc_t bogolex(void)
1511 {
1512 rc_t status = RC_OK;
1513
1514 if (!check_msgcount_parms())
1515 exit(EX_ERROR);
1516
1517 read_mailbox(bogolex_file, NULL);
1518
1519 return status;
1520 }
1521
bogotune(void)1522 static rc_t bogotune(void)
1523 {
1524 bool skip;
1525 result_t *best;
1526
1527 int beg, end;
1528 uint cnt, scan;
1529 rc_t status = RC_OK;
1530
1531 beg = time(NULL);
1532
1533 ham_cutoff = 0.0;
1534 spam_cutoff = 0.1;
1535
1536 /* Note: memory usage highest while reading messages */
1537 /* usage decreases as distribute() converts to count format */
1538
1539 /* read all messages, merge training sets, look up scoring sets */
1540 ns_cnt = filelist_read(REG_GOOD, ham_files);
1541 sp_cnt = filelist_read(REG_SPAM, spam_files);
1542 cnt = ns_cnt + sp_cnt;
1543
1544 end = time(NULL);
1545 if (verbose >= TIME) {
1546 show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
1547 }
1548
1549 distribute(REG_GOOD, ns_msglists);
1550 distribute(REG_SPAM, sp_msglists);
1551
1552 create_countlists(ns_msglists);
1553 create_countlists(sp_msglists);
1554
1555 if (verbose >= TIME && time(NULL) - end > 2) {
1556 end = time(NULL);
1557 show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
1558 }
1559
1560 if (verbose > PARMS+1) {
1561 tunelist_print(ns_and_sp);
1562 tunelist_print(ns_msglists);
1563 tunelist_print(sp_msglists);
1564 }
1565
1566 ns_cnt = count_messages(ns_msglists);
1567 sp_cnt = count_messages(sp_msglists);
1568
1569 if (ds_flag == DS_DSK && !check_msg_counts())
1570 exit(exit_zero ? EX_OK : EX_ERROR);
1571
1572 fflush(stdout);
1573
1574 check_percent = CHECK_PCT; /* for checking low scoring spam
1575 ** and high scoring non-spam */
1576
1577 ns_scores = (double *)xcalloc(ns_cnt, sizeof(double));
1578 sp_scores = (double *)xcalloc(sp_cnt, sizeof(double));
1579
1580 robs = DEFAULT_ROBS;
1581 robx = DEFAULT_ROBX;
1582 min_dev = DEFAULT_MIN_DEV;
1583
1584 if (check_for_high_ns_scores() | check_for_low_sp_scores())
1585 scoring_error();
1586
1587 /*
1588 ** 5. Calculate x and cache size
1589 ** Calculate x with bogoutil's -r option (a new addition).
1590 ** Bound the calculated value within [0.4, 0.6] and set the range to be
1591 ** investigated to [x-0.1, x+0.1].
1592 */
1593
1594 robx = get_robx();
1595 if (ds_flag == DS_DSK) {
1596 db_cachesize = calc_db_cachesize();
1597 printf("Recommended db cache size is %u MB\n", db_cachesize);
1598 }
1599
1600 /*
1601 ** 6. Calculate fp target
1602 ** The fp target will be derived thus: score non-spams with s and md as
1603 ** shipped, and determine the count that will result from a spam cutoff
1604 ** of 0.95; if that is < 0.25%, try 0.9375 etc.
1605 */
1606
1607 min_dev = 0.02;
1608
1609 /* set target and spam_cutoff */
1610
1611 if (coerced_target == 0)
1612 set_thresh(ns_cnt, ns_scores);
1613 else {
1614 /* if coerced target ... */
1615 target = coerced_target;
1616 spam_cutoff = ns_scores[target-1];
1617 }
1618
1619 skip = ROUND(spam_cutoff,100000) < SCAN_CUTOFF;
1620 printf("False-positive target is %u (cutoff %8.6f)\n", target, spam_cutoff);
1621
1622 #ifdef TEST
1623 if (test) {
1624 printf("m: %8.6f, s: %8.6f, x: %0.16f\n", min_dev, robs, robx);
1625 if (verbose < PARMS)
1626 print_ns_scores(target-2, target+2, 0);
1627 }
1628 #endif
1629
1630 if (!esf_flag && (sp_esf < 1.0 || ns_esf < 1.0))
1631 fprintf(stderr, "Warning: Using ESF values (sp=%8.6f, ns=%8.6f) from config file.\n", sp_esf, ns_esf);
1632
1633 /* No longer needed */
1634 wordhash_free(train);
1635 train = NULL;
1636
1637 for (scan=0; scan <= 1 && !skip; scan ++) {
1638 uint r_count;
1639 uint rsi, rxi, mdi, spi, nsi;
1640 result_t *results, *r, *sorted;
1641
1642 printf("Performing %s scan:\n", scan==0 ? "coarse" : "fine");
1643
1644 switch (scan) {
1645 case 0: /* COARSE */
1646 /*
1647 ** 7. Coarsely scan s, md and x
1648 ** The coarse s scan will range from 1 to 0.01 in half decades, and the
1649 ** coarse md scan will range from 0.05 to 0.45 in steps of 0.05. The
1650 ** coarse x scan will use steps of 0.05. The trough must be surrounded on
1651 ** six sides by values below the 33% quantile (unless bounded on one or
1652 ** more sides).
1653 */
1654 init_coarse(robx);
1655 break;
1656 case 1: /* FINE */
1657 /*
1658 ** 8. Finely scan the peak region
1659 ** The fine s scan will range over the estimated s +/- half a decade in
1660 ** steps of a quarter decade, and the fine md scan will range over the
1661 ** estimated md +/- 0.075 in steps of 0.015. The fine x scan will range
1662 ** over the estimated x +/- 0.04 in steps of 0.02. Scans of s and md
1663 ** are bounded by the limits of the coarse scan. Again, the trough must
1664 ** be surrounded on six sides by values below the 33% quantile. If no
1665 ** such trough exists, a warning is given.
1666 */
1667 init_fine(robs, min_dev, robx, spex, nsex);
1668 break;
1669 }
1670
1671 r_count = rsval->cnt * mdval->cnt * rxval->cnt * spexp->cnt * nsexp->cnt;
1672 results = (result_t *) xcalloc(r_count, sizeof(result_t));
1673
1674 print_all_parms(r_count);
1675
1676 if (verbose >= SUMMARY) {
1677 if (verbose >= SUMMARY+1)
1678 printf("%3s ", "cnt");
1679 if (verbose >= SUMMARY+2)
1680 printf(" %s %s %s ", "s", "m", "x");
1681 printf(" %4s %5s %4s %8s %8s %7s %3s %3s\n",
1682 "rs", "md", "rx", "spesf", "nsesf", "cutoff", "fp", "fn");
1683 }
1684
1685 cnt = 0;
1686 beg = time(NULL);
1687 for (rsi = 0; rsi < rsval->cnt; rsi++) {
1688 robs = rsval->data[rsi];
1689 for (mdi = 0; mdi < mdval->cnt; mdi++) {
1690 min_dev = mdval->data[mdi];
1691 for (rxi = 0; rxi < rxval->cnt; rxi++) {
1692 robx = rxval->data[rxi];
1693 for (spi = 0; spi < spexp->cnt; spi++) {
1694 spex = spexp->data[spi];
1695 sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
1696 for (nsi = 0; nsi < nsexp->cnt; nsi++) {
1697 uint fp, fn;
1698 nsex = nsexp->data[nsi];
1699 ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));
1700
1701 /* save parms */
1702 r = &results[cnt++];
1703 r->idx = cnt;
1704 r->rsi = rsi; r->rs = robs;
1705 r->rxi = rxi; r->rx = robx;
1706 r->mdi = mdi; r->md = min_dev;
1707 r->spi = spi; r->sp_exp = spex;
1708 r->nsi = nsi; r->ns_exp = nsex;
1709
1710 if (verbose >= SUMMARY) {
1711 if (verbose >= SUMMARY+1)
1712 printf("%3u ", cnt);
1713 if (verbose >= SUMMARY+2)
1714 printf(" %u %u %u %u %u ",
1715 rsi, mdi, rxi, spi, nsi);
1716 printf("%6.4f %5.3f %5.3f %8.6f %8.6f",
1717 robs, min_dev, robx, sp_esf, ns_esf);
1718 fflush(stdout);
1719 }
1720
1721 spam_cutoff = 0.01;
1722 score_ns(ns_scores); /* scores in descending order */
1723
1724 /* Determine spam_cutoff and false_pos */
1725 for (fp = target; fp < ns_cnt; fp += 1) {
1726 spam_cutoff = ns_scores[fp-1];
1727 if (spam_cutoff < 0.999999)
1728 break;
1729 if (coerced_target != 0)
1730 break;
1731 }
1732 if (ns_cnt < fp)
1733 fprintf(stderr,
1734 "Too few false positives to determine a valid cutoff\n");
1735
1736 score_sp(sp_scores); /* scores in ascending order */
1737 fn = get_fn_count(sp_cnt, sp_scores);
1738
1739 /* save results */
1740 r->co = spam_cutoff;
1741 r->fp = fp;
1742 r->fn = fn;
1743
1744 if (verbose < SUMMARY)
1745 progress(cnt, r_count);
1746 else {
1747 printf(" %8.6f %2u %3u\n", spam_cutoff, fp, fn);
1748 fflush(stdout);
1749 }
1750
1751 #ifdef TEST
1752 if (test && spam_cutoff < 0.501) {
1753 printf("co: %0.16f\n", spam_cutoff);
1754 print_ns_scores(0, fp, 2);
1755 print_sp_scores(fn-10, fn, 10);
1756 }
1757 #endif
1758 if (fMakeCheck && cnt >= cMakeCheck)
1759 break;
1760 }
1761 if (fMakeCheck && cnt >= cMakeCheck)
1762 break;
1763 }
1764 if (fMakeCheck && cnt >= cMakeCheck)
1765 break;
1766 }
1767 if (fMakeCheck && cnt >= cMakeCheck)
1768 break;
1769 }
1770 fflush(stdout);
1771 if (fMakeCheck && cnt >= cMakeCheck)
1772 break;
1773 }
1774
1775 if (verbose >= TIME) {
1776 end = time(NULL);
1777 show_elapsed_time(beg, end, cnt, (double)(end-beg)/cnt, "iterations", "secs");
1778 }
1779
1780 printf("\n");
1781
1782 /* Scan complete, now find minima */
1783
1784 sorted = results_sort(r_count, results);
1785 top_ten(sorted, r_count);
1786
1787 best = count_outliers(r_count, sorted, results);
1788 robs = rsval->data[best->rsi];
1789 robx = rxval->data[best->rxi];
1790 min_dev = mdval->data[best->mdi];
1791
1792 spex = spexp->data[best->spi]; sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
1793 nsex = nsexp->data[best->nsi]; ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));
1794
1795 printf(
1796 "Minimum found at s %6.4f, md %5.3f, x %5.3f, spesf %8.6f, nsesf %8.6f\n",
1797 robs, min_dev, robx, sp_esf, ns_esf);
1798 printf(" fp %u (%6.4f%%), fn %u (%6.4f%%)\n",
1799 best->fp, best->fp*100.0/ns_cnt,
1800 best->fn, best->fn*100.0/sp_cnt);
1801 printf("\n");
1802
1803 data_free(rsval);
1804 data_free(rxval);
1805 data_free(mdval);
1806 data_free(spexp);
1807 data_free(nsexp);
1808
1809 xfree(results);
1810 xfree(sorted);
1811 }
1812
1813 /*
1814 ** 9. Suggest possible spam and non-spam cutoff values
1815 ** With the final x, md and s values, score the spams and non-spams and
1816 ** sort the non-spam scores decreasing and the spam scores increasing;
1817 ** then, traverse the non-spam list until the 0.2% point; report cutoffs
1818 ** that give 0.05%, 0.1% and 0.2% fp.
1819 */
1820
1821 final_recommendations(skip);
1822
1823 return status;
1824 }
1825
main(int argc,char ** argv)1826 int main(int argc, char **argv) /*@globals errno,stderr,stdout@*/
1827 {
1828 ex_t exitcode = EX_OK;
1829
1830 fBogotune = true; /* for rob_compute_spamicity() */
1831
1832 dbgout = stderr;
1833
1834 init_globals();
1835
1836 progtype = build_progtype(progname, DB_TYPE);
1837
1838 ham_files = filelist_new("ham");
1839 spam_files = filelist_new("spam");
1840
1841 /* process args and read mailboxes */
1842 process_arglist(argc, argv);
1843
1844 /* directories from command line and config file are already handled */
1845 if (ds_flag == DS_DSK) {
1846 bfpath *bfp;
1847
1848 if (ds_path == NULL)
1849 ds_path = get_directory(PR_ENV_BOGO);
1850
1851 if (ds_path == NULL)
1852 ds_path = get_directory(PR_ENV_HOME);
1853
1854 if (ds_path == NULL) {
1855 fprintf(stderr, "Cannot derive bogofilter directory from environment, aborting.\n");
1856 exit(EX_ERROR);
1857 }
1858
1859 set_bogohome(ds_path);
1860 bfp = bfpath_create(ds_path);
1861
1862 if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {
1863 fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);
1864 xfree(ds_path);
1865 exit(EX_ERROR);
1866 }
1867
1868 if (bfp->exists && bfp->isdir) {
1869 char *tmp;
1870 bfpath_free(bfp);
1871 tmp = mxcat(ds_path, DIRSEP_S, WORDLIST, NULL);
1872 xfree(ds_path);
1873 ds_path = tmp;
1874 bfp = bfpath_create(ds_path);
1875 if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {
1876 fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);
1877 bfpath_free(bfp);
1878 xfree(ds_path);
1879 exit(EX_ERROR);
1880 }
1881 }
1882
1883 env = ds_init(bfp);
1884
1885 init_wordlist("word", ds_path, 0, WL_REGULAR);
1886 bfpath_free(bfp);
1887 }
1888
1889 bogotune_init();
1890
1891 if (ds_flag == DS_DSK)
1892 load_wordlist(load_hook, train);
1893
1894 /* if encoding not yet set, assume old style */
1895 if (encoding == E_UNKNOWN)
1896 encoding = E_RAW;
1897
1898 if (bogolex_file != NULL)
1899 bogolex();
1900 else
1901 bogotune();
1902
1903 bogotune_free();
1904
1905 if (ds_flag == DS_DSK)
1906 ds_cleanup(env);
1907
1908 exit(exitcode);
1909 }
1910
1911 /* End */
1912