1 /*****************************************************************************
2 
3 NAME:
4    bogotune.c -- determine optimal parameters for bogofilter
5 
6 AUTHORS:
7    Greg Louis - perl version
8    David Relson - C version
9    Matthias Andree - options parser usability
10 
11 ******************************************************************************/
12 
13 /* To allow tuning of large corpora, the memory use must be minimized
14 ** for each messages.  Given the wordlist in ram, foreach token of a
15 ** test message bogotune only needs to know its spam and ham counts.
16 **
17 ** 1. external wordlist ("-d") flag
18 **    a. read wordlist.db
19 **    b. read messages for test set
20 **       1. lookup words in wordlist
21 **       2. discard words
22 **    c. replace wordhashs with wordcnts
23 **    d. de-allocate resident wordlist
24 **
25 ** 2. internal wordlist ("-D") flag
26 **    a. read all messages
27 **    b. distribute messages
28 **        1. proper pct to wordlist
29 **        2. proper pct to test set
30 ** 	  2a. create wordprops from wordlists
31 **    c. replace wordprops with wordcnts
32 **    d. de-allocate resident wordlist
33 */
34 
35 /* Limitations:
36 **
37 **	If all the input messages are in msg-count format,
38 **	bogotune will use default ROBX value since an external
39 **	wordlist is needed to compute a real robx value.
40 */
41 
42 #include "common.h"
43 
44 #include <math.h>
45 #include <ctype.h>
46 #include <errno.h>
47 #include <stdlib.h>
48 #include <sys/types.h>
49 #include <sys/stat.h>
50 
51 #include "bogotune.h"
52 
53 #include "bogoconfig.h"
54 #include "bogoreader.h"
55 #include "bool.h"
56 #include "collect.h"
57 #include "datastore.h"
58 #include "longoptions.h"
59 #include "msgcounts.h"
60 #include "mime.h"
61 #include "mxcat.h"
62 #include "paths.h"
63 #include "robx.h"
64 #include "score.h"
65 #include "token.h"
66 #include "tunelist.h"
67 #include "wordhash.h"
68 #include "wordlists.h"
69 #include "xmalloc.h"
70 #include "xstrdup.h"
71 
72 #undef	HAM_CUTOFF	/* ignore value in score.h */
73 #undef	SPAM_CUTOFF	/* ignore value in score.h */
74 
75 #define	TEST_COUNT	500u	/* minimum allowable message count */
76 #define	LIST_COUNT	2000u	/* minimum msg count in tunelist   */
77 #define	PREF_COUNT	4000u	/* preferred message count         */
78 #define	LARGE_COUNT	40000u
79 
80 #define	HAM_CUTOFF	0.10
81 #define MIN_HAM_CUTOFF	0.10	/* minimum final ham_cutoff */
82 #define MAX_HAM_CUTOFF	0.45	/* maximum final ham_cutoff */
83 #define	MIN_CUTOFF	0.55	/* minimum cutoff  for set_thresh() */
84 #define	WARN_MIN	0.50	/* warning minimum for set_thresh() */
85 #define	WARN_MAX	0.99	/* warning maximum for set_thresh() */
86 #define	MAX_CUTOFF	0.99	/* maximum cutoff  for set_thresh() */
87 #define	SPAM_CUTOFF	0.975
88 #define FP_CUTOFF	0.999
89 #define	SCAN_CUTOFF	0.500	/* skip scans if cutoff is less or equal */
90 
91 #define	CHECK_PCT	0.01	/* for checking high scoring non-spam
92 				** and low scoring spam */
93 
94 /* bogotune's default parameters */
95 #define	DEFAULT_ROBS	ROBS	/* 0.0178 */
96 #define	DEFAULT_ROBX	ROBX	/* 0.5200 */
97 #define	DEFAULT_MIN_DEV	0.02
98 
99 /* coarse scan parms */
100 #define	MD_MIN_C	0.06	/* smallest min_dev to test */
101 #define	MD_MAX_C	0.38	/* largest  min_dev to test */
102 #define	MD_DLT_C	0.08	/* increment		    */
103 
104 /* fine scan parms */
105 #define	MD_MIN_F	0.02
106 #define	MD_MAX_F	MD_MAX_C+MD_DLT_F
107 #define	MD_DLT_F	0.014
108 
109 /* robx limits */
110 #define RX_MIN		0.4
111 #define RX_MAX		0.6
112 
113 enum e_verbosity {
114     SUMMARY	   = 1,	/* summarize main loop iterations	*/
115     TIME	   = 2, /* include timing info           	*/
116     PARMS	   = 3,	/* print parameter sets (rs, md, rx)	*/
117     SCORE_SUMMARY  = 5,	/* verbosity level for printing scores	*/
118     SCORE_DETAIL	/* verbosity level for printing scores	*/
119 };
120 
121 typedef enum e_ds_loc {
122     DS_NONE	   = 0,	/* datastore locn not specified */
123     DS_ERR	   = 1,	/* error in datastore locn spec */
124     DS_DSK	   = 2,	/* datastore on disk */
125     DS_RAM	   = 4 	/* datastore in ram  */
126 } ds_loc;
127 
128 #define	MOD(n,m)	((n) - (floor((n)/(m)))*(m))
129 #define	ROUND(m,n)	floor((m)*(n)+.5)/(n)
130 
131 #define	MIN(n)		((n)/60)
132 #define SECONDS(n)	((n) - MIN(n)*60)
133 
134 #define KEY(r)		((r)->fn + (((r)->co > 0.5) ? (r)->co : 0.99))
135 #define	ESF_SEL(a,b)	(!esf_flag ? (a) : (b))
136 
137 /* Global Variables */
138 
139 const char *progname = "bogotune";
140 static char *ds_path;			/* directory */
141 static ds_loc ds_flag = DS_NONE;
142 static void *env = NULL;
143 
144 static bool    esf_flag = true;		/* test ESF factors if true */
145 static bool    exit_zero = false;	/* non-error exits zero */
146 static const char *bogolex_file = NULL;	/* non-NULL if creating msg-count output */
147 static word_t *w_msg_count;
148 
149 static uint message_count;
150 
151 static wordhash_t *train;
152 static tunelist_t *ns_and_sp;
153 static tunelist_t *ns_msglists, *sp_msglists;
154 
155 static flhead_t *spam_files, *ham_files;
156 
157 typedef struct {
158     uint cnt;
159     double *data;
160 } data_t;
161 
162 static data_t *rsval;
163 static data_t *rxval;
164 static data_t *mdval;
165 static data_t *spexp;
166 static data_t *nsexp;
167 
168 static uint target;
169 static uint ns_cnt, sp_cnt;
170 
171 static double spex, nsex;
172 static double check_percent;		/* initial check for excessively high/low scores */
173 static double *ns_scores;
174 static double *sp_scores;
175 static double user_robx = 0.0;		/* from option '-r value' */
176 static uint   coerced_target = 0;	/* user supplied with '-T value' */
177 
178 static uint   ncnt, nsum;		/* neighbor count and sum - for gfn() averaging */
179 
180 #undef	TEST
181 
182 #ifdef	TEST
183 uint test = 0;
184 #endif
185 
186 bool fMakeCheck = false;	/* allows quick & dirty regression testing */
187 uint cMakeCheck =    50;	/* ... for 50 cycles */
188 
189 const char *logtag = NULL;
190 
191 /* Function Declarations */
192 
193 /* Function Definitions */
194 
bt_trap(void)195 static void bt_trap(void) {}
196 
get_bool(const char * name,const char * arg)197 static bool get_bool(const char *name, const char *arg)
198 {
199     bool b = str_to_bool(arg);
200     if (DEBUG_CONFIG(2))
201 	fprintf(dbgout, "%s -> %s\n", name,
202 		b ? "Yes" : "No");
203     return b;
204 }
205 
get_cnt(double fst,double lst,double amt)206 static int get_cnt(double fst, double lst, double amt)
207 {
208     int cnt = (fabs(lst - fst) + EPS) / (fabs(amt) - EPS) + 1;
209     return cnt;
210 }
211 
seq_by_amt(double fst,double lst,double amt)212 static data_t *seq_by_amt(double fst, double lst, double amt)
213 {
214     uint i;
215     data_t *val = (data_t *)xcalloc(1, sizeof(data_t));
216 
217     val->cnt = get_cnt(fst, lst, amt);
218     val->data = (double *)xcalloc(val->cnt, sizeof(double));
219 
220     for (i = 0; i < val->cnt; i += 1)
221 	val->data[i] = fst + i * amt;
222 
223     return val;
224 }
225 
seq_canonical(double fst,double amt)226 static data_t *seq_canonical(double fst, double amt)
227 {
228     uint c;
229     float v;
230     uint i = 0;
231     data_t *val = (data_t *)xcalloc(1, sizeof(data_t));
232 
233     val->data = (double *)xcalloc(5, sizeof(double));
234 
235     fst = max(fst, RX_MIN);
236     fst = min(fst, RX_MAX);
237     val->data[i++] = fst;
238     for (c = 1; c <= 2; c += 1) {
239 	v = fst - amt * c; if (v >= RX_MIN) val->data[i++] = v;
240 	v = fst + amt * c; if (v <= RX_MAX) val->data[i++] = v;
241     }
242     val->cnt = i;
243 
244     return val;
245 }
246 
seq_by_pow(double fst,double lst,double amt)247 static data_t *seq_by_pow(double fst, double lst, double amt)
248 {
249     uint i;
250     data_t *val = (data_t *)xcalloc(1, sizeof(data_t));
251 
252     val->cnt = get_cnt(fst, lst, amt);
253     val->data = (double *)xcalloc(val->cnt, sizeof(double));
254 
255     for (i = 0; i < val->cnt; i += 1)
256 	val->data[i] = pow(10, fst + i * amt);
257 
258     return val;
259 }
260 
data_free(data_t * val)261 static void data_free(data_t *val)
262 {
263     xfree(val->data); xfree(val);
264 }
265 
init_coarse(double _rx)266 static void init_coarse(double _rx)
267 {
268     rxval = seq_canonical(_rx, 0.05);
269     rsval = seq_by_pow(0.0, -2.0, -1.0);
270     mdval = seq_by_amt(MD_MIN_C, MD_MAX_C, MD_DLT_C);
271     spexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0);
272     nsexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0);
273 }
274 
init_fine(double _rs,double _md,double _rx,double _spex,double _nsex)275 static void init_fine(double _rs, double _md, double _rx,
276 		      double _spex, double _nsex)
277 {
278     double s0, s1;
279 
280     s0 = max(log10(_rs) - 0.5, -2.0);
281     s1 = min(log10(_rs) + 0.5,  0.0);
282 
283     rsval = seq_by_pow(s1, s0, -0.25);
284 
285     s0 = max(_md - 0.042, MD_MIN_F);
286     s1 = min(_md + 0.042, MD_MAX_F);
287 
288     mdval = seq_by_amt(s0, s1, MD_DLT_F);
289     rxval = seq_canonical(_rx, 0.013);
290 
291     s0 = max(_spex - 1.5,  0.0);
292     s1 = min(_spex + 1.5, 20.0);
293 
294     spexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5);
295 
296     s0 = max(_nsex - 1.5,  0.0);
297     s1 = min(_nsex + 1.5, 20.0);
298 
299     nsexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5);
300 }
301 
print_parms(const char * label,const char * format,data_t * data)302 static void print_parms(const char *label, const char *format, data_t *data)
303 {
304     uint i;
305     printf("  %s: %2u ", label, data->cnt);
306     for (i = 0; i < data->cnt; i += 1) {
307 	printf("%s", (i == 0) ? " (" : ", ");
308 	printf(format, data->data[i]); /* RATS: ignore */
309     }
310     printf(")\n"); fflush(stdout);
311 }
312 
print_all_parms(uint r_count)313 static void print_all_parms(uint r_count)
314 {
315     if (verbose >= PARMS) {
316 	print_parms("rsval", "%6.4f", rsval);
317 	print_parms("rxval", "%5.3f", rxval);
318 	print_parms("mdval", "%5.3f", mdval);
319 	print_parms("spexp", "%5.2f", spexp);
320 	print_parms("nsexp", "%5.2f", nsexp);
321     }
322     if (verbose >= TIME)
323 	printf("cnt: %u (rs: %u, rx: %u, md: %u spex: %u, nsex: %u)\n",
324 	       r_count, rsval->cnt, rxval->cnt, mdval->cnt, spexp->cnt, nsexp->cnt);
325 }
326 
compare_ascending(const void * const ir1,const void * const ir2)327 static int compare_ascending(const void *const ir1, const void *const ir2)
328 {
329     const double d1 = *(const double *)ir1;
330     const double d2 = *(const double *)ir2;
331 
332     if (d1 - d2 > 0.0) return  1;
333     if (d2 - d1 > 0.0) return -1;
334 
335     return 0;
336 }
337 
compare_descending(const void * const ir1,const void * const ir2)338 static int compare_descending(const void *const ir1, const void *const ir2)
339 {
340     const double d1 = *(const double *)ir1;
341     const double d2 = *(const double *)ir2;
342 
343     if (d1 - d2 > 0.0) return -1;
344     if (d2 - d1 > 0.0) return +1;
345 
346     return 0;
347 }
348 
349 #define CO(r) ((r->co > 0.5) ? r->co : 0.99)
350 
compare_results(const void * const ir1,const void * const ir2)351 static int compare_results(const void *const ir1, const void *const ir2)
352 {
353     result_t const *r1 = (result_t const *)ir1;
354     result_t const *r2 = (result_t const *)ir2;
355 
356     if (KEY(r1) > KEY(r2)) return  1;
357     if (KEY(r2) > KEY(r1)) return -1;
358 
359     /* Favor cutoffs > 0.5 */
360     if (CO(r1) > CO(r2) ) return  1;
361     if (CO(r2) > CO(r1) ) return -1;
362 
363     if (r1->idx > r2->idx ) return  1;
364     if (r2->idx > r1->idx ) return -1;
365 
366     return 0;
367 }
368 
369 /* Score all non-spam */
370 
score_ns(double * results)371 static void score_ns(double *results)
372 {
373     uint i;
374     uint count = 0;
375 
376     if (verbose >= SCORE_DETAIL)
377 	printf("ns:\n");
378 
379     verbose = -verbose;		/* disable bogofilter debug output */
380     for (i = 0; i < COUNTOF(ns_msglists->u.sets); i += 1) {
381 	mlhead_t *list = ns_msglists->u.sets[i];
382 	mlitem_t *item;
383 	for (item = list->head; item != NULL; item = item->next) {
384 	    wordhash_t *wh = item->wh;
385 	    double score = msg_compute_spamicity(wh);
386 	    results[count++] = score;
387 	    if ( -verbose == SCORE_DETAIL ||
388 		(-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS))
389 		printf("%6u %0.16f\n", count-1, score);
390 	}
391     }
392     verbose = -verbose;		/* enable bogofilter debug output */
393 
394     qsort(results, count, sizeof(double), compare_descending);
395 
396     return;
397 }
398 
check_for_high_ns_scores(void)399 static bool check_for_high_ns_scores(void)
400 {
401     uint t = ceil(ns_cnt * check_percent);
402 
403     score_ns(ns_scores);	/* scores in descending order */
404 
405     /* want at least 1 high scoring non-spam for FP determination */
406     if (ns_scores[t-1] < SPAM_CUTOFF)
407 	return false;
408 
409     if (!quiet) {
410 	fprintf(stderr,
411 		"Warning: test messages include many high scoring nonspam.\n");
412 	fprintf(stderr,
413 		"         You may wish to reclassify them and rerun.\n");
414     }
415 
416     return true;
417 }
418 
419 /* Score all spam to determine false negative counts */
420 
score_sp(double * results)421 static void score_sp(double *results)
422 {
423     uint i;
424     uint count = 0;
425 
426     if (verbose >= SCORE_DETAIL)
427 	printf("sp:\n");
428 
429     verbose = -verbose;		/* disable bogofilter debug output */
430     for (i = 0; i < COUNTOF(sp_msglists->u.sets); i += 1) {
431 	mlhead_t *list = sp_msglists->u.sets[i];
432 	mlitem_t *item;
433 	for (item = list->head; item != NULL; item = item->next) {
434 	    wordhash_t *wh = item->wh;
435 	    double score = msg_compute_spamicity(wh);
436 	    results[count++] = score;
437 	    if ( -verbose == SCORE_DETAIL ||
438 		(-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS))
439 		printf("%6u %0.16f\n", count-1, score);
440 	}
441     }
442     verbose = -verbose;		/* enable bogofilter debug output */
443 
444     qsort(results, count, sizeof(double), compare_ascending);
445 
446     return;
447 }
448 
get_fn_count(uint count,double * results)449 static uint get_fn_count(uint count, double *results)
450 {
451     uint i;
452     uint fn = 0;
453 
454     for (i = 0; i < count; i += 1) {
455 	double score = results[i];
456 	if (score < spam_cutoff)
457 	    fn += 1;
458     }
459 
460     return fn;
461 }
462 
check_for_low_sp_scores(void)463 static bool check_for_low_sp_scores(void)
464 {
465     uint t = ceil(sp_cnt * check_percent);
466 
467     score_sp(sp_scores);	/* get scores */
468 
469     /* low scoring spam may cause problems ... */
470     if (sp_scores[t-1] > HAM_CUTOFF)
471 	return false;
472 
473     if (!quiet) {
474 	fprintf(stderr,
475 		"Warning: test messages include many low scoring spam.\n");
476 	fprintf(stderr,
477 		"         You may wish to reclassify them and rerun.\n");
478     }
479 
480     return true;
481 }
482 
scoring_error(void)483 static void scoring_error(void)
484 {
485     int i;
486 
487     if (quiet)
488 	return;
489 
490     printf("    high ham scores:\n");
491     for (i = 0; i < 10 && ns_scores[i] > SPAM_CUTOFF; i += 1)
492 	printf("      %2d %8.6f\n", i+1, ns_scores[i]);
493 
494     printf("    low spam scores:\n");
495     for (i = 0; i < 10 && sp_scores[i] < HAM_CUTOFF; i += 1)
496 	printf("      %2d %8.6f\n", i+1, sp_scores[i]);
497 }
498 
499 #ifdef	TEST
flag(uint idx,uint cnt,uint dlt)500 static char flag(uint idx, uint cnt, uint dlt)
501 {
502     if (dlt == 0)
503 	return ' ';
504     if (idx < cnt)
505 	return '+';
506     else
507 	return '-';
508 }
509 #endif
510 
511 #ifdef	TEST
print_ns_scores(uint beg,uint cnt,uint dlt)512 static void print_ns_scores(uint beg, uint cnt, uint dlt)
513 {
514     uint i, m = min(cnt + dlt, ns_cnt);
515 
516     printf("ns:\n");
517     for (i = beg; i <= m; i += 1)
518 	printf("    %3d %0.16f %c\n", i+1, ns_scores[i], flag(i, cnt, dlt));
519 }
520 #endif
521 
522 #ifdef	TEST
print_sp_scores(uint beg,uint cnt,uint dlt)523 static void print_sp_scores(uint beg, uint cnt, uint dlt)
524 
525 {
526     uint i, m = min(cnt + dlt, sp_cnt);
527 
528     printf("sp:\n");
529     for (i = beg; i <= m; i += 1)
530 	printf("    %3d %0.16f %c\n", i+1, sp_scores[i], flag(i, cnt, dlt));
531 }
532 #endif
533 
scale(uint cnt,uint lo,uint hi,double beg,double end)534 static double scale(uint cnt, uint lo, uint hi, double beg, double end)
535 {
536     double ans;
537 
538     if (cnt < lo)
539 	return beg;
540     if (cnt > hi)
541 	return end;
542 
543     ans = beg + (end - beg) * (cnt - lo ) / (hi - lo);
544 
545     return ans;
546 }
547 
548 /* compute scores and set global variables:
549 **
550 **	 target
551 **	 spam_cutoff
552 **
553 ** As count increases from 500 to 4000 ...
554 **	1) initial target percent drops from 1% to 0.25%
555 **	2) initial minimum target increases from 5 to 10
556 */
557 
set_thresh(uint count,double * scores)558 static void set_thresh(uint count, double *scores)
559 {
560     uint   ftarget = 0;
561     double cutoff, lgc;
562 
563     score_ns(scores);		/* get scores */
564 
565 /*
566 **	Use parabolic curve to fit data
567 **	message counts: (70814, 12645, 2118, 550)
568 **	target values:  (   22,    18,    8,   1)
569 */
570     lgc = log(count);
571 
572     ftarget = max(ROUND(((-0.4831 * lgc) + 12.8976) * lgc - 61.5264, 1.0), 1);
573     while (1) {
574 	cutoff = ns_scores[ftarget-1];
575 	if (verbose >= PARMS)
576 	    printf("m:  cutoff %8.6f, ftarget %u\n", cutoff, ftarget);
577 	if (ftarget == 1 || cutoff >= MIN_CUTOFF)
578 	    break;
579 	ftarget -= 1;
580     }
581 
582     /* ensure cutoff is below SPAM_CUTOFF */
583     if (cutoff > SPAM_CUTOFF) {
584  	while (cutoff > SPAM_CUTOFF && ++ftarget < count) {
585  	    cutoff = scores[ftarget-1];
586 	    if (verbose >= PARMS)
587 		printf("s:  cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget);
588 	}
589  	cutoff = SPAM_CUTOFF;
590 	--ftarget;
591 	if (verbose >= PARMS)
592 	    printf("s:  cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget);
593     }
594 
595     if (cutoff < WARN_MIN || cutoff > WARN_MAX) {
596 	if (!quiet) {
597 	    fprintf(stderr,
598 		    "%s high-scoring non-spams in this data set.\n",
599 		    (cutoff < WARN_MIN) ? "Too few" : "Too many");
600 	    fprintf(stderr,
601 		    "At target %u, cutoff is %8.6f.\n", ftarget, cutoff);
602 	}
603     }
604 
605 #ifdef	TEST
606     if (verbose >= PARMS)
607 	print_ns_scores(ftarget-4, ftarget+4, 0);
608 #endif
609 
610     target = ftarget;
611     spam_cutoff = cutoff;
612 
613     return;
614 }
615 
init_count(void)616 static void init_count(void)
617 {
618     message_count = 0;
619 }
620 
print_final_count(void)621 static void print_final_count(void)
622 {
623     if (verbose) {
624 	if (!fMakeCheck)
625 	    printf("\r              \r");
626 	printf("%u messages\n", message_count);
627 	fflush(stdout);
628     }
629 }
630 
update_count(void)631 static int update_count(void)
632 {
633     message_count += 1;
634 
635     if (verbose && (message_count % 100) == 0 && !fMakeCheck) {
636 	if ((message_count % 1000) != 0)
637 	    putchar('.');
638 	else
639 	    printf("\r              \r%u ", message_count );
640 	fflush(stdout);
641     }
642     return message_count;
643 }
644 
calc_db_cachesize(void)645 static unsigned int calc_db_cachesize(void)
646 {
647     struct stat fst;
648     if (!stat(ds_path, &fst)) {
649 	int dbc = ceil((double)fst.st_size / (3 * 1024 * 1024));
650         return ((unsigned int)dbc);
651     } else {
652         fprintf(stderr, "Unable to stat %s\n", ds_path);
653         exit (EX_ERROR);
654     }
655 }
656 
load_wordlist(ds_foreach_t * hook,void * userdata)657 static void load_wordlist(ds_foreach_t *hook, void *userdata)
658 {
659     bfpath *bfp = bfpath_create(ds_path);
660 
661     if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {
662 	fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);
663 	exit(EX_ERROR);
664     }
665 
666     if (verbose) {
667 	printf("Reading %s\n", ds_path);
668 	fflush(stdout);
669     }
670 
671     ds_oper(env, bfp, DS_READ, hook, userdata);
672 
673     bfpath_free(bfp);
674 
675     return;
676 }
677 
load_hook(word_t * key,dsv_t * data,void * userdata)678 static ex_t load_hook(word_t *key, dsv_t *data, void *userdata)
679 {
680     wordprop_t *tokenprop = (wordprop_t *)wordhash_insert(train, key, sizeof(wordprop_t), &wordprop_init);
681 
682     (void) userdata;	/* quiet compiler complaint */
683 
684     tokenprop->cnts.bad = data->spamcount;
685     tokenprop->cnts.good = data->goodcount;
686 
687     if (word_cmps(key, ".MSG_COUNT") == 0)
688 	set_msg_counts(data->goodcount, data->spamcount);
689 
690     if (word_cmps(key, ".ENCODING") == 0) {
691 	if (encoding == E_UNKNOWN)
692 	    encoding = (e_enc)data->spamcount; /* FIXME: is this cast correct? */
693 	if (encoding != data->spamcount) {
694 	    fprintf(stderr, "Can't mix database encodings, i.e. utf-8 and any other.\n");
695 	    exit(EX_ERROR);
696 	}
697     }
698 
699     return EX_OK;
700 }
701 
set_train_msg_counts(wordhash_t * wh)702 static void set_train_msg_counts(wordhash_t *wh)
703 {
704     (void)wordhash_insert(wh, w_msg_count, sizeof(wordprop_t), NULL);
705 
706     if (msgs_good == 0 && msgs_bad == 0) {
707 	fprintf(stderr, "Can't find '.MSG_COUNT'.\n");
708 	exit(EX_ERROR);
709     }
710 }
711 
712 /* write_msgcount_file()
713 **
714 **	Create a message count file from the original messages
715 */
716 
print_msgcount_entry(const char * token,uint bad,uint good)717 static void print_msgcount_entry(const char *token, uint bad, uint good)
718 {
719     printf("\"%s\" %u %u\n", token, bad, good);
720 }
721 
write_msgcount_file(wordhash_t * wh)722 static void write_msgcount_file(wordhash_t *wh)
723 {
724     hashnode_t *hn;
725 
726     print_msgcount_entry(".MSG_COUNT", msgs_bad, msgs_good);
727 
728     for (hn = (hashnode_t *)wordhash_first(wh); hn != NULL; hn = (hashnode_t *)wordhash_next(wh)) {
729 	word_t *token = hn->key;
730 	wordprop_t *wp = (wordprop_t *) hn->data;
731 	wordcnts_t *cnts = &wp->cnts;
732 
733 	if (cnts->good == 0 && cnts->bad == 0) {
734 	    wp = (wordprop_t *)wordhash_search(train, token, 0);
735 	    if (wp) {
736 		cnts->good = wp->cnts.good;
737 		cnts->bad  = wp->cnts.bad;
738 	    }
739 	}
740 
741 	print_msgcount_entry((char *)token->u.text, cnts->bad, cnts->good);
742     }
743 
744     return;
745 }
746 
read_mailbox(const char * arg,mlhead_t * msgs)747 static uint read_mailbox(const char *arg, mlhead_t *msgs)
748 {
749     if (verbose) {
750 	printf("Reading %s\n", arg);
751 	fflush(stdout);
752     }
753 
754     init_count();
755     mbox_mode = true;
756     bogoreader_init(1, &arg);
757     while ((*reader_more)()) {
758 	wordhash_t *whp = NULL;
759 	wordhash_t *whc = wordhash_new();
760 
761 	collect_words(whc);
762 
763 	if (ds_path != NULL && (msgs_good + msgs_bad) == 0)
764 	    set_train_msg_counts(whc);
765 
766 	if (whc->count == 0 && !quiet) {
767 	    printf("msg #%u, count is %u\n", message_count, whc->count);
768 	    bt_trap();
769 	}
770 
771 	if (bogolex_file != NULL) {
772 	    wordhash_sort(whc);
773 	    lookup_words(whc);
774 	    write_msgcount_file(whc);
775 	}
776 	else if (whc->count != 0) {
777 	    if (!msg_count_file)
778 		whp = convert_wordhash_to_propslist(whc, train);
779 	    else
780 		whp = convert_propslist_to_countlist(whc);
781 	    msglist_add(msgs, whp);
782 	}
783 
784 	update_count();
785 
786 	if (whc != whp)
787 	    wordhash_free(whc);
788     }
789 
790     print_final_count();
791 
792     ns_and_sp->count += message_count;
793     bogoreader_fini();
794 
795     return message_count;
796 }
797 
filelist_read(int mode,flhead_t * list)798 static uint filelist_read(int mode, flhead_t *list)
799 {
800     uint count = 0;
801     flitem_t *item;
802     mlhead_t *msgs = (mode == REG_GOOD) ? ns_msglists->msgs : sp_msglists->msgs;
803     run_type = (run_t)mode;
804 
805     for (item = list->head; item != NULL; item = item->next) {
806 	lexer = NULL;
807 	msg_count_file = false;
808 	count += read_mailbox(item->name, msgs);
809     }
810     return count;
811 }
812 
813 /* distribute()
814 **
815 **	Proportionally distribute messages between training and scoring sets.
816 **
817 **   Method:
818 **	If only 2500 messages, use 2000 for training and 500 for scoring.
819 **	If over 4000 messages, use equal numbers for training and scoring.
820 **	In between 2500 and 4000, do a proportional distribution.
821 */
822 
distribute(int mode,tunelist_t * ns_or_sp)823 static void distribute(int mode, tunelist_t *ns_or_sp)
824 {
825     int good = mode == REG_GOOD;
826     int bad  = 1 - good;
827 
828     bool divvy = ds_flag == DS_RAM && user_robx < EPS && !msg_count_file;
829 
830     mlitem_t *item;
831     mlhead_t *msgs = ns_or_sp->msgs;
832 
833     int score_count = 0;
834     int train_count = 0;
835 
836     static int train_good = 0;
837     static int train_bad  = 0;
838 
839     double ratio = scale(msgs->count,
840 			 LIST_COUNT + TEST_COUNT,	/* small count */
841 			 LIST_COUNT + LIST_COUNT,	/* large count */
842 			 LIST_COUNT / TEST_COUNT,	/* small ratio */
843 			 LIST_COUNT / LIST_COUNT);	/* large ratio */
844 
845     for (item = msgs->head; item != NULL; item = item->next) {
846 	wordhash_t *wh = item->wh;
847 
848 	/* training set */
849 	if (divvy && train_count / ratio < score_count + 1) {
850 	    wordhash_set_counts(wh, good, bad);
851 	    wordhash_add(train, wh, &wordprop_init);
852 	    train_count += 1;
853 	    wordhash_free(wh);
854 	    train_good += good;
855 	    train_bad  += bad;
856 	}
857 	/* scoring set  */
858 	else {
859 	    uint bin = divvy ? MOD(score_count,3) : 0;
860 	    msglist_add(ns_or_sp->u.sets[bin], wh);
861 	    score_count += 1;
862 	}
863 	item->wh = NULL;
864     }
865 
866     if (divvy) {
867 	wordhash_insert(train, w_msg_count, sizeof(wordprop_t), &wordprop_init);
868 	set_msg_counts(train_good, train_bad);
869     }
870 
871     if (verbose > 1)
872 	printf("%s:  train_count = %d, score_count = %d\n",
873 	       good ? "ns" : "sp",
874 	       train_count, score_count);
875 
876     return;
877 }
878 
create_countlists(tunelist_t * ns_or_sp)879 static void create_countlists(tunelist_t *ns_or_sp)
880 {
881     uint i;
882     uint c = COUNTOF(ns_or_sp->u.sets);
883     for (i = 0; i < c; i += 1) {
884 	mlhead_t *list = ns_or_sp->u.sets[i];
885 	mlitem_t *item;
886 
887 	for (item = list->head; item != NULL; item = item->next) {
888 	    wordhash_t *who = item->wh;
889 	    wordhash_t *whn = convert_propslist_to_countlist(who);
890 	    if (whn != who) {
891 		wordhash_free(who);
892 		item->wh = whn;
893 	    }
894 	}
895     }
896 
897     return;
898 }
899 
print_version(void)900 static void print_version(void)
901 {
902     (void)fprintf(stderr,
903 		  "%s version %s\n"
904 		  "    Database: %s\n"
905 		  "Copyright (C) 2002-2006 Greg Louis, David Relson\n\n"
906 		  "%s comes with ABSOLUTELY NO WARRANTY.  "
907 		  "This is free software, and\nyou are welcome to "
908 		  "redistribute it under the General Public License.  "
909 		  "See\nthe COPYING file with the source distribution for "
910 		  "details.\n"
911 		  "\n",
912 		  progtype, version, ds_version_str(), PACKAGE);
913 }
914 
help(void)915 static void help(void)
916 {
917     (void)fprintf(stderr,
918 		  "Usage:  %s [options] { -c config } { -d directory } -n non-spam-file -s spam-file\n",
919 		  progname);
920     (void)fprintf(stderr,
921 		  "\t  -h      - print this help message.\n"
922 		  "\t  -C      - don't read standard config files.\n"
923 		  "\t  -c file - read specified config file.\n"
924 		  "\t  -D      - don't read a wordlist file.\n"
925 		  "\t  -d path - specify directory for wordlists.\n"
926 		  "\t  -E      - disable ESF (effective size factor) tuning.\n"
927 		  "\t  -M file - rewrite input file in message count format.\n"
928 		  "\t  -r num  - specify robx value\n");
929     (void)fprintf(stderr,
930 		  "\t  -T num  - specify fp target value\n"
931 		  "\t  -s file1 file2 ... - spam files\n"
932 		  "\t  -n file1 file2 ... - non-spam files\n"
933 		  "\t  -v      - increase level of verbose messages\n"
934 		  "\t  -q      - quiet (suppress warnings)\n"
935 	);
936     (void)fprintf(stderr,
937 		  "\n"
938 		  "%s (version %s) is part of the bogofilter package.\n",
939 		  progname, version);
940 }
941 
942 static struct option longopts_bogotune[] = {
943     /* longoptions.h - common options */
944     LONGOPTIONS_COMMON
945     LONGOPTIONS_MAIN_TUNE
946     /* longoptions.h - bogofilter/-lexer options */
947     LONGOPTIONS_LEX
948     /* end of list */
949     { NULL,				0, 0, 0 }
950 };
951 
process_arglist(int argc,char ** argv)952 static int process_arglist(int argc, char **argv)
953 {
954     int  count = 1;
955 
956     bulk_mode = B_CMDLINE;
957 
958 #ifdef __EMX__
959     _response (&argc, &argv);	/* expand response files (@filename) */
960     _wildcard (&argc, &argv);	/* expand wildcards (*.*) */
961 #endif
962 
963 #define	OPTIONS	":-:c:Cd:DeEM:n:qr:s:tT:vVx:"
964 
965     while (1)
966     {
967 	int option;
968 	int option_index = 0;
969 	const char *val;
970 
971 	option = getopt_long_chk(argc, argv, OPTIONS,
972 			     longopts_bogotune, &option_index);
973 
974 	if (option == -1)
975  	    break;
976 
977 	val = optarg;
978 	process_arg(option, NULL, val, PR_NONE, PASS_1_CLI);
979     }
980 
981     if (ds_flag == DS_NONE)	/* default is "wordlist on disk" */
982 	ds_flag = DS_DSK;
983 
984     if (ds_flag == DS_ERR) {
985 	fprintf(stderr, "Only one '-d dir' or '-D' option is allowed.\n");
986 	exit(EX_ERROR);
987     }
988 
989     if (bogolex_file == NULL &&
990 	(spam_files->count == 0 || ham_files->count == 0)) {
991 	fprintf(stderr,
992 		"Bogotune needs both non-spam and spam message sets for its parameter testing.\n");
993 	exit(EX_ERROR);
994     }
995 
996     if (!suppress_config_file)
997 	process_config_files(false, longopts_bogotune);
998 
999     return count;
1000 }
1001 
process_arg(int option,const char * name,const char * val,priority_t precedence,arg_pass_t pass)1002 int process_arg(int option, const char *name, const char *val, priority_t precedence, arg_pass_t pass)
1003 {
1004     static int lastmode = -1;
1005 
1006     (void) name;	/* suppress compiler warning */
1007     (void) precedence; 	/* suppress compiler warning */
1008     (void) pass; 	/* suppress compiler warning */
1009 
1010     if (option == 1) {
1011 	/* If getopt's RETURN_IN_ORDER behavior */
1012 	switch (lastmode) {
1013 	case 'n':
1014 	case 's':
1015 	    option = lastmode;
1016 	    break;
1017 	default:
1018 	    fprintf(stderr,
1019 		    "File names may only be given after -n or -s options.\n");
1020 	}
1021     }
1022 
1023     switch (option) {
1024     case 'c':
1025 	read_config_file(val, false, false, PR_CFG_USER, longopts_bogotune);
1026 	/*@fallthrough@*/
1027 	/* fall through to suppress reading config files */
1028 
1029     case 'C':
1030 	suppress_config_file = true;
1031 	break;
1032 
1033     case 'd':
1034 	ds_path = xstrdup(val);
1035 	ds_flag = (ds_flag == DS_NONE) ? DS_DSK : DS_ERR;
1036 	break;
1037 
1038     case 'D':
1039 	ds_flag = (ds_flag == DS_NONE) ? DS_RAM : DS_ERR;
1040 	break;
1041 
1042     case 'e':
1043 	exit_zero = true;
1044 	break;
1045 
1046     case 'E':
1047 	esf_flag ^= true;
1048 	break;
1049 
1050     case 'M':
1051 	bogolex_file = val;
1052 	break;
1053 
1054     case 'n':
1055 	lastmode = 'n';
1056 	filelist_add(ham_files, val);
1057 	break;
1058 
1059     case 'q':
1060 	quiet = true;
1061 	break;
1062 
1063     case 'r':
1064 	user_robx = atof(val);
1065 	break;
1066 
1067     case 's':
1068 	lastmode = 's';
1069 	filelist_add(spam_files, val);
1070 	break;
1071 
1072 #ifdef	TEST
1073     case 't':
1074 	test += 1;
1075 	break;
1076 #endif
1077     case 'T':
1078 	coerced_target = atoi(val);
1079 	break;
1080 
1081     case 'v':
1082 	verbose += 1;
1083 	break;
1084 
1085     case 'V':
1086 	print_version();
1087 	exit(EX_OK);
1088 
1089     case 'x':
1090 	if (strcmp(val, "MakeCheck") == 0)
1091 	    fMakeCheck = true;
1092 	else
1093 	    set_debug_mask(val);
1094 	break;
1095 
1096     case O_MAX_TOKEN_LEN:
1097 	max_token_len = atoi(val);
1098 	break;
1099 
1100     case O_MIN_TOKEN_LEN:
1101 	min_token_len = atoi(val);
1102 	break;
1103 
1104     case O_MAX_MULTI_TOKEN_LEN:
1105 	max_multi_token_len=atoi(val);
1106 	break;
1107 
1108     case O_MULTI_TOKEN_COUNT:
1109 	multi_token_count=atoi(val);
1110 	break;
1111 
1112     case O_BLOCK_ON_SUBNETS:
1113 	block_on_subnets = get_bool(name, val);
1114 	break;
1115 
1116     case O_REPLACE_NONASCII_CHARACTERS:
1117 	replace_nonascii_characters = get_bool(name, val);
1118 	break;
1119 
1120     case O_TOKEN_COUNT_FIX:
1121 	token_count_fix = atoi(val);
1122 	break;
1123 
1124     case O_TOKEN_COUNT_MIN:
1125 	token_count_min = atoi(val);
1126 	break;
1127 
1128     case O_TOKEN_COUNT_MAX:
1129 	token_count_max = atoi(val);
1130 	break;
1131 
1132     default:
1133 	help();
1134 	exit(EX_ERROR);
1135     }
1136 
1137     return 0;
1138 }
1139 
get_robx(void)1140 static double get_robx(void)
1141 {
1142     double rx;
1143 
1144     if (user_robx > 0.0)
1145 	rx = user_robx;
1146     else if (ds_flag == DS_DSK)	{
1147 	printf("Calculating initial x value...\n");
1148 	verbose = -verbose;		/* disable bogofilter debug output */
1149 	rx = compute_robinson_x();
1150 	verbose = -verbose;		/* enable bogofilter debug output */
1151     }
1152     else
1153 	rx = ROBX;
1154 
1155     if (rx > RX_MAX) rx = RX_MAX;
1156     if (rx < RX_MIN) rx = RX_MIN;
1157 
1158     printf("Initial x value is %8.6f\n", rx);
1159 
1160     return rx;
1161 }
1162 
results_sort(uint r_count,result_t * results)1163 static result_t *results_sort(uint r_count, result_t *results)
1164 {
1165     result_t *ans = (result_t *)xcalloc(r_count, sizeof(result_t));
1166     memcpy(ans, results, r_count * sizeof(result_t));
1167     qsort(ans, r_count, sizeof(result_t), compare_results);
1168     return ans;
1169 }
1170 
top_ten(result_t * sorted,uint n)1171 static void top_ten(result_t *sorted, uint n)
1172 {
1173     uint i, j;
1174     bool f;
1175 
1176     printf("Top ten parameter sets from this scan:\n");
1177 
1178     printf("        rs     md    rx    spesf    nsesf    co     fp  fn   fppc   fnpc\n");
1179     for (f = false; !f; f = true) {
1180       for (i = j = 0; i < 10 && j < n;) {
1181  	result_t *r = &sorted[j++];
1182  	if (!f && r->fp != target) continue;
1183 	sp_esf = ESF_SEL(sp_esf, pow(0.75, r->sp_exp));
1184 	ns_esf = ESF_SEL(ns_esf, pow(0.75, r->ns_exp));
1185 
1186 	printf("%5u %6.4f %5.3f %5.3f %8.6f %8.6f %6.4f  %3u %3u  %6.4f %6.4f\n",
1187 	       r->idx, r->rs, r->md, r->rx, sp_esf, ns_esf, r->co,
1188 	       r->fp, r->fn, r->fp*100.0/ns_cnt, r->fn*100.0/sp_cnt);
1189 	++i;
1190       }
1191       if (i) break;
1192       printf("Warning: fp target not met, using original results\n");
1193     }
1194 
1195     printf("\n");
1196     fflush(stdout);
1197 
1198     return;
1199 }
1200 
1201 /* get false negative */
1202 
gfn(result_t * results,uint rsi,uint mdi,uint rxi,uint spi,uint nsi)1203 static uint gfn(result_t *results,
1204 	       uint rsi, uint mdi, uint rxi,
1205 	       uint spi, uint nsi)
1206 {
1207     uint i = (((rsi * mdval->cnt + mdi) * rxval->cnt + rxi) * spexp->cnt + spi) * nsexp->cnt + nsi;
1208     result_t *r = &results[i];
1209     uint fn = r->fn;
1210     if (r->fp != target) return INT_MAX;
1211     if (verbose > 100)
1212 	printf("   %2u, %2u, %2u, %2u, %2u, %2u\n",
1213 	       rsi, mdi, rxi, spi, nsi, fn);
1214     ncnt += 1;
1215     nsum += fn;
1216     return fn;
1217 }
1218 
count_outliers(uint r_count,result_t * sorted,result_t * unsorted)1219 static result_t *count_outliers(uint r_count, result_t *sorted, result_t *unsorted)
1220 {
1221     bool f = false;
1222     uint i, j = 0, o = 0;
1223     uint rsi, mdi, rxi, spi, nsi;
1224     uint rsc = rsval->cnt - 1;
1225     uint rxc = rxval->cnt - 1;
1226     uint mdc = mdval->cnt - 1;
1227     uint spc = spexp->cnt - 1;
1228     uint nsc = nsexp->cnt - 1;
1229 
1230     result_t *r = NULL;					/* quench bogus compiler warning */
1231     uint q33 = sorted[r_count * 33 / 100].fn;		/* 33% quantile */
1232     uint med = sorted[r_count * 50 / 100].fn;		/* median false negative */
1233 
1234     if (verbose)
1235 	printf("%u%% fn count was %u\n", 50u, med);
1236 
1237     for (i = 0; i < r_count; i += 1) {
1238 	r = &sorted[i];
1239 	if (r->fp != target) continue;
1240 	if (j == 0) j = i+1;
1241 	if (fMakeCheck && j >= cMakeCheck) break;
1242 	rsi = r->rsi; mdi = r->mdi; rxi = r->rxi; spi = r->spi; nsi = r->nsi;
1243 	ncnt = nsum = 0;
1244 	if (((rsi == 0   ||
1245 	      (gfn(unsorted, rsi-1, mdi, rxi, spi, nsi)) < med)) &&
1246 	    ((rsi == rsc ||
1247 	      (gfn(unsorted, rsi+1, mdi, rxi, spi, nsi)) < med)) &&
1248 	    ((mdi == 0   ||
1249 	      (gfn(unsorted, rsi, mdi-1, rxi, spi, nsi)) < med)) &&
1250 	    ((mdi == mdc ||
1251 	      (gfn(unsorted, rsi, mdi+1, rxi, spi, nsi)) < med)) &&
1252 	    ((rxi == 0   ||
1253 	      (gfn(unsorted, rsi, mdi, rxi-1, spi, nsi)) < med)) &&
1254 	    ((rxi == rxc ||
1255 	      (gfn(unsorted, rsi, mdi, rxi+1, spi, nsi)) < med)) &&
1256 	    ((spi == 0   ||
1257 	      (gfn(unsorted, rsi, mdi, rxi, spi-1, nsi)) < med)) &&
1258 	    ((spi == spc ||
1259 	      (gfn(unsorted, rsi, mdi, rxi, spi+1, nsi)) < med)) &&
1260 	    ((nsi == 0   ||
1261 	      (gfn(unsorted, rsi, mdi, rxi, spi, nsi-1)) < med)) &&
1262 	    ((nsi == nsc ||
1263 	      (gfn(unsorted, rsi, mdi, rxi, spi, nsi+1)) < med)) &&
1264 	    (ncnt != 0 && nsum / ncnt <  q33))
1265 	{
1266 	    f = true;
1267 	    break;
1268 	}
1269 	o++;
1270     }
1271 
1272     if (o > 0) {
1273 	printf("%u outlier%s encountered.                                                   \n",
1274 	       o, (o > 1) ? "s" : "");
1275     }
1276 
1277     if (!f) {
1278 	r = &sorted[j-1];
1279 	printf("No smooth minimum encountered, using lowest fn count (an outlier).         \n");
1280     }
1281 
1282     return r;
1283 }
1284 
progress(uint cur,uint top)1285 static void progress(uint cur, uint top)
1286 {
1287     uint i;
1288     uint ndots = ceil(70.0 * cur / top);
1289 
1290     if (quiet)
1291 	return;
1292 
1293     if (ndots < 1)
1294 	ndots = 1;
1295      printf("\r%3u [", cur);
1296      for (i = 0; i < ndots; i += 1)
1297 	 printf(".");
1298      for (i = ndots; i < 70; i += 1)
1299 	 printf(" ");
1300      printf("]");
1301      fflush(stdout);
1302 }
1303 
final_warning(void)1304 static void final_warning(void)
1305 {
1306     printf(
1307 	"The small number and/or relative uniformity of the test messages imply\n"
1308 	"that the recommended values (above), though appropriate to the test set,\n"
1309 	"may not remain valid for long.  Bogotune should be run again with more\n"
1310 	"messages when that becomes possible.\n"
1311 	);
1312 }
1313 
final_recommendations(bool skip)1314 static void final_recommendations(bool skip)
1315 {
1316     uint m;
1317     bool printed = false;
1318     uint minn[] = { 10000, 2000, 1000, 500, 1 };
1319 
1320     printf("Performing final scoring:\n");
1321 
1322     printf("Spam...  ");
1323     score_sp(sp_scores);	/* get scores (in ascending order) */
1324 
1325     printf("Non-Spam...\n");
1326     score_ns(ns_scores);	/* get scores (in descending order) */
1327 
1328     for (m=0; m<10; ++m) printf("%8.6f %8.6f\n", sp_scores[m], ns_scores[m]);
1329 
1330     if (verbose >= PARMS)
1331 	printf("# ns_cnt %u, sp_cnt %u\n", ns_cnt, sp_cnt);
1332 
1333     if (skip) {
1334 	printf("\n");
1335 	printf("### The following recommendations are provisional.\n");
1336 	printf("### Run bogotune with more messages when possible.\n");
1337 	printf("\n");
1338     }
1339 
1340     printf("\nRecommendations:\n\n");
1341     printf("---cut---\n");
1342     printf("db_cachesize=%u\n", db_cachesize);
1343 
1344     printf("robs=%6.4f\n", robs);
1345     printf("min_dev=%5.3f\n", min_dev);
1346     printf("robx=%8.6f\n", robx);
1347     printf("sp_esf=%8.6f\n", sp_esf);
1348     printf("ns_esf=%8.6f\n", ns_esf);
1349 
1350     for (m=0; m < COUNTOF(minn); m += 1) {
1351 	double cutoff;
1352 	uint i, fp = 0, fn = 0;
1353 	uint mn = minn[m];
1354 	double fpp, fnp;
1355 
1356 	if (ns_cnt < mn)
1357 	    continue;
1358 
1359 	if (mn > 1 ) {
1360 	    uint t = (ns_cnt + mn - 1) / mn;
1361 	    cutoff = ns_scores[t-1];
1362 	    if (cutoff > FP_CUTOFF)
1363 		continue;
1364 	    fp = ns_cnt / mn;
1365 	    fpp = 100.0 / mn;
1366 	}
1367 	else {
1368 	    cutoff = SPAM_CUTOFF;
1369 	    if (printed)
1370 		break;
1371 	    for (i = 0; i < ns_cnt; i += 1) {
1372 		if (ns_scores[i] >= cutoff)
1373 		    fp += 1;
1374 	    }
1375 	    cutoff = ns_scores[fp-1];
1376 	    fpp = 100.0 * fp / ns_cnt;
1377 	}
1378 
1379 	for (i = 0; i < sp_cnt; i += 1) {
1380 	    if (sp_scores[i] >= cutoff) {
1381 		fn = i;
1382 		break;
1383 	    }
1384 	}
1385 	fnp = 100.0 * fn / sp_cnt;
1386 
1387 	if (printed)  printf("#");
1388 	printf("spam_cutoff=%8.6f\t# for %4.2f%% fp (%u); expect %4.2f%% fn (%u).\n",
1389 	       cutoff, fpp, fp, fnp, fn);
1390 
1391 	printed = true;
1392 	if (skip)
1393 	    ham_cutoff = cutoff;
1394     }
1395 
1396     if (!skip) {
1397 	uint s = ceil(sp_cnt * 0.002 - 1);
1398 	ham_cutoff = sp_scores[s];
1399 	if (ham_cutoff < MIN_HAM_CUTOFF) ham_cutoff = MIN_HAM_CUTOFF;
1400 	if (ham_cutoff > MAX_HAM_CUTOFF) ham_cutoff = MAX_HAM_CUTOFF;
1401     }
1402 
1403     printf("ham_cutoff=%5.3f\t\n", ham_cutoff);
1404     printf("---cut---\n");
1405     printf("\n");
1406 
1407     if (skip)
1408 	final_warning();
1409 
1410     printf("Tuning completed.\n");
1411 }
1412 
bogotune_init(void)1413 static void bogotune_init(void)
1414 {
1415     const char *msg_count = MSG_COUNT;
1416     w_msg_count = word_news(msg_count);
1417     train       = wordhash_new();
1418     ns_and_sp   = tunelist_new("tr");		/* training lists */
1419     ns_msglists = tunelist_new("ns");		/* non-spam scoring lists */
1420     sp_msglists = tunelist_new("sp");		/* spam     scoring lists */
1421 
1422     return;
1423 }
1424 
bogotune_free(void)1425 static void bogotune_free(void)
1426 {
1427     xfree(ns_scores);
1428     xfree(sp_scores);
1429 
1430     filelist_free(ham_files);
1431     filelist_free(spam_files);
1432 
1433     tunelist_free(ns_msglists);
1434     tunelist_free(sp_msglists);
1435     tunelist_free(ns_and_sp);
1436 
1437     word_free(w_msg_count);
1438 
1439     token_cleanup();
1440     mime_cleanup();
1441 
1442     xfree(ds_path);
1443 
1444     return;
1445 }
1446 
check_msgcount_parms(void)1447 static bool check_msgcount_parms(void)
1448 {
1449     bool ok = true;
1450 
1451     if (ds_flag == DS_RAM) {
1452 	fprintf(stderr, "A wordlist directory must be specified for converting messages to the message count format.\n");
1453 	ok = false;
1454     }
1455 
1456     if (ham_files->count != 0 && spam_files->count != 0) {
1457 	fprintf(stderr, "Message count files may be created from spam or non-spam inputs but not both.\n");
1458 	fprintf(stderr, "Run bogotune once for the spam and again for the non-spam.\n");
1459 	ok = false;
1460     }
1461 
1462     return ok;
1463 }
1464 
check_msg_counts(void)1465 static bool check_msg_counts(void)
1466 {
1467     bool ok = true;
1468     double ratio;
1469 
1470     if (msgs_good < LIST_COUNT || msgs_bad < LIST_COUNT) {
1471 	if (!quiet)
1472 	    fprintf(stderr,
1473 		    "The wordlist contains %u non-spam and %u spam messages.\n"
1474 		    "Bogotune must be run with at least %u of each.\n",
1475 		    msgs_good, msgs_bad, LIST_COUNT);
1476 	ok = false;
1477     }
1478 
1479     ratio =  (double)msgs_good / (double)msgs_bad;
1480     fprintf(stderr, "wordlist's ham to spam ratio is %0.1f to 1.0\n", ratio );
1481     if ( ratio < 0.1 || ratio > 10.0) {
1482 	if (!quiet) {
1483 	    fprintf(stderr,
1484 		    "Bogotune requires the ratio be in the range of 0.1 to 10.\n");
1485 	}
1486 	ok = false;
1487     }
1488 
1489     if (ns_cnt < TEST_COUNT || sp_cnt < TEST_COUNT) {
1490 	if (!quiet)
1491 	    fprintf(stderr,
1492 		    "The messages sets contain %u non-spam and %u spam.  Bogotune "
1493 		    "requires at least %u non-spam and %u spam messages to run.\n",
1494 		    ns_cnt, sp_cnt, TEST_COUNT, TEST_COUNT);
1495 	exit(EX_ERROR);
1496     }
1497 
1498     return ok;
1499 }
1500 
show_elapsed_time(int beg,int end,uint cnt,double val,const char * lbl1,const char * lbl2)1501 static void show_elapsed_time(int beg, int end, uint cnt, double val,
1502 			      const char *lbl1, const char *lbl2)
1503 {
1504     int tm = end - beg;
1505     if (!fMakeCheck)
1506 	printf("    %dm:%02ds for %u %s.  avg: %.1f %s\n",
1507 	       MIN(tm), SECONDS(tm), cnt, lbl1, val, lbl2);
1508 }
1509 
bogolex(void)1510 static rc_t bogolex(void)
1511 {
1512     rc_t status = RC_OK;
1513 
1514     if (!check_msgcount_parms())
1515 	exit(EX_ERROR);
1516 
1517     read_mailbox(bogolex_file, NULL);
1518 
1519     return status;
1520 }
1521 
bogotune(void)1522 static rc_t bogotune(void)
1523 {
1524     bool skip;
1525     result_t *best;
1526 
1527     int beg, end;
1528     uint cnt, scan;
1529     rc_t status = RC_OK;
1530 
1531     beg = time(NULL);
1532 
1533     ham_cutoff = 0.0;
1534     spam_cutoff = 0.1;
1535 
1536     /* Note: memory usage highest while reading messages */
1537     /* usage decreases as distribute() converts to count format */
1538 
1539     /* read all messages, merge training sets, look up scoring sets */
1540     ns_cnt = filelist_read(REG_GOOD, ham_files);
1541     sp_cnt = filelist_read(REG_SPAM, spam_files);
1542     cnt = ns_cnt + sp_cnt;
1543 
1544     end = time(NULL);
1545     if (verbose >= TIME) {
1546 	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
1547     }
1548 
1549     distribute(REG_GOOD, ns_msglists);
1550     distribute(REG_SPAM, sp_msglists);
1551 
1552     create_countlists(ns_msglists);
1553     create_countlists(sp_msglists);
1554 
1555     if (verbose >= TIME && time(NULL) - end > 2) {
1556 	end = time(NULL);
1557 	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
1558     }
1559 
1560     if (verbose > PARMS+1) {
1561 	tunelist_print(ns_and_sp);
1562 	tunelist_print(ns_msglists);
1563 	tunelist_print(sp_msglists);
1564     }
1565 
1566     ns_cnt = count_messages(ns_msglists);
1567     sp_cnt = count_messages(sp_msglists);
1568 
1569     if (ds_flag == DS_DSK && !check_msg_counts())
1570 	exit(exit_zero ? EX_OK : EX_ERROR);
1571 
1572     fflush(stdout);
1573 
1574     check_percent = CHECK_PCT;	/* for checking low scoring spam
1575 				** and high scoring non-spam */
1576 
1577     ns_scores = (double *)xcalloc(ns_cnt, sizeof(double));
1578     sp_scores = (double *)xcalloc(sp_cnt, sizeof(double));
1579 
1580     robs = DEFAULT_ROBS;
1581     robx = DEFAULT_ROBX;
1582     min_dev = DEFAULT_MIN_DEV;
1583 
1584     if (check_for_high_ns_scores() | check_for_low_sp_scores())
1585 	scoring_error();
1586 
1587     /*
1588     ** 5.  Calculate x and cache size
1589     ** Calculate x with bogoutil's -r option (a new addition).
1590     ** Bound the calculated value within [0.4, 0.6] and set the range to be
1591     ** investigated to [x-0.1, x+0.1].
1592     */
1593 
1594     robx = get_robx();
1595     if (ds_flag == DS_DSK) {
1596 	db_cachesize = calc_db_cachesize();
1597 	printf("Recommended db cache size is %u MB\n", db_cachesize);
1598     }
1599 
1600     /*
1601     ** 6.  Calculate fp target
1602     ** The fp target will be derived thus: score non-spams with s and md as
1603     ** shipped, and determine the count that will result from a spam cutoff
1604     ** of 0.95; if that is < 0.25%, try 0.9375 etc.
1605     */
1606 
1607     min_dev = 0.02;
1608 
1609     /* set target and spam_cutoff */
1610 
1611     if (coerced_target == 0)
1612 	set_thresh(ns_cnt, ns_scores);
1613     else {
1614 	/* if coerced target ... */
1615 	target = coerced_target;
1616 	spam_cutoff = ns_scores[target-1];
1617     }
1618 
1619     skip = ROUND(spam_cutoff,100000) < SCAN_CUTOFF;
1620     printf("False-positive target is %u (cutoff %8.6f)\n", target, spam_cutoff);
1621 
1622 #ifdef	TEST
1623     if (test) {
1624 	printf("m: %8.6f, s: %8.6f, x: %0.16f\n", min_dev, robs, robx);
1625 	if (verbose < PARMS)
1626 	    print_ns_scores(target-2, target+2, 0);
1627     }
1628 #endif
1629 
1630     if (!esf_flag && (sp_esf < 1.0 || ns_esf < 1.0))
1631 	fprintf(stderr, "Warning:  Using ESF values (sp=%8.6f, ns=%8.6f) from config file.\n", sp_esf, ns_esf);
1632 
1633     /* No longer needed */
1634     wordhash_free(train);
1635     train = NULL;
1636 
1637     for (scan=0; scan <= 1 && !skip; scan ++) {
1638 	uint r_count;
1639 	uint rsi, rxi, mdi, spi, nsi;
1640 	result_t *results, *r, *sorted;
1641 
1642 	printf("Performing %s scan:\n", scan==0 ? "coarse" : "fine");
1643 
1644 	switch (scan) {
1645 	case 0:		/* COARSE */
1646 	    /*
1647 	    ** 7.  Coarsely scan s, md and x
1648 	    ** The coarse s scan will range from 1 to 0.01 in half decades, and the
1649 	    ** coarse md scan will range from 0.05 to 0.45 in steps of 0.05.  The
1650 	    ** coarse x scan will use steps of 0.05. The trough must be surrounded on
1651 	    ** six sides by values below the 33% quantile (unless bounded on one or
1652 	    ** more sides).
1653 	    */
1654 	    init_coarse(robx);
1655 	    break;
1656 	case 1:		/* FINE */
1657 	    /*
1658 	    ** 8.  Finely scan the peak region
1659 	    ** The fine s scan will range over the estimated s +/- half a decade in
1660 	    ** steps of a quarter decade, and the fine md scan will range over the
1661 	    ** estimated md +/- 0.075 in steps of 0.015.  The fine x scan will range
1662 	    ** over the estimated x +/- 0.04 in steps of 0.02.  Scans of s and md
1663 	    ** are bounded by the limits of the coarse scan.  Again, the trough must
1664 	    ** be surrounded on six sides by values below the 33% quantile.  If no
1665 	    ** such trough exists, a warning is given.
1666 	    */
1667 	    init_fine(robs, min_dev, robx, spex, nsex);
1668 	    break;
1669 	}
1670 
1671 	r_count = rsval->cnt * mdval->cnt * rxval->cnt * spexp->cnt * nsexp->cnt;
1672 	results = (result_t *) xcalloc(r_count, sizeof(result_t));
1673 
1674 	print_all_parms(r_count);
1675 
1676 	if (verbose >= SUMMARY) {
1677 	    if (verbose >= SUMMARY+1)
1678 		printf("%3s ", "cnt");
1679 	    if (verbose >= SUMMARY+2)
1680 		printf(" %s %s %s      ", "s", "m", "x");
1681 	    printf(" %4s %5s   %4s %8s %8s %7s %3s %3s\n",
1682 		   "rs", "md", "rx", "spesf", "nsesf", "cutoff", "fp", "fn");
1683 	}
1684 
1685 	cnt = 0;
1686 	beg = time(NULL);
1687 	for (rsi = 0; rsi < rsval->cnt; rsi++) {
1688 	  robs = rsval->data[rsi];
1689 	  for (mdi = 0; mdi < mdval->cnt; mdi++) {
1690 	    min_dev = mdval->data[mdi];
1691 	    for (rxi = 0; rxi < rxval->cnt; rxi++) {
1692 	      robx = rxval->data[rxi];
1693 	      for (spi = 0; spi < spexp->cnt; spi++) {
1694 		spex = spexp->data[spi];
1695 		sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
1696 		for (nsi = 0; nsi < nsexp->cnt; nsi++) {
1697 		    uint fp, fn;
1698 		    nsex = nsexp->data[nsi];
1699 		    ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));
1700 
1701 		    /* save parms */
1702 		    r = &results[cnt++];
1703 		    r->idx = cnt;
1704 		    r->rsi = rsi; r->rs = robs;
1705 		    r->rxi = rxi; r->rx = robx;
1706 		    r->mdi = mdi; r->md = min_dev;
1707 		    r->spi = spi; r->sp_exp = spex;
1708 		    r->nsi = nsi; r->ns_exp = nsex;
1709 
1710 		    if (verbose >= SUMMARY) {
1711 			if (verbose >= SUMMARY+1)
1712 			    printf("%3u ", cnt);
1713 			if (verbose >= SUMMARY+2)
1714 			    printf(" %u %u %u %u %u  ",
1715 				rsi, mdi, rxi, spi, nsi);
1716 			printf("%6.4f %5.3f %5.3f %8.6f %8.6f",
1717 			    robs, min_dev, robx, sp_esf, ns_esf);
1718 			fflush(stdout);
1719 		    }
1720 
1721 		    spam_cutoff = 0.01;
1722 		    score_ns(ns_scores);	/* scores in descending order */
1723 
1724 		    /* Determine spam_cutoff and false_pos */
1725 		    for (fp = target; fp < ns_cnt; fp += 1) {
1726 			spam_cutoff = ns_scores[fp-1];
1727 			if (spam_cutoff < 0.999999)
1728 			    break;
1729 			if (coerced_target != 0)
1730 			    break;
1731 		    }
1732 		    if (ns_cnt < fp)
1733 			fprintf(stderr,
1734 				"Too few false positives to determine a valid cutoff\n");
1735 
1736 		    score_sp(sp_scores);	/* scores in ascending order */
1737 		    fn = get_fn_count(sp_cnt, sp_scores);
1738 
1739 		    /* save results */
1740 		    r->co = spam_cutoff;
1741 		    r->fp = fp;
1742 		    r->fn = fn;
1743 
1744 		    if (verbose < SUMMARY)
1745 			progress(cnt, r_count);
1746 		    else {
1747 			printf(" %8.6f %2u %3u\n", spam_cutoff, fp, fn);
1748 			fflush(stdout);
1749 		    }
1750 
1751 #ifdef	TEST
1752 		    if (test && spam_cutoff < 0.501) {
1753 			printf("co: %0.16f\n", spam_cutoff);
1754 			print_ns_scores(0, fp, 2);
1755 			print_sp_scores(fn-10, fn, 10);
1756 		    }
1757 #endif
1758 		    if (fMakeCheck && cnt >= cMakeCheck)
1759 			break;
1760 		}
1761 		if (fMakeCheck && cnt >= cMakeCheck)
1762 		    break;
1763 	      }
1764 	      if (fMakeCheck && cnt >= cMakeCheck)
1765 		  break;
1766 	    }
1767 	    if (fMakeCheck && cnt >= cMakeCheck)
1768 		break;
1769 	  }
1770 	  fflush(stdout);
1771 	  if (fMakeCheck && cnt >= cMakeCheck)
1772 	      break;
1773 	}
1774 
1775 	if (verbose >= TIME) {
1776 	    end = time(NULL);
1777 	    show_elapsed_time(beg, end, cnt, (double)(end-beg)/cnt, "iterations", "secs");
1778 	}
1779 
1780 	printf("\n");
1781 
1782 	/* Scan complete, now find minima */
1783 
1784 	sorted = results_sort(r_count, results);
1785 	top_ten(sorted, r_count);
1786 
1787 	best = count_outliers(r_count, sorted, results);
1788 	robs = rsval->data[best->rsi];
1789 	robx = rxval->data[best->rxi];
1790 	min_dev = mdval->data[best->mdi];
1791 
1792 	spex = spexp->data[best->spi]; sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
1793 	nsex = nsexp->data[best->nsi]; ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));
1794 
1795 	printf(
1796     "Minimum found at s %6.4f, md %5.3f, x %5.3f, spesf %8.6f, nsesf %8.6f\n",
1797     		robs, min_dev, robx, sp_esf, ns_esf);
1798 	printf("        fp %u (%6.4f%%), fn %u (%6.4f%%)\n",
1799 		best->fp, best->fp*100.0/ns_cnt,
1800 		best->fn, best->fn*100.0/sp_cnt);
1801 	printf("\n");
1802 
1803 	data_free(rsval);
1804 	data_free(rxval);
1805 	data_free(mdval);
1806 	data_free(spexp);
1807 	data_free(nsexp);
1808 
1809 	xfree(results);
1810 	xfree(sorted);
1811     }
1812 
1813     /*
1814     ** 9.  Suggest possible spam and non-spam cutoff values
1815     ** With the final x, md and s values, score the spams and non-spams and
1816     ** sort the non-spam scores decreasing and the spam scores increasing;
1817     ** then, traverse the non-spam list until the 0.2% point; report cutoffs
1818     ** that give 0.05%, 0.1% and 0.2% fp.
1819     */
1820 
1821     final_recommendations(skip);
1822 
1823     return status;
1824 }
1825 
main(int argc,char ** argv)1826 int main(int argc, char **argv) /*@globals errno,stderr,stdout@*/
1827 {
1828     ex_t exitcode = EX_OK;
1829 
1830     fBogotune = true;		/* for rob_compute_spamicity() */
1831 
1832     dbgout = stderr;
1833 
1834     init_globals();
1835 
1836     progtype = build_progtype(progname, DB_TYPE);
1837 
1838     ham_files  = filelist_new("ham");
1839     spam_files = filelist_new("spam");
1840 
1841     /* process args and read mailboxes */
1842     process_arglist(argc, argv);
1843 
1844     /* directories from command line and config file are already handled */
1845     if (ds_flag == DS_DSK) {
1846 	bfpath *bfp;
1847 
1848 	if (ds_path == NULL)
1849 	    ds_path = get_directory(PR_ENV_BOGO);
1850 
1851 	if (ds_path == NULL)
1852 	    ds_path = get_directory(PR_ENV_HOME);
1853 
1854 	if (ds_path == NULL) {
1855 	    fprintf(stderr, "Cannot derive bogofilter directory from environment, aborting.\n");
1856 	    exit(EX_ERROR);
1857 	}
1858 
1859 	set_bogohome(ds_path);
1860 	bfp = bfpath_create(ds_path);
1861 
1862 	if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {
1863 	    fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);
1864 	    xfree(ds_path);
1865 	    exit(EX_ERROR);
1866 	}
1867 
1868 	if (bfp->exists && bfp->isdir) {
1869 	    char *tmp;
1870 	    bfpath_free(bfp);
1871 	    tmp = mxcat(ds_path, DIRSEP_S, WORDLIST, NULL);
1872 	    xfree(ds_path);
1873 	    ds_path = tmp;
1874 	    bfp = bfpath_create(ds_path);
1875 	    if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {
1876 		fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);
1877 		bfpath_free(bfp);
1878 		xfree(ds_path);
1879 		exit(EX_ERROR);
1880 	    }
1881 	}
1882 
1883 	env = ds_init(bfp);
1884 
1885 	init_wordlist("word", ds_path, 0, WL_REGULAR);
1886 	bfpath_free(bfp);
1887     }
1888 
1889     bogotune_init();
1890 
1891     if (ds_flag == DS_DSK)
1892 	load_wordlist(load_hook, train);
1893 
1894     /* if encoding not yet set, assume old style */
1895     if (encoding == E_UNKNOWN)
1896 	encoding = E_RAW;
1897 
1898     if (bogolex_file != NULL)
1899 	bogolex();
1900     else
1901 	bogotune();
1902 
1903     bogotune_free();
1904 
1905     if (ds_flag == DS_DSK)
1906 	ds_cleanup(env);
1907 
1908     exit(exitcode);
1909 }
1910 
1911 /* End */
1912