1 /*****************************************************************************
2 
3 NAME:
4    score.c -- implements Fisher variant on Robinson algorithm.
5 
6 ******************************************************************************/
7 
8 #include "common.h"
9 
10 #include <math.h>
11 #include <string.h>
12 #include <stdlib.h>
13 
14 #include "bogoconfig.h"
15 #include "bogofilter.h"
16 #include "collect.h"
17 #include "datastore.h"
18 #include "listsort.h"
19 #include "msgcounts.h"
20 #include "prob.h"
21 #include "rand_sleep.h"
22 #include "rstats.h"
23 #include "score.h"
24 #include "wordhash.h"
25 #include "wordlists.h"
26 
27 #if defined(HAVE_GSL_10) && !defined(HAVE_GSL_14)
28 /* HAVE_GSL_14 implies HAVE_GSL_10
29  * if we have neither, we'll use our included GSL 1.4, which knows CDFs
30  * if we have both, we have GSL 1.4, which knows CDFs
31  *
32  * in other cases, we need to integrate the PDF to get the CDF
33  */
34 #define GSL_INTEGRATE_PDF
35 #include "gsl/gsl_randist.h"
36 #include "gsl/gsl_integration.h"
37 #include "gsl/gsl_errno.h"
38 #else
39 #include "gsl/gsl_cdf.h"
40 #endif
41 
42 /* Structure Definitions */
43 
44 typedef struct probnode_t {
45     hashnode_t * node;
46     double	 prob;
47 } probnode_t;
48 
49 /* struct for saving stats for printing. */
50 typedef struct score_s {
51     double min_dev;
52     double spamicity;
53     u_int32_t robn;
54     double p_ln;	/* Robinson P, as a log*/
55     double q_ln;	/* Robinson Q, as a log*/
56     double p_pr;	/* Robinson P */
57     double q_pr;	/* Robinson Q */
58 } score_t;
59 
60 /* Function Prototypes */
61 
62 static	double	get_spamicity(size_t robn, FLOAT P, FLOAT Q);
63 static	bool	need_scoring_boundary(size_t count);
64 static	double	find_scoring_boundary(wordhash_t *wh);
65 static	size_t	compute_count_and_scores(wordhash_t *wh);
66 static	size_t	compute_count_and_spamicity(wordhash_t *wh, FLOAT *P, FLOAT *Q, bool need_stats);
67 static	int	compare_hashnode_t(const void *const pv1, const void *const pv2);
68 
69 /* Static Variables */
70 
71 static score_t	score;
72 
73 /* Function Definitions */
74 
msg_spamicity(void)75 double msg_spamicity(void)
76 {
77     return score.spamicity;
78 }
79 
msg_status(void)80 rc_t msg_status(void)
81 {
82     if (score.spamicity >= spam_cutoff)
83 	return RC_SPAM;
84 
85     if ((ham_cutoff < EPS) ||
86 	(score.spamicity <= ham_cutoff))
87 	return RC_HAM;
88 
89     return RC_UNSURE;
90 }
91 
msg_print_stats(FILE * fp)92 void msg_print_stats(FILE *fp)
93 {
94     bool unsure = unsure_stats && (msg_status() == RC_UNSURE) && verbose;
95 
96     (void)fp;
97 
98     if (quiet)
99 	return;
100 
101     if (Rtable || unsure || verbose >= 2)
102 	rstats_print(unsure);
103 }
104 
105 /** search token in all lists according to precedence, summing up the
106  * counts (all lists at same precedence are used); if found, set cnts
107  * accordingly. */
lookup(const word_t * token,wordcnts_t * cnts)108 static int lookup(const word_t *token, wordcnts_t *cnts)
109 {
110     int override=0;
111     wordlist_t* list;
112 
113     if (fBogotune) {
114 	wordprop_t *wp = (wordprop_t *)wordhash_search_memory(token);
115 	if (wp) {
116 	    cnts->good = wp->cnts.good;
117 	    cnts->bad  = wp->cnts.bad;
118 	}
119 	return 0;
120     }
121 
122     cnts->msgs_bad = cnts->msgs_good = 0;
123 
124     for (list=word_lists; list != NULL; list=list->next)
125     {
126 	dsv_t val;
127 	int ret;
128 
129 	if (override > list->override)	/* if already found */
130 	    break;
131 
132 	ret = ds_read(list->dsh, token, &val);
133 
134 	/* check if we have the token */
135 	switch (ret) {
136 	    case 0:
137 		/* token found, pass on */
138 		break;
139 	    case 1:
140 		/* token not found, clear counts */
141 		val.count[IX_GOOD] = 0;
142 		val.count[IX_SPAM] = 0;
143 		break;
144 	    case DS_ABORT_RETRY:
145 		/* sleep, reinitialize and start over */
146 		rand_sleep(1000,1000000);
147 		begin_wordlist(list);
148 		/* FALLTHROUGH */
149 	    default:
150 		return ret;
151 	}
152 
153 	if (ret == 0 && list->type == WL_IGNORE) {	/* if found on ignore list */
154 	    cnts->good = cnts->bad = 0;
155 	    break;
156 	}
157 
158 	override=list->override;
159 
160 	if (DEBUG_ALGORITHM(2)) {
161 	    fprintf(dbgout, "%6d %5u %5u %5u %5u list=%s,%c,%d ",
162 		    ret, (uint)val.count[IX_GOOD], (uint)val.count[IX_SPAM],
163 		    (uint)list->msgcount[IX_GOOD], (uint)list->msgcount[IX_SPAM],
164 		    list->listname, list->type, list->override);
165 	    word_puts(token, 0, dbgout);
166 	    fputc('\n', dbgout);
167 	}
168 
169 	cnts->good += val.count[IX_GOOD];
170 	cnts->bad += val.count[IX_SPAM];
171 	cnts->msgs_good += list->msgcount[IX_GOOD];
172 	cnts->msgs_bad += list->msgcount[IX_SPAM];
173     }
174 
175     if (DEBUG_ALGORITHM(1)) {
176 	fprintf(dbgout, "%5u %5u ", (uint)cnts->bad, (uint)cnts->good);
177 	word_puts(token, 0, dbgout);
178 	fputc('\n', dbgout);
179     }
180 
181     return 0;
182 }
183 
184 
185 /* do wordlist lookups for the words in the wordhash
186  */
lookup_words(wordhash_t * wh)187 void lookup_words(wordhash_t *wh)
188 {
189     int ret;
190     hashnode_t *node;
191 
192     if (msg_count_file)	/* if mc file, already done */
193 	return;
194 
195 retry:
196     for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
197     {
198 	word_t *token     = node->key;
199 	wordprop_t *props = (wordprop_t *) node->data;
200 	wordcnts_t *cnts  = &props->cnts;
201 	ret = lookup(token, cnts);
202 	if (ret == DS_ABORT_RETRY)
203 	    /* start all over, the message counts may have changed
204 	     * lookup handles reinitializing the wordlist */
205 	    goto retry;
206     }
207 
208     return;
209 }
210 
211 /** selects the best spam/non-spam indicators and calculates Robinson's S,
212  * \return -1.0 for error, S otherwise */
msg_compute_spamicity(wordhash_t * wh)213 double msg_compute_spamicity(wordhash_t *wh) /*@globals errno@*/
214 {
215     FLOAT P = {1.0, 0};		/* Robinson's P */
216     FLOAT Q = {1.0, 0};		/* Robinson's Q */
217 
218     double spamicity;
219     size_t robn = 0;
220 
221     bool need_stats = !fBogotune && (Rtable || passthrough || (verbose > 0));
222 
223     if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() begins\n");
224 
225     if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "min_dev: %f, robs: %f, robx: %f\n",
226 				    min_dev, robs, robx);
227 
228     /* compute scores for the wordhash's tokens */
229     robn = compute_count_and_scores(wh);
230 
231     /* recalculate min_dev if necessary to satisfy token_count settings */
232     if (!need_scoring_boundary(robn))
233 	score.min_dev = min_dev;
234     else
235 	score.min_dev = find_scoring_boundary(wh);
236 
237     /* compute message spamicity from the wordhash's scores */
238     robn = compute_count_and_spamicity(wh, &P, &Q, need_stats);
239 
240     /* Robinson's P, Q and S
241     ** S = (P - Q) / (P + Q)			    [combined indicator]
242     */
243     spamicity = get_spamicity(robn, P, Q);
244 
245     if (need_stats && robn != 0)
246 	rstats_fini(robn, P, Q, spamicity);
247 
248     if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() ends\n");
249 
250     return spamicity;
251 }
252 
253 /*
254 ** compute_count_and_scores()
255 **	compute the token probabilities from the linked list of tokens
256 */
compute_count_and_scores(wordhash_t * wh)257 static size_t compute_count_and_scores(wordhash_t *wh)
258 {
259     size_t count = 0;
260     hashnode_t *node;
261 
262     if (fBogotune)
263 	return count;
264 
265     for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
266     {
267 	wordcnts_t *cnts;
268 	wordprop_t *props;
269 
270 	props = (wordprop_t *) node->data;
271 	cnts  = &props->cnts;
272 	props->prob = calc_prob(cnts->good, cnts->bad,
273 				cnts->msgs_good, cnts->msgs_bad);
274 	props->used = fabs(props->prob - EVEN_ODDS) > min_dev;
275 	if (props->used)
276 	    count += 1;
277     }
278 
279     return count;
280 }
281 
282 /*
283 ** compute_count_and_score()
284 **	compute the spamicity from the linked list of tokens using
285 **	min_dev to select tokens
286 */
compute_count_and_spamicity(wordhash_t * wh,FLOAT * P,FLOAT * Q,bool need_stats)287 static size_t compute_count_and_spamicity(wordhash_t *wh,
288 					  FLOAT *P, FLOAT *Q,
289 					  bool need_stats)
290 {
291     size_t count = 0;
292 
293     hashnode_t *node;
294 
295     for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
296     {
297 	bool useflag;
298 	double prob;
299 	word_t *token;
300 	wordcnts_t *cnts;
301 	wordprop_t *props;
302 
303 	if (!fBogotune) {
304 	    token = node->key;
305 	    props = (wordprop_t *) node->data;
306 	    cnts  = &props->cnts;
307 	    prob = props->prob;
308 	    useflag = props->used;
309 	} else {
310 	    token = NULL;
311 	    cnts = (wordcnts_t *) node;
312 	    prob = calc_prob(cnts->good, cnts->bad,
313 			     cnts->msgs_good, cnts->msgs_bad);
314 	    useflag = fabs(prob - EVEN_ODDS) > score.min_dev;
315 	}
316 
317 	if (need_stats)
318 	    rstats_add(token, prob, useflag, cnts);
319 
320 	/* Robinson's P and Q; accumulation step */
321 	/*
322 	 * P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n)	[spamminess]
323 	 * Q = 1 - (p1*p2*...*pn)^(1/n)			[non-spamminess]
324 	 */
325 	if (useflag ) {
326 	    int e;
327 
328 	    P->mant *= 1-prob;
329 	    if (P->mant < 1.0e-200) {
330 		P->mant = frexp(P->mant, &e);
331 		P->exp += e;
332 	    }
333 
334 	    Q->mant *= prob;
335 	    if (Q->mant < 1.0e-200) {
336 		Q->mant = frexp(Q->mant, &e);
337 		Q->exp += e;
338 	    }
339 	    count += 1;
340 	}
341 
342 	if (DEBUG_ALGORITHM(3)) {
343 	    (void)fprintf(dbgout, "%3lu %3lu %f ",
344 			  (unsigned long)count, (unsigned long)count, prob);
345 	    (void)word_puts(token, 0, dbgout);
346 	    (void)fputc('\n', dbgout);
347 	}
348     }
349 
350     return count;
351 }
352 
353 /* need_scoring_boundary( )
354 **	determine if min_dev gives a count fitting the token count limits
355 **	return True if so; False if not
356 */
need_scoring_boundary(size_t count)357 static bool need_scoring_boundary(size_t count)
358 {
359     /* Early out if no token count limits are set */
360     if (((token_count_min == 0) || (token_count_min <= count)) &&
361 	((token_count_max == 0) || (token_count_max >= count)) &&
362 	((token_count_fix == 0) || (token_count_fix == count)))
363 	return false;
364     else
365 	return true;
366 }
367 
368 /* find_scoring_boundary( )
369 **	determine the token score that gives the desired token count
370 **	for scoring the message.
371 */
find_scoring_boundary(wordhash_t * wh)372 static double find_scoring_boundary(wordhash_t *wh)
373 {
374     size_t count = 0;
375 
376     double min_prob = (token_count_max == 0.0) ? min_dev : 1.0;
377 
378     hashnode_t *node;
379 
380     /* sort by ascending score difference (from 0.5) */
381     wh->iter_head = (hashnode_t *)listsort((element *)wh->iter_head, (fcn_compare *)&compare_hashnode_t);
382 
383     count = max(token_count_fix, max(token_count_min, token_count_max));
384 
385     for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh)) {
386 	wordcnts_t *cnts;
387 	wordprop_t *props = NULL;
388 	double prob;
389 	double dev;
390 
391 	if (!fBogotune) {
392 	    props = (wordprop_t *) node->data;
393 	    cnts  = &props->cnts;
394 	} else {
395 	    cnts = (wordcnts_t *) node;
396 	}
397 	prob = calc_prob(cnts->good, cnts->bad,
398 			 cnts->msgs_good, cnts->msgs_bad);
399 	dev = fabs(prob - EVEN_ODDS);
400 
401 	if (count > 0) {
402 	    count -= 1;
403 	    if (!fBogotune)
404 		props->used = true;
405 	    min_prob = dev;
406 	}
407 	else if (dev >= min_prob) {
408 	    if (!fBogotune)
409 		props->used = true;
410 	}
411 	else {
412 	    if (!fBogotune)
413 		props->used = false;
414 	}
415     }
416 
417     return min_prob;
418 }
419 
420 /* compare_hashnode_t - sort by ascending score difference (from 0.5) */
421 
compare_hashnode_t(const void * const pv1,const void * const pv2)422 static int compare_hashnode_t(const void *const pv1, const void *const pv2)
423 {
424     double d1;
425     double d2;
426 
427     if (!fBogotune) {
428 	const hashnode_t *hn1 = (const hashnode_t *)pv1;
429 	const hashnode_t *hn2 = (const hashnode_t *)pv2;
430 	d1 = fabs(((wordprop_t *) hn1->data)->prob - EVEN_ODDS);
431 	d2 = fabs(((wordprop_t *) hn2->data)->prob - EVEN_ODDS);
432     } else {
433 	const wordcnts_t *cnts;
434 	double prob;
435 	cnts = (const wordcnts_t *) pv1;
436 	prob = calc_prob(cnts->good, cnts->bad,
437 			 cnts->msgs_good, cnts->msgs_bad);
438 	d1 = fabs(prob - EVEN_ODDS);
439 
440 	cnts = (const wordcnts_t *) pv2;
441 	prob = calc_prob(cnts->good, cnts->bad,
442 			 cnts->msgs_good, cnts->msgs_bad);
443 	d2 = fabs(prob - EVEN_ODDS);
444     }
445 
446     if (d1 < d2)
447 	return +1;
448     if (d1 > d2)
449 	return -1;
450 
451     return 0;
452 }
453 
score_initialize(void)454 void score_initialize(void)
455 {
456     word_t *word_robx = word_news(ROBX_W);
457 
458     wordlist_t *list = get_default_wordlist(word_lists);
459 
460     if (fabs(min_dev) < EPS)
461 	min_dev = MIN_DEV;
462     if (spam_cutoff < EPS)
463 	spam_cutoff = SPAM_CUTOFF;
464 
465     /*
466     ** If we're classifying messages, we need to compute the scalefactor
467     ** (from the .MSG_COUNT values)
468     ** If we're registering tokens, we needn't get .MSG_COUNT
469     */
470 
471     if (fabs(robs) < EPS)
472 	robs = ROBS;
473 
474     if (fabs(robx) < EPS)
475     {
476 	/* Assign default value in case there's no wordlist
477 	 * or no wordlist entry */
478 	robx = ROBX;
479 	if (list->dsh != NULL)
480 	{
481 	    int ret;
482 	    dsv_t val;
483 
484 	    /* Note: .ROBX is scaled by 1000000 in the wordlist */
485 	    ret = ds_read(list->dsh, word_robx, &val);
486 	    if (ret != 0)
487 		robx = ROBX;
488 	    else {
489 		/* If found, unscale; else use predefined value */
490 		uint l_robx = val.count[IX_SPAM];
491 		robx = l_robx ? (double)l_robx / 1000000 : ROBX;
492 	    }
493 	}
494     }
495 
496     if (robx < 0.0 || 1.0 < robx) {
497 	fprintf(stderr, "Invalid robx value (%f).  Must be between 0.0 and 1.0\n", robx);
498 	exit(EX_ERROR);
499     }
500 
501     word_free(word_robx);
502 
503     return;
504 }
505 
score_cleanup(void)506 void score_cleanup(void)
507 {
508 /*    rstats_cleanup(); */
509 }
510 
511 #ifdef GSL_INTEGRATE_PDF
chisq(double x,void * p)512 static double chisq(double x, void *p) {
513      return(gsl_ran_chisq_pdf(x, *(double *)p));
514 }
515 
prbf(double x,double df)516 inline static double prbf(double x, double df) {
517     gsl_function chi;
518     int status;
519     double p, abserr;
520     const int intervals = 15;
521     const double eps = 1000 * DBL_EPSILON;
522 
523     gsl_integration_workspace *w;
524     chi.function = chisq;
525     chi.params = &df;
526     gsl_set_error_handler_off();
527     w = gsl_integration_workspace_alloc(intervals);
528     if (!w) {
529 	fprintf(stderr, "Out of memory! %s:%d\n", __FILE__, __LINE__);
530 	exit(EX_ERROR);
531     }
532     status = gsl_integration_qag(&chi, 0, x, eps, eps,
533 	    intervals, GSL_INTEG_GAUSS41, w, &p, &abserr);
534     if (status && status != GSL_EMAXITER) {
535 	fprintf(stderr, "Integration error: %s\n", gsl_strerror(status));
536 	exit(EX_ERROR);
537     }
538     gsl_integration_workspace_free(w);
539     p = max(0.0, 1.0 - p);
540     return(min(1.0, p));
541 }
542 #else
prbf(double x,double df)543 inline static double prbf(double x, double df)
544 {
545     double r = gsl_cdf_chisq_Q(x, df);
546     return (r < DBL_EPSILON) ? 0.0 : r;
547 }
548 #endif
549 
get_spamicity(size_t robn,FLOAT P,FLOAT Q)550 static double get_spamicity(size_t robn, FLOAT P, FLOAT Q)
551 {
552     if (robn == 0)
553     {
554 	score.spamicity = robx;
555     }
556     else
557     {
558 	double sp_df = 2.0 * robn * sp_esf;
559 	double ns_df = 2.0 * robn * ns_esf;
560 	double ln2 = log(2.0);					/* ln(2) */
561 
562 	score.robn = robn;
563 
564 	/* convert to natural logs */
565 	score.p_ln = (log(P.mant) + P.exp * ln2) * sp_esf;	/* invlogsum */
566 	score.q_ln = (log(Q.mant) + Q.exp * ln2) * ns_esf;	/* logsum */
567 
568 	score.p_pr = prbf(-2.0 * score.p_ln, sp_df);		/* compute P */
569 	score.q_pr = prbf(-2.0 * score.q_ln, ns_df);		/* compute Q */
570 
571 	if (!fBogotune && sp_esf >= 1.0 && ns_esf >= 1.0) {
572 	    score.spamicity = (1.0 + score.q_pr - score.p_pr) / 2.0;
573 	} else if (score.q_pr < DBL_EPSILON && score.p_pr < DBL_EPSILON) {
574 	    score.spamicity = 0.5;
575 	} else {
576 	    score.spamicity = score.q_pr / (score.q_pr + score.p_pr);
577 	}
578     }
579 
580     return score.spamicity;
581 }
582 
msg_print_summary(const char * pfx)583 void msg_print_summary(const char *pfx)
584 {
585     if (!Rtable) {
586 	(void)fprintf(fpo, "%s%-*s %6lu %9.6f %9.6f %9.6f\n",
587 		      pfx, max_token_len+2, "N_P_Q_S_s_x_md", (unsigned long)score.robn,
588 		      score.p_pr, score.q_pr, score.spamicity);
589 	(void)fprintf(fpo, "%s%-*s  %9.6f %9.6f %9.6f\n",
590 		      pfx, max_token_len+2+6, " ", robs, robx, score.min_dev);
591     }
592     else {
593 	/* Trim token to 22 characters to accomodate R's default line length of 80 */
594 	(void)fprintf(fpo, "%s%-24s %6lu %9.2e %9.2e %9.2e %9.2e %9.2e %5.3f\n",
595 		      pfx, "N_P_Q_S_s_x_md", (unsigned long)score.robn,
596 		      score.p_pr, score.q_pr, score.spamicity, robs, robx, score.min_dev);
597     }
598 }
599 
600 /* Done */
601