1 /*****************************************************************************
2
3 NAME:
4 score.c -- implements Fisher variant on Robinson algorithm.
5
6 ******************************************************************************/
7
8 #include "common.h"
9
10 #include <math.h>
11 #include <string.h>
12 #include <stdlib.h>
13
14 #include "bogoconfig.h"
15 #include "bogofilter.h"
16 #include "collect.h"
17 #include "datastore.h"
18 #include "listsort.h"
19 #include "msgcounts.h"
20 #include "prob.h"
21 #include "rand_sleep.h"
22 #include "rstats.h"
23 #include "score.h"
24 #include "wordhash.h"
25 #include "wordlists.h"
26
27 #if defined(HAVE_GSL_10) && !defined(HAVE_GSL_14)
28 /* HAVE_GSL_14 implies HAVE_GSL_10
29 * if we have neither, we'll use our included GSL 1.4, which knows CDFs
30 * if we have both, we have GSL 1.4, which knows CDFs
31 *
32 * in other cases, we need to integrate the PDF to get the CDF
33 */
34 #define GSL_INTEGRATE_PDF
35 #include "gsl/gsl_randist.h"
36 #include "gsl/gsl_integration.h"
37 #include "gsl/gsl_errno.h"
38 #else
39 #include "gsl/gsl_cdf.h"
40 #endif
41
42 /* Structure Definitions */
43
44 typedef struct probnode_t {
45 hashnode_t * node;
46 double prob;
47 } probnode_t;
48
49 /* struct for saving stats for printing. */
50 typedef struct score_s {
51 double min_dev;
52 double spamicity;
53 u_int32_t robn;
54 double p_ln; /* Robinson P, as a log*/
55 double q_ln; /* Robinson Q, as a log*/
56 double p_pr; /* Robinson P */
57 double q_pr; /* Robinson Q */
58 } score_t;
59
60 /* Function Prototypes */
61
62 static double get_spamicity(size_t robn, FLOAT P, FLOAT Q);
63 static bool need_scoring_boundary(size_t count);
64 static double find_scoring_boundary(wordhash_t *wh);
65 static size_t compute_count_and_scores(wordhash_t *wh);
66 static size_t compute_count_and_spamicity(wordhash_t *wh, FLOAT *P, FLOAT *Q, bool need_stats);
67 static int compare_hashnode_t(const void *const pv1, const void *const pv2);
68
69 /* Static Variables */
70
71 static score_t score;
72
73 /* Function Definitions */
74
msg_spamicity(void)75 double msg_spamicity(void)
76 {
77 return score.spamicity;
78 }
79
msg_status(void)80 rc_t msg_status(void)
81 {
82 if (score.spamicity >= spam_cutoff)
83 return RC_SPAM;
84
85 if ((ham_cutoff < EPS) ||
86 (score.spamicity <= ham_cutoff))
87 return RC_HAM;
88
89 return RC_UNSURE;
90 }
91
msg_print_stats(FILE * fp)92 void msg_print_stats(FILE *fp)
93 {
94 bool unsure = unsure_stats && (msg_status() == RC_UNSURE) && verbose;
95
96 (void)fp;
97
98 if (quiet)
99 return;
100
101 if (Rtable || unsure || verbose >= 2)
102 rstats_print(unsure);
103 }
104
105 /** search token in all lists according to precedence, summing up the
106 * counts (all lists at same precedence are used); if found, set cnts
107 * accordingly. */
lookup(const word_t * token,wordcnts_t * cnts)108 static int lookup(const word_t *token, wordcnts_t *cnts)
109 {
110 int override=0;
111 wordlist_t* list;
112
113 if (fBogotune) {
114 wordprop_t *wp = (wordprop_t *)wordhash_search_memory(token);
115 if (wp) {
116 cnts->good = wp->cnts.good;
117 cnts->bad = wp->cnts.bad;
118 }
119 return 0;
120 }
121
122 cnts->msgs_bad = cnts->msgs_good = 0;
123
124 for (list=word_lists; list != NULL; list=list->next)
125 {
126 dsv_t val;
127 int ret;
128
129 if (override > list->override) /* if already found */
130 break;
131
132 ret = ds_read(list->dsh, token, &val);
133
134 /* check if we have the token */
135 switch (ret) {
136 case 0:
137 /* token found, pass on */
138 break;
139 case 1:
140 /* token not found, clear counts */
141 val.count[IX_GOOD] = 0;
142 val.count[IX_SPAM] = 0;
143 break;
144 case DS_ABORT_RETRY:
145 /* sleep, reinitialize and start over */
146 rand_sleep(1000,1000000);
147 begin_wordlist(list);
148 /* FALLTHROUGH */
149 default:
150 return ret;
151 }
152
153 if (ret == 0 && list->type == WL_IGNORE) { /* if found on ignore list */
154 cnts->good = cnts->bad = 0;
155 break;
156 }
157
158 override=list->override;
159
160 if (DEBUG_ALGORITHM(2)) {
161 fprintf(dbgout, "%6d %5u %5u %5u %5u list=%s,%c,%d ",
162 ret, (uint)val.count[IX_GOOD], (uint)val.count[IX_SPAM],
163 (uint)list->msgcount[IX_GOOD], (uint)list->msgcount[IX_SPAM],
164 list->listname, list->type, list->override);
165 word_puts(token, 0, dbgout);
166 fputc('\n', dbgout);
167 }
168
169 cnts->good += val.count[IX_GOOD];
170 cnts->bad += val.count[IX_SPAM];
171 cnts->msgs_good += list->msgcount[IX_GOOD];
172 cnts->msgs_bad += list->msgcount[IX_SPAM];
173 }
174
175 if (DEBUG_ALGORITHM(1)) {
176 fprintf(dbgout, "%5u %5u ", (uint)cnts->bad, (uint)cnts->good);
177 word_puts(token, 0, dbgout);
178 fputc('\n', dbgout);
179 }
180
181 return 0;
182 }
183
184
185 /* do wordlist lookups for the words in the wordhash
186 */
lookup_words(wordhash_t * wh)187 void lookup_words(wordhash_t *wh)
188 {
189 int ret;
190 hashnode_t *node;
191
192 if (msg_count_file) /* if mc file, already done */
193 return;
194
195 retry:
196 for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
197 {
198 word_t *token = node->key;
199 wordprop_t *props = (wordprop_t *) node->data;
200 wordcnts_t *cnts = &props->cnts;
201 ret = lookup(token, cnts);
202 if (ret == DS_ABORT_RETRY)
203 /* start all over, the message counts may have changed
204 * lookup handles reinitializing the wordlist */
205 goto retry;
206 }
207
208 return;
209 }
210
211 /** selects the best spam/non-spam indicators and calculates Robinson's S,
212 * \return -1.0 for error, S otherwise */
msg_compute_spamicity(wordhash_t * wh)213 double msg_compute_spamicity(wordhash_t *wh) /*@globals errno@*/
214 {
215 FLOAT P = {1.0, 0}; /* Robinson's P */
216 FLOAT Q = {1.0, 0}; /* Robinson's Q */
217
218 double spamicity;
219 size_t robn = 0;
220
221 bool need_stats = !fBogotune && (Rtable || passthrough || (verbose > 0));
222
223 if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() begins\n");
224
225 if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "min_dev: %f, robs: %f, robx: %f\n",
226 min_dev, robs, robx);
227
228 /* compute scores for the wordhash's tokens */
229 robn = compute_count_and_scores(wh);
230
231 /* recalculate min_dev if necessary to satisfy token_count settings */
232 if (!need_scoring_boundary(robn))
233 score.min_dev = min_dev;
234 else
235 score.min_dev = find_scoring_boundary(wh);
236
237 /* compute message spamicity from the wordhash's scores */
238 robn = compute_count_and_spamicity(wh, &P, &Q, need_stats);
239
240 /* Robinson's P, Q and S
241 ** S = (P - Q) / (P + Q) [combined indicator]
242 */
243 spamicity = get_spamicity(robn, P, Q);
244
245 if (need_stats && robn != 0)
246 rstats_fini(robn, P, Q, spamicity);
247
248 if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() ends\n");
249
250 return spamicity;
251 }
252
253 /*
254 ** compute_count_and_scores()
255 ** compute the token probabilities from the linked list of tokens
256 */
compute_count_and_scores(wordhash_t * wh)257 static size_t compute_count_and_scores(wordhash_t *wh)
258 {
259 size_t count = 0;
260 hashnode_t *node;
261
262 if (fBogotune)
263 return count;
264
265 for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
266 {
267 wordcnts_t *cnts;
268 wordprop_t *props;
269
270 props = (wordprop_t *) node->data;
271 cnts = &props->cnts;
272 props->prob = calc_prob(cnts->good, cnts->bad,
273 cnts->msgs_good, cnts->msgs_bad);
274 props->used = fabs(props->prob - EVEN_ODDS) > min_dev;
275 if (props->used)
276 count += 1;
277 }
278
279 return count;
280 }
281
282 /*
283 ** compute_count_and_score()
284 ** compute the spamicity from the linked list of tokens using
285 ** min_dev to select tokens
286 */
compute_count_and_spamicity(wordhash_t * wh,FLOAT * P,FLOAT * Q,bool need_stats)287 static size_t compute_count_and_spamicity(wordhash_t *wh,
288 FLOAT *P, FLOAT *Q,
289 bool need_stats)
290 {
291 size_t count = 0;
292
293 hashnode_t *node;
294
295 for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
296 {
297 bool useflag;
298 double prob;
299 word_t *token;
300 wordcnts_t *cnts;
301 wordprop_t *props;
302
303 if (!fBogotune) {
304 token = node->key;
305 props = (wordprop_t *) node->data;
306 cnts = &props->cnts;
307 prob = props->prob;
308 useflag = props->used;
309 } else {
310 token = NULL;
311 cnts = (wordcnts_t *) node;
312 prob = calc_prob(cnts->good, cnts->bad,
313 cnts->msgs_good, cnts->msgs_bad);
314 useflag = fabs(prob - EVEN_ODDS) > score.min_dev;
315 }
316
317 if (need_stats)
318 rstats_add(token, prob, useflag, cnts);
319
320 /* Robinson's P and Q; accumulation step */
321 /*
322 * P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n) [spamminess]
323 * Q = 1 - (p1*p2*...*pn)^(1/n) [non-spamminess]
324 */
325 if (useflag ) {
326 int e;
327
328 P->mant *= 1-prob;
329 if (P->mant < 1.0e-200) {
330 P->mant = frexp(P->mant, &e);
331 P->exp += e;
332 }
333
334 Q->mant *= prob;
335 if (Q->mant < 1.0e-200) {
336 Q->mant = frexp(Q->mant, &e);
337 Q->exp += e;
338 }
339 count += 1;
340 }
341
342 if (DEBUG_ALGORITHM(3)) {
343 (void)fprintf(dbgout, "%3lu %3lu %f ",
344 (unsigned long)count, (unsigned long)count, prob);
345 (void)word_puts(token, 0, dbgout);
346 (void)fputc('\n', dbgout);
347 }
348 }
349
350 return count;
351 }
352
353 /* need_scoring_boundary( )
354 ** determine if min_dev gives a count fitting the token count limits
355 ** return True if so; False if not
356 */
need_scoring_boundary(size_t count)357 static bool need_scoring_boundary(size_t count)
358 {
359 /* Early out if no token count limits are set */
360 if (((token_count_min == 0) || (token_count_min <= count)) &&
361 ((token_count_max == 0) || (token_count_max >= count)) &&
362 ((token_count_fix == 0) || (token_count_fix == count)))
363 return false;
364 else
365 return true;
366 }
367
368 /* find_scoring_boundary( )
369 ** determine the token score that gives the desired token count
370 ** for scoring the message.
371 */
find_scoring_boundary(wordhash_t * wh)372 static double find_scoring_boundary(wordhash_t *wh)
373 {
374 size_t count = 0;
375
376 double min_prob = (token_count_max == 0.0) ? min_dev : 1.0;
377
378 hashnode_t *node;
379
380 /* sort by ascending score difference (from 0.5) */
381 wh->iter_head = (hashnode_t *)listsort((element *)wh->iter_head, (fcn_compare *)&compare_hashnode_t);
382
383 count = max(token_count_fix, max(token_count_min, token_count_max));
384
385 for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh)) {
386 wordcnts_t *cnts;
387 wordprop_t *props = NULL;
388 double prob;
389 double dev;
390
391 if (!fBogotune) {
392 props = (wordprop_t *) node->data;
393 cnts = &props->cnts;
394 } else {
395 cnts = (wordcnts_t *) node;
396 }
397 prob = calc_prob(cnts->good, cnts->bad,
398 cnts->msgs_good, cnts->msgs_bad);
399 dev = fabs(prob - EVEN_ODDS);
400
401 if (count > 0) {
402 count -= 1;
403 if (!fBogotune)
404 props->used = true;
405 min_prob = dev;
406 }
407 else if (dev >= min_prob) {
408 if (!fBogotune)
409 props->used = true;
410 }
411 else {
412 if (!fBogotune)
413 props->used = false;
414 }
415 }
416
417 return min_prob;
418 }
419
420 /* compare_hashnode_t - sort by ascending score difference (from 0.5) */
421
compare_hashnode_t(const void * const pv1,const void * const pv2)422 static int compare_hashnode_t(const void *const pv1, const void *const pv2)
423 {
424 double d1;
425 double d2;
426
427 if (!fBogotune) {
428 const hashnode_t *hn1 = (const hashnode_t *)pv1;
429 const hashnode_t *hn2 = (const hashnode_t *)pv2;
430 d1 = fabs(((wordprop_t *) hn1->data)->prob - EVEN_ODDS);
431 d2 = fabs(((wordprop_t *) hn2->data)->prob - EVEN_ODDS);
432 } else {
433 const wordcnts_t *cnts;
434 double prob;
435 cnts = (const wordcnts_t *) pv1;
436 prob = calc_prob(cnts->good, cnts->bad,
437 cnts->msgs_good, cnts->msgs_bad);
438 d1 = fabs(prob - EVEN_ODDS);
439
440 cnts = (const wordcnts_t *) pv2;
441 prob = calc_prob(cnts->good, cnts->bad,
442 cnts->msgs_good, cnts->msgs_bad);
443 d2 = fabs(prob - EVEN_ODDS);
444 }
445
446 if (d1 < d2)
447 return +1;
448 if (d1 > d2)
449 return -1;
450
451 return 0;
452 }
453
score_initialize(void)454 void score_initialize(void)
455 {
456 word_t *word_robx = word_news(ROBX_W);
457
458 wordlist_t *list = get_default_wordlist(word_lists);
459
460 if (fabs(min_dev) < EPS)
461 min_dev = MIN_DEV;
462 if (spam_cutoff < EPS)
463 spam_cutoff = SPAM_CUTOFF;
464
465 /*
466 ** If we're classifying messages, we need to compute the scalefactor
467 ** (from the .MSG_COUNT values)
468 ** If we're registering tokens, we needn't get .MSG_COUNT
469 */
470
471 if (fabs(robs) < EPS)
472 robs = ROBS;
473
474 if (fabs(robx) < EPS)
475 {
476 /* Assign default value in case there's no wordlist
477 * or no wordlist entry */
478 robx = ROBX;
479 if (list->dsh != NULL)
480 {
481 int ret;
482 dsv_t val;
483
484 /* Note: .ROBX is scaled by 1000000 in the wordlist */
485 ret = ds_read(list->dsh, word_robx, &val);
486 if (ret != 0)
487 robx = ROBX;
488 else {
489 /* If found, unscale; else use predefined value */
490 uint l_robx = val.count[IX_SPAM];
491 robx = l_robx ? (double)l_robx / 1000000 : ROBX;
492 }
493 }
494 }
495
496 if (robx < 0.0 || 1.0 < robx) {
497 fprintf(stderr, "Invalid robx value (%f). Must be between 0.0 and 1.0\n", robx);
498 exit(EX_ERROR);
499 }
500
501 word_free(word_robx);
502
503 return;
504 }
505
score_cleanup(void)506 void score_cleanup(void)
507 {
508 /* rstats_cleanup(); */
509 }
510
511 #ifdef GSL_INTEGRATE_PDF
chisq(double x,void * p)512 static double chisq(double x, void *p) {
513 return(gsl_ran_chisq_pdf(x, *(double *)p));
514 }
515
prbf(double x,double df)516 inline static double prbf(double x, double df) {
517 gsl_function chi;
518 int status;
519 double p, abserr;
520 const int intervals = 15;
521 const double eps = 1000 * DBL_EPSILON;
522
523 gsl_integration_workspace *w;
524 chi.function = chisq;
525 chi.params = &df;
526 gsl_set_error_handler_off();
527 w = gsl_integration_workspace_alloc(intervals);
528 if (!w) {
529 fprintf(stderr, "Out of memory! %s:%d\n", __FILE__, __LINE__);
530 exit(EX_ERROR);
531 }
532 status = gsl_integration_qag(&chi, 0, x, eps, eps,
533 intervals, GSL_INTEG_GAUSS41, w, &p, &abserr);
534 if (status && status != GSL_EMAXITER) {
535 fprintf(stderr, "Integration error: %s\n", gsl_strerror(status));
536 exit(EX_ERROR);
537 }
538 gsl_integration_workspace_free(w);
539 p = max(0.0, 1.0 - p);
540 return(min(1.0, p));
541 }
542 #else
prbf(double x,double df)543 inline static double prbf(double x, double df)
544 {
545 double r = gsl_cdf_chisq_Q(x, df);
546 return (r < DBL_EPSILON) ? 0.0 : r;
547 }
548 #endif
549
get_spamicity(size_t robn,FLOAT P,FLOAT Q)550 static double get_spamicity(size_t robn, FLOAT P, FLOAT Q)
551 {
552 if (robn == 0)
553 {
554 score.spamicity = robx;
555 }
556 else
557 {
558 double sp_df = 2.0 * robn * sp_esf;
559 double ns_df = 2.0 * robn * ns_esf;
560 double ln2 = log(2.0); /* ln(2) */
561
562 score.robn = robn;
563
564 /* convert to natural logs */
565 score.p_ln = (log(P.mant) + P.exp * ln2) * sp_esf; /* invlogsum */
566 score.q_ln = (log(Q.mant) + Q.exp * ln2) * ns_esf; /* logsum */
567
568 score.p_pr = prbf(-2.0 * score.p_ln, sp_df); /* compute P */
569 score.q_pr = prbf(-2.0 * score.q_ln, ns_df); /* compute Q */
570
571 if (!fBogotune && sp_esf >= 1.0 && ns_esf >= 1.0) {
572 score.spamicity = (1.0 + score.q_pr - score.p_pr) / 2.0;
573 } else if (score.q_pr < DBL_EPSILON && score.p_pr < DBL_EPSILON) {
574 score.spamicity = 0.5;
575 } else {
576 score.spamicity = score.q_pr / (score.q_pr + score.p_pr);
577 }
578 }
579
580 return score.spamicity;
581 }
582
msg_print_summary(const char * pfx)583 void msg_print_summary(const char *pfx)
584 {
585 if (!Rtable) {
586 (void)fprintf(fpo, "%s%-*s %6lu %9.6f %9.6f %9.6f\n",
587 pfx, max_token_len+2, "N_P_Q_S_s_x_md", (unsigned long)score.robn,
588 score.p_pr, score.q_pr, score.spamicity);
589 (void)fprintf(fpo, "%s%-*s %9.6f %9.6f %9.6f\n",
590 pfx, max_token_len+2+6, " ", robs, robx, score.min_dev);
591 }
592 else {
593 /* Trim token to 22 characters to accomodate R's default line length of 80 */
594 (void)fprintf(fpo, "%s%-24s %6lu %9.2e %9.2e %9.2e %9.2e %9.2e %5.3f\n",
595 pfx, "N_P_Q_S_s_x_md", (unsigned long)score.robn,
596 score.p_pr, score.q_pr, score.spamicity, robs, robx, score.min_dev);
597 }
598 }
599
600 /* Done */
601