bogofilter-1.2.5/src/score.c

/*****************************************************************************

NAME:
   score.c -- implements Fisher variant on Robinson algorithm.

******************************************************************************/

#include "common.h"

#include <math.h>
#include <string.h>
#include <stdlib.h>

#include "bogoconfig.h"
#include "bogofilter.h"
#include "collect.h"
#include "datastore.h"
#include "listsort.h"
#include "msgcounts.h"
#include "prob.h"
#include "rand_sleep.h"
#include "rstats.h"
#include "score.h"
#include "wordhash.h"
#include "wordlists.h"

#if defined(HAVE_GSL_10) && !defined(HAVE_GSL_14)
/* HAVE_GSL_14 implies HAVE_GSL_10
 * if we have neither, we'll use our included GSL 1.4, which knows CDFs
 * if we have both, we have GSL 1.4, which knows CDFs
 *
 * in other cases, we need to integrate the PDF to get the CDF
 */
#define GSL_INTEGRATE_PDF
#include "gsl/gsl_randist.h"
#include "gsl/gsl_integration.h"
#include "gsl/gsl_errno.h"
#else
#include "gsl/gsl_cdf.h"
#endif

/* Structure Definitions */

typedef struct probnode_t {
    hashnode_t * node;
    double	 prob;
} probnode_t;

/* struct for saving stats for printing. */
typedef struct score_s {
    double min_dev;
    double spamicity;
    u_int32_t robn;
    double p_ln;	/* Robinson P, as a log*/
    double q_ln;	/* Robinson Q, as a log*/
    double p_pr;	/* Robinson P */
    double q_pr;	/* Robinson Q */
} score_t;

/* Function Prototypes */

static	double	get_spamicity(size_t robn, FLOAT P, FLOAT Q);
static	bool	need_scoring_boundary(size_t count);
static	double	find_scoring_boundary(wordhash_t *wh);
static	size_t	compute_count_and_scores(wordhash_t *wh);
static	size_t	compute_count_and_spamicity(wordhash_t *wh, FLOAT *P, FLOAT *Q, bool need_stats);
static	int	compare_hashnode_t(const void *const pv1, const void *const pv2);

/* Static Variables */

static score_t	score;

/* Function Definitions */

double msg_spamicity(void)
{
    return score.spamicity;
}

rc_t msg_status(void)
{
    if (score.spamicity >= spam_cutoff)
	return RC_SPAM;

    if ((ham_cutoff < EPS) ||
	(score.spamicity <= ham_cutoff))
	return RC_HAM;

    return RC_UNSURE;
}

void msg_print_stats(FILE *fp)
{
    bool unsure = unsure_stats && (msg_status() == RC_UNSURE) && verbose;

    (void)fp;

    if (quiet)
	return;

    if (Rtable || unsure || verbose >= 2)
	rstats_print(unsure);
}

/** search token in all lists according to precedence, summing up the
 * counts (all lists at same precedence are used); if found, set cnts
 * accordingly. */
static int lookup(const word_t *token, wordcnts_t *cnts)
{
    int override=0;
    wordlist_t* list;

    if (fBogotune) {
	wordprop_t *wp = (wordprop_t *)wordhash_search_memory(token);
	if (wp) {
	    cnts->good = wp->cnts.good;
	    cnts->bad  = wp->cnts.bad;
	}
	return 0;
    }

    cnts->msgs_bad = cnts->msgs_good = 0;

    for (list=word_lists; list != NULL; list=list->next)
    {
	dsv_t val;
	int ret;

	if (override > list->override)	/* if already found */
	    break;

	ret = ds_read(list->dsh, token, &val);

	/* check if we have the token */
	switch (ret) {
	    case 0:
		/* token found, pass on */
		break;
	    case 1:
		/* token not found, clear counts */
		val.count[IX_GOOD] = 0;
		val.count[IX_SPAM] = 0;
		break;
	    case DS_ABORT_RETRY:
		/* sleep, reinitialize and start over */
		rand_sleep(1000,1000000);
		begin_wordlist(list);
		/* FALLTHROUGH */
	    default:
		return ret;
	}

	if (ret == 0 && list->type == WL_IGNORE) {	/* if found on ignore list */
	    cnts->good = cnts->bad = 0;
	    break;
	}

	override=list->override;

	if (DEBUG_ALGORITHM(2)) {
	    fprintf(dbgout, "%6d %5u %5u %5u %5u list=%s,%c,%d ",
		    ret, (uint)val.count[IX_GOOD], (uint)val.count[IX_SPAM],
		    (uint)list->msgcount[IX_GOOD], (uint)list->msgcount[IX_SPAM],
		    list->listname, list->type, list->override);
	    word_puts(token, 0, dbgout);
	    fputc('\n', dbgout);
	}

	cnts->good += val.count[IX_GOOD];
	cnts->bad += val.count[IX_SPAM];
	cnts->msgs_good += list->msgcount[IX_GOOD];
	cnts->msgs_bad += list->msgcount[IX_SPAM];
    }

    if (DEBUG_ALGORITHM(1)) {
	fprintf(dbgout, "%5u %5u ", (uint)cnts->bad, (uint)cnts->good);
	word_puts(token, 0, dbgout);
	fputc('\n', dbgout);
    }

    return 0;
}


/* do wordlist lookups for the words in the wordhash
 */
void lookup_words(wordhash_t *wh)
{
    int ret;
    hashnode_t *node;

    if (msg_count_file)	/* if mc file, already done */
	return;

retry:
    for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
    {
	word_t *token     = node->key;
	wordprop_t *props = (wordprop_t *) node->data;
	wordcnts_t *cnts  = &props->cnts;
	ret = lookup(token, cnts);
	if (ret == DS_ABORT_RETRY)
	    /* start all over, the message counts may have changed
	     * lookup handles reinitializing the wordlist */
	    goto retry;
    }

    return;
}

/** selects the best spam/non-spam indicators and calculates Robinson's S,
 * \return -1.0 for error, S otherwise */
double msg_compute_spamicity(wordhash_t *wh) /*@globals errno@*/
{
    FLOAT P = {1.0, 0};		/* Robinson's P */
    FLOAT Q = {1.0, 0};		/* Robinson's Q */

    double spamicity;
    size_t robn = 0;

    bool need_stats = !fBogotune && (Rtable || passthrough || (verbose > 0));

    if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() begins\n");

    if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "min_dev: %f, robs: %f, robx: %f\n",
				    min_dev, robs, robx);

    /* compute scores for the wordhash's tokens */
    robn = compute_count_and_scores(wh);

    /* recalculate min_dev if necessary to satisfy token_count settings */
    if (!need_scoring_boundary(robn))
	score.min_dev = min_dev;
    else
	score.min_dev = find_scoring_boundary(wh);

    /* compute message spamicity from the wordhash's scores */
    robn = compute_count_and_spamicity(wh, &P, &Q, need_stats);

    /* Robinson's P, Q and S
    ** S = (P - Q) / (P + Q)			    [combined indicator]
    */
    spamicity = get_spamicity(robn, P, Q);

    if (need_stats && robn != 0)
	rstats_fini(robn, P, Q, spamicity);

    if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() ends\n");

    return spamicity;
}

/*
** compute_count_and_scores()
**	compute the token probabilities from the linked list of tokens
*/
static size_t compute_count_and_scores(wordhash_t *wh)
{
    size_t count = 0;
    hashnode_t *node;

    if (fBogotune)
	return count;

    for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
    {
	wordcnts_t *cnts;
	wordprop_t *props;

	props = (wordprop_t *) node->data;
	cnts  = &props->cnts;
	props->prob = calc_prob(cnts->good, cnts->bad,
				cnts->msgs_good, cnts->msgs_bad);
	props->used = fabs(props->prob - EVEN_ODDS) > min_dev;
	if (props->used)
	    count += 1;
    }

    return count;
}

/*
** compute_count_and_score()
**	compute the spamicity from the linked list of tokens using
**	min_dev to select tokens
*/
static size_t compute_count_and_spamicity(wordhash_t *wh,
					  FLOAT *P, FLOAT *Q,
					  bool need_stats)
{
    size_t count = 0;

    hashnode_t *node;

    for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh))
    {
	bool useflag;
	double prob;
	word_t *token;
	wordcnts_t *cnts;
	wordprop_t *props;

	if (!fBogotune) {
	    token = node->key;
	    props = (wordprop_t *) node->data;
	    cnts  = &props->cnts;
	    prob = props->prob;
	    useflag = props->used;
	} else {
	    token = NULL;
	    cnts = (wordcnts_t *) node;
	    prob = calc_prob(cnts->good, cnts->bad,
			     cnts->msgs_good, cnts->msgs_bad);
	    useflag = fabs(prob - EVEN_ODDS) > score.min_dev;
	}

	if (need_stats)
	    rstats_add(token, prob, useflag, cnts);

	/* Robinson's P and Q; accumulation step */
	/*
	 * P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n)	[spamminess]
	 * Q = 1 - (p1*p2*...*pn)^(1/n)			[non-spamminess]
	 */
	if (useflag ) {
	    int e;

	    P->mant *= 1-prob;
	    if (P->mant < 1.0e-200) {
		P->mant = frexp(P->mant, &e);
		P->exp += e;
	    }

	    Q->mant *= prob;
	    if (Q->mant < 1.0e-200) {
		Q->mant = frexp(Q->mant, &e);
		Q->exp += e;
	    }
	    count += 1;
	}

	if (DEBUG_ALGORITHM(3)) {
	    (void)fprintf(dbgout, "%3lu %3lu %f ",
			  (unsigned long)count, (unsigned long)count, prob);
	    (void)word_puts(token, 0, dbgout);
	    (void)fputc('\n', dbgout);
	}
    }

    return count;
}

/* need_scoring_boundary( )
**	determine if min_dev gives a count fitting the token count limits
**	return True if so; False if not
*/
static bool need_scoring_boundary(size_t count)
{
    /* Early out if no token count limits are set */
    if (((token_count_min == 0) || (token_count_min <= count)) &&
	((token_count_max == 0) || (token_count_max >= count)) &&
	((token_count_fix == 0) || (token_count_fix == count)))
	return false;
    else
	return true;
}

/* find_scoring_boundary( )
**	determine the token score that gives the desired token count
**	for scoring the message.
*/
static double find_scoring_boundary(wordhash_t *wh)
{
    size_t count = 0;

    double min_prob = (token_count_max == 0.0) ? min_dev : 1.0;

    hashnode_t *node;

    /* sort by ascending score difference (from 0.5) */
    wh->iter_head = (hashnode_t *)listsort((element *)wh->iter_head, (fcn_compare *)&compare_hashnode_t);

    count = max(token_count_fix, max(token_count_min, token_count_max));

    for (node = (hashnode_t *)wordhash_first(wh); node != NULL; node = (hashnode_t *)wordhash_next(wh)) {
	wordcnts_t *cnts;
	wordprop_t *props = NULL;
	double prob;
	double dev;

	if (!fBogotune) {
	    props = (wordprop_t *) node->data;
	    cnts  = &props->cnts;
	} else {
	    cnts = (wordcnts_t *) node;
	}
	prob = calc_prob(cnts->good, cnts->bad,
			 cnts->msgs_good, cnts->msgs_bad);
	dev = fabs(prob - EVEN_ODDS);

	if (count > 0) {
	    count -= 1;
	    if (!fBogotune)
		props->used = true;
	    min_prob = dev;
	}
	else if (dev >= min_prob) {
	    if (!fBogotune)
		props->used = true;
	}
	else {
	    if (!fBogotune)
		props->used = false;
	}
    }

    return min_prob;
}

/* compare_hashnode_t - sort by ascending score difference (from 0.5) */

static int compare_hashnode_t(const void *const pv1, const void *const pv2)
{
    double d1;
    double d2;

    if (!fBogotune) {
	const hashnode_t *hn1 = (const hashnode_t *)pv1;
	const hashnode_t *hn2 = (const hashnode_t *)pv2;
	d1 = fabs(((wordprop_t *) hn1->data)->prob - EVEN_ODDS);
	d2 = fabs(((wordprop_t *) hn2->data)->prob - EVEN_ODDS);
    } else {
	const wordcnts_t *cnts;
	double prob;
	cnts = (const wordcnts_t *) pv1;
	prob = calc_prob(cnts->good, cnts->bad,
			 cnts->msgs_good, cnts->msgs_bad);
	d1 = fabs(prob - EVEN_ODDS);

	cnts = (const wordcnts_t *) pv2;
	prob = calc_prob(cnts->good, cnts->bad,
			 cnts->msgs_good, cnts->msgs_bad);
	d2 = fabs(prob - EVEN_ODDS);
    }

    if (d1 < d2)
	return +1;
    if (d1 > d2)
	return -1;

    return 0;
}

void score_initialize(void)
{
    word_t *word_robx = word_news(ROBX_W);

    wordlist_t *list = get_default_wordlist(word_lists);

    if (fabs(min_dev) < EPS)
	min_dev = MIN_DEV;
    if (spam_cutoff < EPS)
	spam_cutoff = SPAM_CUTOFF;

    /*
    ** If we're classifying messages, we need to compute the scalefactor
    ** (from the .MSG_COUNT values)
    ** If we're registering tokens, we needn't get .MSG_COUNT
    */

    if (fabs(robs) < EPS)
	robs = ROBS;

    if (fabs(robx) < EPS)
    {
	/* Assign default value in case there's no wordlist
	 * or no wordlist entry */
	robx = ROBX;
	if (list->dsh != NULL)
	{
	    int ret;
	    dsv_t val;

	    /* Note: .ROBX is scaled by 1000000 in the wordlist */
	    ret = ds_read(list->dsh, word_robx, &val);
	    if (ret != 0)
		robx = ROBX;
	    else {
		/* If found, unscale; else use predefined value */
		uint l_robx = val.count[IX_SPAM];
		robx = l_robx ? (double)l_robx / 1000000 : ROBX;
	    }
	}
    }

    if (robx < 0.0 || 1.0 < robx) {
	fprintf(stderr, "Invalid robx value (%f).  Must be between 0.0 and 1.0\n", robx);
	exit(EX_ERROR);
    }

    word_free(word_robx);

    return;
}

void score_cleanup(void)
{
/*    rstats_cleanup(); */
}

#ifdef GSL_INTEGRATE_PDF
static double chisq(double x, void *p) {
     return(gsl_ran_chisq_pdf(x, *(double *)p));
}

inline static double prbf(double x, double df) {
    gsl_function chi;
    int status;
    double p, abserr;
    const int intervals = 15;
    const double eps = 1000 * DBL_EPSILON;

    gsl_integration_workspace *w;
    chi.function = chisq;
    chi.params = &df;
    gsl_set_error_handler_off();
    w = gsl_integration_workspace_alloc(intervals);
    if (!w) {
	fprintf(stderr, "Out of memory! %s:%d\n", __FILE__, __LINE__);
	exit(EX_ERROR);
    }
    status = gsl_integration_qag(&chi, 0, x, eps, eps,
	    intervals, GSL_INTEG_GAUSS41, w, &p, &abserr);
    if (status && status != GSL_EMAXITER) {
	fprintf(stderr, "Integration error: %s\n", gsl_strerror(status));
	exit(EX_ERROR);
    }
    gsl_integration_workspace_free(w);
    p = max(0.0, 1.0 - p);
    return(min(1.0, p));
}
#else
inline static double prbf(double x, double df)
{
    double r = gsl_cdf_chisq_Q(x, df);
    return (r < DBL_EPSILON) ? 0.0 : r;
}
#endif

static double get_spamicity(size_t robn, FLOAT P, FLOAT Q)
{
    if (robn == 0)
    {
	score.spamicity = robx;
    }
    else
    {
	double sp_df = 2.0 * robn * sp_esf;
	double ns_df = 2.0 * robn * ns_esf;
	double ln2 = log(2.0);					/* ln(2) */

	score.robn = robn;

	/* convert to natural logs */
	score.p_ln = (log(P.mant) + P.exp * ln2) * sp_esf;	/* invlogsum */
	score.q_ln = (log(Q.mant) + Q.exp * ln2) * ns_esf;	/* logsum */

	score.p_pr = prbf(-2.0 * score.p_ln, sp_df);		/* compute P */
	score.q_pr = prbf(-2.0 * score.q_ln, ns_df);		/* compute Q */

	if (!fBogotune && sp_esf >= 1.0 && ns_esf >= 1.0) {
	    score.spamicity = (1.0 + score.q_pr - score.p_pr) / 2.0;
	} else if (score.q_pr < DBL_EPSILON && score.p_pr < DBL_EPSILON) {
	    score.spamicity = 0.5;
	} else {
	    score.spamicity = score.q_pr / (score.q_pr + score.p_pr);
	}
    }

    return score.spamicity;
}

void msg_print_summary(const char *pfx)
{
    if (!Rtable) {
	(void)fprintf(fpo, "%s%-*s %6lu %9.6f %9.6f %9.6f\n",
		      pfx, max_token_len+2, "N_P_Q_S_s_x_md", (unsigned long)score.robn,
		      score.p_pr, score.q_pr, score.spamicity);
	(void)fprintf(fpo, "%s%-*s  %9.6f %9.6f %9.6f\n",
		      pfx, max_token_len+2+6, " ", robs, robx, score.min_dev);
    }
    else {
	/* Trim token to 22 characters to accomodate R's default line length of 80 */
	(void)fprintf(fpo, "%s%-24s %6lu %9.2e %9.2e %9.2e %9.2e %9.2e %5.3f\n",
		      pfx, "N_P_Q_S_s_x_md", (unsigned long)score.robn,
		      score.p_pr, score.q_pr, score.spamicity, robs, robx, score.min_dev);
    }
}

/* Done */