1 /*****************************************************************************
2 
3 NAME:
4    bogofilter.c -- detect spam and bogons presented on standard input.
5 
6 AUTHORS:
7    Eric S. Raymond <esr@thyrsus.com>
8    David Relson    <relson@osagesoftware.com>
9    Matthias Andree <matthias.andree@gmx.de>
10    Greg Louis      <glouis@dynamicro.on.ca>
11 
12 THEORY:
13 
14    Originally implemented as Paul Graham's variant of Bayes filtering,
15    as described in
16 
17      "A Plan For Spam", http://www.paulgraham.com/spam.html
18 
19    Updated in accordance with Gary Robinson's proposed modifications,
20    as described at
21 
22     http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
23 
24 ******************************************************************************/
25 
26 #include "common.h"
27 
28 #include <string.h>
29 #include <stdlib.h>
30 
31 #include "bogofilter.h"
32 #include "bogoconfig.h"
33 #include "bogoreader.h"
34 #include "collect.h"
35 #include "format.h"
36 #include "passthrough.h"
37 #include "register.h"
38 #include "rstats.h"
39 #include "score.h"
40 
41 /*
42 **	case B_NORMAL:
43 **	case B_STDIN:		* '-b' - streaming (stdin) mode *
44 **	case B_CMDLINE:		* '-B' - command line mode *
45 **
46 **loop:
47 **    read & parse a message
48 **	if -p, save textblocks
49 **    register if -snSN && -pe
50 **    classify if -pue && ! -snSN
51 **    register if -u
52 **    write    if -p
53 **    if (-snSN && -pe) || -u
54 **	free tokens
55 **    else
56 **	accumulate tokens
57 **
58 **end:	register if -snSN && ! -pe
59 */
60 
61 /* Function Definitions */
62 
print_stats(FILE * fp)63 void print_stats(FILE *fp)
64 {
65     msg_print_stats(fp);
66 }
67 
bogofilter(int argc,char ** argv)68 rc_t bogofilter(int argc, char **argv)
69 {
70     uint msgcount = 0;
71     rc_t status = RC_OK;
72     bool register_opt = (run_type & (REG_SPAM | UNREG_SPAM | REG_GOOD | UNREG_GOOD)) != 0;
73     bool register_bef = register_opt && passthrough;
74     bool register_aft = ((register_opt && !passthrough) || (run_type & RUN_UPDATE)) != 0;
75     bool write_msg    = passthrough || Rtable;
76     bool classify_msg = write_msg || ((run_type & (RUN_NORMAL | RUN_UPDATE))) != 0;
77 
78     wordhash_t *words;
79 
80     score_initialize();			/* initialize constants */
81 
82     if (query)
83 	return query_config();
84 
85     words = register_aft ? wordhash_new() : NULL;
86 
87     bogoreader_init(argc, (const char * const *) argv);
88 
89     while ((*reader_more)()) {
90 	wordhash_t *w = wordhash_new();
91 
92 	rstats_init();
93 	passthrough_setup();
94 
95 	collect_words(w);
96 	wordhash_sort(w);
97 	msgcount += 1;
98 
99 	format_set_counts(w->count, msgcount);
100 
101         if (!passthrough_keepopen())
102             bogoreader_close_ifeof();
103 
104 	if (register_opt && DEBUG_REGISTER(1))
105 	    fprintf(dbgout, "Message #%ld\n", (long) msgcount);
106 	if (register_bef)
107 	    register_words(run_type, w, 1);
108 	if (register_aft)
109 	    wordhash_add(words, w, &wordprop_init);
110 
111 	if (classify_msg || write_msg) {
112 	    double spamicity;
113 	    lookup_words(w);			/* This reads the database */
114 	    spamicity = msg_compute_spamicity(w);
115 	    status = msg_status();
116 	    if (run_type & RUN_UPDATE)		/* Note: don't register if RC_UNSURE */
117 	    {
118 		if (status == RC_SPAM && spamicity <= 1.0 - thresh_update)
119 		    register_words(REG_SPAM, w, msgcount);
120 		if (status == RC_HAM && spamicity >= thresh_update)
121 		    register_words(REG_GOOD, w, msgcount);
122 	    }
123 
124 	    if (verbose && !passthrough && !quiet) {
125 		const char *filename = (*reader_filename)();
126 		if (filename)
127 		    fprintf(fpo, "%s ", filename);
128 	    }
129 
130 	    write_message(status);		/* passthrough */
131 	    if (logflag && !register_opt) {
132 		write_log_message(status);
133 		msgcount = 0;
134 	    }
135 	}
136 	wordhash_free(w);
137 
138 	passthrough_cleanup();
139 	rstats_cleanup();
140 
141 	if (DEBUG_MEMORY(2))
142 	    MEMDISPLAY;
143 
144 	if (fDie)
145 	    exit(EX_ERROR);
146     }
147 
148     bogoreader_fini();
149 
150     if (DEBUG_MEMORY(1))
151 	MEMDISPLAY;
152 
153     if (register_aft && ((run_type & RUN_UPDATE) == 0)) {
154 	wordhash_sort(words);
155 	register_words(run_type, words, msgcount);
156     }
157 
158     score_cleanup();
159 
160     if (logflag && register_opt)
161 	write_log_message(status);
162 
163     wordhash_free(words);
164 
165     if (DEBUG_MEMORY(1))
166 	MEMDISPLAY;
167 
168     return status;
169 }
170 
171 /* Done */
172