1 /*****************************************************************************
2
3 NAME:
4 bogofilter.c -- detect spam and bogons presented on standard input.
5
6 AUTHORS:
7 Eric S. Raymond <esr@thyrsus.com>
8 David Relson <relson@osagesoftware.com>
9 Matthias Andree <matthias.andree@gmx.de>
10 Greg Louis <glouis@dynamicro.on.ca>
11
12 THEORY:
13
14 Originally implemented as Paul Graham's variant of Bayes filtering,
15 as described in
16
17 "A Plan For Spam", http://www.paulgraham.com/spam.html
18
19 Updated in accordance with Gary Robinson's proposed modifications,
20 as described at
21
22 http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
23
24 ******************************************************************************/
25
26 #include "common.h"
27
28 #include <string.h>
29 #include <stdlib.h>
30
31 #include "bogofilter.h"
32 #include "bogoconfig.h"
33 #include "bogoreader.h"
34 #include "collect.h"
35 #include "format.h"
36 #include "passthrough.h"
37 #include "register.h"
38 #include "rstats.h"
39 #include "score.h"
40
41 /*
42 ** case B_NORMAL:
43 ** case B_STDIN: * '-b' - streaming (stdin) mode *
44 ** case B_CMDLINE: * '-B' - command line mode *
45 **
46 **loop:
47 ** read & parse a message
48 ** if -p, save textblocks
49 ** register if -snSN && -pe
50 ** classify if -pue && ! -snSN
51 ** register if -u
52 ** write if -p
53 ** if (-snSN && -pe) || -u
54 ** free tokens
55 ** else
56 ** accumulate tokens
57 **
58 **end: register if -snSN && ! -pe
59 */
60
61 /* Function Definitions */
62
print_stats(FILE * fp)63 void print_stats(FILE *fp)
64 {
65 msg_print_stats(fp);
66 }
67
bogofilter(int argc,char ** argv)68 rc_t bogofilter(int argc, char **argv)
69 {
70 uint msgcount = 0;
71 rc_t status = RC_OK;
72 bool register_opt = (run_type & (REG_SPAM | UNREG_SPAM | REG_GOOD | UNREG_GOOD)) != 0;
73 bool register_bef = register_opt && passthrough;
74 bool register_aft = ((register_opt && !passthrough) || (run_type & RUN_UPDATE)) != 0;
75 bool write_msg = passthrough || Rtable;
76 bool classify_msg = write_msg || ((run_type & (RUN_NORMAL | RUN_UPDATE))) != 0;
77
78 wordhash_t *words;
79
80 score_initialize(); /* initialize constants */
81
82 if (query)
83 return query_config();
84
85 words = register_aft ? wordhash_new() : NULL;
86
87 bogoreader_init(argc, (const char * const *) argv);
88
89 while ((*reader_more)()) {
90 wordhash_t *w = wordhash_new();
91
92 rstats_init();
93 passthrough_setup();
94
95 collect_words(w);
96 wordhash_sort(w);
97 msgcount += 1;
98
99 format_set_counts(w->count, msgcount);
100
101 if (!passthrough_keepopen())
102 bogoreader_close_ifeof();
103
104 if (register_opt && DEBUG_REGISTER(1))
105 fprintf(dbgout, "Message #%ld\n", (long) msgcount);
106 if (register_bef)
107 register_words(run_type, w, 1);
108 if (register_aft)
109 wordhash_add(words, w, &wordprop_init);
110
111 if (classify_msg || write_msg) {
112 double spamicity;
113 lookup_words(w); /* This reads the database */
114 spamicity = msg_compute_spamicity(w);
115 status = msg_status();
116 if (run_type & RUN_UPDATE) /* Note: don't register if RC_UNSURE */
117 {
118 if (status == RC_SPAM && spamicity <= 1.0 - thresh_update)
119 register_words(REG_SPAM, w, msgcount);
120 if (status == RC_HAM && spamicity >= thresh_update)
121 register_words(REG_GOOD, w, msgcount);
122 }
123
124 if (verbose && !passthrough && !quiet) {
125 const char *filename = (*reader_filename)();
126 if (filename)
127 fprintf(fpo, "%s ", filename);
128 }
129
130 write_message(status); /* passthrough */
131 if (logflag && !register_opt) {
132 write_log_message(status);
133 msgcount = 0;
134 }
135 }
136 wordhash_free(w);
137
138 passthrough_cleanup();
139 rstats_cleanup();
140
141 if (DEBUG_MEMORY(2))
142 MEMDISPLAY;
143
144 if (fDie)
145 exit(EX_ERROR);
146 }
147
148 bogoreader_fini();
149
150 if (DEBUG_MEMORY(1))
151 MEMDISPLAY;
152
153 if (register_aft && ((run_type & RUN_UPDATE) == 0)) {
154 wordhash_sort(words);
155 register_words(run_type, words, msgcount);
156 }
157
158 score_cleanup();
159
160 if (logflag && register_opt)
161 write_log_message(status);
162
163 wordhash_free(words);
164
165 if (DEBUG_MEMORY(1))
166 MEMDISPLAY;
167
168 return status;
169 }
170
171 /* Done */
172