1 /*
2 * Heirloom mailx - a mail user agent derived from Berkeley Mail.
3 *
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5 */
6 /*
7 * Copyright (c) 2004
8 * Gunnar Ritter. All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by Gunnar Ritter
21 * and his contributors.
22 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #ifndef lint
40 #ifdef DOSCCS
41 static char sccsid[] = "@(#)junk.c 1.73 (gritter) 3/4/06";
42 #endif
43 #endif /* not lint */
44
45 #include "config.h"
46 #include "rcv.h"
47
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #include <limits.h>
51 #include <time.h>
52 #include <unistd.h>
53 #include <utime.h>
54
55 #ifdef HAVE_MMAP
56 #include <sys/mman.h>
57 #else /* !HAVE_MMAP */
58 #define mmap(a, b, c, d, e, f) MAP_FAILED
59 #define munmap(a, b)
60 #endif /* !HAVE_MMAP */
61 #ifndef HAVE_MREMAP
62 #define mremap(a, b, c, d) MAP_FAILED
63 #endif /* !HAVE_MREMAP */
64
65 #ifndef MAP_FAILED
66 #define MAP_FAILED ((void *)-1)
67 #endif /* !MAP_FAILED */
68
69 #include "extern.h"
70 #include "md5.h"
71
72 /*
73 * Mail -- a mail program
74 *
75 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
76 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
77 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
78 *
79 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
80 * Classification using Chained Tokens", February 2004,
81 * <http://www.nuclearelephant.com/papers/chained.html>.
82 */
83
84 #define DFL .40
85 #define THR .9
86 #define MID .5
87
88 #define MAX2 0x0000ffff
89 #define MAX3 0x00ffffffUL
90 #define MAX4 0xffffffffUL
91
92 /*
93 * The dictionary consists of two files forming a hash table. The hash
94 * consists of the first 56 bits of the result of applying MD5 to the
95 * input word. This scheme ensures that collisions are unlikely enough
96 * to make junk detection work; according to the birthday paradox, a
97 * 50 % probability for one single collision is reached at 2^28 entries.
98 *
99 * To make the chain structure independent from input, the MD5 input is
100 * xor'ed with a random number. This makes it impossible that someone uses
101 * a carefully crafted message for a denial-of-service attack against the
102 * database.
103 */
104 #define SIZEOF_node 17
105 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
106 #define OF_node_next 4 /* bit-negated table index of next node */
107 #define OF_node_good 8 /* number of times this appeared in good msgs */
108 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
109 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
110 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
111 static char *nodes;
112
113 #define SIZEOF_super 262164
114 #define OF_super_size 0 /* allocated nodes in the chain file */
115 #define OF_super_used 4 /* used nodes in the chain file */
116 #define OF_super_ngood 8 /* number of good messages scanned so far */
117 #define OF_super_nbad 12 /* number of bad messages scanned so far */
118 #define OF_super_mangle 16 /* used to mangle the MD5 input */
119 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
120 #define SIZEOF_entry 4
121 static char *super;
122
123 static size_t super_mmapped;
124 static size_t nodes_mmapped;
125 static int rw_map;
126 static int chained_tokens;
127
128 /*
129 * Version history
130 * ---------------
131 * 0 Initial version
132 * 1 Fixed the mangling; it was ineffective in version 0.
133 * Hash extended to 56 bits.
134 */
135 static int table_version;
136 #define current_table_version 1
137
138 #define get(e) \
139 ((unsigned)(((char *)(e))[0]&0377) + \
140 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
141 ((unsigned)(((char *)(e))[2]&0377) << 16))
142
143 #define put(e, n) \
144 (((char *)(e))[0] = (n) & 0x0000ff, \
145 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
146 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
147
148 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
149
150 #define s2f(s) ((float)(s) / MAX3)
151
152 #define getn(p) \
153 ((unsigned long)(((char *)(p))[0]&0377) + \
154 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
155 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
156 ((unsigned long)(((char *)(p))[3]&0377) << 24))
157
158 #define putn(p, n) \
159 (((char *)(p))[0] = (n) & 0x000000ffUL, \
160 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
161 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
162 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
163
164 struct lexstat {
165 char *save;
166 int price;
167 int url;
168 int lastc;
169 int hadamp;
170 enum loc {
171 FROM_LINE = 0,
172 HEADER = 1,
173 BODY = 2
174 } loc;
175 enum html {
176 HTML_NONE = 0,
177 HTML_TEXT = 1,
178 HTML_TAG = 2,
179 HTML_SKIP = 3
180 } html;
181 char tag[8];
182 char *tagp;
183 char field[LINESIZE];
184 };
185
186 #define constituent(c, b, i, price, hadamp) \
187 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
188 (c) == '$' || (c) == '!' || (c) == '_' || \
189 (c) == '#' || (c) == '%' || (c) == '&' || \
190 ((c) == ';' && hadamp) || \
191 ((c) == '-' && !(price)) || \
192 (((c) == '.' || (c) == ',' || (c) == '/') && \
193 (i) > 0 && digitchar((b)[(i)-1]&0377)))
194
195 #define url_xchar(c) \
196 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
197 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
198 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
199 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
200 (c) != '#' && (c) != '"')
201
202 enum db {
203 SUPER = 0,
204 NODES = 1
205 };
206
207 enum entry {
208 GOOD = 0,
209 BAD = 1
210 };
211
212 static const char README1[] = "\
213 This is a junk mail database maintained by mailx(1). It does not contain any\n\
214 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
215 used for lookup. It is thus possible to tell if some given word was likely\n\
216 contained in your mail from examining this data, at best.\n";
217 static const char README2[] = "\n\
218 The database files are stored in compress(1) format by default. This saves\n\
219 some space, but leads to higher processor usage when the database is read\n\
220 or updated. You can use uncompress(1) on these files if you prefer to store\n\
221 them in flat form.\n";
222
223 static int verbose;
224 static int _debug;
225 static FILE *sfp, *nfp;
226 static char *sname, *nname;
227
228 static enum okay getdb(int rw);
229 static void putdb(void);
230 static void relsedb(void);
231 static FILE *dbfp(enum db db, int rw, int *compressed, char **fn);
232 static char *lookup(unsigned long h1, unsigned long h2, int create);
233 static unsigned long grow(unsigned long size);
234 static char *nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
235 struct lexstat *sp, int *stop);
236 static void join(char **buf, size_t *bufsize, const char *s1, const char *s2);
237 static void add(const char *word, enum entry entry, struct lexstat *sp,
238 int incr);
239 static enum okay scan(struct message *m, enum entry entry,
240 void (*func)(const char *, enum entry, struct lexstat *, int),
241 int arg);
242 static void recompute(void);
243 static float getprob(char *n);
244 static int insert(int *msgvec, enum entry entry, int incr);
245 static void clsf(struct message *m);
246 static void rate(const char *word, enum entry entry, struct lexstat *sp,
247 int unused);
248 static void dbhash(const char *word, unsigned long *h1, unsigned long *h2);
249 static void mkmangle(void);
250
251 static enum okay
getdb(int rw)252 getdb(int rw)
253 {
254 void *zp = NULL;
255 long n;
256 int compressed;
257
258 chained_tokens = value("chained-junk-tokens") != NULL;
259 if ((sfp = dbfp(SUPER, rw, &compressed, &sname)) == (FILE *)-1)
260 return STOP;
261 if (sfp && !compressed) {
262 super = mmap(NULL, SIZEOF_super,
263 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
264 MAP_SHARED, fileno(sfp), 0);
265 if (super != MAP_FAILED) {
266 super_mmapped = SIZEOF_super;
267 goto skip;
268 }
269 }
270 super_mmapped = 0;
271 super = smalloc(SIZEOF_super);
272 if (sfp) {
273 if (compressed)
274 zp = zalloc(sfp);
275 if ((compressed ? zread(zp, super, SIZEOF_super)
276 != SIZEOF_super :
277 fread(super, 1, SIZEOF_super, sfp)
278 != SIZEOF_super) ||
279 ferror(sfp)) {
280 fprintf(stderr, "Error reading junk mail database.\n");
281 memset(super, 0, SIZEOF_super);
282 mkmangle();
283 if (compressed)
284 zfree(zp);
285 Fclose(sfp);
286 sfp = NULL;
287 } else if (compressed)
288 zfree(zp);
289 } else {
290 memset(super, 0, SIZEOF_super);
291 mkmangle();
292 }
293 skip: if ((n = getn(&super[OF_super_size])) == 0) {
294 n = 1;
295 putn(&super[OF_super_size], 1);
296 }
297 if (sfp && (nfp = dbfp(NODES, rw, &compressed, &nname)) != NULL) {
298 if (nfp == (FILE *)-1) {
299 relsedb();
300 return STOP;
301 }
302 }
303 rw_map = rw;
304 if (sfp && nfp && !compressed) {
305 nodes = mmap(NULL, n * SIZEOF_node,
306 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
307 MAP_SHARED, fileno(nfp), 0);
308 if (nodes != MAP_FAILED) {
309 nodes_mmapped = n * SIZEOF_node;
310 return OKAY;
311 }
312 }
313 nodes_mmapped = 0;
314 nodes = smalloc(n * SIZEOF_node);
315 if (sfp && nfp) {
316 if (compressed)
317 zp = zalloc(nfp);
318 if ((compressed ? zread(zp, nodes, n * SIZEOF_node)
319 != n * SIZEOF_node :
320 fread(nodes, 1, n * SIZEOF_node, nfp)
321 != n * SIZEOF_node) ||
322 ferror(nfp)) {
323 fprintf(stderr, "Error reading junk mail database.\n");
324 memset(nodes, 0, n * SIZEOF_node);
325 memset(super, 0, SIZEOF_super);
326 mkmangle();
327 putn(&super[OF_super_size], n);
328 }
329 if (compressed)
330 zfree(zp);
331 Fclose(nfp);
332 nfp = NULL;
333 } else
334 memset(nodes, 0, n * SIZEOF_node);
335 if (sfp) {
336 Fclose(sfp);
337 sfp = NULL;
338 }
339 return OKAY;
340 }
341
342 static void
putdb(void)343 putdb(void)
344 {
345 void *zp;
346 int scomp, ncomp;
347
348 if (!super_mmapped && (sfp = dbfp(SUPER, O_WRONLY, &scomp, &sname))
349 == NULL || sfp == (FILE *)-1)
350 return;
351 if (!nodes_mmapped && (nfp = dbfp(NODES, O_WRONLY, &ncomp, &nname))
352 == NULL || nfp == (FILE *)-1)
353 return;
354 if (super_mmapped == 0 || nodes_mmapped == 0)
355 holdint();
356 /*
357 * Use utime() with mmap() since Linux does not update st_mtime
358 * reliably otherwise.
359 */
360 if (super_mmapped)
361 utime(sname, NULL);
362 else if (scomp) {
363 zp = zalloc(sfp);
364 zwrite(zp, super, SIZEOF_super);
365 zfree(zp);
366 trunc(sfp);
367 } else
368 fwrite(super, 1, SIZEOF_super, sfp);
369 if (nodes_mmapped)
370 utime(nname, NULL);
371 else if (ncomp) {
372 zp = zalloc(nfp);
373 zwrite(zp, nodes, getn(&super[OF_super_size]) * SIZEOF_node);
374 zfree(zp);
375 trunc(nfp);
376 } else
377 fwrite(nodes, 1,
378 getn(&super[OF_super_size]) * SIZEOF_node, nfp);
379 if (super_mmapped == 0 || nodes_mmapped == 0)
380 relseint();
381 }
382
383 static void
relsedb(void)384 relsedb(void)
385 {
386 if (super_mmapped) {
387 munmap(super, super_mmapped);
388 super_mmapped = 0;
389 } else
390 free(super);
391 if (nodes_mmapped) {
392 munmap(nodes, nodes_mmapped);
393 nodes_mmapped = 0;
394 } else
395 free(nodes);
396 if (sfp && sfp != (FILE *)-1) {
397 Fclose(sfp);
398 sfp = NULL;
399 }
400 if (nfp && nfp != (FILE *)-1) {
401 Fclose(nfp);
402 nfp = NULL;
403 }
404 }
405
406 static FILE *
dbfp(enum db db,int rw,int * compressed,char ** fn)407 dbfp(enum db db, int rw, int *compressed, char **fn)
408 {
409 FILE *fp, *rp;
410 char *dir;
411 struct flock flp;
412 char *sfx[][2] = {
413 { "super", "nodes" },
414 { "super1", "nodes1" }
415 };
416 char **sf;
417 char *zfx[][2] = {
418 { "super.Z", "nodes.Z" },
419 { "super1.Z", "nodes1.Z" }
420 };
421 char **zf;
422 int n;
423
424 if ((dir = value("junkdb")) == NULL) {
425 fprintf(stderr, "No junk mail database specified. "
426 "Set the junkdb variable.\n");
427 return (FILE *)-1;
428 }
429 dir = expand(dir);
430 if (makedir(dir) == STOP) {
431 fprintf(stderr, "Cannot create directory \"%s\"\n.", dir);
432 return (FILE *)-1;
433 }
434 if (rw!=O_WRONLY)
435 table_version = current_table_version;
436 loop: sf = sfx[table_version];
437 zf = zfx[table_version];
438 *fn = salloc((n = strlen(dir)) + 40);
439 strcpy(*fn, dir);
440 (*fn)[n] = '/';
441 *compressed = 0;
442 strcpy(&(*fn)[n+1], sf[db]);
443 if ((fp = Fopen(*fn, rw!=O_RDONLY ? "r+" : "r")) != NULL)
444 goto okay;
445 *compressed = 1;
446 strcpy(&(*fn)[n+1], zf[db]);
447 if ((fp = Fopen(*fn, rw ? "r+" : "r")) == NULL &&
448 rw==O_WRONLY ? (fp = Fopen(*fn, "w+")) == NULL : 0) {
449 fprintf(stderr, "Cannot open junk mail database \"%s\".\n",*fn);
450 return NULL;
451 }
452 if (rw==O_WRONLY) {
453 strcpy(&(*fn)[n+1], "README");
454 if (access(*fn, F_OK) < 0 && (rp = Fopen(*fn, "w")) != NULL) {
455 fputs(README1, rp);
456 fputs(README2, rp);
457 Fclose(rp);
458 }
459 } else if (fp == NULL) {
460 if (table_version > 0) {
461 table_version--;
462 goto loop;
463 } else
464 table_version = current_table_version;
465 }
466 okay: if (fp) {
467 flp.l_type = rw!=O_RDONLY ? F_WRLCK : F_RDLCK;
468 flp.l_start = 0;
469 flp.l_len = 0;
470 flp.l_whence = SEEK_SET;
471 fcntl(fileno(fp), F_SETLKW, &flp);
472 }
473 return fp;
474 }
475
476 static char *
lookup(unsigned long h1,unsigned long h2,int create)477 lookup(unsigned long h1, unsigned long h2, int create)
478 {
479 char *n, *lastn = NULL;
480 unsigned long c, lastc = MAX4, used, size;
481
482 used = getn(&super[OF_super_used]);
483 size = getn(&super[OF_super_size]);
484 c = ~getn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry]);
485 n = &nodes[c*SIZEOF_node];
486 while (c < used) {
487 if (getn(&n[OF_node_hash]) == h1 &&
488 (table_version < 1 ? 1 :
489 get(&n[OF_node_hash2]) == h2))
490 return n;
491 lastc = c;
492 lastn = n;
493 c = ~getn(&n[OF_node_next]);
494 n = &nodes[c*SIZEOF_node];
495 }
496 if (create) {
497 if (used >= size) {
498 if ((size = grow(size)) == 0)
499 return NULL;
500 lastn = &nodes[lastc*SIZEOF_node];
501 }
502 putn(&super[OF_super_used], used+1);
503 n = &nodes[used*SIZEOF_node];
504 putn(&n[OF_node_hash], h1);
505 put(&n[OF_node_hash2], h2);
506 if (lastc < used)
507 putn(&lastn[OF_node_next], ~used);
508 else
509 putn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry],
510 ~used);
511 return n;
512 } else
513 return NULL;
514 }
515
516 static unsigned long
grow(unsigned long size)517 grow(unsigned long size)
518 {
519 unsigned long incr, newsize;
520 void *onodes;
521
522 incr = size > MAX2 ? MAX2 : size;
523 newsize = size + incr;
524 if (newsize > MAX4-MAX2) {
525 oflo: fprintf(stderr, "Junk mail database overflow.\n");
526 return 0;
527 }
528 if (nodes_mmapped) {
529 if (lseek(fileno(nfp), newsize*SIZEOF_node-1, SEEK_SET)
530 == (off_t)-1 || write(fileno(nfp),"\0",1) != 1)
531 goto oflo;
532 onodes = nodes;
533 if ((nodes = mremap(nodes, nodes_mmapped, newsize*SIZEOF_node,
534 MREMAP_MAYMOVE)) == MAP_FAILED) {
535 if ((nodes = mmap(NULL, newsize*SIZEOF_node,
536 rw_map!=O_RDONLY ?
537 PROT_READ|PROT_WRITE :
538 PROT_READ,
539 MAP_SHARED, fileno(nfp), 0))
540 == MAP_FAILED) {
541 nodes = onodes;
542 goto oflo;
543 }
544 munmap(onodes, nodes_mmapped);
545 }
546 nodes_mmapped = newsize*SIZEOF_node;
547 } else {
548 nodes = srealloc(nodes, newsize*SIZEOF_node);
549 memset(&nodes[size*SIZEOF_node], 0, incr*SIZEOF_node);
550 }
551 size = newsize;
552 putn(&super[OF_super_size], size);
553 return size;
554 }
555
556 #define SAVE(c) { \
557 if (i+j >= (long)*bufsize-4) \
558 *buf = srealloc(*buf, *bufsize += 32); \
559 (*buf)[j+i] = (c); \
560 i += (*buf)[j+i] != '\0'; \
561 }
562
563 static char *
nextword(char ** buf,size_t * bufsize,size_t * count,FILE * fp,struct lexstat * sp,int * stop)564 nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
565 struct lexstat *sp, int *stop)
566 {
567 int c, i, j, k;
568 char *cp, *cq;
569
570 loop: *stop = 0;
571 sp->hadamp = 0;
572 if (sp->save) {
573 i = j = 0;
574 for (cp = sp->save; *cp; cp++) {
575 SAVE(*cp&0377)
576 }
577 SAVE('\0')
578 free(sp->save);
579 sp->save = NULL;
580 goto out;
581 }
582 if (sp->loc == FROM_LINE)
583 while (*count > 0 && (c = getc(fp)) != EOF) {
584 sp->lastc = c;
585 if (c == '\n') {
586 sp->loc = HEADER;
587 break;
588 }
589 }
590 i = 0;
591 j = 0;
592 if (sp->loc == HEADER && sp->field[0]) {
593 field: cp = sp->field;
594 do {
595 c = *cp&0377;
596 SAVE(c)
597 cp++;
598 } while (*cp);
599 j = i;
600 i = 0;
601 }
602 if (sp->price) {
603 sp->price = 0;
604 SAVE('$')
605 }
606 while (*count > 0 && (c = getc(fp)) != EOF) {
607 (*count)--;
608 if (c == '\0' && table_version >= 1) {
609 sp->loc = HEADER;
610 sp->lastc = '\n';
611 *stop = 1;
612 continue;
613 }
614 if (c == '\b' && table_version >= 1) {
615 sp->html = HTML_TEXT;
616 continue;
617 }
618 if (c == '<' && sp->html == HTML_TEXT) {
619 sp->html = HTML_TAG;
620 sp->tagp = sp->tag;
621 continue;
622 }
623 if (sp->html == HTML_TAG) {
624 if (spacechar(c)) {
625 *sp->tagp = '\0';
626 if (!asccasecmp(sp->tag, "a") ||
627 !asccasecmp(sp->tag, "img") ||
628 !asccasecmp(sp->tag, "font") ||
629 !asccasecmp(sp->tag, "span") ||
630 !asccasecmp(sp->tag, "meta") ||
631 !asccasecmp(sp->tag, "table") ||
632 !asccasecmp(sp->tag, "tr") ||
633 !asccasecmp(sp->tag, "td") ||
634 !asccasecmp(sp->tag, "p"))
635 sp->html = HTML_TEXT;
636 else
637 sp->html = HTML_SKIP;
638 } else if (c == '>') {
639 sp->html = HTML_TEXT;
640 continue;
641 } else {
642 if (sp->tagp - sp->tag < sizeof sp->tag - 1)
643 *sp->tagp++ = c;
644 continue;
645 }
646 }
647 if (sp->html == HTML_SKIP) {
648 if (c == '>')
649 sp->html = HTML_TEXT;
650 continue;
651 }
652 if (c == '$' && i == 0)
653 sp->price = 1;
654 if (sp->loc == HEADER && sp->lastc == '\n') {
655 if (!spacechar(c)) {
656 k = 0;
657 while (k < sizeof sp->field - 3) {
658 sp->field[k++] = c;
659 if (*count <= 0 ||
660 (c = getc(fp)) == EOF)
661 break;
662 if (spacechar(c) || c == ':') {
663 ungetc(c, fp);
664 break;
665 }
666 sp->lastc = c;
667 (*count)--;
668 }
669 sp->field[k++] = '*';
670 sp->field[k] = '\0';
671 j = 0;
672 *stop = 1;
673 goto field;
674 } else if (c == '\n') {
675 j = 0;
676 sp->loc = BODY;
677 sp->html = HTML_NONE;
678 *stop = 1;
679 }
680 }
681 if (sp->url) {
682 if (!url_xchar(c)) {
683 sp->url = 0;
684 cp = sp->save = smalloc(i+6);
685 for (cq = "HOST*"; *cq; cq++)
686 *cp++ = *cq;
687 for (cq = &(*buf)[j]; *cq != ':'; cq++);
688 cq += 3; /* skip "://" */
689 while (cq < &(*buf)[i+j] &&
690 (alnumchar(*cq&0377) ||
691 *cq == '.' || *cq == '-'))
692 *cp++ = *cq++;
693 *cp = '\0';
694 *stop = 1;
695 break;
696 }
697 SAVE(c)
698 } else if (constituent(c, *buf, i+j, sp->price, sp->hadamp) ||
699 sp->loc == HEADER && c == '.' &&
700 asccasecmp(sp->field, "subject*")) {
701 if (c == '&')
702 sp->hadamp = 1;
703 SAVE(c)
704 } else if (i > 0 && c == ':' && *count > 2) {
705 if ((c = getc(fp)) != '/') {
706 ungetc(c, fp);
707 break;
708 }
709 (*count)--;
710 if ((c = getc(fp)) != '/') {
711 ungetc(c, fp);
712 break;
713 }
714 (*count)--;
715 sp->url = 1;
716 SAVE('\0')
717 cp = savestr(*buf);
718 j = i = 0;
719 for (cq = "URL*"; *cq; cq++) {
720 SAVE(*cq&0377)
721 }
722 j = i;
723 i = 0;
724 do {
725 if (alnumchar(*cp&0377)) {
726 SAVE(*cp&0377)
727 } else
728 i = 0;
729 } while (*++cp);
730 for (cq = "://"; *cq; cq++) {
731 SAVE(*cq&0377)
732 }
733 } else if (i > 1 && ((*buf)[i+j-1] == ',' ||
734 (*buf)[i+j-1] == '.') && !digitchar(c)) {
735 i--;
736 ungetc(c, fp);
737 (*count)++;
738 break;
739 } else if (i > 0) {
740 sp->lastc = c;
741 break;
742 }
743 sp->lastc = c;
744 }
745 out: if (i > 0) {
746 SAVE('\0')
747 c = 0;
748 for (k = 0; k < i; k++)
749 if (digitchar((*buf)[k+j]&0377))
750 c++;
751 else if (!alphachar((*buf)[k+j]&0377) &&
752 (*buf)[k+j] != '$') {
753 c = 0;
754 break;
755 }
756 if (c == i)
757 goto loop;
758 /*
759 * Including the results of other filtering software (the
760 * 'X-Spam' fields) might seem tempting, but will also rate
761 * their false negatives good with this filter. Therefore
762 * these fields are ignored.
763 *
764 * Handling 'Received' fields is difficult since they include
765 * lots of both useless and interesting words for our purposes.
766 */
767 if (sp->loc == HEADER &&
768 (asccasecmp(sp->field, "message-id*") == 0 ||
769 asccasecmp(sp->field, "references*") == 0 ||
770 asccasecmp(sp->field, "in-reply-to*") == 0 ||
771 asccasecmp(sp->field, "status*") == 0 ||
772 asccasecmp(sp->field, "x-status*") == 0 ||
773 asccasecmp(sp->field, "date*") == 0 ||
774 asccasecmp(sp->field, "delivery-date*") == 0 ||
775 ascncasecmp(sp->field, "x-spam", 6) == 0 ||
776 ascncasecmp(sp->field, "x-pstn", 6) == 0 ||
777 ascncasecmp(sp->field, "x-scanned", 9) == 0 ||
778 asccasecmp(sp->field, "received*") == 0 &&
779 ((2*c > i) || i < 4 ||
780 asccasestr(*buf, "localhost") != NULL)))
781 goto loop;
782 return *buf;
783 }
784 return NULL;
785 }
786
787 #define JOINCHECK if (i >= *bufsize) \
788 *buf = srealloc(*buf, *bufsize += 32)
789 static void
join(char ** buf,size_t * bufsize,const char * s1,const char * s2)790 join(char **buf, size_t *bufsize, const char *s1, const char *s2)
791 {
792 int i = 0;
793
794 while (*s1) {
795 JOINCHECK;
796 (*buf)[i++] = *s1++;
797 }
798 JOINCHECK;
799 (*buf)[i++] = ' ';
800 do {
801 JOINCHECK;
802 (*buf)[i++] = *s2;
803 } while (*s2++);
804 }
805
806 /*ARGSUSED3*/
807 static void
add(const char * word,enum entry entry,struct lexstat * sp,int incr)808 add(const char *word, enum entry entry, struct lexstat *sp, int incr)
809 {
810 unsigned c;
811 unsigned long h1, h2;
812 char *n;
813
814 dbhash(word, &h1, &h2);
815 if ((n = lookup(h1, h2, 1)) != NULL) {
816 switch (entry) {
817 case GOOD:
818 c = get(&n[OF_node_good]);
819 if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
820 c += incr;
821 put(&n[OF_node_good], c);
822 }
823 break;
824 case BAD:
825 c = get(&n[OF_node_bad]);
826 if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
827 c += incr;
828 put(&n[OF_node_bad], c);
829 }
830 break;
831 }
832 }
833 }
834
835 static enum okay
scan(struct message * m,enum entry entry,void (* func)(const char *,enum entry,struct lexstat *,int),int arg)836 scan(struct message *m, enum entry entry,
837 void (*func)(const char *, enum entry, struct lexstat *, int),
838 int arg)
839 {
840 FILE *fp;
841 char *buf0 = NULL, *buf1 = NULL, *buf2 = NULL, **bp, *cp;
842 size_t bufsize0 = 0, bufsize1 = 0, bufsize2 = 0, *zp, count;
843 struct lexstat *sp;
844 int stop;
845
846 if ((fp = Ftemp(&cp, "Ra", "w+", 0600, 1)) == NULL) {
847 perror("tempfile");
848 return STOP;
849 }
850 rm(cp);
851 Ftfree(&cp);
852 if (send(m, fp, NULL, NULL, SEND_TOFLTR, NULL) < 0) {
853 Fclose(fp);
854 return STOP;
855 }
856 fflush(fp);
857 rewind(fp);
858 sp = scalloc(1, sizeof *sp);
859 count = fsize(fp);
860 bp = &buf0;
861 zp = &bufsize0;
862 while (nextword(bp, zp, &count, fp, sp, &stop) != NULL) {
863 (*func)(*bp, entry, sp, arg);
864 if (chained_tokens && buf0 && *buf0 && buf1 && *buf1 && !stop) {
865 join(&buf2, &bufsize2, bp == &buf1 ? buf0 : buf1, *bp);
866 (*func)(buf2, entry, sp, arg);
867 }
868 bp = bp == &buf1 ? &buf0 : &buf1;
869 zp = zp == &bufsize1 ? &bufsize0 : &bufsize1;
870 }
871 free(buf0);
872 free(buf1);
873 free(buf2);
874 free(sp);
875 Fclose(fp);
876 return OKAY;
877 }
878
879 static void
recompute(void)880 recompute(void)
881 {
882 unsigned long used, i;
883 unsigned s;
884 char *n;
885 float p;
886
887 used = getn(&super[OF_super_used]);
888 for (i = 0; i < used; i++) {
889 n = &nodes[i*SIZEOF_node];
890 p = getprob(n);
891 s = f2s(p);
892 put(&n[OF_node_prob_O], s);
893 }
894 }
895
896 static float
getprob(char * n)897 getprob(char *n)
898 {
899 unsigned long ngood, nbad;
900 unsigned g, b;
901 float p, BOT, TOP;
902
903 ngood = getn(&super[OF_super_ngood]);
904 nbad = getn(&super[OF_super_nbad]);
905 if (ngood + nbad >= 18000) {
906 BOT = .0001;
907 TOP = .9999;
908 } else if (ngood + nbad >= 9000) {
909 BOT = .001;
910 TOP = .999;
911 } else {
912 BOT = .01;
913 TOP = .99;
914 }
915 g = get(&n[OF_node_good]) * 2;
916 b = get(&n[OF_node_bad]);
917 if (g + b >= 5) {
918 p = smin(1.0, nbad ? (float)b/nbad : 0.0) /
919 (smin(1.0, ngood ? (float)g/ngood : 0.0) +
920 smin(1.0, nbad ? (float)b/nbad : 0.0));
921 p = smin(TOP, p);
922 p = smax(BOT, p);
923 } else if (g == 0 && b == 0)
924 p = DFL;
925 else
926 p = 0;
927 return p;
928 }
929
930 static int
insert(int * msgvec,enum entry entry,int incr)931 insert(int *msgvec, enum entry entry, int incr)
932 {
933 int *ip;
934 unsigned long u = 0;
935
936 verbose = value("verbose") != NULL;
937 if (getdb(O_RDWR) != OKAY)
938 return 1;
939 switch (entry) {
940 case GOOD:
941 u = getn(&super[OF_super_ngood]);
942 break;
943 case BAD:
944 u = getn(&super[OF_super_nbad]);
945 break;
946 }
947 for (ip = msgvec; *ip; ip++) {
948 setdot(&message[*ip-1]);
949 if (incr > 0 && u == MAX4-incr+1) {
950 fprintf(stderr, "Junk mail database overflow.\n");
951 break;
952 } else if (incr < 0 && -incr > u) {
953 fprintf(stderr, "Junk mail database underflow.\n");
954 break;
955 }
956 u += incr;
957 if (entry == GOOD && incr > 0 || entry == BAD && incr < 0)
958 message[*ip-1].m_flag &= ~MJUNK;
959 else
960 message[*ip-1].m_flag |= MJUNK;
961 scan(&message[*ip-1], entry, add, incr);
962 }
963 switch (entry) {
964 case GOOD:
965 putn(&super[OF_super_ngood], u);
966 break;
967 case BAD:
968 putn(&super[OF_super_nbad], u);
969 break;
970 }
971 if (table_version < 1)
972 recompute();
973 putdb();
974 relsedb();
975 return 0;
976 }
977
978 int
cgood(void * v)979 cgood(void *v)
980 {
981 return insert(v, GOOD, 1);
982 }
983
984 int
cjunk(void * v)985 cjunk(void *v)
986 {
987 return insert(v, BAD, 1);
988 }
989
990 int
cungood(void * v)991 cungood(void *v)
992 {
993 return insert(v, GOOD, -1);
994 }
995
996 int
cunjunk(void * v)997 cunjunk(void *v)
998 {
999 return insert(v, BAD, -1);
1000 }
1001
1002 int
cclassify(void * v)1003 cclassify(void *v)
1004 {
1005 int *msgvec = v, *ip;
1006
1007 verbose = value("verbose") != NULL;
1008 _debug = debug || value("debug") != NULL;
1009 if (getdb(O_RDONLY) != OKAY)
1010 return 1;
1011 for (ip = msgvec; *ip; ip++) {
1012 setdot(&message[*ip-1]);
1013 clsf(&message[*ip-1]);
1014 }
1015 relsedb();
1016 return 0;
1017 }
1018
1019 #define BEST 15
1020 static struct {
1021 float dist;
1022 float prob;
1023 char *word;
1024 unsigned long hash1;
1025 unsigned long hash2;
1026 enum loc loc;
1027 } best[BEST];
1028
1029 static void
clsf(struct message * m)1030 clsf(struct message *m)
1031 {
1032 int i;
1033 float a = 1, b = 1, r;
1034
1035 if (verbose)
1036 fprintf(stderr, "Examining message %d\n", m - &message[0] + 1);
1037 for (i = 0; i < BEST; i++) {
1038 best[i].dist = 0;
1039 best[i].prob = -1;
1040 }
1041 if (scan(m, -1, rate, 0) != OKAY)
1042 return;
1043 if (best[0].prob == -1) {
1044 if (verbose)
1045 fprintf(stderr, "No information found.\n");
1046 m->m_flag &= ~MJUNK;
1047 return;
1048 }
1049 for (i = 0; i < BEST; i++) {
1050 if (best[i].prob == -1)
1051 break;
1052 if (verbose)
1053 fprintf(stderr, "Probe %2d: \"%s\", hash=%lu:%lu "
1054 "prob=%.4g dist=%.4g\n",
1055 i+1, prstr(best[i].word),
1056 best[i].hash1, best[i].hash2,
1057 best[i].prob, best[i].dist);
1058 a *= best[i].prob;
1059 b *= 1 - best[i].prob;
1060 }
1061 r = a+b > 0 ? a / (a+b) : 0;
1062 if (verbose)
1063 fprintf(stderr, "Junk probability of message %d: %g\n",
1064 m - &message[0] + 1, r);
1065 if (r > THR)
1066 m->m_flag |= MJUNK;
1067 else
1068 m->m_flag &= ~MJUNK;
1069 }
1070
1071 /*ARGSUSED4*/
1072 static void
rate(const char * word,enum entry entry,struct lexstat * sp,int unused)1073 rate(const char *word, enum entry entry, struct lexstat *sp, int unused)
1074 {
1075 char *n;
1076 unsigned long h1, h2;
1077 float p, d;
1078 int i, j;
1079
1080 dbhash(word, &h1, &h2);
1081 if ((n = lookup(h1, h2, 0)) != NULL) {
1082 p = getprob(n);
1083 } else
1084 p = DFL;
1085 if (_debug)
1086 fprintf(stderr, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1, h2,
1087 n ? get(&n[OF_node_good]) : 0,
1088 n ? get(&n[OF_node_bad]) : 0,
1089 p, prstr(word));
1090 if (p == 0)
1091 return;
1092 d = p >= MID ? p - MID : MID - p;
1093 if (d >= best[BEST-1].dist)
1094 for (i = 0; i < BEST; i++) {
1095 if (h1 == best[i].hash1 && h2 == best[i].hash2)
1096 break;
1097 /*
1098 * This selection prefers words from the end of the
1099 * header and from the start of the body. It does
1100 * probably not matter much at all, but gives at
1101 * least the most interesting verbose output.
1102 */
1103 if (d > best[i].dist || best[i].loc == HEADER &&
1104 d >= best[i].dist) {
1105 for (j = BEST-2; j >= i; j--)
1106 best[j+1] = best[j];
1107 best[i].dist = d;
1108 best[i].prob = p;
1109 best[i].word = savestr(word);
1110 best[i].hash1 = h1;
1111 best[i].hash2 = h2;
1112 best[i].loc = sp->loc;
1113 break;
1114 }
1115 }
1116 }
1117
1118 static void
dbhash(const char * word,unsigned long * h1,unsigned long * h2)1119 dbhash(const char *word, unsigned long *h1, unsigned long *h2)
1120 {
1121 unsigned char digest[16];
1122 MD5_CTX ctx;
1123
1124 MD5Init(&ctx);
1125 MD5Update(&ctx, (unsigned char *)word, strlen(word));
1126 if (table_version >= 1)
1127 MD5Update(&ctx, (unsigned char *)&super[OF_super_mangle], 4);
1128 MD5Final(digest, &ctx);
1129 *h1 = getn(digest);
1130 if (table_version < 1) {
1131 *h1 ^= getn(&super[OF_super_mangle]);
1132 *h2 = 0;
1133 } else
1134 *h2 = get(&digest[4]);
1135 }
1136
1137 /*
1138 * The selection of the value for mangling is not critical. It is practically
1139 * impossible for any person to determine the exact time when the database
1140 * was created first (without looking at the database, which would reveal the
1141 * value anyway), so we just use this. The MD5 hash here ensures that each
1142 * single second gives a completely different mangling value (which is not
1143 * necessary anymore if table_version>=1, but does not hurt).
1144 */
1145 static void
mkmangle(void)1146 mkmangle(void)
1147 {
1148 union {
1149 time_t t;
1150 char c[16];
1151 } u;
1152 unsigned long s;
1153 unsigned char digest[16];
1154 MD5_CTX ctx;
1155
1156 memset(&u, 0, sizeof u);
1157 time(&u.t);
1158 MD5Init(&ctx);
1159 MD5Update(&ctx, (unsigned char *)u.c, sizeof u.c);
1160 MD5Final(digest, &ctx);
1161 s = getn(digest);
1162 putn(&super[OF_super_mangle], s);
1163 }
1164
1165 int
cprobability(void * v)1166 cprobability(void *v)
1167 {
1168 char **args = v;
1169 unsigned long used, ngood, nbad;
1170 unsigned long h1, h2;
1171 unsigned g, b;
1172 float p, d;
1173 char *n;
1174
1175 if (*args == NULL) {
1176 fprintf(stderr, "No words given.\n");
1177 return 1;
1178 }
1179 if (getdb(O_RDONLY) != OKAY)
1180 return 1;
1181 used = getn(&super[OF_super_used]);
1182 ngood = getn(&super[OF_super_ngood]);
1183 nbad = getn(&super[OF_super_nbad]);
1184 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1185 used, ngood, nbad);
1186 do {
1187 dbhash(*args, &h1, &h2);
1188 printf("\"%s\", hash=%lu:%lu ", *args, h1, h2);
1189 if ((n = lookup(h1, h2, 0)) != NULL) {
1190 g = get(&n[OF_node_good]);
1191 b = get(&n[OF_node_bad]);
1192 printf("good=%u bad=%u ", g, b);
1193 p = getprob(n);
1194 if (p != 0) {
1195 d = p >= MID ? p - MID : MID - p;
1196 printf("prob=%.4g dist=%.4g", p, d);
1197 } else
1198 printf("too infrequent");
1199 } else
1200 printf("not in database");
1201 putchar('\n');
1202 } while (*++args);
1203 relsedb();
1204 return 0;
1205 }
1206