1 /*
2  * Heirloom mailx - a mail user agent derived from Berkeley Mail.
3  *
4  * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5  */
6 /*
7  * Copyright (c) 2004
8  *	Gunnar Ritter.  All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by Gunnar Ritter
21  *	and his contributors.
22  * 4. Neither the name of Gunnar Ritter nor the names of his contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #ifndef lint
40 #ifdef	DOSCCS
41 static char sccsid[] = "@(#)junk.c	1.73 (gritter) 3/4/06";
42 #endif
43 #endif /* not lint */
44 
45 #include "config.h"
46 #include "rcv.h"
47 
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #include <limits.h>
51 #include <time.h>
52 #include <unistd.h>
53 #include <utime.h>
54 
55 #ifdef	HAVE_MMAP
56 #include <sys/mman.h>
57 #else	/* !HAVE_MMAP */
58 #define	mmap(a, b, c, d, e, f)	MAP_FAILED
59 #define	munmap(a, b)
60 #endif	/* !HAVE_MMAP */
61 #ifndef	HAVE_MREMAP
62 #define	mremap(a, b, c, d)	MAP_FAILED
63 #endif	/* !HAVE_MREMAP */
64 
65 #ifndef	MAP_FAILED
66 #define	MAP_FAILED	((void *)-1)
67 #endif	/* !MAP_FAILED */
68 
69 #include "extern.h"
70 #include "md5.h"
71 
72 /*
73  * Mail -- a mail program
74  *
75  * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
76  * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
77  * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
78  *
79  * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
80  * Classification using Chained Tokens", February 2004,
81  * <http://www.nuclearelephant.com/papers/chained.html>.
82  */
83 
84 #define	DFL	.40
85 #define	THR	.9
86 #define	MID	.5
87 
88 #define	MAX2	0x0000ffff
89 #define	MAX3	0x00ffffffUL
90 #define	MAX4	0xffffffffUL
91 
92 /*
93  * The dictionary consists of two files forming a hash table. The hash
94  * consists of the first 56 bits of the result of applying MD5 to the
95  * input word. This scheme ensures that collisions are unlikely enough
96  * to make junk detection work; according to the birthday paradox, a
97  * 50 % probability for one single collision is reached at 2^28 entries.
98  *
99  * To make the chain structure independent from input, the MD5 input is
100  * xor'ed with a random number. This makes it impossible that someone uses
101  * a carefully crafted message for a denial-of-service attack against the
102  * database.
103  */
104 #define	SIZEOF_node	17
105 #define	OF_node_hash	0	/* first 32 bits of MD5 of word|mangle */
106 #define	OF_node_next	4	/* bit-negated table index of next node */
107 #define	OF_node_good	8	/* number of times this appeared in good msgs */
108 #define	OF_node_bad	11	/* number of times this appeared in bad msgs */
109 #define	OF_node_prob_O	14	/* table_version<1: precomputed probability */
110 #define	OF_node_hash2	14	/* upper 3 bytes of MD5 hash */
111 static char	*nodes;
112 
113 #define	SIZEOF_super	262164
114 #define	OF_super_size	0	/* allocated nodes in the chain file */
115 #define	OF_super_used	4	/* used nodes in the chain file */
116 #define	OF_super_ngood	8	/* number of good messages scanned so far */
117 #define	OF_super_nbad	12	/* number of bad messages scanned so far */
118 #define	OF_super_mangle	16	/* used to mangle the MD5 input */
119 #define	OF_super_bucket	20	/* 65536 bit-negated node indices */
120 #define	SIZEOF_entry	4
121 static char	*super;
122 
123 static size_t	super_mmapped;
124 static size_t	nodes_mmapped;
125 static int	rw_map;
126 static int	chained_tokens;
127 
128 /*
129  * Version history
130  * ---------------
131  * 0	Initial version
132  * 1	Fixed the mangling; it was ineffective in version 0.
133  *      Hash extended to 56 bits.
134  */
135 static int	table_version;
136 #define	current_table_version	1
137 
138 #define	get(e)	\
139 	((unsigned)(((char *)(e))[0]&0377) + \
140 	 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
141 	 ((unsigned)(((char *)(e))[2]&0377) << 16))
142 
143 #define	put(e, n) \
144 	(((char *)(e))[0] = (n) & 0x0000ff, \
145 	 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
146 	 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
147 
148 #define	f2s(d)	(smin(((unsigned)((d) * MAX3)), MAX3))
149 
150 #define	s2f(s)	((float)(s) / MAX3)
151 
152 #define	getn(p)	\
153 	((unsigned long)(((char *)(p))[0]&0377) + \
154 	 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
155 	 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
156 	 ((unsigned long)(((char *)(p))[3]&0377) << 24))
157 
158 #define	putn(p, n) \
159 	(((char *)(p))[0] = (n) & 0x000000ffUL, \
160 	 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
161 	 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
162 	 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
163 
164 struct lexstat {
165 	char	*save;
166 	int	price;
167 	int	url;
168 	int	lastc;
169 	int	hadamp;
170 	enum loc {
171 		FROM_LINE	= 0,
172 		HEADER		= 1,
173 		BODY		= 2
174 	} loc;
175 	enum html {
176 		HTML_NONE	= 0,
177 		HTML_TEXT	= 1,
178 		HTML_TAG	= 2,
179 		HTML_SKIP	= 3
180 	} html;
181 	char	tag[8];
182 	char	*tagp;
183 	char	field[LINESIZE];
184 };
185 
186 #define	constituent(c, b, i, price, hadamp) \
187 	((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
188 		(c) == '$' || (c) == '!' || (c) == '_' || \
189 		(c) == '#' || (c) == '%' || (c) == '&' || \
190 		((c) == ';' && hadamp) || \
191 		((c) == '-' && !(price)) || \
192 		(((c) == '.' || (c) == ',' || (c) == '/') && \
193 		 (i) > 0 && digitchar((b)[(i)-1]&0377)))
194 
195 #define	url_xchar(c) \
196 	(((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
197 	 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
198 	 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
199 	 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
200 	 (c) != '#' && (c) != '"')
201 
202 enum db {
203 	SUPER = 0,
204 	NODES = 1
205 };
206 
207 enum entry {
208 	GOOD = 0,
209 	BAD  = 1
210 };
211 
212 static const char	README1[] = "\
213 This is a junk mail database maintained by mailx(1). It does not contain any\n\
214 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
215 used for lookup. It is thus possible to tell if some given word was likely\n\
216 contained in your mail from examining this data, at best.\n";
217 static const char	README2[] = "\n\
218 The database files are stored in compress(1) format by default. This saves\n\
219 some space, but leads to higher processor usage when the database is read\n\
220 or updated. You can use uncompress(1) on these files if you prefer to store\n\
221 them in flat form.\n";
222 
223 static int	verbose;
224 static int	_debug;
225 static FILE	*sfp, *nfp;
226 static char	*sname, *nname;
227 
228 static enum okay getdb(int rw);
229 static void putdb(void);
230 static void relsedb(void);
231 static FILE *dbfp(enum db db, int rw, int *compressed, char **fn);
232 static char *lookup(unsigned long h1, unsigned long h2, int create);
233 static unsigned long grow(unsigned long size);
234 static char *nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
235 		struct lexstat *sp, int *stop);
236 static void join(char **buf, size_t *bufsize, const char *s1, const char *s2);
237 static void add(const char *word, enum entry entry, struct lexstat *sp,
238 		int incr);
239 static enum okay scan(struct message *m, enum entry entry,
240 		void (*func)(const char *, enum entry, struct lexstat *, int),
241 		int arg);
242 static void recompute(void);
243 static float getprob(char *n);
244 static int insert(int *msgvec, enum entry entry, int incr);
245 static void clsf(struct message *m);
246 static void rate(const char *word, enum entry entry, struct lexstat *sp,
247 		int unused);
248 static void dbhash(const char *word, unsigned long *h1, unsigned long *h2);
249 static void mkmangle(void);
250 
251 static enum okay
getdb(int rw)252 getdb(int rw)
253 {
254 	void	*zp = NULL;
255 	long	n;
256 	int	compressed;
257 
258 	chained_tokens = value("chained-junk-tokens") != NULL;
259 	if ((sfp = dbfp(SUPER, rw, &compressed, &sname)) == (FILE *)-1)
260 		return STOP;
261 	if (sfp && !compressed) {
262 		super = mmap(NULL, SIZEOF_super,
263 				rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
264 				MAP_SHARED, fileno(sfp), 0);
265 		if (super != MAP_FAILED) {
266 			super_mmapped = SIZEOF_super;
267 			goto skip;
268 		}
269 	}
270 	super_mmapped = 0;
271 	super = smalloc(SIZEOF_super);
272 	if (sfp) {
273 		if (compressed)
274 			zp = zalloc(sfp);
275 		if ((compressed ? zread(zp, super, SIZEOF_super)
276 					!= SIZEOF_super :
277 				fread(super, 1, SIZEOF_super, sfp)
278 					!= SIZEOF_super) ||
279 				ferror(sfp)) {
280 			fprintf(stderr, "Error reading junk mail database.\n");
281 			memset(super, 0, SIZEOF_super);
282 			mkmangle();
283 			if (compressed)
284 				zfree(zp);
285 			Fclose(sfp);
286 			sfp = NULL;
287 		} else if (compressed)
288 			zfree(zp);
289 	} else {
290 		memset(super, 0, SIZEOF_super);
291 		mkmangle();
292 	}
293 skip:	if ((n = getn(&super[OF_super_size])) == 0) {
294 		n = 1;
295 		putn(&super[OF_super_size], 1);
296 	}
297 	if (sfp && (nfp = dbfp(NODES, rw, &compressed, &nname)) != NULL) {
298 		if (nfp == (FILE *)-1) {
299 			relsedb();
300 			return STOP;
301 		}
302 	}
303 	rw_map = rw;
304 	if (sfp && nfp && !compressed) {
305 		nodes = mmap(NULL, n * SIZEOF_node,
306 				rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
307 				MAP_SHARED, fileno(nfp), 0);
308 		if (nodes != MAP_FAILED) {
309 			nodes_mmapped = n * SIZEOF_node;
310 			return OKAY;
311 		}
312 	}
313 	nodes_mmapped = 0;
314 	nodes = smalloc(n * SIZEOF_node);
315 	if (sfp && nfp) {
316 		if (compressed)
317 			zp = zalloc(nfp);
318 		if ((compressed ? zread(zp, nodes, n * SIZEOF_node)
319 				!= n * SIZEOF_node :
320 				fread(nodes, 1, n * SIZEOF_node, nfp)
321 				!= n * SIZEOF_node) ||
322 				ferror(nfp)) {
323 			fprintf(stderr, "Error reading junk mail database.\n");
324 			memset(nodes, 0, n * SIZEOF_node);
325 			memset(super, 0, SIZEOF_super);
326 			mkmangle();
327 			putn(&super[OF_super_size], n);
328 		}
329 		if (compressed)
330 			zfree(zp);
331 		Fclose(nfp);
332 		nfp = NULL;
333 	} else
334 		memset(nodes, 0, n * SIZEOF_node);
335 	if (sfp) {
336 		Fclose(sfp);
337 		sfp = NULL;
338 	}
339 	return OKAY;
340 }
341 
342 static void
putdb(void)343 putdb(void)
344 {
345 	void	*zp;
346 	int	scomp, ncomp;
347 
348 	if (!super_mmapped && (sfp = dbfp(SUPER, O_WRONLY, &scomp, &sname))
349 			== NULL || sfp == (FILE *)-1)
350 		return;
351 	if (!nodes_mmapped && (nfp = dbfp(NODES, O_WRONLY, &ncomp, &nname))
352 			== NULL || nfp == (FILE *)-1)
353 		return;
354 	if (super_mmapped == 0 || nodes_mmapped == 0)
355 		holdint();
356 	/*
357 	 * Use utime() with mmap() since Linux does not update st_mtime
358 	 * reliably otherwise.
359 	 */
360 	if (super_mmapped)
361 		utime(sname, NULL);
362 	else if (scomp) {
363 		zp = zalloc(sfp);
364 		zwrite(zp, super, SIZEOF_super);
365 		zfree(zp);
366 		trunc(sfp);
367 	} else
368 		fwrite(super, 1, SIZEOF_super, sfp);
369 	if (nodes_mmapped)
370 		utime(nname, NULL);
371 	else if (ncomp) {
372 		zp = zalloc(nfp);
373 		zwrite(zp, nodes, getn(&super[OF_super_size]) * SIZEOF_node);
374 		zfree(zp);
375 		trunc(nfp);
376 	} else
377 		fwrite(nodes, 1,
378 			getn(&super[OF_super_size]) * SIZEOF_node, nfp);
379 	if (super_mmapped == 0 || nodes_mmapped == 0)
380 		relseint();
381 }
382 
383 static void
relsedb(void)384 relsedb(void)
385 {
386 	if (super_mmapped) {
387 		munmap(super, super_mmapped);
388 		super_mmapped = 0;
389 	} else
390 		free(super);
391 	if (nodes_mmapped) {
392 		munmap(nodes, nodes_mmapped);
393 		nodes_mmapped = 0;
394 	} else
395 		free(nodes);
396 	if (sfp && sfp != (FILE *)-1) {
397 		Fclose(sfp);
398 		sfp = NULL;
399 	}
400 	if (nfp && nfp != (FILE *)-1) {
401 		Fclose(nfp);
402 		nfp = NULL;
403 	}
404 }
405 
406 static FILE *
dbfp(enum db db,int rw,int * compressed,char ** fn)407 dbfp(enum db db, int rw, int *compressed, char **fn)
408 {
409 	FILE	*fp, *rp;
410 	char	*dir;
411 	struct flock	flp;
412 	char *sfx[][2] = {
413 		{ "super",	"nodes" },
414 		{ "super1",	"nodes1" }
415 	};
416 	char **sf;
417 	char *zfx[][2] = {
418 		{ "super.Z",	"nodes.Z" },
419 		{ "super1.Z",	"nodes1.Z" }
420 	};
421 	char **zf;
422 	int	n;
423 
424 	if ((dir = value("junkdb")) == NULL) {
425 		fprintf(stderr, "No junk mail database specified. "
426 				"Set the junkdb variable.\n");
427 		return (FILE *)-1;
428 	}
429 	dir = expand(dir);
430 	if (makedir(dir) == STOP) {
431 		fprintf(stderr, "Cannot create directory \"%s\"\n.", dir);
432 		return (FILE *)-1;
433 	}
434 	if (rw!=O_WRONLY)
435 		table_version = current_table_version;
436 loop:	sf = sfx[table_version];
437 	zf = zfx[table_version];
438 	*fn = salloc((n = strlen(dir)) + 40);
439 	strcpy(*fn, dir);
440 	(*fn)[n] = '/';
441 	*compressed = 0;
442 	strcpy(&(*fn)[n+1], sf[db]);
443 	if ((fp = Fopen(*fn, rw!=O_RDONLY ? "r+" : "r")) != NULL)
444 		goto okay;
445 	*compressed = 1;
446 	strcpy(&(*fn)[n+1], zf[db]);
447 	if ((fp = Fopen(*fn, rw ? "r+" : "r")) == NULL &&
448 			rw==O_WRONLY ? (fp = Fopen(*fn, "w+")) == NULL : 0) {
449 		fprintf(stderr, "Cannot open junk mail database \"%s\".\n",*fn);
450 		return NULL;
451 	}
452 	if (rw==O_WRONLY) {
453 		strcpy(&(*fn)[n+1], "README");
454 		if (access(*fn, F_OK) < 0 && (rp = Fopen(*fn, "w")) != NULL) {
455 			fputs(README1, rp);
456 			fputs(README2, rp);
457 			Fclose(rp);
458 		}
459 	} else if (fp == NULL) {
460 		if (table_version > 0) {
461 			table_version--;
462 			goto loop;
463 		} else
464 			table_version = current_table_version;
465 	}
466 okay:	if (fp) {
467 		flp.l_type = rw!=O_RDONLY ? F_WRLCK : F_RDLCK;
468 		flp.l_start = 0;
469 		flp.l_len = 0;
470 		flp.l_whence = SEEK_SET;
471 		fcntl(fileno(fp), F_SETLKW, &flp);
472 	}
473 	return fp;
474 }
475 
476 static char *
lookup(unsigned long h1,unsigned long h2,int create)477 lookup(unsigned long h1, unsigned long h2, int create)
478 {
479 	char	*n, *lastn = NULL;
480 	unsigned long	c, lastc = MAX4, used, size;
481 
482 	used = getn(&super[OF_super_used]);
483 	size = getn(&super[OF_super_size]);
484 	c = ~getn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry]);
485 	n = &nodes[c*SIZEOF_node];
486 	while (c < used) {
487 		if (getn(&n[OF_node_hash]) == h1 &&
488 				(table_version < 1 ? 1 :
489 				get(&n[OF_node_hash2]) == h2))
490 			return n;
491 		lastc = c;
492 		lastn = n;
493 		c = ~getn(&n[OF_node_next]);
494 		n = &nodes[c*SIZEOF_node];
495 	}
496 	if (create) {
497 		if (used >= size) {
498 			if ((size = grow(size)) == 0)
499 				return NULL;
500 			lastn = &nodes[lastc*SIZEOF_node];
501 		}
502 		putn(&super[OF_super_used], used+1);
503 		n = &nodes[used*SIZEOF_node];
504 		putn(&n[OF_node_hash], h1);
505 		put(&n[OF_node_hash2], h2);
506 		if (lastc < used)
507 			putn(&lastn[OF_node_next], ~used);
508 		else
509 			putn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry],
510 					~used);
511 		return n;
512 	} else
513 		return NULL;
514 }
515 
516 static unsigned long
grow(unsigned long size)517 grow(unsigned long size)
518 {
519 	unsigned long	incr, newsize;
520 	void	*onodes;
521 
522 	incr = size > MAX2 ? MAX2 : size;
523 	newsize = size + incr;
524 	if (newsize > MAX4-MAX2) {
525 	oflo:	fprintf(stderr, "Junk mail database overflow.\n");
526 		return 0;
527 	}
528 	if (nodes_mmapped) {
529 		if (lseek(fileno(nfp), newsize*SIZEOF_node-1, SEEK_SET)
530 				== (off_t)-1 || write(fileno(nfp),"\0",1) != 1)
531 			goto oflo;
532 		onodes = nodes;
533 		if ((nodes = mremap(nodes, nodes_mmapped, newsize*SIZEOF_node,
534 				MREMAP_MAYMOVE)) == MAP_FAILED) {
535 			if ((nodes = mmap(NULL, newsize*SIZEOF_node,
536 						rw_map!=O_RDONLY ?
537 							PROT_READ|PROT_WRITE :
538 							PROT_READ,
539 						MAP_SHARED, fileno(nfp), 0))
540 					== MAP_FAILED) {
541 				nodes = onodes;
542 				goto oflo;
543 			}
544 			munmap(onodes, nodes_mmapped);
545 		}
546 		nodes_mmapped = newsize*SIZEOF_node;
547 	} else {
548 		nodes = srealloc(nodes, newsize*SIZEOF_node);
549 		memset(&nodes[size*SIZEOF_node], 0, incr*SIZEOF_node);
550 	}
551 	size = newsize;
552 	putn(&super[OF_super_size], size);
553 	return size;
554 }
555 
556 #define	SAVE(c)	{ \
557 	if (i+j >= (long)*bufsize-4) \
558 		*buf = srealloc(*buf, *bufsize += 32); \
559 	(*buf)[j+i] = (c); \
560 	i += (*buf)[j+i] != '\0'; \
561 }
562 
563 static char *
nextword(char ** buf,size_t * bufsize,size_t * count,FILE * fp,struct lexstat * sp,int * stop)564 nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
565 		struct lexstat *sp, int *stop)
566 {
567 	int	c, i, j, k;
568 	char	*cp, *cq;
569 
570 loop:	*stop = 0;
571 	sp->hadamp = 0;
572 	if (sp->save) {
573 		i = j = 0;
574 		for (cp = sp->save; *cp; cp++) {
575 			SAVE(*cp&0377)
576 		}
577 		SAVE('\0')
578 		free(sp->save);
579 		sp->save = NULL;
580 		goto out;
581 	}
582 	if (sp->loc == FROM_LINE)
583 		while (*count > 0 && (c = getc(fp)) != EOF) {
584 			sp->lastc = c;
585 			if (c == '\n') {
586 				sp->loc = HEADER;
587 				break;
588 			}
589 		}
590 	i = 0;
591 	j = 0;
592 	if (sp->loc == HEADER && sp->field[0]) {
593 	field:	cp = sp->field;
594 		do {
595 			c = *cp&0377;
596 			SAVE(c)
597 			cp++;
598 		} while (*cp);
599 		j = i;
600 		i = 0;
601 	}
602 	if (sp->price) {
603 		sp->price = 0;
604 		SAVE('$')
605 	}
606 	while (*count > 0 && (c = getc(fp)) != EOF) {
607 		(*count)--;
608 		if (c == '\0' && table_version >= 1) {
609 			sp->loc = HEADER;
610 			sp->lastc = '\n';
611 			*stop = 1;
612 			continue;
613 		}
614 		if (c == '\b' && table_version >= 1) {
615 			sp->html = HTML_TEXT;
616 			continue;
617 		}
618 		if (c == '<' && sp->html == HTML_TEXT) {
619 			sp->html = HTML_TAG;
620 			sp->tagp = sp->tag;
621 			continue;
622 		}
623 		if (sp->html == HTML_TAG) {
624 			if (spacechar(c)) {
625 				*sp->tagp = '\0';
626 				if (!asccasecmp(sp->tag, "a") ||
627 						!asccasecmp(sp->tag, "img") ||
628 						!asccasecmp(sp->tag, "font") ||
629 						!asccasecmp(sp->tag, "span") ||
630 						!asccasecmp(sp->tag, "meta") ||
631 						!asccasecmp(sp->tag, "table") ||
632 						!asccasecmp(sp->tag, "tr") ||
633 						!asccasecmp(sp->tag, "td") ||
634 						!asccasecmp(sp->tag, "p"))
635 					sp->html = HTML_TEXT;
636 				else
637 					sp->html = HTML_SKIP;
638 			} else if (c == '>') {
639 				sp->html = HTML_TEXT;
640 				continue;
641 			} else {
642 				if (sp->tagp - sp->tag < sizeof sp->tag - 1)
643 					*sp->tagp++ = c;
644 				continue;
645 			}
646 		}
647 		if (sp->html == HTML_SKIP) {
648 			if (c == '>')
649 				sp->html = HTML_TEXT;
650 			continue;
651 		}
652 		if (c == '$' && i == 0)
653 			sp->price = 1;
654 		if (sp->loc == HEADER && sp->lastc == '\n') {
655 			if (!spacechar(c)) {
656 				k = 0;
657 				while (k < sizeof sp->field - 3) {
658 					sp->field[k++] = c;
659 					if (*count <= 0 ||
660 							(c = getc(fp)) == EOF)
661 						break;
662 					if (spacechar(c) || c == ':') {
663 						ungetc(c, fp);
664 						break;
665 					}
666 					sp->lastc = c;
667 					(*count)--;
668 				}
669 				sp->field[k++] = '*';
670 				sp->field[k] = '\0';
671 				j = 0;
672 				*stop = 1;
673 				goto field;
674 			} else if (c == '\n') {
675 				j = 0;
676 				sp->loc = BODY;
677 				sp->html = HTML_NONE;
678 				*stop = 1;
679 			}
680 		}
681 		if (sp->url) {
682 			if (!url_xchar(c)) {
683 				sp->url = 0;
684 				cp = sp->save = smalloc(i+6);
685 				for (cq = "HOST*"; *cq; cq++)
686 					*cp++ = *cq;
687 				for (cq = &(*buf)[j]; *cq != ':'; cq++);
688 				cq += 3;	/* skip "://" */
689 				while (cq < &(*buf)[i+j] &&
690 						(alnumchar(*cq&0377) ||
691 						 *cq == '.' || *cq == '-'))
692 					*cp++ = *cq++;
693 				*cp = '\0';
694 				*stop = 1;
695 				break;
696 			}
697 			SAVE(c)
698 		} else if (constituent(c, *buf, i+j, sp->price, sp->hadamp) ||
699 				sp->loc == HEADER && c == '.' &&
700 				asccasecmp(sp->field, "subject*")) {
701 			if (c == '&')
702 				sp->hadamp = 1;
703 			SAVE(c)
704 		} else if (i > 0 && c == ':' && *count > 2) {
705 			if ((c = getc(fp)) != '/') {
706 				ungetc(c, fp);
707 				break;
708 			}
709 			(*count)--;
710 			if ((c = getc(fp)) != '/') {
711 				ungetc(c, fp);
712 				break;
713 			}
714 			(*count)--;
715 			sp->url = 1;
716 			SAVE('\0')
717 			cp = savestr(*buf);
718 			j = i = 0;
719 			for (cq = "URL*"; *cq; cq++) {
720 				SAVE(*cq&0377)
721 			}
722 			j = i;
723 			i = 0;
724 			do {
725 				if (alnumchar(*cp&0377)) {
726 					SAVE(*cp&0377)
727 				} else
728 					i = 0;
729 			} while (*++cp);
730 			for (cq = "://"; *cq; cq++) {
731 				SAVE(*cq&0377)
732 			}
733 		} else if (i > 1 && ((*buf)[i+j-1] == ',' ||
734 				 (*buf)[i+j-1] == '.') && !digitchar(c)) {
735 			i--;
736 			ungetc(c, fp);
737 			(*count)++;
738 			break;
739 		} else if (i > 0) {
740 			sp->lastc = c;
741 			break;
742 		}
743 		sp->lastc = c;
744 	}
745 out:	if (i > 0) {
746 		SAVE('\0')
747 		c = 0;
748 		for (k = 0; k < i; k++)
749 			if (digitchar((*buf)[k+j]&0377))
750 				c++;
751 			else if (!alphachar((*buf)[k+j]&0377) &&
752 					(*buf)[k+j] != '$') {
753 				c = 0;
754 				break;
755 			}
756 		if (c == i)
757 			goto loop;
758 		/*
759 		 * Including the results of other filtering software (the
760 		 * 'X-Spam' fields) might seem tempting, but will also rate
761 		 * their false negatives good with this filter. Therefore
762 		 * these fields are ignored.
763 		 *
764 		 * Handling 'Received' fields is difficult since they include
765 		 * lots of both useless and interesting words for our purposes.
766 		 */
767 		if (sp->loc == HEADER &&
768 				(asccasecmp(sp->field, "message-id*") == 0 ||
769 				 asccasecmp(sp->field, "references*") == 0 ||
770 				 asccasecmp(sp->field, "in-reply-to*") == 0 ||
771 				 asccasecmp(sp->field, "status*") == 0 ||
772 				 asccasecmp(sp->field, "x-status*") == 0 ||
773 				 asccasecmp(sp->field, "date*") == 0 ||
774 				 asccasecmp(sp->field, "delivery-date*") == 0 ||
775 				 ascncasecmp(sp->field, "x-spam", 6) == 0 ||
776 				 ascncasecmp(sp->field, "x-pstn", 6) == 0 ||
777 				 ascncasecmp(sp->field, "x-scanned", 9) == 0 ||
778 				 asccasecmp(sp->field, "received*") == 0 &&
779 				 	((2*c > i) || i < 4 ||
780 					asccasestr(*buf, "localhost") != NULL)))
781 			goto loop;
782 		return *buf;
783 	}
784 	return NULL;
785 }
786 
787 #define	JOINCHECK	if (i >= *bufsize) \
788 				*buf = srealloc(*buf, *bufsize += 32)
789 static void
join(char ** buf,size_t * bufsize,const char * s1,const char * s2)790 join(char **buf, size_t *bufsize, const char *s1, const char *s2)
791 {
792 	int	i = 0;
793 
794 	while (*s1) {
795 		JOINCHECK;
796 		(*buf)[i++] = *s1++;
797 	}
798 	JOINCHECK;
799 	(*buf)[i++] = ' ';
800 	do {
801 		JOINCHECK;
802 		(*buf)[i++] = *s2;
803 	} while (*s2++);
804 }
805 
806 /*ARGSUSED3*/
807 static void
add(const char * word,enum entry entry,struct lexstat * sp,int incr)808 add(const char *word, enum entry entry, struct lexstat *sp, int incr)
809 {
810 	unsigned	c;
811 	unsigned long	h1, h2;
812 	char	*n;
813 
814 	dbhash(word, &h1, &h2);
815 	if ((n = lookup(h1, h2, 1)) != NULL) {
816 		switch (entry) {
817 		case GOOD:
818 			c = get(&n[OF_node_good]);
819 			if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
820 				c += incr;
821 				put(&n[OF_node_good], c);
822 			}
823 			break;
824 		case BAD:
825 			c = get(&n[OF_node_bad]);
826 			if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
827 				c += incr;
828 				put(&n[OF_node_bad], c);
829 			}
830 			break;
831 		}
832 	}
833 }
834 
835 static enum okay
scan(struct message * m,enum entry entry,void (* func)(const char *,enum entry,struct lexstat *,int),int arg)836 scan(struct message *m, enum entry entry,
837 		void (*func)(const char *, enum entry, struct lexstat *, int),
838 		int arg)
839 {
840 	FILE	*fp;
841 	char	*buf0 = NULL, *buf1 = NULL, *buf2 = NULL, **bp, *cp;
842 	size_t	bufsize0 = 0, bufsize1 = 0, bufsize2 = 0, *zp, count;
843 	struct lexstat	*sp;
844 	int	stop;
845 
846 	if ((fp = Ftemp(&cp, "Ra", "w+", 0600, 1)) == NULL) {
847 		perror("tempfile");
848 		return STOP;
849 	}
850 	rm(cp);
851 	Ftfree(&cp);
852 	if (send(m, fp, NULL, NULL, SEND_TOFLTR, NULL) < 0) {
853 		Fclose(fp);
854 		return STOP;
855 	}
856 	fflush(fp);
857 	rewind(fp);
858 	sp = scalloc(1, sizeof *sp);
859 	count = fsize(fp);
860 	bp = &buf0;
861 	zp = &bufsize0;
862 	while (nextword(bp, zp, &count, fp, sp, &stop) != NULL) {
863 		(*func)(*bp, entry, sp, arg);
864 		if (chained_tokens && buf0 && *buf0 && buf1 && *buf1 && !stop) {
865 			join(&buf2, &bufsize2, bp == &buf1 ? buf0 : buf1, *bp);
866 			(*func)(buf2, entry, sp, arg);
867 		}
868 		bp = bp == &buf1 ? &buf0 : &buf1;
869 		zp = zp == &bufsize1 ? &bufsize0 : &bufsize1;
870 	}
871 	free(buf0);
872 	free(buf1);
873 	free(buf2);
874 	free(sp);
875 	Fclose(fp);
876 	return OKAY;
877 }
878 
879 static void
recompute(void)880 recompute(void)
881 {
882 	unsigned long	used, i;
883 	unsigned	s;
884 	char	*n;
885 	float	p;
886 
887 	used = getn(&super[OF_super_used]);
888 	for (i = 0; i < used; i++) {
889 		n = &nodes[i*SIZEOF_node];
890 		p = getprob(n);
891 		s = f2s(p);
892 		put(&n[OF_node_prob_O], s);
893 	}
894 }
895 
896 static float
getprob(char * n)897 getprob(char *n)
898 {
899 	unsigned long	ngood, nbad;
900 	unsigned	g, b;
901 	float	p, BOT, TOP;
902 
903 	ngood = getn(&super[OF_super_ngood]);
904 	nbad = getn(&super[OF_super_nbad]);
905 	if (ngood + nbad >= 18000) {
906 		BOT = .0001;
907 		TOP = .9999;
908 	} else if (ngood + nbad >= 9000) {
909 		BOT = .001;
910 		TOP = .999;
911 	} else {
912 		BOT = .01;
913 		TOP = .99;
914 	}
915 	g = get(&n[OF_node_good]) * 2;
916 	b = get(&n[OF_node_bad]);
917 	if (g + b >= 5) {
918 		p = smin(1.0, nbad ? (float)b/nbad : 0.0) /
919 			(smin(1.0, ngood ? (float)g/ngood : 0.0) +
920 			 smin(1.0, nbad ? (float)b/nbad : 0.0));
921 		p = smin(TOP, p);
922 		p = smax(BOT, p);
923 	} else if (g == 0 && b == 0)
924 		p = DFL;
925 	else
926 		p = 0;
927 	return p;
928 }
929 
930 static int
insert(int * msgvec,enum entry entry,int incr)931 insert(int *msgvec, enum entry entry, int incr)
932 {
933 	int	*ip;
934 	unsigned long	u = 0;
935 
936 	verbose = value("verbose") != NULL;
937 	if (getdb(O_RDWR) != OKAY)
938 		return 1;
939 	switch (entry) {
940 	case GOOD:
941 		u = getn(&super[OF_super_ngood]);
942 		break;
943 	case BAD:
944 		u = getn(&super[OF_super_nbad]);
945 		break;
946 	}
947 	for (ip = msgvec; *ip; ip++) {
948 		setdot(&message[*ip-1]);
949 		if (incr > 0 && u == MAX4-incr+1) {
950 			fprintf(stderr, "Junk mail database overflow.\n");
951 			break;
952 		} else if (incr < 0 && -incr > u) {
953 			fprintf(stderr, "Junk mail database underflow.\n");
954 			break;
955 		}
956 		u += incr;
957 		if (entry == GOOD && incr > 0 || entry == BAD && incr < 0)
958 			message[*ip-1].m_flag &= ~MJUNK;
959 		else
960 			message[*ip-1].m_flag |= MJUNK;
961 		scan(&message[*ip-1], entry, add, incr);
962 	}
963 	switch (entry) {
964 	case GOOD:
965 		putn(&super[OF_super_ngood], u);
966 		break;
967 	case BAD:
968 		putn(&super[OF_super_nbad], u);
969 		break;
970 	}
971 	if (table_version < 1)
972 		recompute();
973 	putdb();
974 	relsedb();
975 	return 0;
976 }
977 
978 int
cgood(void * v)979 cgood(void *v)
980 {
981 	return insert(v, GOOD, 1);
982 }
983 
984 int
cjunk(void * v)985 cjunk(void *v)
986 {
987 	return insert(v, BAD, 1);
988 }
989 
990 int
cungood(void * v)991 cungood(void *v)
992 {
993 	return insert(v, GOOD, -1);
994 }
995 
996 int
cunjunk(void * v)997 cunjunk(void *v)
998 {
999 	return insert(v, BAD, -1);
1000 }
1001 
1002 int
cclassify(void * v)1003 cclassify(void *v)
1004 {
1005 	int	*msgvec = v, *ip;
1006 
1007 	verbose = value("verbose") != NULL;
1008 	_debug = debug || value("debug") != NULL;
1009 	if (getdb(O_RDONLY) != OKAY)
1010 		return 1;
1011 	for (ip = msgvec; *ip; ip++) {
1012 		setdot(&message[*ip-1]);
1013 		clsf(&message[*ip-1]);
1014 	}
1015 	relsedb();
1016 	return 0;
1017 }
1018 
1019 #define	BEST	15
1020 static struct {
1021 	float	dist;
1022 	float	prob;
1023 	char	*word;
1024 	unsigned long	hash1;
1025 	unsigned long	hash2;
1026 	enum loc	loc;
1027 } best[BEST];
1028 
1029 static void
clsf(struct message * m)1030 clsf(struct message *m)
1031 {
1032 	int	i;
1033 	float	a = 1, b = 1, r;
1034 
1035 	if (verbose)
1036 		fprintf(stderr, "Examining message %d\n", m - &message[0] + 1);
1037 	for (i = 0; i < BEST; i++) {
1038 		best[i].dist = 0;
1039 		best[i].prob = -1;
1040 	}
1041 	if (scan(m, -1, rate, 0) != OKAY)
1042 		return;
1043 	if (best[0].prob == -1) {
1044 		if (verbose)
1045 			fprintf(stderr, "No information found.\n");
1046 		m->m_flag &= ~MJUNK;
1047 		return;
1048 	}
1049 	for (i = 0; i < BEST; i++) {
1050 		if (best[i].prob == -1)
1051 			break;
1052 		if (verbose)
1053 			fprintf(stderr, "Probe %2d: \"%s\", hash=%lu:%lu "
1054 				"prob=%.4g dist=%.4g\n",
1055 				i+1, prstr(best[i].word),
1056 				best[i].hash1, best[i].hash2,
1057 				best[i].prob, best[i].dist);
1058 		a *= best[i].prob;
1059 		b *= 1 - best[i].prob;
1060 	}
1061 	r = a+b > 0 ? a / (a+b) : 0;
1062 	if (verbose)
1063 		fprintf(stderr, "Junk probability of message %d: %g\n",
1064 				m - &message[0] + 1, r);
1065 	if (r > THR)
1066 		m->m_flag |= MJUNK;
1067 	else
1068 		m->m_flag &= ~MJUNK;
1069 }
1070 
1071 /*ARGSUSED4*/
1072 static void
rate(const char * word,enum entry entry,struct lexstat * sp,int unused)1073 rate(const char *word, enum entry entry, struct lexstat *sp, int unused)
1074 {
1075 	char	*n;
1076 	unsigned long	h1, h2;
1077 	float	p, d;
1078 	int	i, j;
1079 
1080 	dbhash(word, &h1, &h2);
1081 	if ((n = lookup(h1, h2, 0)) != NULL) {
1082 		p = getprob(n);
1083 	} else
1084 		p = DFL;
1085 	if (_debug)
1086 		fprintf(stderr, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1, h2,
1087 				n ? get(&n[OF_node_good]) : 0,
1088 				n ? get(&n[OF_node_bad]) : 0,
1089 				p, prstr(word));
1090 	if (p == 0)
1091 		return;
1092 	d = p >= MID ? p - MID : MID - p;
1093 	if (d >= best[BEST-1].dist)
1094 		for (i = 0; i < BEST; i++) {
1095 			if (h1 == best[i].hash1 && h2 == best[i].hash2)
1096 				break;
1097 			/*
1098 			 * This selection prefers words from the end of the
1099 			 * header and from the start of the body. It does
1100 			 * probably not matter much at all, but gives at
1101 			 * least the most interesting verbose output.
1102 			 */
1103 			if (d > best[i].dist || best[i].loc == HEADER &&
1104 					d >= best[i].dist) {
1105 				for (j = BEST-2; j >= i; j--)
1106 					best[j+1] = best[j];
1107 				best[i].dist = d;
1108 				best[i].prob = p;
1109 				best[i].word = savestr(word);
1110 				best[i].hash1 = h1;
1111 				best[i].hash2 = h2;
1112 				best[i].loc = sp->loc;
1113 				break;
1114 			}
1115 		}
1116 }
1117 
1118 static void
dbhash(const char * word,unsigned long * h1,unsigned long * h2)1119 dbhash(const char *word, unsigned long *h1, unsigned long *h2)
1120 {
1121 	unsigned char	digest[16];
1122 	MD5_CTX	ctx;
1123 
1124 	MD5Init(&ctx);
1125 	MD5Update(&ctx, (unsigned char *)word, strlen(word));
1126 	if (table_version >= 1)
1127 		MD5Update(&ctx, (unsigned char *)&super[OF_super_mangle], 4);
1128 	MD5Final(digest, &ctx);
1129 	*h1 = getn(digest);
1130 	if (table_version < 1) {
1131 		*h1 ^= getn(&super[OF_super_mangle]);
1132 		*h2 = 0;
1133 	} else
1134 		*h2 = get(&digest[4]);
1135 }
1136 
1137 /*
1138  * The selection of the value for mangling is not critical. It is practically
1139  * impossible for any person to determine the exact time when the database
1140  * was created first (without looking at the database, which would reveal the
1141  * value anyway), so we just use this. The MD5 hash here ensures that each
1142  * single second gives a completely different mangling value (which is not
1143  * necessary anymore if table_version>=1, but does not hurt).
1144  */
1145 static void
mkmangle(void)1146 mkmangle(void)
1147 {
1148 	union {
1149 		time_t	t;
1150 		char	c[16];
1151 	} u;
1152 	unsigned long	s;
1153 	unsigned char	digest[16];
1154 	MD5_CTX	ctx;
1155 
1156 	memset(&u, 0, sizeof u);
1157 	time(&u.t);
1158 	MD5Init(&ctx);
1159 	MD5Update(&ctx, (unsigned char *)u.c, sizeof u.c);
1160 	MD5Final(digest, &ctx);
1161 	s = getn(digest);
1162 	putn(&super[OF_super_mangle], s);
1163 }
1164 
1165 int
cprobability(void * v)1166 cprobability(void *v)
1167 {
1168 	char	**args = v;
1169 	unsigned long	used, ngood, nbad;
1170 	unsigned long	h1, h2;
1171 	unsigned	g, b;
1172 	float	p, d;
1173 	char	*n;
1174 
1175 	if (*args == NULL) {
1176 		fprintf(stderr, "No words given.\n");
1177 		return 1;
1178 	}
1179 	if (getdb(O_RDONLY) != OKAY)
1180 		return 1;
1181 	used = getn(&super[OF_super_used]);
1182 	ngood = getn(&super[OF_super_ngood]);
1183 	nbad = getn(&super[OF_super_nbad]);
1184 	printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1185 			used, ngood, nbad);
1186 	do {
1187 		dbhash(*args, &h1, &h2);
1188 		printf("\"%s\", hash=%lu:%lu ", *args, h1, h2);
1189 		if ((n = lookup(h1, h2, 0)) != NULL) {
1190 			g = get(&n[OF_node_good]);
1191 			b = get(&n[OF_node_bad]);
1192 			printf("good=%u bad=%u ", g, b);
1193 			p = getprob(n);
1194 			if (p != 0) {
1195 				d = p >= MID ? p - MID : MID - p;
1196 				printf("prob=%.4g dist=%.4g", p, d);
1197 			} else
1198 				printf("too infrequent");
1199 		} else
1200 			printf("not in database");
1201 		putchar('\n');
1202 	} while (*++args);
1203 	relsedb();
1204 	return 0;
1205 }
1206