1/* This file is part of Mailfromd.             -*- c -*-
2   Copyright (C) 2006-2021 Sergey Poznyakoff
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 3, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16
17MF_BUILTIN_MODULE
18MF_COND(WITH_DSPAM)
19
20#include "srvcfg.h"
21#undef HAVE_CONFIG_H
22#define CONFIG_DEFAULT "/dev/null"
23#define LOGDIR "/dev/null"
24#include <libdspam.h>
25#include "mflib/dspam.h"
26#include "msg.h"
27
28/* User parameters */
29MF_VAR(dspam_user, STRING, SYM_PRECIOUS);
30MF_VAR(dspam_group, STRING, SYM_PRECIOUS);
31MF_VAR(dspam_config, STRING, SYM_PRECIOUS);
32MF_VAR(dspam_profile, STRING, SYM_PRECIOUS);
33/* Output variables */
34MF_VAR(dspam_signature, STRING, SYM_PRECIOUS);
35MF_VAR(dspam_probability, NUMBER);
36MF_VAR(dspam_confidence, NUMBER);
37MF_VAR(dspam_prec, NUMBER);
38#define DEFAULT_DSPAM_PREC 3
39
40static int _dspam_initialized;
41
42static void
43_dspam_shutdown()
44{
45	dspam_shutdown_driver(NULL);
46}
47
48struct transtab
49{
50	int trans_from;
51	int trans_to;
52};
53
54static struct builtin_const_trans mode_trans[] = {
55	MF_TRANS(DSM_PROCESS),
56	MF_TRANS(DSM_CLASSIFY)
57};
58
59static struct builtin_const_trans flag_trans[] = {
60	MF_TRANS(DSF_SIGNATURE),
61	MF_TRANS(DSF_NOISE),
62	MF_TRANS(DSF_WHITELIST)
63};
64
65static struct builtin_const_trans tokenizer_trans[] = {
66	MF_TRANS(DSZ_WORD),
67	MF_TRANS(DSZ_CHAIN),
68	MF_TRANS(DSZ_SBPH),
69	MF_TRANS(DSZ_OSB),
70};
71
72static struct builtin_const_trans tmod_trans[] = {
73	MF_TRANS(DST_TEFT),
74	MF_TRANS(DST_TOE),
75	MF_TRANS(DST_TUM)
76};
77
78static struct builtin_const_trans class_trans[] = {
79	MF_TRANS(DSR_ISSPAM),
80	MF_TRANS(DSR_ISINNOCENT),
81	MF_TRANS(DSR_NONE)
82};
83
84static struct builtin_const_trans source_trans[] = {
85	MF_TRANS(DSS_ERROR),
86	MF_TRANS(DSS_CORPUS),
87	MF_TRANS(DSS_INOCULATION),
88	MF_TRANS(DSS_NONE)
89};
90
91static void
92ctx_cleanup(void *ptr)
93{
94	DSPAM_CTX *ctx = ptr;
95	dspam_destroy(ctx);
96}
97
98
99struct config_entry {
100	int argc;
101	char **argv;
102#       define config_keyword argv[0]
103#       define config_value argv[1]
104};
105
106static void
107free_config_entry(void *data)
108{
109	struct config_entry *entry = data;
110	mu_argcv_free(entry->argc, entry->argv);
111}
112
113static int
114compare_config_entry(const void *a, const void *b)
115{
116	struct config_entry const *ent_a = a;
117	struct config_entry const *ent_b = b;
118	return strcasecmp(ent_a->config_keyword, ent_b->config_keyword);
119}
120
121struct config_entry *
122config_find(mu_list_t config, const char *kw)
123{
124	if (config) {
125		struct config_entry key, *ret;
126		key.argc = 1;
127		key.argv = (char **)&kw;
128		if (mu_list_locate(config, &key, (void **)&ret) == 0)
129			return ret;
130	}
131	return NULL;
132}
133
134const char *
135config_find_value(mu_list_t config, const char *kw)
136{
137	struct config_entry *ent = config_find(config, kw);
138	if (ent)
139		return ent->config_value;
140	return NULL;
141}
142
143static int
144read_config(mu_list_t config, const char *file_name)
145{
146	int rc;
147	mu_stream_t str, flt;
148	char *buf = NULL;
149	size_t size = 0, n;
150	static const char *args[] = { "INLINE-COMMENT", "#", "-r" };
151
152	if ((rc = mu_file_stream_create(&str, file_name, MU_STREAM_READ))) {
153		mu_error(_("cannot open configuration file `%s': %s"),
154			 file_name, mu_strerror(rc));
155		return rc;
156	}
157
158	rc = mu_filter_create_args(&flt, str,
159				   "INLINE-COMMENT",
160				   MU_ARRAY_SIZE(args), args,
161				   MU_FILTER_DECODE,
162				   MU_STREAM_READ);
163	mu_stream_unref(str);
164	if (rc) {
165		mu_error (_("cannot open filter stream for `%s': %s"),
166			  file_name, mu_strerror (rc));
167		return rc;
168	}
169	str = flt;
170
171	while (mu_stream_getline(str, &buf, &size, &n) == 0 && n > 0) {
172		struct config_entry *ent;
173		struct mu_wordsplit ws;
174
175		if (mu_wordsplit(buf, &ws, MU_WRDSF_DEFFLAGS)) {
176			mu_error("mu_wordsplit: %s",
177				 mu_wordsplit_strerror(&ws));
178			break;
179		}
180
181		if (ws.ws_wordc) {
182			ent = mu_alloc(sizeof(*ent));
183			ent->argc = ws.ws_wordc;
184			ent->argv = ws.ws_wordv;
185			mu_list_append(config, ent);
186			ws.ws_wordc = 0;
187			ws.ws_wordv = NULL;
188		} /* FIXME: diagnostics */
189		mu_wordsplit_free(&ws);
190	}
191	free(buf);
192	mu_stream_close(str);
193	mu_stream_destroy(&str);
194	return 0;
195}
196
197
198static void *
199alloc_config()
200{
201	mu_list_t config;
202
203	mu_list_create(&config);
204	mu_list_set_destroy_item(config, free_config_entry);
205	mu_list_set_comparator(config, compare_config_entry);
206	return config;
207}
208
209static void
210destroy_config(void *data)
211{
212	mu_list_t config = data;
213	mu_list_destroy(&config);
214}
215
216MF_DECLARE_DATA(DSPAM_CONFIG, alloc_config, destroy_config)
217
218MF_DSEXP_SUPPRESS([<get_config>],[<
219static mu_list_t
220get_config(eval_environ_t env, mu_list_t config)
221{
222	/* Initialize dspam library and set up global variables, if
223	   needed */
224	if (!_dspam_initialized) {
225		const char *config_file = MF_VAR_STRING(dspam_config);
226		if (config_file && config_file[0])
227			read_config(config, config_file);
228
229		MF_ASSERT(libdspam_init(config_find_value(config,
230							  "StorageDriver"))
231			  == 0,
232			  mfe_failure,
233			  "libdspam_init failed");
234
235		dspam_init_driver(NULL);
236		atexit(_dspam_shutdown);
237		_dspam_initialized = 1;
238
239		if (MF_VAR_STRING(dspam_user) == NULL)
240			MF_VAR_SET_STRING(dspam_user, mf_server_user);
241
242		if (MF_VAR_REF(dspam_prec, uint) == 0)
243			MF_VAR_REF(dspam_prec, uint, DEFAULT_DSPAM_PREC);
244	}
245
246	return config;
247}
248>])
249
250struct keyword_prop {
251	char *name;
252	int len;
253	int flag;
254};
255
256#define PROP_ATTACH    1
257#define PROP_ALGORITHM 2
258#define PROP_TOKENIZER 3
259#define PROP_PVALUE    4
260
261static struct keyword_prop keyword_prop[] = {
262	{ "IgnoreHeader", 0, PROP_ATTACH },
263	{ "MySQL", 5, PROP_ATTACH },
264	{ "PgSQL", 5, PROP_ATTACH },
265	{ "SQLite", 6, PROP_ATTACH },
266	{ "LocalMX", 0, PROP_ATTACH },
267	{ "Storage", 7, PROP_ATTACH },
268	{ "Processor", 9, PROP_ATTACH },
269	{ "Hash", 4, PROP_ATTACH },
270	{ "Algorithm", 0, PROP_ALGORITHM },
271	{ "PValue", 0, PROP_PVALUE },
272	{ "Tokenizer", 0, PROP_TOKENIZER },
273	{ NULL }
274};
275
276static struct mu_kwd algorithm_kwd[] = {
277	{ "graham", DSA_GRAHAM },
278	{ "burton", DSA_BURTON },
279	{ "robinson", DSA_ROBINSON },
280	{ "naive", DSA_NAIVE },
281	{ "chi-square", DSA_CHI_SQUARE },
282	{ NULL }
283};
284
285static struct mu_kwd pvalue_kwd[] = {
286	{ "robinson", DSP_ROBINSON },
287	{ "markov", DSP_MARKOV },
288	{ NULL }
289};
290
291static struct mu_kwd tokenizer_kwd[] = {
292	{ "word", DSZ_WORD },
293	{ "chain", DSZ_CHAIN },
294	{ "chained", DSZ_CHAIN },
295	{ "sbph", DSZ_SBPH },
296	{ "osb", DSZ_OSB },
297	{ NULL }
298};
299
300static void
301set_context_attributes(DSPAM_CTX *ctx, mu_list_t config, const char *profile,
302		       int ignore_tokenizer)
303{
304	mu_iterator_t itr;
305	int algo = 0;
306	int algo_set = 0;
307	int pvalue = 0;
308	int pvalue_set = 0;
309	int tokenizer = 0;
310	int tokenizer_set = 0;
311	int n;
312
313	if (!profile || !profile[0])
314		profile = config_find_value(config, "DefaultProfile");
315
316	mu_list_get_iterator(config, &itr);
317	for (mu_iterator_first(itr); !mu_iterator_is_done(itr);
318	     mu_iterator_next(itr)) {
319		struct config_entry *ent;
320		struct keyword_prop *prop;
321
322		mu_iterator_current (itr, (void **)&ent);
323		for (prop = keyword_prop; prop->name; prop++) {
324			char *p;
325
326			if ((prop->len ?
327			     strncasecmp(ent->config_keyword, prop->name,
328					 prop->len) :
329			     strcasecmp(ent->config_keyword, prop->name))
330			    == 0) {
331				switch (prop->flag) {
332				case PROP_ATTACH:
333					dspam_addattribute(ctx,
334							   ent->config_keyword,
335							   ent->config_value);
336					break;
337
338				case PROP_ALGORITHM:
339					algo_set = 1;
340					if (mu_kwd_xlat_name_ci(algorithm_kwd,
341							 ent->config_value,
342								&n) == 0)
343						algo |= n;
344					break;
345
346				case PROP_PVALUE:
347					if (pvalue_set)
348						continue;
349					if (mu_kwd_xlat_name_ci(pvalue_kwd,
350							 ent->config_value,
351								&n) == 0) {
352						pvalue = n;
353						pvalue_set = 1;
354					}
355					break;
356
357				case PROP_TOKENIZER:
358					tokenizer_set = 1;
359					if (mu_kwd_xlat_name_ci(tokenizer_kwd,
360							 ent->config_value,
361								&n) == 0)
362						tokenizer |= n;
363					break;
364				}
365			} else if (profile &&
366				   (p = strchr(ent->config_keyword, '.')) &&
367				   strcasecmp(p + 1, profile) == 0) {
368				size_t len = p - ent->config_keyword;
369				char *key = mu_alloc(len + 1);
370				memcpy(key, ent->config_keyword, len);
371				key[len] = 0;
372				dspam_addattribute(ctx, key,
373						   ent->config_value);
374				free(key);
375			}
376		}
377	}
378	mu_iterator_destroy(&itr);
379
380	if (algo_set)
381		ctx->algorithms = algo | (pvalue_set ? pvalue : DSP_GRAHAM);
382
383	if (!ignore_tokenizer && tokenizer_set)
384		ctx->tokenizer = tokenizer;
385
386	if ((ctx->algorithms & DSA_CHI_SQUARE) &&
387	    !(ctx->algorithms & DSP_ROBINSON))
388		mu_diag_output(MU_DIAG_WARNING,
389			       "Chi-Square algorithm enabled with other "
390			       "algorithms: false positives may ensue");
391}
392
393/* number dspam(number msg, number flags; number class_source) */
394MF_DSEXP
395MF_DEFUN(dspam, NUMBER, NUMBER nmsg, NUMBER mode_flags, OPTIONAL, NUMBER class_src)
396{
397	int rc;
398	DSPAM_CTX *ctx;               	/* DSPAM Context */
399	int mode;
400	int flags;
401	mu_message_t msg;
402	mu_stream_t msgstr, instr;
403	const char *msgbuf;
404	size_t msgsize;
405	unsigned prec;
406	mu_transport_t trans[2];
407	mu_list_t config = get_config(env, MF_GET_DATA);
408	int tokenizer;
409
410	/* Prepare message buffer */
411	msg = bi_message_from_descr(env, nmsg);
412	rc = mu_message_size(msg, &msgsize);
413	MF_ASSERT(rc == 0,
414		  mfe_failure,
415 		  "mu_message_size: %s", mu_strerror(rc));
416
417	rc = mu_memory_stream_create(&msgstr, MU_STREAM_RDWR);
418	MF_ASSERT(rc == 0,
419		  mfe_failure,
420		 "mu_static_memory_stream_create: %s",
421		  mu_strerror(rc));
422	MF_DCL_CLEANUP(msgstr, _builtin_stream_cleanup);
423
424	rc = mu_message_get_streamref(msg, &instr);
425	MF_ASSERT(rc == 0,
426		  mfe_failure,
427		  "mu_message_get_streamref: %s",
428		  mu_strerror(rc));
429	MF_DCL_CLEANUP(instr, _builtin_stream_cleanup);
430
431	rc = mu_stream_copy(msgstr, instr, msgsize, NULL);
432	MF_ASSERT(rc == 0,
433		  mfe_failure,
434		  "mu_stream_copy: %s",
435		  mu_strerror(rc));
436
437	MF_CLEANUP(instr);
438
439	mu_stream_ioctl(msgstr, MU_IOCTL_TRANSPORT, MU_IOCTL_OP_GET, trans);
440	msgbuf = (const char*)trans[0];
441
442	/* Prepare DSPAM context */
443	MF_ASSERT(_builtin_const_to_c(mode_trans, MU_ARRAY_SIZE(mode_trans),
444				      mode_flags & _MFL__DSM_MASK, &mode) == 0,
445		  mfe_failure,
446		  "bad dspam mode");
447	flags = _builtin_const_to_bitmap(flag_trans, MU_ARRAY_SIZE(flag_trans),
448					 mode_flags);
449
450	/* Create the DSPAM context */
451	ctx = dspam_create(MF_VAR_STRING(dspam_user),
452			   MF_VAR_STRING(dspam_group),
453			   config_find_value(config, "Home"), mode,
454			   flags);
455	MF_ASSERT(ctx != NULL,
456		  mfe_failure,
457		  "dspam_create failed");
458	MF_DCL_CLEANUP(ctx, ctx_cleanup);
459
460	/* Use graham and robinson algorithms, graham's p-values */
461	ctx->algorithms = DSA_GRAHAM | DSA_BURTON | DSP_GRAHAM;
462
463	tokenizer = mode_flags & _MFL__DSZ_MASK;
464	set_context_attributes(ctx, config, MF_VAR_STRING(dspam_profile),
465			       tokenizer);
466
467	MF_ASSERT(dspam_attach(ctx, NULL) == 0,
468		  mfe_failure,
469		  "dspam_attach failed");
470
471	/* Configure tokenizer */
472	if (tokenizer)
473		MF_ASSERT(_builtin_const_to_c(tokenizer_trans,
474					      MU_ARRAY_SIZE(tokenizer_trans),
475					      tokenizer, &ctx->tokenizer) == 0,
476			  mfe_failure,
477			  "bad dspam tokenizer");
478
479	/* Set training mode */
480	MF_ASSERT(_builtin_const_to_c(tmod_trans, MU_ARRAY_SIZE(tmod_trans),
481			    mode_flags & _MFL__DST_MASK, &ctx->training_mode)
482		  == 0,
483		  mfe_failure,
484		  "bad dspam training mode");
485
486	/* Set up classification and source */
487	if (MF_DEFINED(class_src)) {
488		MF_ASSERT(_builtin_const_to_c(class_trans,
489					      MU_ARRAY_SIZE(class_trans),
490					      class_src & _MFL__DSR_MASK,
491					      &ctx->classification) == 0,
492			  mfe_failure,
493			  "bad dspam classification flag");
494		MF_ASSERT(_builtin_const_to_c(source_trans,
495					      MU_ARRAY_SIZE(source_trans),
496					      class_src & _MFL__DSS_MASK,
497					      &ctx->source) == 0,
498			  mfe_failure,
499			  "bad dspam source flag");
500	}
501
502	/* Process the message */
503	MF_ASSERT(dspam_process(ctx, msgbuf) == 0,
504		  mfe_failure,
505		  "dspam_process failed");
506
507	rc = MF_VAR_REF(dspam_prec, uint);
508	prec = 1;
509	while (rc--)
510		prec *= 10;
511	MF_VAR_REF(dspam_probability, ulong, ctx->probability * prec);
512	MF_VAR_REF(dspam_confidence, ulong, ctx->confidence * prec);
513	if (flags & DSF_SIGNATURE) {
514		char signame[128];
515		_ds_create_signature_id(ctx, signame, sizeof(signame));
516		_ds_set_signature(ctx, ctx->signature, signame);
517		MF_VAR_SET_STRING(dspam_signature, signame);
518	}
519	MF_ASSERT(_builtin_c_to_const(class_trans,
520				      MU_ARRAY_SIZE(class_trans),
521				      ctx->result,
522				      &rc) == 0,
523		  mfe_failure,
524		  "unrecognized dspam result");
525	MF_CLEANUP(ctx);
526
527	/* FIXME: Any additional processing? */
528
529	MF_RETURN(rc);
530}
531END
532
533