1 /*-
2  * Copyright 2016 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "config.h"
18 #include "stat_api.h"
19 #include "rspamd.h"
20 #include "cfg_rcl.h"
21 #include "stat_internal.h"
22 #include "lua/lua_common.h"
23 
24 static struct rspamd_stat_ctx *stat_ctx = NULL;
25 
26 static struct rspamd_stat_classifier lua_classifier = {
27 	.name = "lua",
28 	.init_func = lua_classifier_init,
29 	.classify_func = lua_classifier_classify,
30 	.learn_spam_func = lua_classifier_learn_spam,
31 	.fin_func = NULL,
32 };
33 
34 static struct rspamd_stat_classifier stat_classifiers[] = {
35 	{
36 		.name = "bayes",
37 		.init_func = bayes_init,
38 		.classify_func = bayes_classify,
39 		.learn_spam_func = bayes_learn_spam,
40 		.fin_func = bayes_fin,
41 	}
42 };
43 
44 static struct rspamd_stat_tokenizer stat_tokenizers[] = {
45 	{
46 		.name = "osb-text",
47 		.get_config = rspamd_tokenizer_osb_get_config,
48 		.tokenize_func = rspamd_tokenizer_osb,
49 	},
50 	{
51 		.name = "osb",
52 		.get_config = rspamd_tokenizer_osb_get_config,
53 		.tokenize_func = rspamd_tokenizer_osb,
54 	},
55 };
56 
57 #define RSPAMD_STAT_BACKEND_ELT(nam, eltn) { \
58 		.name = #nam,                              \
59         .read_only = false,                        \
60 		.init = rspamd_##eltn##_init,              \
61 		.runtime = rspamd_##eltn##_runtime,        \
62 		.process_tokens = rspamd_##eltn##_process_tokens, \
63 		.finalize_process = rspamd_##eltn##_finalize_process, \
64 		.learn_tokens = rspamd_##eltn##_learn_tokens, \
65 		.finalize_learn = rspamd_##eltn##_finalize_learn, \
66 		.total_learns = rspamd_##eltn##_total_learns, \
67 		.inc_learns = rspamd_##eltn##_inc_learns, \
68 		.dec_learns = rspamd_##eltn##_dec_learns, \
69 		.get_stat = rspamd_##eltn##_get_stat, \
70 		.load_tokenizer_config = rspamd_##eltn##_load_tokenizer_config, \
71 		.close = rspamd_##eltn##_close \
72 	}
73 #define RSPAMD_STAT_BACKEND_ELT_READONLY(nam, eltn) { \
74 		.name = #nam,                              \
75         .read_only = true,                         \
76 		.init = rspamd_##eltn##_init,              \
77 		.runtime = rspamd_##eltn##_runtime,        \
78 		.process_tokens = rspamd_##eltn##_process_tokens, \
79 		.finalize_process = rspamd_##eltn##_finalize_process, \
80 		.learn_tokens = NULL, \
81 		.finalize_learn = NULL, \
82 		.total_learns = rspamd_##eltn##_total_learns, \
83 		.inc_learns = NULL, \
84 		.dec_learns = NULL, \
85 		.get_stat = rspamd_##eltn##_get_stat, \
86 		.load_tokenizer_config = rspamd_##eltn##_load_tokenizer_config, \
87 		.close = rspamd_##eltn##_close \
88 	}
89 
90 static struct rspamd_stat_backend stat_backends[] = {
91 		RSPAMD_STAT_BACKEND_ELT(mmap, mmaped_file),
92 		RSPAMD_STAT_BACKEND_ELT(sqlite3, sqlite3),
93 		RSPAMD_STAT_BACKEND_ELT_READONLY(cdb, cdb),
94 #ifdef WITH_HIREDIS
95 		RSPAMD_STAT_BACKEND_ELT(redis, redis)
96 #endif
97 };
98 
99 #define RSPAMD_STAT_CACHE_ELT(nam, eltn) { \
100 		.name = #nam, \
101 		.init = rspamd_stat_cache_##eltn##_init, \
102 		.runtime = rspamd_stat_cache_##eltn##_runtime, \
103 		.check = rspamd_stat_cache_##eltn##_check, \
104 		.learn = rspamd_stat_cache_##eltn##_learn, \
105 		.close = rspamd_stat_cache_##eltn##_close \
106 	}
107 
108 static struct rspamd_stat_cache stat_caches[] = {
109 		RSPAMD_STAT_CACHE_ELT(sqlite3, sqlite3),
110 #ifdef WITH_HIREDIS
111 		RSPAMD_STAT_CACHE_ELT(redis, redis),
112 #endif
113 };
114 
115 void
rspamd_stat_init(struct rspamd_config * cfg,struct ev_loop * ev_base)116 rspamd_stat_init (struct rspamd_config *cfg, struct ev_loop *ev_base)
117 {
118 	GList *cur, *curst;
119 	struct rspamd_classifier_config *clf;
120 	struct rspamd_statfile_config *stf;
121 	struct rspamd_stat_backend *bk;
122 	struct rspamd_statfile *st;
123 	struct rspamd_classifier *cl;
124 	const ucl_object_t *cache_obj = NULL, *cache_name_obj;
125 	const gchar *cache_name = NULL;
126 	lua_State *L = cfg->lua_state;
127 	guint lua_classifiers_cnt = 0, i;
128 	gboolean skip_cache = FALSE;
129 
130 	if (stat_ctx == NULL) {
131 		stat_ctx = g_malloc0 (sizeof (*stat_ctx));
132 	}
133 
134 	lua_getglobal (L, "rspamd_classifiers");
135 
136 	if (lua_type (L, -1) == LUA_TTABLE) {
137 		lua_pushnil (L);
138 
139 		while (lua_next (L, -2) != 0) {
140 			lua_classifiers_cnt ++;
141 			lua_pop (L, 1);
142 		}
143 	}
144 
145 	lua_pop (L, 1);
146 
147 	stat_ctx->classifiers_count = G_N_ELEMENTS (stat_classifiers) +
148 				lua_classifiers_cnt;
149 	stat_ctx->classifiers_subrs = g_new0 (struct rspamd_stat_classifier,
150 			stat_ctx->classifiers_count);
151 
152 	for (i = 0; i < G_N_ELEMENTS (stat_classifiers); i ++) {
153 		memcpy (&stat_ctx->classifiers_subrs[i], &stat_classifiers[i],
154 				sizeof (struct rspamd_stat_classifier));
155 	}
156 
157 	lua_getglobal (L, "rspamd_classifiers");
158 
159 	if (lua_type (L, -1) == LUA_TTABLE) {
160 		lua_pushnil (L);
161 
162 		while (lua_next (L, -2) != 0) {
163 			lua_pushvalue (L, -2);
164 			memcpy (&stat_ctx->classifiers_subrs[i], &lua_classifier,
165 							sizeof (struct rspamd_stat_classifier));
166 			stat_ctx->classifiers_subrs[i].name = g_strdup (lua_tostring (L, -1));
167 			i ++;
168 			lua_pop (L, 2);
169 		}
170 	}
171 
172 	lua_pop (L, 1);
173 	stat_ctx->backends_subrs = stat_backends;
174 	stat_ctx->backends_count = G_N_ELEMENTS (stat_backends);
175 
176 	stat_ctx->tokenizers_subrs = stat_tokenizers;
177 	stat_ctx->tokenizers_count = G_N_ELEMENTS (stat_tokenizers);
178 	stat_ctx->caches_subrs = stat_caches;
179 	stat_ctx->caches_count = G_N_ELEMENTS (stat_caches);
180 	stat_ctx->cfg = cfg;
181 	stat_ctx->statfiles = g_ptr_array_new ();
182 	stat_ctx->classifiers = g_ptr_array_new ();
183 	stat_ctx->async_elts = g_queue_new ();
184 	stat_ctx->event_loop = ev_base;
185 	stat_ctx->lua_stat_tokens_ref = -1;
186 
187 	/* Interact with lua_stat */
188 	if (luaL_dostring (L, "return require \"lua_stat\"") != 0) {
189 		msg_err_config ("cannot require lua_stat: %s",
190 				lua_tostring (L, -1));
191 	}
192 	else {
193 #if LUA_VERSION_NUM >= 504
194 		lua_settop(L, -2);
195 #endif
196 		if (lua_type (L, -1) != LUA_TTABLE) {
197 			msg_err_config ("lua stat must return "
198 							"table and not %s",
199 					lua_typename (L, lua_type (L, -1)));
200 		}
201 		else {
202 			lua_pushstring (L, "gen_stat_tokens");
203 			lua_gettable (L, -2);
204 
205 			if (lua_type (L, -1) != LUA_TFUNCTION) {
206 				msg_err_config ("gen_stat_tokens must return "
207 								"function and not %s",
208 						lua_typename (L, lua_type (L, -1)));
209 			}
210 			else {
211 				/* Call this function to obtain closure */
212 				gint err_idx, ret;
213 				struct rspamd_config **pcfg;
214 
215 				lua_pushcfunction (L, &rspamd_lua_traceback);
216 				err_idx = lua_gettop (L);
217 				lua_pushvalue (L, err_idx - 1);
218 
219 				pcfg = lua_newuserdata (L, sizeof (*pcfg));
220 				*pcfg = cfg;
221 				rspamd_lua_setclass (L, "rspamd{config}", -1);
222 
223 				if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
224 					msg_err_config ("call to gen_stat_tokens lua "
225 									"script failed (%d): %s", ret,
226 									lua_tostring (L, -1));
227 				}
228 				else {
229 					if (lua_type (L, -1) != LUA_TFUNCTION) {
230 						msg_err_config ("gen_stat_tokens invocation must return "
231 										"function and not %s",
232 								lua_typename (L, lua_type (L, -1)));
233 					}
234 					else {
235 						stat_ctx->lua_stat_tokens_ref = luaL_ref (L, LUA_REGISTRYINDEX);
236 					}
237 				}
238 			}
239 		}
240 	}
241 
242 	/* Cleanup mess */
243 	lua_settop (L, 0);
244 
245 	/* Create statfiles from the classifiers */
246 	cur = cfg->classifiers;
247 
248 	while (cur) {
249 		bk = NULL;
250 		clf = cur->data;
251 		cl = g_malloc0 (sizeof (*cl));
252 		cl->cfg = clf;
253 		cl->ctx = stat_ctx;
254 		cl->statfiles_ids = g_array_new (FALSE, FALSE, sizeof (gint));
255 		cl->subrs = rspamd_stat_get_classifier (clf->classifier);
256 
257 		if (cl->subrs == NULL) {
258 			g_free (cl);
259 			msg_err_config ("cannot init classifier type %s", clf->name);
260 			cur = g_list_next (cur);
261 			continue;
262 		}
263 
264 		if (!cl->subrs->init_func (cfg, ev_base, cl)) {
265 			g_free (cl);
266 			msg_err_config ("cannot init classifier type %s", clf->name);
267 			cur = g_list_next (cur);
268 			continue;
269 		}
270 
271 		if (!(clf->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
272 			bk = rspamd_stat_get_backend (clf->backend);
273 
274 			if (bk == NULL) {
275 				msg_err_config ("cannot get backend of type %s, so disable classifier"
276 						" %s completely", clf->backend, clf->name);
277 				cur = g_list_next (cur);
278 				continue;
279 			}
280 		}
281 
282 		/* XXX:
283 		 * Here we get the first classifier tokenizer config as the only one
284 		 * We NO LONGER support multiple tokenizers per rspamd instance
285 		 */
286 		if (stat_ctx->tkcf == NULL) {
287 			stat_ctx->tokenizer = rspamd_stat_get_tokenizer (clf->tokenizer->name);
288 			g_assert (stat_ctx->tokenizer != NULL);
289 			stat_ctx->tkcf = stat_ctx->tokenizer->get_config (cfg->cfg_pool,
290 					clf->tokenizer, NULL);
291 		}
292 
293 		/* Init classifier cache */
294 		cache_name = NULL;
295 
296 		if (!bk->read_only) {
297 			if (clf->opts) {
298 				cache_obj = ucl_object_lookup(clf->opts, "cache");
299 				cache_name_obj = NULL;
300 
301 				if (cache_obj && ucl_object_type(cache_obj) == UCL_NULL) {
302 					skip_cache = TRUE;
303 				}
304 				else {
305 					if (cache_obj) {
306 						cache_name_obj = ucl_object_lookup_any(cache_obj,
307 								"name", "type", NULL);
308 					}
309 
310 					if (cache_name_obj) {
311 						cache_name = ucl_object_tostring(cache_name_obj);
312 					}
313 				}
314 			}
315 		}
316 		else {
317 			skip_cache = true;
318 		}
319 
320 		if (cache_name == NULL && !skip_cache) {
321 			/* We assume that learn cache is the same as backend */
322 			cache_name = clf->backend;
323 		}
324 
325 		curst = clf->statfiles;
326 
327 		while (curst) {
328 			stf = curst->data;
329 			st = g_malloc0 (sizeof (*st));
330 			st->classifier = cl;
331 			st->stcf = stf;
332 
333 			if (!(cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
334 				st->backend = bk;
335 				st->bkcf = bk->init (stat_ctx, cfg, st);
336 				msg_info_config ("added backend %s for symbol %s",
337 						bk->name, stf->symbol);
338 			}
339 			else {
340 				msg_debug_config ("added backend-less statfile for symbol %s",
341 						stf->symbol);
342 			}
343 
344 			/* XXX: bad hack to pass statfiles configuration to cache */
345 			if (cl->cache == NULL && !skip_cache) {
346 				cl->cache = rspamd_stat_get_cache (cache_name);
347 				g_assert (cl->cache != NULL);
348 				cl->cachecf = cl->cache->init (stat_ctx, cfg, st, cache_obj);
349 
350 				if (cl->cachecf == NULL) {
351 					msg_err_config ("error adding cache %s for symbol %s",
352 							cl->cache->name, stf->symbol);
353 					cl->cache = NULL;
354 				}
355 				else {
356 					msg_debug_config ("added cache %s for symbol %s",
357 							cl->cache->name, stf->symbol);
358 				}
359 			}
360 
361 			if (st->bkcf == NULL &&
362 					!(cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
363 				msg_err_config ("cannot init backend %s for statfile %s",
364 						clf->backend, stf->symbol);
365 
366 				g_free (st);
367 			}
368 			else {
369 				st->id = stat_ctx->statfiles->len;
370 				g_ptr_array_add (stat_ctx->statfiles, st);
371 				g_array_append_val (cl->statfiles_ids, st->id);
372 			}
373 
374 			curst = curst->next;
375 		}
376 
377 		g_ptr_array_add (stat_ctx->classifiers, cl);
378 
379 		cur = cur->next;
380 	}
381 }
382 
383 void
rspamd_stat_close(void)384 rspamd_stat_close (void)
385 {
386 	struct rspamd_classifier *cl;
387 	struct rspamd_statfile *st;
388 	struct rspamd_stat_ctx *st_ctx;
389 	struct rspamd_stat_async_elt *aelt;
390 	GList *cur;
391 	guint i, j;
392 	gint id;
393 
394 	st_ctx = rspamd_stat_get_ctx ();
395 	g_assert (st_ctx != NULL);
396 
397 	for (i = 0; i < st_ctx->classifiers->len; i ++) {
398 		cl = g_ptr_array_index (st_ctx->classifiers, i);
399 
400 		for (j = 0; j < cl->statfiles_ids->len; j ++) {
401 			id = g_array_index (cl->statfiles_ids, gint, j);
402 			st = g_ptr_array_index (st_ctx->statfiles, id);
403 			if (!(st->classifier->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
404 				st->backend->close (st->bkcf);
405 			}
406 
407 			g_free (st);
408 		}
409 
410 		if (cl->cache && cl->cachecf) {
411 			cl->cache->close (cl->cachecf);
412 		}
413 
414 		g_array_free (cl->statfiles_ids, TRUE);
415 
416 		if (cl->subrs->fin_func) {
417 			cl->subrs->fin_func (cl);
418 		}
419 
420 		g_free (cl);
421 	}
422 
423 	cur = st_ctx->async_elts->head;
424 
425 	while (cur) {
426 		aelt = cur->data;
427 		REF_RELEASE (aelt);
428 		cur = g_list_next (cur);
429 	}
430 
431 	g_queue_free (stat_ctx->async_elts);
432 	g_ptr_array_free (st_ctx->statfiles, TRUE);
433 	g_ptr_array_free (st_ctx->classifiers, TRUE);
434 
435 	if (st_ctx->lua_stat_tokens_ref != -1) {
436 		luaL_unref (st_ctx->cfg->lua_state, LUA_REGISTRYINDEX,
437 				st_ctx->lua_stat_tokens_ref);
438 	}
439 
440 	g_free (st_ctx);
441 
442 	/* Set global var to NULL */
443 	stat_ctx = NULL;
444 }
445 
446 struct rspamd_stat_ctx *
rspamd_stat_get_ctx(void)447 rspamd_stat_get_ctx (void)
448 {
449 	return stat_ctx;
450 }
451 
452 struct rspamd_stat_classifier *
rspamd_stat_get_classifier(const gchar * name)453 rspamd_stat_get_classifier (const gchar *name)
454 {
455 	guint i;
456 
457 	if (name == NULL || name[0] == '\0') {
458 		name = RSPAMD_DEFAULT_CLASSIFIER;
459 	}
460 
461 	for (i = 0; i < stat_ctx->classifiers_count; i ++) {
462 		if (strcmp (name, stat_ctx->classifiers_subrs[i].name) == 0) {
463 			return &stat_ctx->classifiers_subrs[i];
464 		}
465 	}
466 
467 	msg_err ("cannot find classifier named %s", name);
468 
469 	return NULL;
470 }
471 
472 struct rspamd_stat_backend *
rspamd_stat_get_backend(const gchar * name)473 rspamd_stat_get_backend (const gchar *name)
474 {
475 	guint i;
476 
477 	if (name == NULL || name[0] == '\0') {
478 		name = RSPAMD_DEFAULT_BACKEND;
479 	}
480 
481 	for (i = 0; i < stat_ctx->backends_count; i ++) {
482 		if (strcmp (name, stat_ctx->backends_subrs[i].name) == 0) {
483 			return &stat_ctx->backends_subrs[i];
484 		}
485 	}
486 
487 	msg_err ("cannot find backend named %s", name);
488 
489 	return NULL;
490 }
491 
492 struct rspamd_stat_tokenizer *
rspamd_stat_get_tokenizer(const gchar * name)493 rspamd_stat_get_tokenizer (const gchar *name)
494 {
495 	guint i;
496 
497 	if (name == NULL || name[0] == '\0') {
498 		name = RSPAMD_DEFAULT_TOKENIZER;
499 	}
500 
501 	for (i = 0; i < stat_ctx->tokenizers_count; i ++) {
502 		if (strcmp (name, stat_ctx->tokenizers_subrs[i].name) == 0) {
503 			return &stat_ctx->tokenizers_subrs[i];
504 		}
505 	}
506 
507 	msg_err ("cannot find tokenizer named %s", name);
508 
509 	return NULL;
510 }
511 
512 struct rspamd_stat_cache *
rspamd_stat_get_cache(const gchar * name)513 rspamd_stat_get_cache (const gchar *name)
514 {
515 	guint i;
516 
517 	if (name == NULL || name[0] == '\0') {
518 		name = RSPAMD_DEFAULT_CACHE;
519 	}
520 
521 	for (i = 0; i < stat_ctx->caches_count; i++) {
522 		if (strcmp (name, stat_ctx->caches_subrs[i].name) == 0) {
523 			return &stat_ctx->caches_subrs[i];
524 		}
525 	}
526 
527 	msg_err ("cannot find cache named %s", name);
528 
529 	return NULL;
530 }
531 
532 static void
rspamd_async_elt_dtor(struct rspamd_stat_async_elt * elt)533 rspamd_async_elt_dtor (struct rspamd_stat_async_elt *elt)
534 {
535 	if (elt->cleanup) {
536 		elt->cleanup (elt, elt->ud);
537 	}
538 
539 	ev_timer_stop (elt->event_loop, &elt->timer_ev);
540 	g_free (elt);
541 }
542 
543 static void
rspamd_async_elt_on_timer(EV_P_ ev_timer * w,int revents)544 rspamd_async_elt_on_timer (EV_P_ ev_timer *w, int revents)
545 {
546 	struct rspamd_stat_async_elt *elt = (struct rspamd_stat_async_elt *)w->data;
547 	gdouble jittered_time;
548 
549 
550 	if (elt->enabled) {
551 		elt->handler (elt, elt->ud);
552 	}
553 
554 	jittered_time = rspamd_time_jitter (elt->timeout, 0);
555 	elt->timer_ev.repeat = jittered_time;
556 	ev_timer_again (EV_A_ w);
557 }
558 
559 struct rspamd_stat_async_elt*
rspamd_stat_ctx_register_async(rspamd_stat_async_handler handler,rspamd_stat_async_cleanup cleanup,gpointer d,gdouble timeout)560 rspamd_stat_ctx_register_async (rspamd_stat_async_handler handler,
561 		rspamd_stat_async_cleanup cleanup,
562 		gpointer d,
563 		gdouble timeout)
564 {
565 	struct rspamd_stat_async_elt *elt;
566 	struct rspamd_stat_ctx *st_ctx;
567 
568 	st_ctx = rspamd_stat_get_ctx ();
569 	g_assert (st_ctx != NULL);
570 
571 	elt = g_malloc0 (sizeof (*elt));
572 	elt->handler = handler;
573 	elt->cleanup = cleanup;
574 	elt->ud = d;
575 	elt->timeout = timeout;
576 	elt->event_loop = st_ctx->event_loop;
577 	REF_INIT_RETAIN (elt, rspamd_async_elt_dtor);
578 	/* Enabled by default */
579 
580 
581 	if (st_ctx->event_loop) {
582 		elt->enabled = TRUE;
583 		/*
584 		 * First we set timeval to zero as we want cb to be executed as
585 		 * fast as possible
586 		 */
587 		elt->timer_ev.data = elt;
588 		ev_timer_init (&elt->timer_ev, rspamd_async_elt_on_timer,
589 				0.1, 0.0);
590 		ev_timer_start (st_ctx->event_loop, &elt->timer_ev);
591 	}
592 	else {
593 		elt->enabled = FALSE;
594 	}
595 
596 	g_queue_push_tail (st_ctx->async_elts, elt);
597 
598 	return elt;
599 }
600