1 /*-
2 * Copyright 2016 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "config.h"
18 #include "stat_api.h"
19 #include "rspamd.h"
20 #include "cfg_rcl.h"
21 #include "stat_internal.h"
22 #include "lua/lua_common.h"
23
24 static struct rspamd_stat_ctx *stat_ctx = NULL;
25
26 static struct rspamd_stat_classifier lua_classifier = {
27 .name = "lua",
28 .init_func = lua_classifier_init,
29 .classify_func = lua_classifier_classify,
30 .learn_spam_func = lua_classifier_learn_spam,
31 .fin_func = NULL,
32 };
33
34 static struct rspamd_stat_classifier stat_classifiers[] = {
35 {
36 .name = "bayes",
37 .init_func = bayes_init,
38 .classify_func = bayes_classify,
39 .learn_spam_func = bayes_learn_spam,
40 .fin_func = bayes_fin,
41 }
42 };
43
44 static struct rspamd_stat_tokenizer stat_tokenizers[] = {
45 {
46 .name = "osb-text",
47 .get_config = rspamd_tokenizer_osb_get_config,
48 .tokenize_func = rspamd_tokenizer_osb,
49 },
50 {
51 .name = "osb",
52 .get_config = rspamd_tokenizer_osb_get_config,
53 .tokenize_func = rspamd_tokenizer_osb,
54 },
55 };
56
57 #define RSPAMD_STAT_BACKEND_ELT(nam, eltn) { \
58 .name = #nam, \
59 .read_only = false, \
60 .init = rspamd_##eltn##_init, \
61 .runtime = rspamd_##eltn##_runtime, \
62 .process_tokens = rspamd_##eltn##_process_tokens, \
63 .finalize_process = rspamd_##eltn##_finalize_process, \
64 .learn_tokens = rspamd_##eltn##_learn_tokens, \
65 .finalize_learn = rspamd_##eltn##_finalize_learn, \
66 .total_learns = rspamd_##eltn##_total_learns, \
67 .inc_learns = rspamd_##eltn##_inc_learns, \
68 .dec_learns = rspamd_##eltn##_dec_learns, \
69 .get_stat = rspamd_##eltn##_get_stat, \
70 .load_tokenizer_config = rspamd_##eltn##_load_tokenizer_config, \
71 .close = rspamd_##eltn##_close \
72 }
73 #define RSPAMD_STAT_BACKEND_ELT_READONLY(nam, eltn) { \
74 .name = #nam, \
75 .read_only = true, \
76 .init = rspamd_##eltn##_init, \
77 .runtime = rspamd_##eltn##_runtime, \
78 .process_tokens = rspamd_##eltn##_process_tokens, \
79 .finalize_process = rspamd_##eltn##_finalize_process, \
80 .learn_tokens = NULL, \
81 .finalize_learn = NULL, \
82 .total_learns = rspamd_##eltn##_total_learns, \
83 .inc_learns = NULL, \
84 .dec_learns = NULL, \
85 .get_stat = rspamd_##eltn##_get_stat, \
86 .load_tokenizer_config = rspamd_##eltn##_load_tokenizer_config, \
87 .close = rspamd_##eltn##_close \
88 }
89
90 static struct rspamd_stat_backend stat_backends[] = {
91 RSPAMD_STAT_BACKEND_ELT(mmap, mmaped_file),
92 RSPAMD_STAT_BACKEND_ELT(sqlite3, sqlite3),
93 RSPAMD_STAT_BACKEND_ELT_READONLY(cdb, cdb),
94 #ifdef WITH_HIREDIS
95 RSPAMD_STAT_BACKEND_ELT(redis, redis)
96 #endif
97 };
98
99 #define RSPAMD_STAT_CACHE_ELT(nam, eltn) { \
100 .name = #nam, \
101 .init = rspamd_stat_cache_##eltn##_init, \
102 .runtime = rspamd_stat_cache_##eltn##_runtime, \
103 .check = rspamd_stat_cache_##eltn##_check, \
104 .learn = rspamd_stat_cache_##eltn##_learn, \
105 .close = rspamd_stat_cache_##eltn##_close \
106 }
107
108 static struct rspamd_stat_cache stat_caches[] = {
109 RSPAMD_STAT_CACHE_ELT(sqlite3, sqlite3),
110 #ifdef WITH_HIREDIS
111 RSPAMD_STAT_CACHE_ELT(redis, redis),
112 #endif
113 };
114
115 void
rspamd_stat_init(struct rspamd_config * cfg,struct ev_loop * ev_base)116 rspamd_stat_init (struct rspamd_config *cfg, struct ev_loop *ev_base)
117 {
118 GList *cur, *curst;
119 struct rspamd_classifier_config *clf;
120 struct rspamd_statfile_config *stf;
121 struct rspamd_stat_backend *bk;
122 struct rspamd_statfile *st;
123 struct rspamd_classifier *cl;
124 const ucl_object_t *cache_obj = NULL, *cache_name_obj;
125 const gchar *cache_name = NULL;
126 lua_State *L = cfg->lua_state;
127 guint lua_classifiers_cnt = 0, i;
128 gboolean skip_cache = FALSE;
129
130 if (stat_ctx == NULL) {
131 stat_ctx = g_malloc0 (sizeof (*stat_ctx));
132 }
133
134 lua_getglobal (L, "rspamd_classifiers");
135
136 if (lua_type (L, -1) == LUA_TTABLE) {
137 lua_pushnil (L);
138
139 while (lua_next (L, -2) != 0) {
140 lua_classifiers_cnt ++;
141 lua_pop (L, 1);
142 }
143 }
144
145 lua_pop (L, 1);
146
147 stat_ctx->classifiers_count = G_N_ELEMENTS (stat_classifiers) +
148 lua_classifiers_cnt;
149 stat_ctx->classifiers_subrs = g_new0 (struct rspamd_stat_classifier,
150 stat_ctx->classifiers_count);
151
152 for (i = 0; i < G_N_ELEMENTS (stat_classifiers); i ++) {
153 memcpy (&stat_ctx->classifiers_subrs[i], &stat_classifiers[i],
154 sizeof (struct rspamd_stat_classifier));
155 }
156
157 lua_getglobal (L, "rspamd_classifiers");
158
159 if (lua_type (L, -1) == LUA_TTABLE) {
160 lua_pushnil (L);
161
162 while (lua_next (L, -2) != 0) {
163 lua_pushvalue (L, -2);
164 memcpy (&stat_ctx->classifiers_subrs[i], &lua_classifier,
165 sizeof (struct rspamd_stat_classifier));
166 stat_ctx->classifiers_subrs[i].name = g_strdup (lua_tostring (L, -1));
167 i ++;
168 lua_pop (L, 2);
169 }
170 }
171
172 lua_pop (L, 1);
173 stat_ctx->backends_subrs = stat_backends;
174 stat_ctx->backends_count = G_N_ELEMENTS (stat_backends);
175
176 stat_ctx->tokenizers_subrs = stat_tokenizers;
177 stat_ctx->tokenizers_count = G_N_ELEMENTS (stat_tokenizers);
178 stat_ctx->caches_subrs = stat_caches;
179 stat_ctx->caches_count = G_N_ELEMENTS (stat_caches);
180 stat_ctx->cfg = cfg;
181 stat_ctx->statfiles = g_ptr_array_new ();
182 stat_ctx->classifiers = g_ptr_array_new ();
183 stat_ctx->async_elts = g_queue_new ();
184 stat_ctx->event_loop = ev_base;
185 stat_ctx->lua_stat_tokens_ref = -1;
186
187 /* Interact with lua_stat */
188 if (luaL_dostring (L, "return require \"lua_stat\"") != 0) {
189 msg_err_config ("cannot require lua_stat: %s",
190 lua_tostring (L, -1));
191 }
192 else {
193 #if LUA_VERSION_NUM >= 504
194 lua_settop(L, -2);
195 #endif
196 if (lua_type (L, -1) != LUA_TTABLE) {
197 msg_err_config ("lua stat must return "
198 "table and not %s",
199 lua_typename (L, lua_type (L, -1)));
200 }
201 else {
202 lua_pushstring (L, "gen_stat_tokens");
203 lua_gettable (L, -2);
204
205 if (lua_type (L, -1) != LUA_TFUNCTION) {
206 msg_err_config ("gen_stat_tokens must return "
207 "function and not %s",
208 lua_typename (L, lua_type (L, -1)));
209 }
210 else {
211 /* Call this function to obtain closure */
212 gint err_idx, ret;
213 struct rspamd_config **pcfg;
214
215 lua_pushcfunction (L, &rspamd_lua_traceback);
216 err_idx = lua_gettop (L);
217 lua_pushvalue (L, err_idx - 1);
218
219 pcfg = lua_newuserdata (L, sizeof (*pcfg));
220 *pcfg = cfg;
221 rspamd_lua_setclass (L, "rspamd{config}", -1);
222
223 if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
224 msg_err_config ("call to gen_stat_tokens lua "
225 "script failed (%d): %s", ret,
226 lua_tostring (L, -1));
227 }
228 else {
229 if (lua_type (L, -1) != LUA_TFUNCTION) {
230 msg_err_config ("gen_stat_tokens invocation must return "
231 "function and not %s",
232 lua_typename (L, lua_type (L, -1)));
233 }
234 else {
235 stat_ctx->lua_stat_tokens_ref = luaL_ref (L, LUA_REGISTRYINDEX);
236 }
237 }
238 }
239 }
240 }
241
242 /* Cleanup mess */
243 lua_settop (L, 0);
244
245 /* Create statfiles from the classifiers */
246 cur = cfg->classifiers;
247
248 while (cur) {
249 bk = NULL;
250 clf = cur->data;
251 cl = g_malloc0 (sizeof (*cl));
252 cl->cfg = clf;
253 cl->ctx = stat_ctx;
254 cl->statfiles_ids = g_array_new (FALSE, FALSE, sizeof (gint));
255 cl->subrs = rspamd_stat_get_classifier (clf->classifier);
256
257 if (cl->subrs == NULL) {
258 g_free (cl);
259 msg_err_config ("cannot init classifier type %s", clf->name);
260 cur = g_list_next (cur);
261 continue;
262 }
263
264 if (!cl->subrs->init_func (cfg, ev_base, cl)) {
265 g_free (cl);
266 msg_err_config ("cannot init classifier type %s", clf->name);
267 cur = g_list_next (cur);
268 continue;
269 }
270
271 if (!(clf->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
272 bk = rspamd_stat_get_backend (clf->backend);
273
274 if (bk == NULL) {
275 msg_err_config ("cannot get backend of type %s, so disable classifier"
276 " %s completely", clf->backend, clf->name);
277 cur = g_list_next (cur);
278 continue;
279 }
280 }
281
282 /* XXX:
283 * Here we get the first classifier tokenizer config as the only one
284 * We NO LONGER support multiple tokenizers per rspamd instance
285 */
286 if (stat_ctx->tkcf == NULL) {
287 stat_ctx->tokenizer = rspamd_stat_get_tokenizer (clf->tokenizer->name);
288 g_assert (stat_ctx->tokenizer != NULL);
289 stat_ctx->tkcf = stat_ctx->tokenizer->get_config (cfg->cfg_pool,
290 clf->tokenizer, NULL);
291 }
292
293 /* Init classifier cache */
294 cache_name = NULL;
295
296 if (!bk->read_only) {
297 if (clf->opts) {
298 cache_obj = ucl_object_lookup(clf->opts, "cache");
299 cache_name_obj = NULL;
300
301 if (cache_obj && ucl_object_type(cache_obj) == UCL_NULL) {
302 skip_cache = TRUE;
303 }
304 else {
305 if (cache_obj) {
306 cache_name_obj = ucl_object_lookup_any(cache_obj,
307 "name", "type", NULL);
308 }
309
310 if (cache_name_obj) {
311 cache_name = ucl_object_tostring(cache_name_obj);
312 }
313 }
314 }
315 }
316 else {
317 skip_cache = true;
318 }
319
320 if (cache_name == NULL && !skip_cache) {
321 /* We assume that learn cache is the same as backend */
322 cache_name = clf->backend;
323 }
324
325 curst = clf->statfiles;
326
327 while (curst) {
328 stf = curst->data;
329 st = g_malloc0 (sizeof (*st));
330 st->classifier = cl;
331 st->stcf = stf;
332
333 if (!(cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
334 st->backend = bk;
335 st->bkcf = bk->init (stat_ctx, cfg, st);
336 msg_info_config ("added backend %s for symbol %s",
337 bk->name, stf->symbol);
338 }
339 else {
340 msg_debug_config ("added backend-less statfile for symbol %s",
341 stf->symbol);
342 }
343
344 /* XXX: bad hack to pass statfiles configuration to cache */
345 if (cl->cache == NULL && !skip_cache) {
346 cl->cache = rspamd_stat_get_cache (cache_name);
347 g_assert (cl->cache != NULL);
348 cl->cachecf = cl->cache->init (stat_ctx, cfg, st, cache_obj);
349
350 if (cl->cachecf == NULL) {
351 msg_err_config ("error adding cache %s for symbol %s",
352 cl->cache->name, stf->symbol);
353 cl->cache = NULL;
354 }
355 else {
356 msg_debug_config ("added cache %s for symbol %s",
357 cl->cache->name, stf->symbol);
358 }
359 }
360
361 if (st->bkcf == NULL &&
362 !(cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
363 msg_err_config ("cannot init backend %s for statfile %s",
364 clf->backend, stf->symbol);
365
366 g_free (st);
367 }
368 else {
369 st->id = stat_ctx->statfiles->len;
370 g_ptr_array_add (stat_ctx->statfiles, st);
371 g_array_append_val (cl->statfiles_ids, st->id);
372 }
373
374 curst = curst->next;
375 }
376
377 g_ptr_array_add (stat_ctx->classifiers, cl);
378
379 cur = cur->next;
380 }
381 }
382
383 void
rspamd_stat_close(void)384 rspamd_stat_close (void)
385 {
386 struct rspamd_classifier *cl;
387 struct rspamd_statfile *st;
388 struct rspamd_stat_ctx *st_ctx;
389 struct rspamd_stat_async_elt *aelt;
390 GList *cur;
391 guint i, j;
392 gint id;
393
394 st_ctx = rspamd_stat_get_ctx ();
395 g_assert (st_ctx != NULL);
396
397 for (i = 0; i < st_ctx->classifiers->len; i ++) {
398 cl = g_ptr_array_index (st_ctx->classifiers, i);
399
400 for (j = 0; j < cl->statfiles_ids->len; j ++) {
401 id = g_array_index (cl->statfiles_ids, gint, j);
402 st = g_ptr_array_index (st_ctx->statfiles, id);
403 if (!(st->classifier->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND)) {
404 st->backend->close (st->bkcf);
405 }
406
407 g_free (st);
408 }
409
410 if (cl->cache && cl->cachecf) {
411 cl->cache->close (cl->cachecf);
412 }
413
414 g_array_free (cl->statfiles_ids, TRUE);
415
416 if (cl->subrs->fin_func) {
417 cl->subrs->fin_func (cl);
418 }
419
420 g_free (cl);
421 }
422
423 cur = st_ctx->async_elts->head;
424
425 while (cur) {
426 aelt = cur->data;
427 REF_RELEASE (aelt);
428 cur = g_list_next (cur);
429 }
430
431 g_queue_free (stat_ctx->async_elts);
432 g_ptr_array_free (st_ctx->statfiles, TRUE);
433 g_ptr_array_free (st_ctx->classifiers, TRUE);
434
435 if (st_ctx->lua_stat_tokens_ref != -1) {
436 luaL_unref (st_ctx->cfg->lua_state, LUA_REGISTRYINDEX,
437 st_ctx->lua_stat_tokens_ref);
438 }
439
440 g_free (st_ctx);
441
442 /* Set global var to NULL */
443 stat_ctx = NULL;
444 }
445
446 struct rspamd_stat_ctx *
rspamd_stat_get_ctx(void)447 rspamd_stat_get_ctx (void)
448 {
449 return stat_ctx;
450 }
451
452 struct rspamd_stat_classifier *
rspamd_stat_get_classifier(const gchar * name)453 rspamd_stat_get_classifier (const gchar *name)
454 {
455 guint i;
456
457 if (name == NULL || name[0] == '\0') {
458 name = RSPAMD_DEFAULT_CLASSIFIER;
459 }
460
461 for (i = 0; i < stat_ctx->classifiers_count; i ++) {
462 if (strcmp (name, stat_ctx->classifiers_subrs[i].name) == 0) {
463 return &stat_ctx->classifiers_subrs[i];
464 }
465 }
466
467 msg_err ("cannot find classifier named %s", name);
468
469 return NULL;
470 }
471
472 struct rspamd_stat_backend *
rspamd_stat_get_backend(const gchar * name)473 rspamd_stat_get_backend (const gchar *name)
474 {
475 guint i;
476
477 if (name == NULL || name[0] == '\0') {
478 name = RSPAMD_DEFAULT_BACKEND;
479 }
480
481 for (i = 0; i < stat_ctx->backends_count; i ++) {
482 if (strcmp (name, stat_ctx->backends_subrs[i].name) == 0) {
483 return &stat_ctx->backends_subrs[i];
484 }
485 }
486
487 msg_err ("cannot find backend named %s", name);
488
489 return NULL;
490 }
491
492 struct rspamd_stat_tokenizer *
rspamd_stat_get_tokenizer(const gchar * name)493 rspamd_stat_get_tokenizer (const gchar *name)
494 {
495 guint i;
496
497 if (name == NULL || name[0] == '\0') {
498 name = RSPAMD_DEFAULT_TOKENIZER;
499 }
500
501 for (i = 0; i < stat_ctx->tokenizers_count; i ++) {
502 if (strcmp (name, stat_ctx->tokenizers_subrs[i].name) == 0) {
503 return &stat_ctx->tokenizers_subrs[i];
504 }
505 }
506
507 msg_err ("cannot find tokenizer named %s", name);
508
509 return NULL;
510 }
511
512 struct rspamd_stat_cache *
rspamd_stat_get_cache(const gchar * name)513 rspamd_stat_get_cache (const gchar *name)
514 {
515 guint i;
516
517 if (name == NULL || name[0] == '\0') {
518 name = RSPAMD_DEFAULT_CACHE;
519 }
520
521 for (i = 0; i < stat_ctx->caches_count; i++) {
522 if (strcmp (name, stat_ctx->caches_subrs[i].name) == 0) {
523 return &stat_ctx->caches_subrs[i];
524 }
525 }
526
527 msg_err ("cannot find cache named %s", name);
528
529 return NULL;
530 }
531
532 static void
rspamd_async_elt_dtor(struct rspamd_stat_async_elt * elt)533 rspamd_async_elt_dtor (struct rspamd_stat_async_elt *elt)
534 {
535 if (elt->cleanup) {
536 elt->cleanup (elt, elt->ud);
537 }
538
539 ev_timer_stop (elt->event_loop, &elt->timer_ev);
540 g_free (elt);
541 }
542
543 static void
rspamd_async_elt_on_timer(EV_P_ ev_timer * w,int revents)544 rspamd_async_elt_on_timer (EV_P_ ev_timer *w, int revents)
545 {
546 struct rspamd_stat_async_elt *elt = (struct rspamd_stat_async_elt *)w->data;
547 gdouble jittered_time;
548
549
550 if (elt->enabled) {
551 elt->handler (elt, elt->ud);
552 }
553
554 jittered_time = rspamd_time_jitter (elt->timeout, 0);
555 elt->timer_ev.repeat = jittered_time;
556 ev_timer_again (EV_A_ w);
557 }
558
559 struct rspamd_stat_async_elt*
rspamd_stat_ctx_register_async(rspamd_stat_async_handler handler,rspamd_stat_async_cleanup cleanup,gpointer d,gdouble timeout)560 rspamd_stat_ctx_register_async (rspamd_stat_async_handler handler,
561 rspamd_stat_async_cleanup cleanup,
562 gpointer d,
563 gdouble timeout)
564 {
565 struct rspamd_stat_async_elt *elt;
566 struct rspamd_stat_ctx *st_ctx;
567
568 st_ctx = rspamd_stat_get_ctx ();
569 g_assert (st_ctx != NULL);
570
571 elt = g_malloc0 (sizeof (*elt));
572 elt->handler = handler;
573 elt->cleanup = cleanup;
574 elt->ud = d;
575 elt->timeout = timeout;
576 elt->event_loop = st_ctx->event_loop;
577 REF_INIT_RETAIN (elt, rspamd_async_elt_dtor);
578 /* Enabled by default */
579
580
581 if (st_ctx->event_loop) {
582 elt->enabled = TRUE;
583 /*
584 * First we set timeval to zero as we want cb to be executed as
585 * fast as possible
586 */
587 elt->timer_ev.data = elt;
588 ev_timer_init (&elt->timer_ev, rspamd_async_elt_on_timer,
589 0.1, 0.0);
590 ev_timer_start (st_ctx->event_loop, &elt->timer_ev);
591 }
592 else {
593 elt->enabled = FALSE;
594 }
595
596 g_queue_push_tail (st_ctx->async_elts, elt);
597
598 return elt;
599 }
600