1 /*-
2 * Copyright 2016 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "libmime/message.h"
17 #include "re_cache.h"
18 #include "cryptobox.h"
19 #include "ref.h"
20 #include "libserver/url.h"
21 #include "libserver/task.h"
22 #include "libserver/cfg_file.h"
23 #include "libutil/util.h"
24 #include "libutil/regexp.h"
25 #include "lua/lua_common.h"
26 #include "libstat/stat_api.h"
27 #include "contrib/uthash/utlist.h"
28
29 #include "khash.h"
30
31 #ifdef WITH_HYPERSCAN
32 #include "hs.h"
33 #endif
34
35 #include "unix-std.h"
36 #include <signal.h>
37 #include <stdalign.h>
38 #include <math.h>
39 #include "contrib/libev/ev.h"
40
41 #ifndef WITH_PCRE2
42 #include <pcre.h>
43 #else
44 #include <pcre2.h>
45 #endif
46
47 #include "contrib/fastutf8/fastutf8.h"
48
49 #ifdef HAVE_SYS_WAIT_H
50 #include <sys/wait.h>
51 #endif
52
53 #define msg_err_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
54 "re_cache", cache->hash, \
55 G_STRFUNC, \
56 __VA_ARGS__)
57 #define msg_warn_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
58 "re_cache", cache->hash, \
59 G_STRFUNC, \
60 __VA_ARGS__)
61 #define msg_info_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
62 "re_cache", cache->hash, \
63 G_STRFUNC, \
64 __VA_ARGS__)
65
66 #define msg_debug_re_task(...) rspamd_conditional_debug_fast (NULL, NULL, \
67 rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \
68 G_STRFUNC, \
69 __VA_ARGS__)
70 #define msg_debug_re_cache(...) rspamd_conditional_debug_fast (NULL, NULL, \
71 rspamd_re_cache_log_id, "re_cache", cache->hash, \
72 G_STRFUNC, \
73 __VA_ARGS__)
74
75 INIT_LOG_MODULE(re_cache)
76
77 #ifdef WITH_HYPERSCAN
78 #define RSPAMD_HS_MAGIC_LEN (sizeof (rspamd_hs_magic))
79 static const guchar rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'},
80 rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'};
81 #endif
82
83
84 struct rspamd_re_class {
85 guint64 id;
86 enum rspamd_re_type type;
87 gboolean has_utf8; /* if there are any utf8 regexps */
88 gpointer type_data;
89 gsize type_len;
90 GHashTable *re;
91 rspamd_cryptobox_hash_state_t *st;
92
93 gchar hash[rspamd_cryptobox_HASHBYTES + 1];
94
95 #ifdef WITH_HYPERSCAN
96 hs_database_t *hs_db;
97 hs_scratch_t *hs_scratch;
98 gint *hs_ids;
99 guint nhs;
100 #endif
101 };
102
103 enum rspamd_re_cache_elt_match_type {
104 RSPAMD_RE_CACHE_PCRE = 0,
105 RSPAMD_RE_CACHE_HYPERSCAN,
106 RSPAMD_RE_CACHE_HYPERSCAN_PRE
107 };
108
109 struct rspamd_re_cache_elt {
110 rspamd_regexp_t *re;
111 gint lua_cbref;
112 enum rspamd_re_cache_elt_match_type match_type;
113 };
114
115 KHASH_INIT (lua_selectors_hash, gchar *, int, 1, kh_str_hash_func, kh_str_hash_equal);
116
117 struct rspamd_re_cache {
118 GHashTable *re_classes;
119
120 GPtrArray *re;
121 khash_t (lua_selectors_hash) *selectors;
122 ref_entry_t ref;
123 guint nre;
124 guint max_re_data;
125 gchar hash[rspamd_cryptobox_HASHBYTES + 1];
126 lua_State *L;
127 #ifdef WITH_HYPERSCAN
128 enum rspamd_hyperscan_status hyperscan_loaded;
129 gboolean disable_hyperscan;
130 gboolean vectorized_hyperscan;
131 hs_platform_info_t plt;
132 #endif
133 };
134
135 struct rspamd_re_selector_result {
136 guchar **scvec;
137 guint *lenvec;
138 guint cnt;
139 };
140
141 KHASH_INIT (selectors_results_hash, int, struct rspamd_re_selector_result, 1,
142 kh_int_hash_func, kh_int_hash_equal);
143
144 struct rspamd_re_runtime {
145 guchar *checked;
146 guchar *results;
147 khash_t (selectors_results_hash) *sel_cache;
148 struct rspamd_re_cache *cache;
149 struct rspamd_re_cache_stat stat;
150 gboolean has_hs;
151 };
152
153 static GQuark
rspamd_re_cache_quark(void)154 rspamd_re_cache_quark (void)
155 {
156 return g_quark_from_static_string ("re_cache");
157 }
158
159 static guint64
rspamd_re_cache_class_id(enum rspamd_re_type type,gconstpointer type_data,gsize datalen)160 rspamd_re_cache_class_id (enum rspamd_re_type type,
161 gconstpointer type_data,
162 gsize datalen)
163 {
164 rspamd_cryptobox_fast_hash_state_t st;
165
166 rspamd_cryptobox_fast_hash_init (&st, 0xdeadbabe);
167 rspamd_cryptobox_fast_hash_update (&st, &type, sizeof (type));
168
169 if (datalen > 0) {
170 rspamd_cryptobox_fast_hash_update (&st, type_data, datalen);
171 }
172
173 return rspamd_cryptobox_fast_hash_final (&st);
174 }
175
176 static void
rspamd_re_cache_destroy(struct rspamd_re_cache * cache)177 rspamd_re_cache_destroy (struct rspamd_re_cache *cache)
178 {
179 GHashTableIter it;
180 gpointer k, v;
181 struct rspamd_re_class *re_class;
182 gchar *skey;
183 gint sref;
184
185 g_assert (cache != NULL);
186 g_hash_table_iter_init (&it, cache->re_classes);
187
188 while (g_hash_table_iter_next (&it, &k, &v)) {
189 re_class = v;
190 g_hash_table_iter_steal (&it);
191 g_hash_table_unref (re_class->re);
192
193 if (re_class->type_data) {
194 g_free (re_class->type_data);
195 }
196
197 #ifdef WITH_HYPERSCAN
198 if (re_class->hs_db) {
199 hs_free_database (re_class->hs_db);
200 }
201 if (re_class->hs_scratch) {
202 hs_free_scratch (re_class->hs_scratch);
203 }
204 if (re_class->hs_ids) {
205 g_free (re_class->hs_ids);
206 }
207 #endif
208 g_free (re_class);
209 }
210
211 if (cache->L) {
212 kh_foreach (cache->selectors, skey, sref, {
213 luaL_unref (cache->L, LUA_REGISTRYINDEX, sref);
214 g_free (skey);
215 });
216
217 struct rspamd_re_cache_elt *elt;
218 guint i;
219
220 PTR_ARRAY_FOREACH (cache->re, i, elt) {
221 if (elt->lua_cbref != -1) {
222 luaL_unref (cache->L, LUA_REGISTRYINDEX, elt->lua_cbref);
223 }
224 }
225 }
226
227 kh_destroy (lua_selectors_hash, cache->selectors);
228
229 g_hash_table_unref (cache->re_classes);
230 g_ptr_array_free (cache->re, TRUE);
231 g_free (cache);
232 }
233
234 static void
rspamd_re_cache_elt_dtor(gpointer e)235 rspamd_re_cache_elt_dtor (gpointer e)
236 {
237 struct rspamd_re_cache_elt *elt = e;
238
239 rspamd_regexp_unref (elt->re);
240 g_free (elt);
241 }
242
243 struct rspamd_re_cache *
rspamd_re_cache_new(void)244 rspamd_re_cache_new (void)
245 {
246 struct rspamd_re_cache *cache;
247
248 cache = g_malloc0 (sizeof (*cache));
249 cache->re_classes = g_hash_table_new (g_int64_hash, g_int64_equal);
250 cache->nre = 0;
251 cache->re = g_ptr_array_new_full (256, rspamd_re_cache_elt_dtor);
252 cache->selectors = kh_init (lua_selectors_hash);
253 #ifdef WITH_HYPERSCAN
254 cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN;
255 #endif
256 REF_INIT_RETAIN (cache, rspamd_re_cache_destroy);
257
258 return cache;
259 }
260
261 enum rspamd_hyperscan_status
rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache * cache)262 rspamd_re_cache_is_hs_loaded (struct rspamd_re_cache *cache)
263 {
264 g_assert (cache != NULL);
265
266 #ifdef WITH_HYPERSCAN
267 return cache->hyperscan_loaded;
268 #else
269 return RSPAMD_HYPERSCAN_UNSUPPORTED;
270 #endif
271 }
272
273 rspamd_regexp_t *
rspamd_re_cache_add(struct rspamd_re_cache * cache,rspamd_regexp_t * re,enum rspamd_re_type type,gconstpointer type_data,gsize datalen,gint lua_cbref)274 rspamd_re_cache_add (struct rspamd_re_cache *cache,
275 rspamd_regexp_t *re,
276 enum rspamd_re_type type,
277 gconstpointer type_data, gsize datalen,
278 gint lua_cbref)
279 {
280 guint64 class_id;
281 struct rspamd_re_class *re_class;
282 rspamd_regexp_t *nre;
283 struct rspamd_re_cache_elt *elt;
284
285 g_assert (cache != NULL);
286 g_assert (re != NULL);
287
288 class_id = rspamd_re_cache_class_id (type, type_data, datalen);
289 re_class = g_hash_table_lookup (cache->re_classes, &class_id);
290
291 if (re_class == NULL) {
292 re_class = g_malloc0 (sizeof (*re_class));
293 re_class->id = class_id;
294 re_class->type_len = datalen;
295 re_class->type = type;
296 re_class->re = g_hash_table_new_full (rspamd_regexp_hash,
297 rspamd_regexp_equal, NULL, (GDestroyNotify)rspamd_regexp_unref);
298
299 if (datalen > 0) {
300 re_class->type_data = g_malloc0 (datalen);
301 memcpy (re_class->type_data, type_data, datalen);
302 }
303
304 g_hash_table_insert (cache->re_classes, &re_class->id, re_class);
305 }
306
307 if ((nre = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (re)))
308 == NULL) {
309 /*
310 * We set re id based on the global position in the cache
311 */
312 elt = g_malloc0 (sizeof (*elt));
313 /* One ref for re_class */
314 nre = rspamd_regexp_ref (re);
315 rspamd_regexp_set_cache_id (re, cache->nre++);
316 /* One ref for cache */
317 elt->re = rspamd_regexp_ref (re);
318 g_ptr_array_add (cache->re, elt);
319 rspamd_regexp_set_class (re, re_class);
320 elt->lua_cbref = lua_cbref;
321
322 g_hash_table_insert (re_class->re, rspamd_regexp_get_id (nre), nre);
323 }
324
325 if (rspamd_regexp_get_flags (re) & RSPAMD_REGEXP_FLAG_UTF) {
326 re_class->has_utf8 = TRUE;
327 }
328
329 return nre;
330 }
331
332 void
rspamd_re_cache_replace(struct rspamd_re_cache * cache,rspamd_regexp_t * what,rspamd_regexp_t * with)333 rspamd_re_cache_replace (struct rspamd_re_cache *cache,
334 rspamd_regexp_t *what,
335 rspamd_regexp_t *with)
336 {
337 guint64 re_id;
338 struct rspamd_re_class *re_class;
339 rspamd_regexp_t *src;
340 struct rspamd_re_cache_elt *elt;
341
342 g_assert (cache != NULL);
343 g_assert (what != NULL);
344 g_assert (with != NULL);
345
346 re_class = rspamd_regexp_get_class (what);
347
348 if (re_class != NULL) {
349 re_id = rspamd_regexp_get_cache_id (what);
350
351 g_assert (re_id != RSPAMD_INVALID_ID);
352 src = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (what));
353 elt = g_ptr_array_index (cache->re, re_id);
354 g_assert (elt != NULL);
355 g_assert (src != NULL);
356
357 rspamd_regexp_set_cache_id (what, RSPAMD_INVALID_ID);
358 rspamd_regexp_set_class (what, NULL);
359 rspamd_regexp_set_cache_id (with, re_id);
360 rspamd_regexp_set_class (with, re_class);
361 /*
362 * On calling of this function, we actually unref old re (what)
363 */
364 g_hash_table_insert (re_class->re,
365 rspamd_regexp_get_id (what),
366 rspamd_regexp_ref (with));
367
368 rspamd_regexp_unref (elt->re);
369 elt->re = rspamd_regexp_ref (with);
370 /* XXX: do not touch match type here */
371 }
372 }
373
374 static gint
rspamd_re_cache_sort_func(gconstpointer a,gconstpointer b)375 rspamd_re_cache_sort_func (gconstpointer a, gconstpointer b)
376 {
377 struct rspamd_re_cache_elt * const *re1 = a, * const *re2 = b;
378
379 return rspamd_regexp_cmp (rspamd_regexp_get_id ((*re1)->re),
380 rspamd_regexp_get_id ((*re2)->re));
381 }
382
383 void
rspamd_re_cache_init(struct rspamd_re_cache * cache,struct rspamd_config * cfg)384 rspamd_re_cache_init (struct rspamd_re_cache *cache, struct rspamd_config *cfg)
385 {
386 guint i, fl;
387 GHashTableIter it;
388 gpointer k, v;
389 struct rspamd_re_class *re_class;
390 rspamd_cryptobox_hash_state_t st_global;
391 rspamd_regexp_t *re;
392 struct rspamd_re_cache_elt *elt;
393 guchar hash_out[rspamd_cryptobox_HASHBYTES];
394
395 g_assert (cache != NULL);
396
397 rspamd_cryptobox_hash_init (&st_global, NULL, 0);
398 /* Resort all regexps */
399 g_ptr_array_sort (cache->re, rspamd_re_cache_sort_func);
400
401 for (i = 0; i < cache->re->len; i ++) {
402 elt = g_ptr_array_index (cache->re, i);
403 re = elt->re;
404 re_class = rspamd_regexp_get_class (re);
405 g_assert (re_class != NULL);
406 rspamd_regexp_set_cache_id (re, i);
407
408 if (re_class->st == NULL) {
409 (void) !posix_memalign ((void **)&re_class->st, RSPAMD_ALIGNOF(rspamd_cryptobox_hash_state_t),
410 sizeof (*re_class->st));
411 g_assert (re_class->st != NULL);
412 rspamd_cryptobox_hash_init (re_class->st, NULL, 0);
413 }
414
415 /* Update hashes */
416 /* Id of re class */
417 rspamd_cryptobox_hash_update (re_class->st, (gpointer) &re_class->id,
418 sizeof (re_class->id));
419 rspamd_cryptobox_hash_update (&st_global, (gpointer) &re_class->id,
420 sizeof (re_class->id));
421 /* Id of re expression */
422 rspamd_cryptobox_hash_update (re_class->st, rspamd_regexp_get_id (re),
423 rspamd_cryptobox_HASHBYTES);
424 rspamd_cryptobox_hash_update (&st_global, rspamd_regexp_get_id (re),
425 rspamd_cryptobox_HASHBYTES);
426 /* PCRE flags */
427 fl = rspamd_regexp_get_pcre_flags (re);
428 rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&fl,
429 sizeof (fl));
430 rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
431 sizeof (fl));
432 /* Rspamd flags */
433 fl = rspamd_regexp_get_flags (re);
434 rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl,
435 sizeof (fl));
436 rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
437 sizeof (fl));
438 /* Limit of hits */
439 fl = rspamd_regexp_get_maxhits (re);
440 rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl,
441 sizeof (fl));
442 rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
443 sizeof (fl));
444 /* Numberic order */
445 rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&i,
446 sizeof (i));
447 rspamd_cryptobox_hash_update (&st_global, (const guchar *)&i,
448 sizeof (i));
449 }
450
451 rspamd_cryptobox_hash_final (&st_global, hash_out);
452 rspamd_snprintf (cache->hash, sizeof (cache->hash), "%*xs",
453 (gint) rspamd_cryptobox_HASHBYTES, hash_out);
454
455 /* Now finalize all classes */
456 g_hash_table_iter_init (&it, cache->re_classes);
457
458 while (g_hash_table_iter_next (&it, &k, &v)) {
459 re_class = v;
460
461 if (re_class->st) {
462 /*
463 * We finally update all classes with the number of expressions
464 * in the cache to ensure that if even a single re has been changed
465 * we won't be broken due to id mismatch
466 */
467 rspamd_cryptobox_hash_update (re_class->st,
468 (gpointer)&cache->re->len,
469 sizeof (cache->re->len));
470 rspamd_cryptobox_hash_final (re_class->st, hash_out);
471 rspamd_snprintf (re_class->hash, sizeof (re_class->hash), "%*xs",
472 (gint) rspamd_cryptobox_HASHBYTES, hash_out);
473 free (re_class->st); /* Due to posix_memalign */
474 re_class->st = NULL;
475 }
476 }
477
478 cache->L = cfg->lua_state;
479
480 #ifdef WITH_HYPERSCAN
481 const gchar *platform = "generic";
482 rspamd_fstring_t *features = rspamd_fstring_new ();
483
484 cache->disable_hyperscan = cfg->disable_hyperscan;
485 cache->vectorized_hyperscan = cfg->vectorized_hyperscan;
486
487 g_assert (hs_populate_platform (&cache->plt) == HS_SUCCESS);
488
489 /* Now decode what we do have */
490 switch (cache->plt.tune) {
491 case HS_TUNE_FAMILY_HSW:
492 platform = "haswell";
493 break;
494 case HS_TUNE_FAMILY_SNB:
495 platform = "sandy";
496 break;
497 case HS_TUNE_FAMILY_BDW:
498 platform = "broadwell";
499 break;
500 case HS_TUNE_FAMILY_IVB:
501 platform = "ivy";
502 break;
503 default:
504 break;
505 }
506
507 if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) {
508 features = rspamd_fstring_append (features, "AVX2", 4);
509 }
510
511 hs_set_allocator (g_malloc, g_free);
512
513 msg_info_re_cache ("loaded hyperscan engine with cpu tune '%s' and features '%V'",
514 platform, features);
515
516 rspamd_fstring_free (features);
517 #endif
518 }
519
520 struct rspamd_re_runtime *
rspamd_re_cache_runtime_new(struct rspamd_re_cache * cache)521 rspamd_re_cache_runtime_new (struct rspamd_re_cache *cache)
522 {
523 struct rspamd_re_runtime *rt;
524 g_assert (cache != NULL);
525
526 rt = g_malloc0 (sizeof (*rt) + NBYTES (cache->nre) + cache->nre);
527 rt->cache = cache;
528 REF_RETAIN (cache);
529 rt->checked = ((guchar *)rt) + sizeof (*rt);
530 rt->results = rt->checked + NBYTES (cache->nre);
531 rt->stat.regexp_total = cache->nre;
532 #ifdef WITH_HYPERSCAN
533 rt->has_hs = cache->hyperscan_loaded;
534 #endif
535
536 return rt;
537 }
538
539 const struct rspamd_re_cache_stat *
rspamd_re_cache_get_stat(struct rspamd_re_runtime * rt)540 rspamd_re_cache_get_stat (struct rspamd_re_runtime *rt)
541 {
542 g_assert (rt != NULL);
543
544 return &rt->stat;
545 }
546
547 static gboolean
rspamd_re_cache_check_lua_condition(struct rspamd_task * task,rspamd_regexp_t * re,const guchar * in,gsize len,goffset start,goffset end,gint lua_cbref)548 rspamd_re_cache_check_lua_condition (struct rspamd_task *task,
549 rspamd_regexp_t *re,
550 const guchar *in, gsize len,
551 goffset start, goffset end,
552 gint lua_cbref)
553 {
554 lua_State *L = (lua_State *)task->cfg->lua_state;
555 GError *err = NULL;
556 struct rspamd_lua_text __attribute__ ((unused)) *t;
557 gint text_pos;
558
559 if (G_LIKELY (lua_cbref == -1)) {
560 return TRUE;
561 }
562
563 t = lua_new_text (L, in, len, FALSE);
564 text_pos = lua_gettop (L);
565
566 if (!rspamd_lua_universal_pcall (L, lua_cbref,
567 G_STRLOC, 1, "utii", &err,
568 "rspamd{task}", task,
569 text_pos, start, end)) {
570 msg_warn_task ("cannot call for re_cache_check_lua_condition for re %s: %e",
571 rspamd_regexp_get_pattern (re), err);
572 g_error_free (err);
573
574 return TRUE;
575 }
576
577 gboolean res = lua_toboolean (L, -1);
578
579 lua_settop (L, text_pos - 1);
580
581 return res;
582 }
583
584 static guint
rspamd_re_cache_process_pcre(struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_task * task,const guchar * in,gsize len,gboolean is_raw,gint lua_cbref)585 rspamd_re_cache_process_pcre (struct rspamd_re_runtime *rt,
586 rspamd_regexp_t *re, struct rspamd_task *task,
587 const guchar *in, gsize len,
588 gboolean is_raw,
589 gint lua_cbref)
590 {
591 guint r = 0;
592 const gchar *start = NULL, *end = NULL;
593 guint max_hits = rspamd_regexp_get_maxhits (re);
594 guint64 id = rspamd_regexp_get_cache_id (re);
595 gdouble t1 = NAN, t2, pr;
596 const gdouble slow_time = 1e8;
597
598 if (in == NULL) {
599 return rt->results[id];
600 }
601
602 if (len == 0) {
603 return rt->results[id];
604 }
605
606 if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) {
607 len = rt->cache->max_re_data;
608 }
609
610 r = rt->results[id];
611
612 if (max_hits == 0 || r < max_hits) {
613 pr = rspamd_random_double_fast ();
614
615 if (pr > 0.9) {
616 t1 = rspamd_get_ticks (TRUE);
617 }
618
619 while (rspamd_regexp_search (re,
620 in,
621 len,
622 &start,
623 &end,
624 is_raw,
625 NULL)) {
626 if (rspamd_re_cache_check_lua_condition (task, re, in, len,
627 start - (const gchar *)in, end - (const gchar *)in, lua_cbref)) {
628 r++;
629 msg_debug_re_task ("found regexp /%s/, total hits: %d",
630 rspamd_regexp_get_pattern (re), r);
631 }
632
633 if (max_hits > 0 && r >= max_hits) {
634 break;
635 }
636 }
637
638 rt->results[id] += r;
639 rt->stat.regexp_checked++;
640 rt->stat.bytes_scanned_pcre += len;
641 rt->stat.bytes_scanned += len;
642
643 if (r > 0) {
644 rt->stat.regexp_matched += r;
645 }
646
647 if (!isnan (t1)) {
648 t2 = rspamd_get_ticks (TRUE);
649
650 if (t2 - t1 > slow_time) {
651 rspamd_symcache_enable_profile (task);
652 msg_info_task ("regexp '%16s' took %.0f ticks to execute",
653 rspamd_regexp_get_pattern (re), t2 - t1);
654 }
655 }
656 }
657
658 return r;
659 }
660
661 #ifdef WITH_HYPERSCAN
662 struct rspamd_re_hyperscan_cbdata {
663 struct rspamd_re_runtime *rt;
664 const guchar **ins;
665 const guint *lens;
666 guint count;
667 rspamd_regexp_t *re;
668 struct rspamd_task *task;
669 };
670
671 static gint
rspamd_re_cache_hyperscan_cb(unsigned int id,unsigned long long from,unsigned long long to,unsigned int flags,void * ud)672 rspamd_re_cache_hyperscan_cb (unsigned int id,
673 unsigned long long from,
674 unsigned long long to,
675 unsigned int flags,
676 void *ud)
677 {
678 struct rspamd_re_hyperscan_cbdata *cbdata = ud;
679 struct rspamd_re_runtime *rt;
680 struct rspamd_re_cache_elt *cache_elt;
681 guint ret, maxhits, i, processed;
682 struct rspamd_task *task;
683
684 rt = cbdata->rt;
685 task = cbdata->task;
686 cache_elt = g_ptr_array_index (rt->cache->re, id);
687 maxhits = rspamd_regexp_get_maxhits (cache_elt->re);
688
689 if (cache_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) {
690 if (rspamd_re_cache_check_lua_condition (task, cache_elt->re,
691 cbdata->ins[0], cbdata->lens[0], from, to, cache_elt->lua_cbref)) {
692 ret = 1;
693 setbit (rt->checked, id);
694
695 if (maxhits == 0 || rt->results[id] < maxhits) {
696 rt->results[id] += ret;
697 rt->stat.regexp_matched++;
698 }
699 msg_debug_re_task ("found regexp /%s/ using hyperscan only, total hits: %d",
700 rspamd_regexp_get_pattern (cache_elt->re), rt->results[id]);
701 }
702 }
703 else {
704 if (!isset (rt->checked, id)) {
705
706 processed = 0;
707
708 for (i = 0; i < cbdata->count; i ++) {
709 rspamd_re_cache_process_pcre (rt,
710 cache_elt->re,
711 cbdata->task,
712 cbdata->ins[i],
713 cbdata->lens[i],
714 FALSE,
715 cache_elt->lua_cbref);
716 setbit (rt->checked, id);
717
718 processed += cbdata->lens[i];
719
720 if (processed >= to) {
721 break;
722 }
723 }
724 }
725 }
726
727 return 0;
728 }
729 #endif
730
731 static guint
rspamd_re_cache_process_regexp_data(struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_task * task,const guchar ** in,guint * lens,guint count,gboolean is_raw,gboolean * processed_hyperscan)732 rspamd_re_cache_process_regexp_data (struct rspamd_re_runtime *rt,
733 rspamd_regexp_t *re, struct rspamd_task *task,
734 const guchar **in, guint *lens,
735 guint count,
736 gboolean is_raw,
737 gboolean *processed_hyperscan)
738 {
739
740 guint64 re_id;
741 guint ret = 0;
742 guint i;
743 struct rspamd_re_cache_elt *cache_elt;
744
745 re_id = rspamd_regexp_get_cache_id (re);
746
747 if (count == 0 || in == NULL) {
748 /* We assume this as absence of the specified data */
749 setbit (rt->checked, re_id);
750 rt->results[re_id] = ret;
751 return ret;
752 }
753
754 cache_elt = (struct rspamd_re_cache_elt *)g_ptr_array_index (rt->cache->re, re_id);
755
756 #ifndef WITH_HYPERSCAN
757 for (i = 0; i < count; i++) {
758 ret = rspamd_re_cache_process_pcre (rt,
759 re,
760 task,
761 in[i],
762 lens[i],
763 is_raw,
764 cache_elt->lua_cbref);
765 rt->results[re_id] = ret;
766 }
767
768 setbit (rt->checked, re_id);
769 #else
770 struct rspamd_re_class *re_class;
771 struct rspamd_re_hyperscan_cbdata cbdata;
772
773 cache_elt = g_ptr_array_index (rt->cache->re, re_id);
774 re_class = rspamd_regexp_get_class (re);
775
776 if (rt->cache->disable_hyperscan || cache_elt->match_type == RSPAMD_RE_CACHE_PCRE ||
777 !rt->has_hs || (is_raw && re_class->has_utf8)) {
778 for (i = 0; i < count; i++) {
779 ret = rspamd_re_cache_process_pcre (rt,
780 re,
781 task,
782 in[i],
783 lens[i],
784 is_raw,
785 cache_elt->lua_cbref);
786 }
787
788 setbit (rt->checked, re_id);
789 }
790 else {
791 for (i = 0; i < count; i ++) {
792 /* For Hyperscan we can probably safely disable all those limits */
793 #if 0
794 if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) {
795 lens[i] = rt->cache->max_re_data;
796 }
797 #endif
798 rt->stat.bytes_scanned += lens[i];
799 }
800
801 g_assert (re_class->hs_scratch != NULL);
802 g_assert (re_class->hs_db != NULL);
803
804 /* Go through hyperscan API */
805 if (!rt->cache->vectorized_hyperscan) {
806 for (i = 0; i < count; i++) {
807 cbdata.ins = &in[i];
808 cbdata.re = re;
809 cbdata.rt = rt;
810 cbdata.lens = &lens[i];
811 cbdata.count = 1;
812 cbdata.task = task;
813
814 if ((hs_scan (re_class->hs_db, in[i], lens[i], 0,
815 re_class->hs_scratch,
816 rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
817 ret = 0;
818 }
819 else {
820 ret = rt->results[re_id];
821 *processed_hyperscan = TRUE;
822 }
823 }
824 }
825 else {
826 cbdata.ins = in;
827 cbdata.re = re;
828 cbdata.rt = rt;
829 cbdata.lens = lens;
830 cbdata.count = 1;
831 cbdata.task = task;
832
833 if ((hs_scan_vector (re_class->hs_db, (const char **)in, lens, count, 0,
834 re_class->hs_scratch,
835 rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
836 ret = 0;
837 }
838 else {
839 ret = rt->results[re_id];
840 *processed_hyperscan = TRUE;
841 }
842 }
843 }
844 #endif
845
846 return ret;
847 }
848
849 static void
rspamd_re_cache_finish_class(struct rspamd_task * task,struct rspamd_re_runtime * rt,struct rspamd_re_class * re_class,const gchar * class_name)850 rspamd_re_cache_finish_class (struct rspamd_task *task,
851 struct rspamd_re_runtime *rt,
852 struct rspamd_re_class *re_class,
853 const gchar *class_name)
854 {
855 #ifdef WITH_HYPERSCAN
856 guint i;
857 guint64 re_id;
858 guint found = 0;
859
860 /* Set all bits that are not checked and included in hyperscan to 1 */
861 for (i = 0; i < re_class->nhs; i++) {
862 re_id = re_class->hs_ids[i];
863
864 if (!isset (rt->checked, re_id)) {
865 g_assert (rt->results[re_id] == 0);
866 rt->results[re_id] = 0;
867 setbit (rt->checked, re_id);
868 }
869 else {
870 found ++;
871 }
872 }
873
874 msg_debug_re_task ("finished hyperscan for class %s; %d "
875 "matches found; %d hyperscan supported regexps; %d total regexps",
876 class_name, found, re_class->nhs, (gint)g_hash_table_size (re_class->re));
877 #endif
878 }
879
880 static gboolean
rspamd_re_cache_process_selector(struct rspamd_task * task,struct rspamd_re_runtime * rt,const gchar * name,guchar *** svec,guint ** lenvec,guint * n)881 rspamd_re_cache_process_selector (struct rspamd_task *task,
882 struct rspamd_re_runtime *rt,
883 const gchar *name,
884 guchar ***svec,
885 guint **lenvec,
886 guint *n)
887 {
888 gint ref;
889 khiter_t k;
890 lua_State *L;
891 gint err_idx, ret;
892 struct rspamd_task **ptask;
893 gboolean result = FALSE;
894 struct rspamd_re_cache *cache = rt->cache;
895 struct rspamd_re_selector_result *sr;
896
897 L = cache->L;
898 k = kh_get (lua_selectors_hash, cache->selectors, (gchar *)name);
899
900 if (k == kh_end (cache->selectors)) {
901 msg_err_task ("cannot find selector %s, not registered", name);
902
903 return FALSE;
904 }
905
906 ref = kh_value (cache->selectors, k);
907
908 /* First, search for the cached result */
909 if (rt->sel_cache) {
910 k = kh_get (selectors_results_hash, rt->sel_cache, ref);
911
912 if (k != kh_end (rt->sel_cache)) {
913 sr = &kh_value (rt->sel_cache, k);
914
915 *svec = sr->scvec;
916 *lenvec = sr->lenvec;
917 *n = sr->cnt;
918
919 return TRUE;
920 }
921 }
922 else {
923 rt->sel_cache = kh_init (selectors_results_hash);
924 }
925
926 lua_pushcfunction (L, &rspamd_lua_traceback);
927 err_idx = lua_gettop (L);
928
929 lua_rawgeti (L, LUA_REGISTRYINDEX, ref);
930 ptask = lua_newuserdata (L, sizeof (*ptask));
931 *ptask = task;
932 rspamd_lua_setclass (L, "rspamd{task}", -1);
933
934 if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
935 msg_err_task ("call to selector %s "
936 "failed (%d): %s", name, ret,
937 lua_tostring (L, -1));
938 }
939 else {
940 struct rspamd_lua_text *txt;
941 gsize slen;
942 const gchar *sel_data;
943
944 if (lua_type (L, -1) != LUA_TTABLE) {
945 txt = lua_check_text_or_string (L, -1);
946
947 if (txt) {
948 sel_data = txt->start;
949 slen = txt->len;
950 *n = 1;
951 *svec = g_malloc (sizeof (guchar *));
952 *lenvec = g_malloc (sizeof (guint));
953 (*svec)[0] = g_malloc (slen);
954 memcpy ((*svec)[0], sel_data, slen);
955 (*lenvec)[0] = slen;
956 result = TRUE;
957 }
958 }
959 else {
960 *n = rspamd_lua_table_size (L, -1);
961
962 if (*n > 0) {
963 *svec = g_malloc (sizeof (guchar *) * (*n));
964 *lenvec = g_malloc (sizeof (guint) * (*n));
965
966 for (guint i = 0; i < *n; i ++) {
967 lua_rawgeti (L, -1, i + 1);
968
969 txt = lua_check_text_or_string (L, -1);
970 if (txt) {
971 sel_data = txt->start;
972 slen = txt->len;
973 }
974 else {
975 sel_data = "";
976 slen = 0;
977 }
978
979 (*svec)[i] = g_malloc (slen);
980 memcpy ((*svec)[i], sel_data, slen);
981 (*lenvec)[i] = slen;
982 lua_pop (L, 1);
983 }
984
985 result = TRUE;
986 }
987 }
988 }
989
990 lua_settop (L, err_idx - 1);
991
992 if (result) {
993 k = kh_put (selectors_results_hash, rt->sel_cache, ref, &ret);
994 sr = &kh_value (rt->sel_cache, k);
995
996 sr->cnt = *n;
997 sr->scvec = *svec;
998 sr->lenvec = *lenvec;
999 }
1000
1001 return result;
1002 }
1003
1004 static inline guint
rspamd_process_words_vector(GArray * words,const guchar ** scvec,guint * lenvec,struct rspamd_re_class * re_class,guint cnt,gboolean * raw)1005 rspamd_process_words_vector (GArray *words,
1006 const guchar **scvec,
1007 guint *lenvec,
1008 struct rspamd_re_class *re_class,
1009 guint cnt,
1010 gboolean *raw)
1011 {
1012 guint j;
1013 rspamd_stat_token_t *tok;
1014
1015 if (words) {
1016 for (j = 0; j < words->len; j ++) {
1017 tok = &g_array_index (words, rspamd_stat_token_t, j);
1018
1019 if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
1020 if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
1021 if (!re_class->has_utf8) {
1022 *raw = TRUE;
1023 }
1024 else {
1025 continue; /* Skip */
1026 }
1027 }
1028 }
1029 else {
1030 continue; /* Skip non text */
1031 }
1032
1033 if (re_class->type == RSPAMD_RE_RAWWORDS) {
1034 if (tok->original.len > 0) {
1035 scvec[cnt] = tok->original.begin;
1036 lenvec[cnt++] = tok->original.len;
1037 }
1038 }
1039 else if (re_class->type == RSPAMD_RE_WORDS) {
1040 if (tok->normalized.len > 0) {
1041 scvec[cnt] = tok->normalized.begin;
1042 lenvec[cnt++] = tok->normalized.len;
1043 }
1044 }
1045 else {
1046 /* Stemmed words */
1047 if (tok->stemmed.len > 0) {
1048 scvec[cnt] = tok->stemmed.begin;
1049 lenvec[cnt++] = tok->stemmed.len;
1050 }
1051 }
1052 }
1053 }
1054
1055 return cnt;
1056 }
1057
1058 static guint
rspamd_re_cache_process_headers_list(struct rspamd_task * task,struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_re_class * re_class,struct rspamd_mime_header * rh,gboolean is_strong,gboolean * processed_hyperscan)1059 rspamd_re_cache_process_headers_list (struct rspamd_task *task,
1060 struct rspamd_re_runtime *rt,
1061 rspamd_regexp_t *re,
1062 struct rspamd_re_class *re_class,
1063 struct rspamd_mime_header *rh,
1064 gboolean is_strong,
1065 gboolean *processed_hyperscan)
1066 {
1067 const guchar **scvec, *in;
1068 gboolean raw = FALSE;
1069 guint *lenvec;
1070 struct rspamd_mime_header *cur;
1071 guint cnt = 0, i = 0, ret = 0;
1072
1073 DL_COUNT (rh, cur, cnt);
1074
1075 scvec = g_malloc (sizeof (*scvec) * cnt);
1076 lenvec = g_malloc (sizeof (*lenvec) * cnt);
1077
1078 DL_FOREACH (rh, cur) {
1079
1080 if (is_strong && strcmp (cur->name, re_class->type_data) != 0) {
1081 /* Skip a different case */
1082 continue;
1083 }
1084
1085 if (re_class->type == RSPAMD_RE_RAWHEADER) {
1086 in = (const guchar *)cur->value;
1087 lenvec[i] = strlen (cur->value);
1088
1089 if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) {
1090 raw = TRUE;
1091 }
1092 }
1093 else {
1094 in = (const guchar *)cur->decoded;
1095 /* Validate input^W^WNo need to validate as it is already valid */
1096 if (!in) {
1097 lenvec[i] = 0;
1098 scvec[i] = (guchar *)"";
1099 continue;
1100 }
1101
1102 lenvec[i] = strlen (in);
1103 }
1104
1105 scvec[i] = in;
1106
1107 i ++;
1108 }
1109
1110 if (i > 0) {
1111 ret = rspamd_re_cache_process_regexp_data (rt, re,
1112 task, scvec, lenvec, i, raw, processed_hyperscan);
1113 msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d",
1114 re_class->type_data,
1115 rspamd_regexp_get_pattern (re),
1116 (int) lenvec[0], scvec[0], ret);
1117 }
1118
1119 g_free (scvec);
1120 g_free (lenvec);
1121
1122 return ret;
1123 }
1124
1125 /*
1126 * Calculates the specified regexp for the specified class if it's not calculated
1127 */
1128 static guint
rspamd_re_cache_exec_re(struct rspamd_task * task,struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_re_class * re_class,gboolean is_strong)1129 rspamd_re_cache_exec_re (struct rspamd_task *task,
1130 struct rspamd_re_runtime *rt,
1131 rspamd_regexp_t *re,
1132 struct rspamd_re_class *re_class,
1133 gboolean is_strong)
1134 {
1135 guint ret = 0, i, re_id;
1136 struct rspamd_mime_header *rh;
1137 const gchar *in;
1138 const guchar **scvec;
1139 guint *lenvec;
1140 gboolean raw = FALSE, processed_hyperscan = FALSE;
1141 struct rspamd_mime_text_part *text_part;
1142 struct rspamd_mime_part *mime_part;
1143 struct rspamd_url *url;
1144 guint len, cnt;
1145 const gchar *class_name;
1146
1147 class_name = rspamd_re_cache_type_to_string (re_class->type);
1148 msg_debug_re_task ("start check re type: %s: /%s/",
1149 class_name,
1150 rspamd_regexp_get_pattern (re));
1151 re_id = rspamd_regexp_get_cache_id (re);
1152
1153 switch (re_class->type) {
1154 case RSPAMD_RE_HEADER:
1155 case RSPAMD_RE_RAWHEADER:
1156 /* Get list of specified headers */
1157 rh = rspamd_message_get_header_array(task,
1158 re_class->type_data, FALSE);
1159
1160 if (rh) {
1161 ret = rspamd_re_cache_process_headers_list (task, rt, re,
1162 re_class, rh, is_strong, &processed_hyperscan);
1163 msg_debug_re_task ("checked header(%s) regexp: %s -> %d",
1164 (const char *)re_class->type_data,
1165 rspamd_regexp_get_pattern (re),
1166 ret);
1167 }
1168 break;
1169 case RSPAMD_RE_ALLHEADER:
1170 raw = TRUE;
1171 in = MESSAGE_FIELD (task, raw_headers_content).begin;
1172 len = MESSAGE_FIELD (task, raw_headers_content).len;
1173 ret = rspamd_re_cache_process_regexp_data (rt, re,
1174 task, (const guchar **)&in, &len, 1, raw, &processed_hyperscan);
1175 msg_debug_re_task ("checked allheader regexp: %s -> %d",
1176 rspamd_regexp_get_pattern (re), ret);
1177 break;
1178 case RSPAMD_RE_MIMEHEADER:
1179 PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, mime_part) {
1180 rh = rspamd_message_get_header_from_hash(mime_part->raw_headers,
1181 re_class->type_data, FALSE);
1182
1183 if (rh) {
1184 ret += rspamd_re_cache_process_headers_list (task, rt, re,
1185 re_class, rh, is_strong, &processed_hyperscan);
1186 }
1187 msg_debug_re_task ("checked mime header(%s) regexp: %s -> %d",
1188 (const char *)re_class->type_data,
1189 rspamd_regexp_get_pattern (re),
1190 ret);
1191 }
1192 break;
1193 case RSPAMD_RE_MIME:
1194 case RSPAMD_RE_RAWMIME:
1195 /* Iterate through text parts */
1196 if (MESSAGE_FIELD (task, text_parts)->len > 0) {
1197 cnt = MESSAGE_FIELD (task, text_parts)->len;
1198 scvec = g_malloc (sizeof (*scvec) * cnt);
1199 lenvec = g_malloc (sizeof (*lenvec) * cnt);
1200
1201 PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1202 /* Select data for regexp */
1203 if (re_class->type == RSPAMD_RE_RAWMIME) {
1204 if (text_part->raw.len == 0) {
1205 len = 0;
1206 in = "";
1207 }
1208 else {
1209 in = text_part->raw.begin;
1210 len = text_part->raw.len;
1211 }
1212
1213 raw = TRUE;
1214 }
1215 else {
1216 /* Skip empty parts */
1217 if (IS_TEXT_PART_EMPTY (text_part)) {
1218 len = 0;
1219 in = "";
1220 }
1221 else {
1222 /* Check raw flags */
1223 if (!IS_TEXT_PART_UTF (text_part)) {
1224 raw = TRUE;
1225 }
1226
1227 in = text_part->utf_content.begin;
1228 len = text_part->utf_content.len;
1229 }
1230 }
1231
1232 scvec[i] = (guchar *) in;
1233 lenvec[i] = len;
1234 }
1235
1236 ret = rspamd_re_cache_process_regexp_data (rt, re,
1237 task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1238 msg_debug_re_task ("checked mime regexp: %s -> %d",
1239 rspamd_regexp_get_pattern (re), ret);
1240 g_free (scvec);
1241 g_free (lenvec);
1242 }
1243 break;
1244 case RSPAMD_RE_URL:
1245 cnt = kh_size (MESSAGE_FIELD (task, urls));
1246
1247 if (cnt > 0) {
1248 scvec = g_malloc (sizeof (*scvec) * cnt);
1249 lenvec = g_malloc (sizeof (*lenvec) * cnt);
1250 i = 0;
1251 raw = FALSE;
1252
1253 kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
1254 if ((url->protocol & PROTOCOL_MAILTO)) {
1255 continue;
1256 }
1257 in = url->string;
1258 len = url->urllen;
1259
1260 if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
1261 scvec[i] = (guchar *) in;
1262 lenvec[i++] = len;
1263 }
1264 });
1265
1266 #if 0
1267 g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
1268
1269 while (g_hash_table_iter_next (&it, &k, &v)) {
1270 url = v;
1271 in = url->string;
1272 len = url->urllen;
1273
1274 if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
1275 scvec[i] = (guchar *) in;
1276 lenvec[i++] = len;
1277 }
1278 }
1279 #endif
1280 ret = rspamd_re_cache_process_regexp_data (rt, re,
1281 task, scvec, lenvec, i, raw, &processed_hyperscan);
1282 msg_debug_re_task ("checked url regexp: %s -> %d",
1283 rspamd_regexp_get_pattern (re), ret);
1284 g_free (scvec);
1285 g_free (lenvec);
1286 }
1287 break;
1288 case RSPAMD_RE_EMAIL:
1289 cnt = kh_size (MESSAGE_FIELD (task, urls));
1290
1291 if (cnt > 0) {
1292 scvec = g_malloc (sizeof (*scvec) * cnt);
1293 lenvec = g_malloc (sizeof (*lenvec) * cnt);
1294 i = 0;
1295 raw = FALSE;
1296
1297 kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
1298
1299 if (!(url->protocol & PROTOCOL_MAILTO)) {
1300 continue;
1301 }
1302 if (url->userlen == 0 || url->hostlen == 0) {
1303 continue;
1304 }
1305
1306 in = rspamd_url_user_unsafe (url);
1307 len = url->userlen + 1 + url->hostlen;
1308 scvec[i] = (guchar *) in;
1309 lenvec[i++] = len;
1310 });
1311
1312 ret = rspamd_re_cache_process_regexp_data (rt, re,
1313 task, scvec, lenvec, i, raw, &processed_hyperscan);
1314 msg_debug_re_task ("checked email regexp: %s -> %d",
1315 rspamd_regexp_get_pattern (re), ret);
1316 g_free (scvec);
1317 g_free (lenvec);
1318 }
1319 break;
1320 case RSPAMD_RE_BODY:
1321 raw = TRUE;
1322 in = task->msg.begin;
1323 len = task->msg.len;
1324
1325 ret = rspamd_re_cache_process_regexp_data (rt, re, task,
1326 (const guchar **)&in, &len, 1, raw, &processed_hyperscan);
1327 msg_debug_re_task ("checked rawbody regexp: %s -> %d",
1328 rspamd_regexp_get_pattern (re), ret);
1329 break;
1330 case RSPAMD_RE_SABODY:
1331 /* According to SA docs:
1332 * The 'body' in this case is the textual parts of the message body;
1333 * any non-text MIME parts are stripped, and the message decoded from
1334 * Quoted-Printable or Base-64-encoded format if necessary. The message
1335 * Subject header is considered part of the body and becomes the first
1336 * paragraph when running the rules. All HTML tags and line breaks will
1337 * be removed before matching.
1338 */
1339 cnt = MESSAGE_FIELD (task, text_parts)->len + 1;
1340 scvec = g_malloc (sizeof (*scvec) * cnt);
1341 lenvec = g_malloc (sizeof (*lenvec) * cnt);
1342
1343 /*
1344 * Body rules also include the Subject as the first line
1345 * of the body content.
1346 */
1347
1348 rh = rspamd_message_get_header_array(task, "Subject", FALSE);
1349
1350 if (rh) {
1351 scvec[0] = (guchar *)rh->decoded;
1352 lenvec[0] = strlen (rh->decoded);
1353 }
1354 else {
1355 scvec[0] = (guchar *)"";
1356 lenvec[0] = 0;
1357 }
1358
1359 PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1360 if (text_part->utf_stripped_content) {
1361 scvec[i + 1] = (guchar *)text_part->utf_stripped_content->data;
1362 lenvec[i + 1] = text_part->utf_stripped_content->len;
1363
1364 if (!IS_TEXT_PART_UTF (text_part)) {
1365 raw = TRUE;
1366 }
1367 }
1368 else {
1369 scvec[i + 1] = (guchar *)"";
1370 lenvec[i + 1] = 0;
1371 }
1372 }
1373
1374 ret = rspamd_re_cache_process_regexp_data (rt, re,
1375 task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1376 msg_debug_re_task ("checked sa body regexp: %s -> %d",
1377 rspamd_regexp_get_pattern (re), ret);
1378 g_free (scvec);
1379 g_free (lenvec);
1380 break;
1381 case RSPAMD_RE_SARAWBODY:
1382 /* According to SA docs:
1383 * The 'raw body' of a message is the raw data inside all textual
1384 * parts. The text will be decoded from base64 or quoted-printable
1385 * encoding, but HTML tags and line breaks will still be present.
1386 * Multiline expressions will need to be used to match strings that are
1387 * broken by line breaks.
1388 */
1389 if (MESSAGE_FIELD (task, text_parts)->len > 0) {
1390 cnt = MESSAGE_FIELD (task, text_parts)->len;
1391 scvec = g_malloc (sizeof (*scvec) * cnt);
1392 lenvec = g_malloc (sizeof (*lenvec) * cnt);
1393
1394 for (i = 0; i < cnt; i++) {
1395 text_part = g_ptr_array_index (MESSAGE_FIELD (task, text_parts), i);
1396
1397 if (text_part->parsed.len > 0) {
1398 scvec[i] = (guchar *)text_part->parsed.begin;
1399 lenvec[i] = text_part->parsed.len;
1400
1401 if (!IS_TEXT_PART_UTF (text_part)) {
1402 raw = TRUE;
1403 }
1404 }
1405 else {
1406 scvec[i] = (guchar *)"";
1407 lenvec[i] = 0;
1408 }
1409 }
1410
1411 ret = rspamd_re_cache_process_regexp_data (rt, re,
1412 task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1413 msg_debug_re_task ("checked sa rawbody regexp: %s -> %d",
1414 rspamd_regexp_get_pattern (re), ret);
1415 g_free (scvec);
1416 g_free (lenvec);
1417 }
1418 break;
1419 case RSPAMD_RE_WORDS:
1420 case RSPAMD_RE_STEMWORDS:
1421 case RSPAMD_RE_RAWWORDS:
1422 if (MESSAGE_FIELD (task, text_parts)->len > 0) {
1423 cnt = 0;
1424 raw = FALSE;
1425
1426 PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1427 if (text_part->utf_words) {
1428 cnt += text_part->utf_words->len;
1429 }
1430 }
1431
1432 if (task->meta_words && task->meta_words->len > 0) {
1433 cnt += task->meta_words->len;
1434 }
1435
1436 if (cnt > 0) {
1437 scvec = g_malloc (sizeof (*scvec) * cnt);
1438 lenvec = g_malloc (sizeof (*lenvec) * cnt);
1439
1440 cnt = 0;
1441
1442 PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1443 if (text_part->utf_words) {
1444 cnt = rspamd_process_words_vector (text_part->utf_words,
1445 scvec, lenvec, re_class, cnt, &raw);
1446 }
1447 }
1448
1449 if (task->meta_words) {
1450 cnt = rspamd_process_words_vector (task->meta_words,
1451 scvec, lenvec, re_class, cnt, &raw);
1452 }
1453
1454 ret = rspamd_re_cache_process_regexp_data (rt, re,
1455 task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1456
1457 msg_debug_re_task ("checked sa words regexp: %s -> %d",
1458 rspamd_regexp_get_pattern (re), ret);
1459 g_free (scvec);
1460 g_free (lenvec);
1461 }
1462 }
1463 break;
1464 case RSPAMD_RE_SELECTOR:
1465 if (rspamd_re_cache_process_selector (task, rt,
1466 re_class->type_data,
1467 (guchar ***)&scvec,
1468 &lenvec, &cnt)) {
1469
1470 ret = rspamd_re_cache_process_regexp_data (rt, re,
1471 task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1472 msg_debug_re_task ("checked selector(%s) regexp: %s -> %d",
1473 re_class->type_data,
1474 rspamd_regexp_get_pattern (re), ret);
1475
1476 /* Do not free vectors as they are managed by rt->sel_cache */
1477 }
1478 break;
1479 case RSPAMD_RE_MAX:
1480 msg_err_task ("regexp of class invalid has been called: %s",
1481 rspamd_regexp_get_pattern (re));
1482 break;
1483 }
1484
1485 #if WITH_HYPERSCAN
1486 if (processed_hyperscan) {
1487 rspamd_re_cache_finish_class (task, rt, re_class, class_name);
1488 }
1489 #endif
1490
1491 setbit (rt->checked, re_id);
1492
1493 return rt->results[re_id];
1494 }
1495
1496 gint
rspamd_re_cache_process(struct rspamd_task * task,rspamd_regexp_t * re,enum rspamd_re_type type,gconstpointer type_data,gsize datalen,gboolean is_strong)1497 rspamd_re_cache_process (struct rspamd_task *task,
1498 rspamd_regexp_t *re,
1499 enum rspamd_re_type type,
1500 gconstpointer type_data,
1501 gsize datalen,
1502 gboolean is_strong)
1503 {
1504 guint64 re_id;
1505 struct rspamd_re_class *re_class;
1506 struct rspamd_re_cache *cache;
1507 struct rspamd_re_runtime *rt;
1508
1509 g_assert (task != NULL);
1510 rt = task->re_rt;
1511 g_assert (rt != NULL);
1512 g_assert (re != NULL);
1513
1514 cache = rt->cache;
1515 re_id = rspamd_regexp_get_cache_id (re);
1516
1517 if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) {
1518 msg_err_task ("re '%s' has no valid id for the cache",
1519 rspamd_regexp_get_pattern (re));
1520 return 0;
1521 }
1522
1523 if (isset (rt->checked, re_id)) {
1524 /* Fast path */
1525 rt->stat.regexp_fast_cached ++;
1526 return rt->results[re_id];
1527 }
1528 else {
1529 /* Slow path */
1530 re_class = rspamd_regexp_get_class (re);
1531
1532 if (re_class == NULL) {
1533 msg_err_task ("cannot find re class for regexp '%s'",
1534 rspamd_regexp_get_pattern (re));
1535 return 0;
1536 }
1537
1538 return rspamd_re_cache_exec_re (task, rt, re, re_class,
1539 is_strong);
1540 }
1541
1542 return 0;
1543 }
1544
1545 int
rspamd_re_cache_process_ffi(void * ptask,void * pre,int type,void * type_data,int is_strong)1546 rspamd_re_cache_process_ffi (void *ptask,
1547 void *pre,
1548 int type,
1549 void *type_data,
1550 int is_strong)
1551 {
1552 struct rspamd_lua_regexp **lua_re = pre;
1553 struct rspamd_task **real_task = ptask;
1554 gsize typelen = 0;
1555
1556 if (type_data) {
1557 typelen = strlen (type_data);
1558 }
1559
1560 return rspamd_re_cache_process (*real_task, (*lua_re)->re,
1561 type, type_data, typelen, is_strong);
1562 }
1563
1564 void
rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime * rt)1565 rspamd_re_cache_runtime_destroy (struct rspamd_re_runtime *rt)
1566 {
1567 g_assert (rt != NULL);
1568
1569 if (rt->sel_cache) {
1570 struct rspamd_re_selector_result sr;
1571
1572 kh_foreach_value (rt->sel_cache, sr, {
1573 for (guint i = 0; i < sr.cnt; i ++) {
1574 g_free ((gpointer)sr.scvec[i]);
1575 }
1576
1577 g_free (sr.scvec);
1578 g_free (sr.lenvec);
1579 });
1580 kh_destroy (selectors_results_hash, rt->sel_cache);
1581 }
1582
1583 REF_RELEASE (rt->cache);
1584 g_free (rt);
1585 }
1586
1587 void
rspamd_re_cache_unref(struct rspamd_re_cache * cache)1588 rspamd_re_cache_unref (struct rspamd_re_cache *cache)
1589 {
1590 if (cache) {
1591 REF_RELEASE (cache);
1592 }
1593 }
1594
1595 struct rspamd_re_cache *
rspamd_re_cache_ref(struct rspamd_re_cache * cache)1596 rspamd_re_cache_ref (struct rspamd_re_cache *cache)
1597 {
1598 if (cache) {
1599 REF_RETAIN (cache);
1600 }
1601
1602 return cache;
1603 }
1604
1605 guint
rspamd_re_cache_set_limit(struct rspamd_re_cache * cache,guint limit)1606 rspamd_re_cache_set_limit (struct rspamd_re_cache *cache, guint limit)
1607 {
1608 guint old;
1609
1610 g_assert (cache != NULL);
1611
1612 old = cache->max_re_data;
1613 cache->max_re_data = limit;
1614
1615 return old;
1616 }
1617
1618 const gchar *
rspamd_re_cache_type_to_string(enum rspamd_re_type type)1619 rspamd_re_cache_type_to_string (enum rspamd_re_type type)
1620 {
1621 const gchar *ret = "unknown";
1622
1623 switch (type) {
1624 case RSPAMD_RE_HEADER:
1625 ret = "header";
1626 break;
1627 case RSPAMD_RE_RAWHEADER:
1628 ret = "raw header";
1629 break;
1630 case RSPAMD_RE_MIMEHEADER:
1631 ret = "mime header";
1632 break;
1633 case RSPAMD_RE_ALLHEADER:
1634 ret = "all headers";
1635 break;
1636 case RSPAMD_RE_MIME:
1637 ret = "part";
1638 break;
1639 case RSPAMD_RE_RAWMIME:
1640 ret = "raw part";
1641 break;
1642 case RSPAMD_RE_BODY:
1643 ret = "rawbody";
1644 break;
1645 case RSPAMD_RE_URL:
1646 ret = "url";
1647 break;
1648 case RSPAMD_RE_EMAIL:
1649 ret = "email";
1650 break;
1651 case RSPAMD_RE_SABODY:
1652 ret = "sa body";
1653 break;
1654 case RSPAMD_RE_SARAWBODY:
1655 ret = "sa raw body";
1656 break;
1657 case RSPAMD_RE_SELECTOR:
1658 ret = "selector";
1659 break;
1660 case RSPAMD_RE_WORDS:
1661 ret = "words";
1662 break;
1663 case RSPAMD_RE_RAWWORDS:
1664 ret = "raw_words";
1665 break;
1666 case RSPAMD_RE_STEMWORDS:
1667 ret = "stem_words";
1668 break;
1669 case RSPAMD_RE_MAX:
1670 default:
1671 ret = "invalid class";
1672 break;
1673 }
1674
1675 return ret;
1676 }
1677
1678 enum rspamd_re_type
rspamd_re_cache_type_from_string(const char * str)1679 rspamd_re_cache_type_from_string (const char *str)
1680 {
1681 enum rspamd_re_type ret;
1682 guint64 h;
1683
1684 /*
1685 * To optimize this function, we apply hash to input string and
1686 * pre-select it from the values
1687 */
1688
1689 if (str != NULL) {
1690 h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
1691 str, strlen (str), 0xdeadbabe);
1692
1693 switch (h) {
1694 case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */
1695 ret = RSPAMD_RE_HEADER;
1696 break;
1697 case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */
1698 ret = RSPAMD_RE_RAWHEADER;
1699 break;
1700 case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */
1701 ret = RSPAMD_RE_MIME;
1702 break;
1703 case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */
1704 ret = RSPAMD_RE_RAWMIME;
1705 break;
1706 case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */
1707 case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */
1708 ret = RSPAMD_RE_BODY;
1709 break;
1710 case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */
1711 case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */
1712 ret = RSPAMD_RE_URL;
1713 break;
1714 case G_GUINT64_CONSTANT (0x7e232b0f60b571be): /* email */
1715 ret = RSPAMD_RE_EMAIL;
1716 break;
1717 case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */
1718 ret = RSPAMD_RE_ALLHEADER;
1719 break;
1720 case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */
1721 ret = RSPAMD_RE_MIMEHEADER;
1722 break;
1723 case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */
1724 ret = RSPAMD_RE_SABODY;
1725 break;
1726 case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */
1727 ret = RSPAMD_RE_SARAWBODY;
1728 break;
1729 default:
1730 ret = RSPAMD_RE_MAX;
1731 break;
1732 }
1733 }
1734 else {
1735 ret = RSPAMD_RE_MAX;
1736 }
1737
1738 return ret;
1739 }
1740
1741 #ifdef WITH_HYPERSCAN
1742 static gchar *
rspamd_re_cache_hs_pattern_from_pcre(rspamd_regexp_t * re)1743 rspamd_re_cache_hs_pattern_from_pcre (rspamd_regexp_t *re)
1744 {
1745 /*
1746 * Workaroung for bug in ragel 7.0.0.11
1747 * https://github.com/intel/hyperscan/issues/133
1748 */
1749 const gchar *pat = rspamd_regexp_get_pattern (re);
1750 guint flags = rspamd_regexp_get_flags (re), esc_flags = RSPAMD_REGEXP_ESCAPE_RE;
1751 gchar *escaped;
1752 gsize esc_len;
1753
1754 if (flags & RSPAMD_REGEXP_FLAG_UTF) {
1755 esc_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
1756 }
1757
1758 escaped = rspamd_str_regexp_escape (pat, strlen (pat), &esc_len,esc_flags);
1759
1760 return escaped;
1761 }
1762
1763 static gboolean
rspamd_re_cache_is_finite(struct rspamd_re_cache * cache,rspamd_regexp_t * re,gint flags,gdouble max_time)1764 rspamd_re_cache_is_finite (struct rspamd_re_cache *cache,
1765 rspamd_regexp_t *re, gint flags, gdouble max_time)
1766 {
1767 pid_t cld;
1768 gint status;
1769 struct timespec ts;
1770 hs_compile_error_t *hs_errors;
1771 hs_database_t *test_db;
1772 gdouble wait_time;
1773 const gint max_tries = 10;
1774 gint tries = 0, rc;
1775 void (*old_hdl)(int);
1776
1777 wait_time = max_time / max_tries;
1778 /* We need to restore SIGCHLD processing */
1779 old_hdl = signal (SIGCHLD, SIG_DFL);
1780 cld = fork ();
1781
1782 if (cld == 0) {
1783 /* Try to compile pattern */
1784
1785 gchar *pat = rspamd_re_cache_hs_pattern_from_pcre (re);
1786
1787 if (hs_compile (pat,
1788 flags | HS_FLAG_PREFILTER,
1789 cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
1790 &cache->plt,
1791 &test_db,
1792 &hs_errors) != HS_SUCCESS) {
1793
1794 msg_info_re_cache ("cannot compile (prefilter mode) '%s' to hyperscan: '%s'",
1795 pat,
1796 hs_errors != NULL ? hs_errors->message : "unknown error");
1797
1798 hs_free_compile_error (hs_errors);
1799 g_free (pat);
1800
1801 exit (EXIT_FAILURE);
1802 }
1803
1804 g_free (pat);
1805 exit (EXIT_SUCCESS);
1806 }
1807 else if (cld > 0) {
1808 double_to_ts (wait_time, &ts);
1809
1810 while ((rc = waitpid (cld, &status, WNOHANG)) == 0 && tries ++ < max_tries) {
1811 (void)nanosleep (&ts, NULL);
1812 }
1813
1814 /* Child has been terminated */
1815 if (rc > 0) {
1816 /* Forget about SIGCHLD after this point */
1817 signal (SIGCHLD, old_hdl);
1818
1819 if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_SUCCESS) {
1820 return TRUE;
1821 }
1822 else {
1823 msg_err_re_cache (
1824 "cannot approximate %s to hyperscan",
1825 rspamd_regexp_get_pattern (re));
1826
1827 return FALSE;
1828 }
1829 }
1830 else {
1831 /* We consider that as timeout */
1832 kill (cld, SIGKILL);
1833 g_assert (waitpid (cld, &status, 0) != -1);
1834 msg_err_re_cache (
1835 "cannot approximate %s to hyperscan: timeout waiting",
1836 rspamd_regexp_get_pattern (re));
1837 signal (SIGCHLD, old_hdl);
1838 }
1839 }
1840 else {
1841 msg_err_re_cache (
1842 "cannot approximate %s to hyperscan: fork failed: %s",
1843 rspamd_regexp_get_pattern (re), strerror (errno));
1844 signal (SIGCHLD, old_hdl);
1845 }
1846
1847 return FALSE;
1848 }
1849 #endif
1850
1851 #ifdef WITH_HYPERSCAN
1852 struct rspamd_re_cache_hs_compile_cbdata {
1853 GHashTableIter it;
1854 struct rspamd_re_cache *cache;
1855 const char *cache_dir;
1856 gdouble max_time;
1857 gboolean silent;
1858 guint total;
1859 void (*cb)(guint ncompiled, GError *err, void *cbd);
1860 void *cbd;
1861 };
1862
1863 static void
rspamd_re_cache_compile_err(EV_P_ ev_timer * w,GError * err,struct rspamd_re_cache_hs_compile_cbdata * cbdata,bool is_fatal)1864 rspamd_re_cache_compile_err (EV_P_ ev_timer *w, GError *err,
1865 struct rspamd_re_cache_hs_compile_cbdata *cbdata, bool is_fatal)
1866 {
1867 cbdata->cb (cbdata->total, err, cbdata->cbd);
1868
1869 if (is_fatal) {
1870 ev_timer_stop(EV_A_ w);
1871 g_free(w);
1872 g_free(cbdata);
1873 }
1874 else {
1875 /* Continue compilation */
1876 ev_timer_again(EV_A_ w);
1877 }
1878 g_error_free (err);
1879 }
1880
1881 static void
rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer * w,int revents)1882 rspamd_re_cache_compile_timer_cb (EV_P_ ev_timer *w, int revents )
1883 {
1884 struct rspamd_re_cache_hs_compile_cbdata *cbdata =
1885 (struct rspamd_re_cache_hs_compile_cbdata *)w->data;
1886 GHashTableIter cit;
1887 gpointer k, v;
1888 struct rspamd_re_class *re_class;
1889 gchar path[PATH_MAX], npath[PATH_MAX];
1890 hs_database_t *test_db;
1891 gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags;
1892 rspamd_cryptobox_fast_hash_state_t crc_st;
1893 guint64 crc;
1894 rspamd_regexp_t *re;
1895 hs_compile_error_t *hs_errors = NULL;
1896 guint *hs_flags = NULL;
1897 const hs_expr_ext_t **hs_exts = NULL;
1898 gchar **hs_pats = NULL;
1899 gchar *hs_serialized = NULL;
1900 gsize serialized_len;
1901 struct iovec iov[7];
1902 struct rspamd_re_cache *cache;
1903 GError *err;
1904 pid_t our_pid = getpid ();
1905
1906 cache = cbdata->cache;
1907
1908 if (!g_hash_table_iter_next (&cbdata->it, &k, &v)) {
1909 /* All done */
1910 ev_timer_stop (EV_A_ w);
1911 cbdata->cb (cbdata->total, NULL, cbdata->cbd);
1912 g_free (w);
1913 g_free (cbdata);
1914
1915 return;
1916 }
1917
1918 re_class = v;
1919 rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cbdata->cache_dir,
1920 G_DIR_SEPARATOR, re_class->hash);
1921
1922 if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) {
1923
1924 fd = open (path, O_RDONLY, 00600);
1925
1926 /* Read number of regexps */
1927 g_assert (fd != -1);
1928 g_assert (lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET) != -1);
1929 g_assert (read (fd, &n, sizeof (n)) == sizeof (n));
1930 close (fd);
1931
1932 if (re_class->type_len > 0) {
1933 if (!cbdata->silent) {
1934 msg_info_re_cache (
1935 "skip already valid class %s(%*s) to cache %6s, %d regexps",
1936 rspamd_re_cache_type_to_string (re_class->type),
1937 (gint) re_class->type_len - 1,
1938 re_class->type_data,
1939 re_class->hash,
1940 n);
1941 }
1942 }
1943 else {
1944 if (!cbdata->silent) {
1945 msg_info_re_cache (
1946 "skip already valid class %s to cache %6s, %d regexps",
1947 rspamd_re_cache_type_to_string (re_class->type),
1948 re_class->hash,
1949 n);
1950 }
1951 }
1952
1953 ev_timer_again (EV_A_ w);
1954 return;
1955 }
1956
1957 rspamd_snprintf (path, sizeof (path), "%s%c%s.%P.hs.new", cbdata->cache_dir,
1958 G_DIR_SEPARATOR, re_class->hash, our_pid);
1959 fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600);
1960
1961 if (fd == -1) {
1962 err = g_error_new (rspamd_re_cache_quark (), errno,
1963 "cannot open file %s: %s", path, strerror (errno));
1964 rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
1965 return;
1966 }
1967
1968 g_hash_table_iter_init (&cit, re_class->re);
1969 n = g_hash_table_size (re_class->re);
1970 hs_flags = g_malloc0 (sizeof (*hs_flags) * n);
1971 hs_ids = g_malloc (sizeof (*hs_ids) * n);
1972 hs_pats = g_malloc (sizeof (*hs_pats) * n);
1973 hs_exts = g_malloc0 (sizeof (*hs_exts) * n);
1974 i = 0;
1975
1976 while (g_hash_table_iter_next (&cit, &k, &v)) {
1977 re = v;
1978
1979 pcre_flags = rspamd_regexp_get_pcre_flags (re);
1980 re_flags = rspamd_regexp_get_flags (re);
1981
1982 if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) {
1983 /* Do not try to compile bad regexp */
1984 msg_info_re_cache (
1985 "do not try compile %s to hyperscan as it is PCRE only",
1986 rspamd_regexp_get_pattern (re));
1987 continue;
1988 }
1989
1990 hs_flags[i] = 0;
1991 hs_exts[i] = NULL;
1992 #ifndef WITH_PCRE2
1993 if (pcre_flags & PCRE_FLAG(UTF8)) {
1994 hs_flags[i] |= HS_FLAG_UTF8;
1995 }
1996 #else
1997 if (pcre_flags & PCRE_FLAG(UTF)) {
1998 hs_flags[i] |= HS_FLAG_UTF8;
1999 }
2000 #endif
2001 if (pcre_flags & PCRE_FLAG(CASELESS)) {
2002 hs_flags[i] |= HS_FLAG_CASELESS;
2003 }
2004 if (pcre_flags & PCRE_FLAG(MULTILINE)) {
2005 hs_flags[i] |= HS_FLAG_MULTILINE;
2006 }
2007 if (pcre_flags & PCRE_FLAG(DOTALL)) {
2008 hs_flags[i] |= HS_FLAG_DOTALL;
2009 }
2010
2011
2012 if (re_flags & RSPAMD_REGEXP_FLAG_LEFTMOST) {
2013 hs_flags[i] |= HS_FLAG_SOM_LEFTMOST;
2014 }
2015 else if (rspamd_regexp_get_maxhits (re) == 1) {
2016 hs_flags[i] |= HS_FLAG_SINGLEMATCH;
2017 }
2018
2019 gchar *pat = rspamd_re_cache_hs_pattern_from_pcre (re);
2020
2021 if (hs_compile (pat,
2022 hs_flags[i],
2023 cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
2024 &cache->plt,
2025 &test_db,
2026 &hs_errors) != HS_SUCCESS) {
2027 msg_info_re_cache ("cannot compile '%s' to hyperscan: '%s', try prefilter match",
2028 pat,
2029 hs_errors != NULL ? hs_errors->message : "unknown error");
2030 hs_free_compile_error (hs_errors);
2031
2032 /* The approximation operation might take a significant
2033 * amount of time, so we need to check if it's finite
2034 */
2035 if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], cbdata->max_time)) {
2036 hs_flags[i] |= HS_FLAG_PREFILTER;
2037 hs_ids[i] = rspamd_regexp_get_cache_id (re);
2038 hs_pats[i] = pat;
2039 i++;
2040 }
2041 else {
2042 g_free (pat); /* Avoid leak */
2043 }
2044 }
2045 else {
2046 hs_ids[i] = rspamd_regexp_get_cache_id (re);
2047 hs_pats[i] = pat;
2048 i ++;
2049 hs_free_database (test_db);
2050 }
2051 }
2052 /* Adjust real re number */
2053 n = i;
2054
2055 #define CLEANUP_ALLOCATED(is_err) do { \
2056 g_free (hs_flags); \
2057 g_free (hs_ids); \
2058 for (guint j = 0; j < i; j ++) { \
2059 g_free (hs_pats[j]); \
2060 } \
2061 g_free (hs_pats); \
2062 g_free (hs_exts); \
2063 if (is_err) { \
2064 close (fd); \
2065 unlink (path); \
2066 if (hs_errors) hs_free_compile_error (hs_errors); \
2067 } \
2068 } while(0)
2069
2070 if (n > 0) {
2071 /* Create the hs tree */
2072 hs_errors = NULL;
2073 if (hs_compile_ext_multi ((const char **)hs_pats,
2074 hs_flags,
2075 hs_ids,
2076 hs_exts,
2077 n,
2078 cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
2079 &cache->plt,
2080 &test_db,
2081 &hs_errors) != HS_SUCCESS) {
2082
2083 err = g_error_new (rspamd_re_cache_quark (), EINVAL,
2084 "cannot create tree of regexp when processing '%s': %s",
2085 hs_pats[hs_errors->expression], hs_errors->message);
2086 CLEANUP_ALLOCATED(true);
2087 rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2088
2089 return;
2090 }
2091
2092 if (hs_serialize_database (test_db, &hs_serialized,
2093 &serialized_len) != HS_SUCCESS) {
2094 err = g_error_new (rspamd_re_cache_quark (),
2095 errno,
2096 "cannot serialize tree of regexp for %s",
2097 re_class->hash);
2098
2099 CLEANUP_ALLOCATED(true);
2100 hs_free_database (test_db);
2101 rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2102 return;
2103 }
2104
2105 hs_free_database (test_db);
2106
2107 /*
2108 * Magic - 8 bytes
2109 * Platform - sizeof (platform)
2110 * n - number of regexps
2111 * n * <regexp ids>
2112 * n * <regexp flags>
2113 * crc - 8 bytes checksum
2114 * <hyperscan blob>
2115 */
2116 rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe);
2117 /* IDs -> Flags -> Hs blob */
2118 rspamd_cryptobox_fast_hash_update (&crc_st,
2119 hs_ids, sizeof (*hs_ids) * n);
2120 rspamd_cryptobox_fast_hash_update (&crc_st,
2121 hs_flags, sizeof (*hs_flags) * n);
2122 rspamd_cryptobox_fast_hash_update (&crc_st,
2123 hs_serialized, serialized_len);
2124 crc = rspamd_cryptobox_fast_hash_final (&crc_st);
2125
2126 if (cache->vectorized_hyperscan) {
2127 iov[0].iov_base = (void *) rspamd_hs_magic_vector;
2128 }
2129 else {
2130 iov[0].iov_base = (void *) rspamd_hs_magic;
2131 }
2132
2133 iov[0].iov_len = RSPAMD_HS_MAGIC_LEN;
2134 iov[1].iov_base = &cache->plt;
2135 iov[1].iov_len = sizeof (cache->plt);
2136 iov[2].iov_base = &n;
2137 iov[2].iov_len = sizeof (n);
2138 iov[3].iov_base = hs_ids;
2139 iov[3].iov_len = sizeof (*hs_ids) * n;
2140 iov[4].iov_base = hs_flags;
2141 iov[4].iov_len = sizeof (*hs_flags) * n;
2142 iov[5].iov_base = &crc;
2143 iov[5].iov_len = sizeof (crc);
2144 iov[6].iov_base = hs_serialized;
2145 iov[6].iov_len = serialized_len;
2146
2147 if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) {
2148 err = g_error_new (rspamd_re_cache_quark (),
2149 errno,
2150 "cannot serialize tree of regexp to %s: %s",
2151 path, strerror (errno));
2152
2153 CLEANUP_ALLOCATED(true);
2154 g_free (hs_serialized);
2155
2156 rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2157 return;
2158 }
2159
2160 if (re_class->type_len > 0) {
2161 msg_info_re_cache (
2162 "compiled class %s(%*s) to cache %6s, %d/%d regexps",
2163 rspamd_re_cache_type_to_string (re_class->type),
2164 (gint) re_class->type_len - 1,
2165 re_class->type_data,
2166 re_class->hash,
2167 n,
2168 (gint)g_hash_table_size (re_class->re));
2169 }
2170 else {
2171 msg_info_re_cache (
2172 "compiled class %s to cache %6s, %d/%d regexps",
2173 rspamd_re_cache_type_to_string (re_class->type),
2174 re_class->hash,
2175 n,
2176 (gint)g_hash_table_size (re_class->re));
2177 }
2178
2179 cbdata->total += n;
2180 CLEANUP_ALLOCATED(false);
2181
2182 /* Now rename temporary file to the new .hs file */
2183 rspamd_snprintf (npath, sizeof (npath), "%s%c%s.hs", cbdata->cache_dir,
2184 G_DIR_SEPARATOR, re_class->hash);
2185
2186 if (rename (path, npath) == -1) {
2187 err = g_error_new (rspamd_re_cache_quark (),
2188 errno,
2189 "cannot rename %s to %s: %s",
2190 path, npath, strerror (errno));
2191 unlink (path);
2192 close (fd);
2193
2194 rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2195 return;
2196 }
2197
2198 close (fd);
2199 }
2200 else {
2201 err = g_error_new (rspamd_re_cache_quark (),
2202 errno,
2203 "no suitable regular expressions %s (%d original): "
2204 "remove temporary file %s",
2205 rspamd_re_cache_type_to_string (re_class->type),
2206 (gint)g_hash_table_size (re_class->re),
2207 path);
2208
2209 CLEANUP_ALLOCATED(true);
2210 rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2211
2212 return;
2213 }
2214
2215 /* Continue process */
2216 ev_timer_again (EV_A_ w);
2217 }
2218
2219 #endif
2220
2221 gint
rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache * cache,const char * cache_dir,gdouble max_time,gboolean silent,struct ev_loop * event_loop,void (* cb)(guint ncompiled,GError * err,void * cbd),void * cbd)2222 rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache,
2223 const char *cache_dir,
2224 gdouble max_time,
2225 gboolean silent,
2226 struct ev_loop *event_loop,
2227 void (*cb)(guint ncompiled, GError *err, void *cbd),
2228 void *cbd)
2229 {
2230 g_assert (cache != NULL);
2231 g_assert (cache_dir != NULL);
2232
2233 #ifndef WITH_HYPERSCAN
2234 return -1;
2235 #else
2236 static ev_timer *timer;
2237 static const ev_tstamp timer_interval = 0.1;
2238 struct rspamd_re_cache_hs_compile_cbdata *cbdata;
2239
2240 cbdata = g_malloc0 (sizeof (*cbdata));
2241 g_hash_table_iter_init (&cbdata->it, cache->re_classes);
2242 cbdata->cache = cache;
2243 cbdata->cache_dir = cache_dir;
2244 cbdata->cb = cb;
2245 cbdata->cbd = cbd;
2246 cbdata->max_time = max_time;
2247 cbdata->silent = silent;
2248 cbdata->total = 0;
2249 timer = g_malloc0 (sizeof (*timer));
2250 timer->data = (void *)cbdata; /* static */
2251
2252 ev_timer_init (timer, rspamd_re_cache_compile_timer_cb,
2253 timer_interval, timer_interval);
2254 ev_timer_start (event_loop, timer);
2255
2256 return 0;
2257 #endif
2258 }
2259
2260 gboolean
rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache * cache,const char * path,gboolean silent,gboolean try_load)2261 rspamd_re_cache_is_valid_hyperscan_file (struct rspamd_re_cache *cache,
2262 const char *path, gboolean silent, gboolean try_load)
2263 {
2264 g_assert (cache != NULL);
2265 g_assert (path != NULL);
2266
2267 #ifndef WITH_HYPERSCAN
2268 return FALSE;
2269 #else
2270 gint fd, n, ret;
2271 guchar magicbuf[RSPAMD_HS_MAGIC_LEN];
2272 const guchar *mb;
2273 GHashTableIter it;
2274 gpointer k, v;
2275 struct rspamd_re_class *re_class;
2276 gsize len;
2277 const gchar *hash_pos;
2278 hs_platform_info_t test_plt;
2279 hs_database_t *test_db = NULL;
2280 guchar *map, *p, *end;
2281 rspamd_cryptobox_fast_hash_state_t crc_st;
2282 guint64 crc, valid_crc;
2283
2284 len = strlen (path);
2285
2286 if (len < sizeof (rspamd_cryptobox_HASHBYTES + 3)) {
2287 if (!silent) {
2288 msg_err_re_cache ("cannot open hyperscan cache file %s: too short filename",
2289 path);
2290 }
2291 return FALSE;
2292 }
2293
2294 if (memcmp (path + len - 3, ".hs", 3) != 0) {
2295 if (!silent) {
2296 msg_err_re_cache ("cannot open hyperscan cache file %s: not ending with .hs",
2297 path);
2298 }
2299 return FALSE;
2300 }
2301
2302 hash_pos = path + len - 3 - (sizeof (re_class->hash) - 1);
2303 g_hash_table_iter_init (&it, cache->re_classes);
2304
2305 while (g_hash_table_iter_next (&it, &k, &v)) {
2306 re_class = v;
2307
2308 if (memcmp (hash_pos, re_class->hash, sizeof (re_class->hash) - 1) == 0) {
2309 /* Open file and check magic */
2310 gssize r;
2311
2312 fd = open (path, O_RDONLY);
2313
2314 if (fd == -1) {
2315 if (errno != ENOENT || !silent) {
2316 msg_err_re_cache ("cannot open hyperscan cache file %s: %s",
2317 path, strerror (errno));
2318 }
2319 return FALSE;
2320 }
2321
2322 if ((r = read (fd, magicbuf, sizeof (magicbuf))) != sizeof (magicbuf)) {
2323 if (r == -1) {
2324 msg_err_re_cache ("cannot read magic from hyperscan "
2325 "cache file %s: %s",
2326 path, strerror (errno));
2327 }
2328 else {
2329 msg_err_re_cache ("truncated read magic from hyperscan "
2330 "cache file %s: %z, %z wanted",
2331 path, r, (gsize)sizeof (magicbuf));
2332 }
2333 close (fd);
2334 return FALSE;
2335 }
2336
2337 if (cache->vectorized_hyperscan) {
2338 mb = rspamd_hs_magic_vector;
2339 }
2340 else {
2341 mb = rspamd_hs_magic;
2342 }
2343
2344 if (memcmp (magicbuf, mb, sizeof (magicbuf)) != 0) {
2345 msg_err_re_cache ("cannot open hyperscan cache file %s: "
2346 "bad magic ('%*xs', '%*xs' expected)",
2347 path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf,
2348 (int) RSPAMD_HS_MAGIC_LEN, mb);
2349
2350 close (fd);
2351 return FALSE;
2352 }
2353
2354 if ((r = read (fd, &test_plt, sizeof (test_plt))) != sizeof (test_plt)) {
2355 if (r == -1) {
2356 msg_err_re_cache ("cannot read platform data from hyperscan "
2357 "cache file %s: %s",
2358 path, strerror (errno));
2359 }
2360 else {
2361 msg_err_re_cache ("truncated read platform data from hyperscan "
2362 "cache file %s: %z, %z wanted",
2363 path, r, (gsize)sizeof (magicbuf));
2364 }
2365
2366 close (fd);
2367 return FALSE;
2368 }
2369
2370 if (memcmp (&test_plt, &cache->plt, sizeof (test_plt)) != 0) {
2371 msg_err_re_cache ("cannot open hyperscan cache file %s: "
2372 "compiled for a different platform",
2373 path);
2374
2375 close (fd);
2376 return FALSE;
2377 }
2378
2379 close (fd);
2380
2381 if (try_load) {
2382 map = rspamd_file_xmap (path, PROT_READ, &len, TRUE);
2383
2384 if (map == NULL) {
2385 msg_err_re_cache ("cannot mmap hyperscan cache file %s: "
2386 "%s",
2387 path, strerror (errno));
2388 return FALSE;
2389 }
2390
2391 p = map + RSPAMD_HS_MAGIC_LEN + sizeof (test_plt);
2392 end = map + len;
2393 n = *(gint *)p;
2394 p += sizeof (gint);
2395
2396 if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */
2397 sizeof (guint64) + /* crc */
2398 RSPAMD_HS_MAGIC_LEN + /* header */
2399 sizeof (cache->plt) > len) {
2400 /* Some wrong amount of regexps */
2401 msg_err_re_cache ("bad number of expressions in %s: %d",
2402 path, n);
2403 munmap (map, len);
2404 return FALSE;
2405 }
2406
2407 /*
2408 * Magic - 8 bytes
2409 * Platform - sizeof (platform)
2410 * n - number of regexps
2411 * n * <regexp ids>
2412 * n * <regexp flags>
2413 * crc - 8 bytes checksum
2414 * <hyperscan blob>
2415 */
2416
2417 memcpy (&crc, p + n * 2 * sizeof (gint), sizeof (crc));
2418 rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe);
2419 /* IDs */
2420 rspamd_cryptobox_fast_hash_update (&crc_st, p, n * sizeof (gint));
2421 /* Flags */
2422 rspamd_cryptobox_fast_hash_update (&crc_st, p + n * sizeof (gint),
2423 n * sizeof (gint));
2424 /* HS database */
2425 p += n * sizeof (gint) * 2 + sizeof (guint64);
2426 rspamd_cryptobox_fast_hash_update (&crc_st, p, end - p);
2427 valid_crc = rspamd_cryptobox_fast_hash_final (&crc_st);
2428
2429 if (crc != valid_crc) {
2430 msg_warn_re_cache ("outdated or invalid hs database in %s: "
2431 "crc read %xL, crc expected %xL", path, crc, valid_crc);
2432 munmap (map, len);
2433
2434 return FALSE;
2435 }
2436
2437 if ((ret = hs_deserialize_database (p, end - p, &test_db))
2438 != HS_SUCCESS) {
2439 msg_err_re_cache ("bad hs database in %s: %d", path, ret);
2440 munmap (map, len);
2441
2442 return FALSE;
2443 }
2444
2445 hs_free_database (test_db);
2446 munmap (map, len);
2447 }
2448 /* XXX: add crc check */
2449
2450 return TRUE;
2451 }
2452 }
2453
2454 if (!silent) {
2455 msg_warn_re_cache ("unknown hyperscan cache file %s", path);
2456 }
2457
2458 return FALSE;
2459 #endif
2460 }
2461
2462
2463 enum rspamd_hyperscan_status
rspamd_re_cache_load_hyperscan(struct rspamd_re_cache * cache,const char * cache_dir,bool try_load)2464 rspamd_re_cache_load_hyperscan (struct rspamd_re_cache *cache,
2465 const char *cache_dir, bool try_load)
2466 {
2467 g_assert (cache != NULL);
2468 g_assert (cache_dir != NULL);
2469
2470 #ifndef WITH_HYPERSCAN
2471 return RSPAMD_HYPERSCAN_UNSUPPORTED;
2472 #else
2473 gchar path[PATH_MAX];
2474 gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret;
2475 GHashTableIter it;
2476 gpointer k, v;
2477 guint8 *map, *p, *end;
2478 struct rspamd_re_class *re_class;
2479 struct rspamd_re_cache_elt *elt;
2480 struct stat st;
2481 gboolean has_valid = FALSE, all_valid = FALSE;
2482
2483 g_hash_table_iter_init (&it, cache->re_classes);
2484
2485 while (g_hash_table_iter_next (&it, &k, &v)) {
2486 re_class = v;
2487 rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir,
2488 G_DIR_SEPARATOR, re_class->hash);
2489
2490 if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, try_load, FALSE)) {
2491 msg_debug_re_cache ("load hyperscan database from '%s'",
2492 re_class->hash);
2493
2494 fd = open (path, O_RDONLY);
2495
2496 /* Read number of regexps */
2497 g_assert (fd != -1);
2498 fstat (fd, &st);
2499
2500 map = mmap (NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
2501
2502 if (map == MAP_FAILED) {
2503 if (!try_load) {
2504 msg_err_re_cache ("cannot mmap %s: %s", path, strerror (errno));
2505 }
2506 else {
2507 msg_debug_re_cache ("cannot mmap %s: %s", path, strerror (errno));
2508 }
2509
2510 close (fd);
2511 all_valid = FALSE;
2512 continue;
2513 }
2514
2515 close (fd);
2516 end = map + st.st_size;
2517 p = map + RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt);
2518 n = *(gint *)p;
2519
2520 if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */
2521 sizeof (guint64) + /* crc */
2522 RSPAMD_HS_MAGIC_LEN + /* header */
2523 sizeof (cache->plt) > (gsize)st.st_size) {
2524 /* Some wrong amount of regexps */
2525 if (!try_load) {
2526 msg_err_re_cache ("bad number of expressions in %s: %d",
2527 path, n);
2528 }
2529 else {
2530 msg_debug_re_cache ("bad number of expressions in %s: %d",
2531 path, n);
2532 }
2533
2534 munmap (map, st.st_size);
2535 all_valid = FALSE;
2536 continue;
2537 }
2538
2539 total += n;
2540 p += sizeof (n);
2541 hs_ids = g_malloc (n * sizeof (*hs_ids));
2542 memcpy (hs_ids, p, n * sizeof (*hs_ids));
2543 p += n * sizeof (*hs_ids);
2544 hs_flags = g_malloc (n * sizeof (*hs_flags));
2545 memcpy (hs_flags, p, n * sizeof (*hs_flags));
2546
2547 /* Skip crc */
2548 p += n * sizeof (*hs_ids) + sizeof (guint64);
2549
2550 /* Cleanup */
2551 if (re_class->hs_scratch != NULL) {
2552 hs_free_scratch (re_class->hs_scratch);
2553 }
2554
2555 if (re_class->hs_db != NULL) {
2556 hs_free_database (re_class->hs_db);
2557 }
2558
2559 if (re_class->hs_ids) {
2560 g_free (re_class->hs_ids);
2561 }
2562
2563 re_class->hs_ids = NULL;
2564 re_class->hs_scratch = NULL;
2565 re_class->hs_db = NULL;
2566
2567 if ((ret = hs_deserialize_database (p, end - p, &re_class->hs_db))
2568 != HS_SUCCESS) {
2569 if (!try_load) {
2570 msg_err_re_cache ("bad hs database in %s: %d", path, ret);
2571 }
2572 else {
2573 msg_debug_re_cache ("bad hs database in %s: %d", path, ret);
2574 }
2575 munmap (map, st.st_size);
2576 g_free (hs_ids);
2577 g_free (hs_flags);
2578
2579 re_class->hs_ids = NULL;
2580 re_class->hs_scratch = NULL;
2581 re_class->hs_db = NULL;
2582 all_valid = FALSE;
2583
2584 continue;
2585 }
2586
2587 munmap (map, st.st_size);
2588
2589 g_assert (hs_alloc_scratch (re_class->hs_db,
2590 &re_class->hs_scratch) == HS_SUCCESS);
2591
2592 /*
2593 * Now find hyperscan elts that are successfully compiled and
2594 * specify that they should be matched using hyperscan
2595 */
2596 for (i = 0; i < n; i ++) {
2597 g_assert ((gint)cache->re->len > hs_ids[i] && hs_ids[i] >= 0);
2598 elt = g_ptr_array_index (cache->re, hs_ids[i]);
2599
2600 if (hs_flags[i] & HS_FLAG_PREFILTER) {
2601 elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE;
2602 }
2603 else {
2604 elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN;
2605 }
2606 }
2607
2608 re_class->hs_ids = hs_ids;
2609 g_free (hs_flags);
2610 re_class->nhs = n;
2611
2612 if (!has_valid) {
2613 has_valid = TRUE;
2614 all_valid = TRUE;
2615 }
2616 }
2617 else {
2618 if (!try_load) {
2619 msg_err_re_cache ("invalid hyperscan hash file '%s'",
2620 path);
2621 }
2622 else {
2623 msg_debug_re_cache ("invalid hyperscan hash file '%s'",
2624 path);
2625 }
2626 all_valid = FALSE;
2627 continue;
2628 }
2629 }
2630
2631 if (has_valid) {
2632 if (all_valid) {
2633 msg_info_re_cache ("full hyperscan database of %d regexps has been loaded", total);
2634 cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL;
2635 }
2636 else {
2637 msg_info_re_cache ("partial hyperscan database of %d regexps has been loaded", total);
2638 cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL;
2639 }
2640 }
2641 else {
2642 msg_info_re_cache ("hyperscan database has NOT been loaded; no valid expressions");
2643 cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR;
2644 }
2645
2646
2647
2648 return cache->hyperscan_loaded;
2649 #endif
2650 }
2651
rspamd_re_cache_add_selector(struct rspamd_re_cache * cache,const gchar * sname,gint ref)2652 void rspamd_re_cache_add_selector (struct rspamd_re_cache *cache,
2653 const gchar *sname,
2654 gint ref)
2655 {
2656 khiter_t k;
2657
2658 k = kh_get (lua_selectors_hash, cache->selectors, (gchar *)sname);
2659
2660 if (k == kh_end (cache->selectors)) {
2661 gchar *cpy = g_strdup (sname);
2662 gint res;
2663
2664 k = kh_put (lua_selectors_hash, cache->selectors, cpy, &res);
2665
2666 kh_value (cache->selectors, k) = ref;
2667 }
2668 else {
2669 msg_warn_re_cache ("replacing selector with name %s", sname);
2670
2671 if (cache->L) {
2672 luaL_unref (cache->L, LUA_REGISTRYINDEX, kh_value (cache->selectors, k));
2673 }
2674
2675 kh_value (cache->selectors, k) = ref;
2676 }
2677 }
2678