1 /*-
2  * Copyright 2016 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "libmime/message.h"
17 #include "re_cache.h"
18 #include "cryptobox.h"
19 #include "ref.h"
20 #include "libserver/url.h"
21 #include "libserver/task.h"
22 #include "libserver/cfg_file.h"
23 #include "libutil/util.h"
24 #include "libutil/regexp.h"
25 #include "lua/lua_common.h"
26 #include "libstat/stat_api.h"
27 #include "contrib/uthash/utlist.h"
28 
29 #include "khash.h"
30 
31 #ifdef WITH_HYPERSCAN
32 #include "hs.h"
33 #endif
34 
35 #include "unix-std.h"
36 #include <signal.h>
37 #include <stdalign.h>
38 #include <math.h>
39 #include "contrib/libev/ev.h"
40 
41 #ifndef WITH_PCRE2
42 #include <pcre.h>
43 #else
44 #include <pcre2.h>
45 #endif
46 
47 #include "contrib/fastutf8/fastutf8.h"
48 
49 #ifdef HAVE_SYS_WAIT_H
50 #include <sys/wait.h>
51 #endif
52 
53 #define msg_err_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
54         "re_cache", cache->hash, \
55         G_STRFUNC, \
56         __VA_ARGS__)
57 #define msg_warn_re_cache(...)   rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
58         "re_cache", cache->hash, \
59         G_STRFUNC, \
60         __VA_ARGS__)
61 #define msg_info_re_cache(...)   rspamd_default_log_function (G_LOG_LEVEL_INFO, \
62         "re_cache", cache->hash, \
63         G_STRFUNC, \
64         __VA_ARGS__)
65 
66 #define msg_debug_re_task(...)  rspamd_conditional_debug_fast (NULL, NULL, \
67         rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \
68         G_STRFUNC, \
69         __VA_ARGS__)
70 #define msg_debug_re_cache(...)  rspamd_conditional_debug_fast (NULL, NULL, \
71         rspamd_re_cache_log_id, "re_cache", cache->hash, \
72         G_STRFUNC, \
73         __VA_ARGS__)
74 
75 INIT_LOG_MODULE(re_cache)
76 
77 #ifdef WITH_HYPERSCAN
78 #define RSPAMD_HS_MAGIC_LEN (sizeof (rspamd_hs_magic))
79 static const guchar rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'},
80 		rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'};
81 #endif
82 
83 
84 struct rspamd_re_class {
85 	guint64 id;
86 	enum rspamd_re_type type;
87 	gboolean has_utf8; /* if there are any utf8 regexps */
88 	gpointer type_data;
89 	gsize type_len;
90 	GHashTable *re;
91 	rspamd_cryptobox_hash_state_t *st;
92 
93 	gchar hash[rspamd_cryptobox_HASHBYTES + 1];
94 
95 #ifdef WITH_HYPERSCAN
96 	hs_database_t *hs_db;
97 	hs_scratch_t *hs_scratch;
98 	gint *hs_ids;
99 	guint nhs;
100 #endif
101 };
102 
103 enum rspamd_re_cache_elt_match_type {
104 	RSPAMD_RE_CACHE_PCRE = 0,
105 	RSPAMD_RE_CACHE_HYPERSCAN,
106 	RSPAMD_RE_CACHE_HYPERSCAN_PRE
107 };
108 
109 struct rspamd_re_cache_elt {
110 	rspamd_regexp_t *re;
111 	gint lua_cbref;
112 	enum rspamd_re_cache_elt_match_type match_type;
113 };
114 
115 KHASH_INIT (lua_selectors_hash, gchar *, int, 1, kh_str_hash_func, kh_str_hash_equal);
116 
117 struct rspamd_re_cache {
118 	GHashTable *re_classes;
119 
120 	GPtrArray *re;
121 	khash_t (lua_selectors_hash) *selectors;
122 	ref_entry_t ref;
123 	guint nre;
124 	guint max_re_data;
125 	gchar hash[rspamd_cryptobox_HASHBYTES + 1];
126 	lua_State *L;
127 #ifdef WITH_HYPERSCAN
128 	enum rspamd_hyperscan_status hyperscan_loaded;
129 	gboolean disable_hyperscan;
130 	gboolean vectorized_hyperscan;
131 	hs_platform_info_t plt;
132 #endif
133 };
134 
135 struct rspamd_re_selector_result {
136 	guchar **scvec;
137 	guint *lenvec;
138 	guint cnt;
139 };
140 
141 KHASH_INIT (selectors_results_hash, int, struct rspamd_re_selector_result, 1,
142 		kh_int_hash_func, kh_int_hash_equal);
143 
144 struct rspamd_re_runtime {
145 	guchar *checked;
146 	guchar *results;
147 	khash_t (selectors_results_hash) *sel_cache;
148 	struct rspamd_re_cache *cache;
149 	struct rspamd_re_cache_stat stat;
150 	gboolean has_hs;
151 };
152 
153 static GQuark
rspamd_re_cache_quark(void)154 rspamd_re_cache_quark (void)
155 {
156 	return g_quark_from_static_string ("re_cache");
157 }
158 
159 static guint64
rspamd_re_cache_class_id(enum rspamd_re_type type,gconstpointer type_data,gsize datalen)160 rspamd_re_cache_class_id (enum rspamd_re_type type,
161 		gconstpointer type_data,
162 		gsize datalen)
163 {
164 	rspamd_cryptobox_fast_hash_state_t st;
165 
166 	rspamd_cryptobox_fast_hash_init (&st, 0xdeadbabe);
167 	rspamd_cryptobox_fast_hash_update (&st, &type, sizeof (type));
168 
169 	if (datalen > 0) {
170 		rspamd_cryptobox_fast_hash_update (&st, type_data, datalen);
171 	}
172 
173 	return rspamd_cryptobox_fast_hash_final (&st);
174 }
175 
176 static void
rspamd_re_cache_destroy(struct rspamd_re_cache * cache)177 rspamd_re_cache_destroy (struct rspamd_re_cache *cache)
178 {
179 	GHashTableIter it;
180 	gpointer k, v;
181 	struct rspamd_re_class *re_class;
182 	gchar *skey;
183 	gint sref;
184 
185 	g_assert (cache != NULL);
186 	g_hash_table_iter_init (&it, cache->re_classes);
187 
188 	while (g_hash_table_iter_next (&it, &k, &v)) {
189 		re_class = v;
190 		g_hash_table_iter_steal (&it);
191 		g_hash_table_unref (re_class->re);
192 
193 		if (re_class->type_data) {
194 			g_free (re_class->type_data);
195 		}
196 
197 #ifdef WITH_HYPERSCAN
198 		if (re_class->hs_db) {
199 			hs_free_database (re_class->hs_db);
200 		}
201 		if (re_class->hs_scratch) {
202 			hs_free_scratch (re_class->hs_scratch);
203 		}
204 		if (re_class->hs_ids) {
205 			g_free (re_class->hs_ids);
206 		}
207 #endif
208 		g_free (re_class);
209 	}
210 
211 	if (cache->L) {
212 		kh_foreach (cache->selectors, skey, sref, {
213 			luaL_unref (cache->L, LUA_REGISTRYINDEX, sref);
214 			g_free (skey);
215 		});
216 
217 		struct rspamd_re_cache_elt *elt;
218 		guint i;
219 
220 		PTR_ARRAY_FOREACH (cache->re, i, elt) {
221 			if (elt->lua_cbref != -1) {
222 				luaL_unref (cache->L, LUA_REGISTRYINDEX, elt->lua_cbref);
223 			}
224 		}
225 	}
226 
227 	kh_destroy (lua_selectors_hash, cache->selectors);
228 
229 	g_hash_table_unref (cache->re_classes);
230 	g_ptr_array_free (cache->re, TRUE);
231 	g_free (cache);
232 }
233 
234 static void
rspamd_re_cache_elt_dtor(gpointer e)235 rspamd_re_cache_elt_dtor (gpointer e)
236 {
237 	struct rspamd_re_cache_elt *elt = e;
238 
239 	rspamd_regexp_unref (elt->re);
240 	g_free (elt);
241 }
242 
243 struct rspamd_re_cache *
rspamd_re_cache_new(void)244 rspamd_re_cache_new (void)
245 {
246 	struct rspamd_re_cache *cache;
247 
248 	cache = g_malloc0 (sizeof (*cache));
249 	cache->re_classes = g_hash_table_new (g_int64_hash, g_int64_equal);
250 	cache->nre = 0;
251 	cache->re = g_ptr_array_new_full (256, rspamd_re_cache_elt_dtor);
252 	cache->selectors = kh_init (lua_selectors_hash);
253 #ifdef WITH_HYPERSCAN
254 	cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN;
255 #endif
256 	REF_INIT_RETAIN (cache, rspamd_re_cache_destroy);
257 
258 	return cache;
259 }
260 
261 enum rspamd_hyperscan_status
rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache * cache)262 rspamd_re_cache_is_hs_loaded (struct rspamd_re_cache *cache)
263 {
264 	g_assert (cache != NULL);
265 
266 #ifdef WITH_HYPERSCAN
267 	return cache->hyperscan_loaded;
268 #else
269 	return RSPAMD_HYPERSCAN_UNSUPPORTED;
270 #endif
271 }
272 
273 rspamd_regexp_t *
rspamd_re_cache_add(struct rspamd_re_cache * cache,rspamd_regexp_t * re,enum rspamd_re_type type,gconstpointer type_data,gsize datalen,gint lua_cbref)274 rspamd_re_cache_add (struct rspamd_re_cache *cache,
275 					 rspamd_regexp_t *re,
276 					 enum rspamd_re_type type,
277 					 gconstpointer type_data, gsize datalen,
278 					 gint lua_cbref)
279 {
280 	guint64 class_id;
281 	struct rspamd_re_class *re_class;
282 	rspamd_regexp_t *nre;
283 	struct rspamd_re_cache_elt *elt;
284 
285 	g_assert (cache != NULL);
286 	g_assert (re != NULL);
287 
288 	class_id = rspamd_re_cache_class_id (type, type_data, datalen);
289 	re_class = g_hash_table_lookup (cache->re_classes, &class_id);
290 
291 	if (re_class == NULL) {
292 		re_class = g_malloc0 (sizeof (*re_class));
293 		re_class->id = class_id;
294 		re_class->type_len = datalen;
295 		re_class->type = type;
296 		re_class->re = g_hash_table_new_full (rspamd_regexp_hash,
297 				rspamd_regexp_equal, NULL, (GDestroyNotify)rspamd_regexp_unref);
298 
299 		if (datalen > 0) {
300 			re_class->type_data = g_malloc0 (datalen);
301 			memcpy (re_class->type_data, type_data, datalen);
302 		}
303 
304 		g_hash_table_insert (cache->re_classes, &re_class->id, re_class);
305 	}
306 
307 	if ((nre = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (re)))
308 			== NULL) {
309 		/*
310 		 * We set re id based on the global position in the cache
311 		 */
312 		elt = g_malloc0 (sizeof (*elt));
313 		/* One ref for re_class */
314 		nre = rspamd_regexp_ref (re);
315 		rspamd_regexp_set_cache_id (re, cache->nre++);
316 		/* One ref for cache */
317 		elt->re = rspamd_regexp_ref (re);
318 		g_ptr_array_add (cache->re, elt);
319 		rspamd_regexp_set_class (re, re_class);
320 		elt->lua_cbref = lua_cbref;
321 
322 		g_hash_table_insert (re_class->re, rspamd_regexp_get_id (nre), nre);
323 	}
324 
325 	if (rspamd_regexp_get_flags (re) & RSPAMD_REGEXP_FLAG_UTF) {
326 		re_class->has_utf8 = TRUE;
327 	}
328 
329 	return nre;
330 }
331 
332 void
rspamd_re_cache_replace(struct rspamd_re_cache * cache,rspamd_regexp_t * what,rspamd_regexp_t * with)333 rspamd_re_cache_replace (struct rspamd_re_cache *cache,
334 		rspamd_regexp_t *what,
335 		rspamd_regexp_t *with)
336 {
337 	guint64 re_id;
338 	struct rspamd_re_class *re_class;
339 	rspamd_regexp_t *src;
340 	struct rspamd_re_cache_elt *elt;
341 
342 	g_assert (cache != NULL);
343 	g_assert (what != NULL);
344 	g_assert (with != NULL);
345 
346 	re_class = rspamd_regexp_get_class (what);
347 
348 	if (re_class != NULL) {
349 		re_id = rspamd_regexp_get_cache_id (what);
350 
351 		g_assert (re_id != RSPAMD_INVALID_ID);
352 		src = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (what));
353 		elt = g_ptr_array_index (cache->re, re_id);
354 		g_assert (elt != NULL);
355 		g_assert (src != NULL);
356 
357 		rspamd_regexp_set_cache_id (what, RSPAMD_INVALID_ID);
358 		rspamd_regexp_set_class (what, NULL);
359 		rspamd_regexp_set_cache_id (with, re_id);
360 		rspamd_regexp_set_class (with, re_class);
361 		/*
362 		 * On calling of this function, we actually unref old re (what)
363 		 */
364 		g_hash_table_insert (re_class->re,
365 				rspamd_regexp_get_id (what),
366 				rspamd_regexp_ref (with));
367 
368 		rspamd_regexp_unref (elt->re);
369 		elt->re = rspamd_regexp_ref (with);
370 		/* XXX: do not touch match type here */
371 	}
372 }
373 
374 static gint
rspamd_re_cache_sort_func(gconstpointer a,gconstpointer b)375 rspamd_re_cache_sort_func (gconstpointer a, gconstpointer b)
376 {
377 	struct rspamd_re_cache_elt * const *re1 = a, * const *re2 = b;
378 
379 	return rspamd_regexp_cmp (rspamd_regexp_get_id ((*re1)->re),
380 			rspamd_regexp_get_id ((*re2)->re));
381 }
382 
383 void
rspamd_re_cache_init(struct rspamd_re_cache * cache,struct rspamd_config * cfg)384 rspamd_re_cache_init (struct rspamd_re_cache *cache, struct rspamd_config *cfg)
385 {
386 	guint i, fl;
387 	GHashTableIter it;
388 	gpointer k, v;
389 	struct rspamd_re_class *re_class;
390 	rspamd_cryptobox_hash_state_t st_global;
391 	rspamd_regexp_t *re;
392 	struct rspamd_re_cache_elt *elt;
393 	guchar hash_out[rspamd_cryptobox_HASHBYTES];
394 
395 	g_assert (cache != NULL);
396 
397 	rspamd_cryptobox_hash_init (&st_global, NULL, 0);
398 	/* Resort all regexps */
399 	g_ptr_array_sort (cache->re, rspamd_re_cache_sort_func);
400 
401 	for (i = 0; i < cache->re->len; i ++) {
402 		elt = g_ptr_array_index (cache->re, i);
403 		re = elt->re;
404 		re_class = rspamd_regexp_get_class (re);
405 		g_assert (re_class != NULL);
406 		rspamd_regexp_set_cache_id (re, i);
407 
408 		if (re_class->st == NULL) {
409 			(void) !posix_memalign ((void **)&re_class->st, _Alignof (rspamd_cryptobox_hash_state_t),
410 			 		sizeof (*re_class->st));
411 			g_assert (re_class->st != NULL);
412 			rspamd_cryptobox_hash_init (re_class->st, NULL, 0);
413 		}
414 
415 		/* Update hashes */
416 		/* Id of re class */
417 		rspamd_cryptobox_hash_update (re_class->st, (gpointer) &re_class->id,
418 				sizeof (re_class->id));
419 		rspamd_cryptobox_hash_update (&st_global, (gpointer) &re_class->id,
420 				sizeof (re_class->id));
421 		/* Id of re expression */
422 		rspamd_cryptobox_hash_update (re_class->st, rspamd_regexp_get_id (re),
423 				rspamd_cryptobox_HASHBYTES);
424 		rspamd_cryptobox_hash_update (&st_global, rspamd_regexp_get_id (re),
425 				rspamd_cryptobox_HASHBYTES);
426 		/* PCRE flags */
427 		fl = rspamd_regexp_get_pcre_flags (re);
428 		rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&fl,
429 				sizeof (fl));
430 		rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
431 				sizeof (fl));
432 		/* Rspamd flags */
433 		fl = rspamd_regexp_get_flags (re);
434 		rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl,
435 				sizeof (fl));
436 		rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
437 				sizeof (fl));
438 		/* Limit of hits */
439 		fl = rspamd_regexp_get_maxhits (re);
440 		rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl,
441 				sizeof (fl));
442 		rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
443 				sizeof (fl));
444 		/* Numberic order */
445 		rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&i,
446 				sizeof (i));
447 		rspamd_cryptobox_hash_update (&st_global, (const guchar *)&i,
448 				sizeof (i));
449 	}
450 
451 	rspamd_cryptobox_hash_final (&st_global, hash_out);
452 	rspamd_snprintf (cache->hash, sizeof (cache->hash), "%*xs",
453 			(gint) rspamd_cryptobox_HASHBYTES, hash_out);
454 
455 	/* Now finalize all classes */
456 	g_hash_table_iter_init (&it, cache->re_classes);
457 
458 	while (g_hash_table_iter_next (&it, &k, &v)) {
459 		re_class = v;
460 
461 		if (re_class->st) {
462 			/*
463 			 * We finally update all classes with the number of expressions
464 			 * in the cache to ensure that if even a single re has been changed
465 			 * we won't be broken due to id mismatch
466 			 */
467 			rspamd_cryptobox_hash_update (re_class->st,
468 					(gpointer)&cache->re->len,
469 					sizeof (cache->re->len));
470 			rspamd_cryptobox_hash_final (re_class->st, hash_out);
471 			rspamd_snprintf (re_class->hash, sizeof (re_class->hash), "%*xs",
472 					(gint) rspamd_cryptobox_HASHBYTES, hash_out);
473 			free (re_class->st); /* Due to posix_memalign */
474 			re_class->st = NULL;
475 		}
476 	}
477 
478 	cache->L = cfg->lua_state;
479 
480 #ifdef WITH_HYPERSCAN
481 	const gchar *platform = "generic";
482 	rspamd_fstring_t *features = rspamd_fstring_new ();
483 
484 	cache->disable_hyperscan = cfg->disable_hyperscan;
485 	cache->vectorized_hyperscan = cfg->vectorized_hyperscan;
486 
487 	g_assert (hs_populate_platform (&cache->plt) == HS_SUCCESS);
488 
489 	/* Now decode what we do have */
490 	switch (cache->plt.tune) {
491 	case HS_TUNE_FAMILY_HSW:
492 		platform = "haswell";
493 		break;
494 	case HS_TUNE_FAMILY_SNB:
495 		platform = "sandy";
496 		break;
497 	case HS_TUNE_FAMILY_BDW:
498 		platform = "broadwell";
499 		break;
500 	case HS_TUNE_FAMILY_IVB:
501 		platform = "ivy";
502 		break;
503 	default:
504 		break;
505 	}
506 
507 	if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) {
508 		features = rspamd_fstring_append (features, "AVX2", 4);
509 	}
510 
511 	hs_set_allocator (g_malloc, g_free);
512 
513 	msg_info_re_cache ("loaded hyperscan engine with cpu tune '%s' and features '%V'",
514 			platform, features);
515 
516 	rspamd_fstring_free (features);
517 #endif
518 }
519 
520 struct rspamd_re_runtime *
rspamd_re_cache_runtime_new(struct rspamd_re_cache * cache)521 rspamd_re_cache_runtime_new (struct rspamd_re_cache *cache)
522 {
523 	struct rspamd_re_runtime *rt;
524 	g_assert (cache != NULL);
525 
526 	rt = g_malloc0 (sizeof (*rt) + NBYTES (cache->nre) + cache->nre);
527 	rt->cache = cache;
528 	REF_RETAIN (cache);
529 	rt->checked = ((guchar *)rt) + sizeof (*rt);
530 	rt->results = rt->checked + NBYTES (cache->nre);
531 	rt->stat.regexp_total = cache->nre;
532 #ifdef WITH_HYPERSCAN
533 	rt->has_hs = cache->hyperscan_loaded;
534 #endif
535 
536 	return rt;
537 }
538 
539 const struct rspamd_re_cache_stat *
rspamd_re_cache_get_stat(struct rspamd_re_runtime * rt)540 rspamd_re_cache_get_stat (struct rspamd_re_runtime *rt)
541 {
542 	g_assert (rt != NULL);
543 
544 	return &rt->stat;
545 }
546 
547 static gboolean
rspamd_re_cache_check_lua_condition(struct rspamd_task * task,rspamd_regexp_t * re,const guchar * in,gsize len,goffset start,goffset end,gint lua_cbref)548 rspamd_re_cache_check_lua_condition (struct rspamd_task *task,
549 									 rspamd_regexp_t *re,
550 									 const guchar *in, gsize len,
551 									 goffset start, goffset end,
552 									 gint lua_cbref)
553 {
554 	lua_State *L = (lua_State *)task->cfg->lua_state;
555 	GError *err = NULL;
556 	struct rspamd_lua_text __attribute__ ((unused)) *t;
557 	gint text_pos;
558 
559 	if (G_LIKELY (lua_cbref == -1)) {
560 		return TRUE;
561 	}
562 
563 	t = lua_new_text (L, in, len, FALSE);
564 	text_pos = lua_gettop (L);
565 
566 	if (!rspamd_lua_universal_pcall (L, lua_cbref,
567 			G_STRLOC, 1, "utii", &err,
568 			"rspamd{task}", task,
569 			text_pos, start, end)) {
570 		msg_warn_task ("cannot call for re_cache_check_lua_condition for re %s: %e",
571 				rspamd_regexp_get_pattern (re), err);
572 		g_error_free (err);
573 
574 		return TRUE;
575 	}
576 
577 	gboolean res = lua_toboolean (L, -1);
578 
579 	lua_settop (L, text_pos - 1);
580 
581 	return res;
582 }
583 
584 static guint
rspamd_re_cache_process_pcre(struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_task * task,const guchar * in,gsize len,gboolean is_raw,gint lua_cbref)585 rspamd_re_cache_process_pcre (struct rspamd_re_runtime *rt,
586 		rspamd_regexp_t *re, struct rspamd_task *task,
587 		const guchar *in, gsize len,
588 		gboolean is_raw,
589 		gint lua_cbref)
590 {
591 	guint r = 0;
592 	const gchar *start = NULL, *end = NULL;
593 	guint max_hits = rspamd_regexp_get_maxhits (re);
594 	guint64 id = rspamd_regexp_get_cache_id (re);
595 	gdouble t1 = NAN, t2, pr;
596 	const gdouble slow_time = 1e8;
597 
598 	if (in == NULL) {
599 		return rt->results[id];
600 	}
601 
602 	if (len == 0) {
603 		return rt->results[id];
604 	}
605 
606 	if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) {
607 		len = rt->cache->max_re_data;
608 	}
609 
610 	r = rt->results[id];
611 
612 	if (max_hits == 0 || r < max_hits) {
613 		pr = rspamd_random_double_fast ();
614 
615 		if (pr > 0.9) {
616 			t1 = rspamd_get_ticks (TRUE);
617 		}
618 
619 		while (rspamd_regexp_search (re,
620 				in,
621 				len,
622 				&start,
623 				&end,
624 				is_raw,
625 				NULL)) {
626 			if (rspamd_re_cache_check_lua_condition (task, re, in, len,
627 					start - (const gchar *)in, end - (const gchar *)in, lua_cbref)) {
628 				r++;
629 				msg_debug_re_task ("found regexp /%s/, total hits: %d",
630 						rspamd_regexp_get_pattern (re), r);
631 			}
632 
633 			if (max_hits > 0 && r >= max_hits) {
634 				break;
635 			}
636 		}
637 
638 		rt->results[id] += r;
639 		rt->stat.regexp_checked++;
640 		rt->stat.bytes_scanned_pcre += len;
641 		rt->stat.bytes_scanned += len;
642 
643 		if (r > 0) {
644 			rt->stat.regexp_matched += r;
645 		}
646 
647 		if (!isnan (t1)) {
648 			t2 = rspamd_get_ticks (TRUE);
649 
650 			if (t2 - t1 > slow_time) {
651 				rspamd_symcache_enable_profile (task);
652 				msg_info_task ("regexp '%16s' took %.0f ticks to execute",
653 						rspamd_regexp_get_pattern (re), t2 - t1);
654 			}
655 		}
656 	}
657 
658 	return r;
659 }
660 
661 #ifdef WITH_HYPERSCAN
662 struct rspamd_re_hyperscan_cbdata {
663 	struct rspamd_re_runtime *rt;
664 	const guchar **ins;
665 	const guint *lens;
666 	guint count;
667 	rspamd_regexp_t *re;
668 	struct rspamd_task *task;
669 };
670 
671 static gint
rspamd_re_cache_hyperscan_cb(unsigned int id,unsigned long long from,unsigned long long to,unsigned int flags,void * ud)672 rspamd_re_cache_hyperscan_cb (unsigned int id,
673 		unsigned long long from,
674 		unsigned long long to,
675 		unsigned int flags,
676 		void *ud)
677 {
678 	struct rspamd_re_hyperscan_cbdata *cbdata = ud;
679 	struct rspamd_re_runtime *rt;
680 	struct rspamd_re_cache_elt *cache_elt;
681 	guint ret, maxhits, i, processed;
682 	struct rspamd_task *task;
683 
684 	rt = cbdata->rt;
685 	task = cbdata->task;
686 	cache_elt = g_ptr_array_index (rt->cache->re, id);
687 	maxhits = rspamd_regexp_get_maxhits (cache_elt->re);
688 
689 	if (cache_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) {
690 		if (rspamd_re_cache_check_lua_condition (task, cache_elt->re,
691 				cbdata->ins[0], cbdata->lens[0], from, to, cache_elt->lua_cbref)) {
692 			ret = 1;
693 			setbit (rt->checked, id);
694 
695 			if (maxhits == 0 || rt->results[id] < maxhits) {
696 				rt->results[id] += ret;
697 				rt->stat.regexp_matched++;
698 			}
699 			msg_debug_re_task ("found regexp /%s/ using hyperscan only, total hits: %d",
700 					rspamd_regexp_get_pattern (cache_elt->re), rt->results[id]);
701 		}
702 	}
703 	else {
704 		if (!isset (rt->checked, id)) {
705 
706 			processed = 0;
707 
708 			for (i = 0; i < cbdata->count; i ++) {
709 				rspamd_re_cache_process_pcre (rt,
710 						cache_elt->re,
711 						cbdata->task,
712 						cbdata->ins[i],
713 						cbdata->lens[i],
714 						FALSE,
715 						cache_elt->lua_cbref);
716 				setbit (rt->checked, id);
717 
718 				processed += cbdata->lens[i];
719 
720 				if (processed >= to) {
721 					break;
722 				}
723 			}
724 		}
725 	}
726 
727 	return 0;
728 }
729 #endif
730 
731 static guint
rspamd_re_cache_process_regexp_data(struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_task * task,const guchar ** in,guint * lens,guint count,gboolean is_raw,gboolean * processed_hyperscan)732 rspamd_re_cache_process_regexp_data (struct rspamd_re_runtime *rt,
733 		rspamd_regexp_t *re, struct rspamd_task *task,
734 		const guchar **in, guint *lens,
735 		guint count,
736 		gboolean is_raw,
737 		gboolean *processed_hyperscan)
738 {
739 
740 	guint64 re_id;
741 	guint ret = 0;
742 	guint i;
743 	struct rspamd_re_cache_elt *cache_elt;
744 
745 	re_id = rspamd_regexp_get_cache_id (re);
746 
747 	if (count == 0 || in == NULL) {
748 		/* We assume this as absence of the specified data */
749 		setbit (rt->checked, re_id);
750 		rt->results[re_id] = ret;
751 		return ret;
752 	}
753 
754 	cache_elt = (struct rspamd_re_cache_elt *)g_ptr_array_index (rt->cache->re, re_id);
755 
756 #ifndef WITH_HYPERSCAN
757 	for (i = 0; i < count; i++) {
758 		ret = rspamd_re_cache_process_pcre (rt,
759 				re,
760 				task,
761 				in[i],
762 				lens[i],
763 				is_raw,
764 				cache_elt->lua_cbref);
765 		rt->results[re_id] = ret;
766 	}
767 
768 	setbit (rt->checked, re_id);
769 #else
770 	struct rspamd_re_class *re_class;
771 	struct rspamd_re_hyperscan_cbdata cbdata;
772 
773 	cache_elt = g_ptr_array_index (rt->cache->re, re_id);
774 	re_class = rspamd_regexp_get_class (re);
775 
776 	if (rt->cache->disable_hyperscan || cache_elt->match_type == RSPAMD_RE_CACHE_PCRE ||
777 			!rt->has_hs || (is_raw && re_class->has_utf8)) {
778 		for (i = 0; i < count; i++) {
779 			ret = rspamd_re_cache_process_pcre (rt,
780 					re,
781 					task,
782 					in[i],
783 					lens[i],
784 					is_raw,
785 					cache_elt->lua_cbref);
786 		}
787 
788 		setbit (rt->checked, re_id);
789 	}
790 	else {
791 		for (i = 0; i < count; i ++) {
792 			/* For Hyperscan we can probably safely disable all those limits */
793 #if 0
794 			if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) {
795 				lens[i] = rt->cache->max_re_data;
796 			}
797 #endif
798 			rt->stat.bytes_scanned += lens[i];
799 		}
800 
801 		g_assert (re_class->hs_scratch != NULL);
802 		g_assert (re_class->hs_db != NULL);
803 
804 		/* Go through hyperscan API */
805 		if (!rt->cache->vectorized_hyperscan) {
806 			for (i = 0; i < count; i++) {
807 				cbdata.ins = &in[i];
808 				cbdata.re = re;
809 				cbdata.rt = rt;
810 				cbdata.lens = &lens[i];
811 				cbdata.count = 1;
812 				cbdata.task = task;
813 
814 				if ((hs_scan (re_class->hs_db, in[i], lens[i], 0,
815 						re_class->hs_scratch,
816 						rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
817 					ret = 0;
818 				}
819 				else {
820 					ret = rt->results[re_id];
821 					*processed_hyperscan = TRUE;
822 				}
823 			}
824 		}
825 		else {
826 			cbdata.ins = in;
827 			cbdata.re = re;
828 			cbdata.rt = rt;
829 			cbdata.lens = lens;
830 			cbdata.count = 1;
831 			cbdata.task = task;
832 
833 			if ((hs_scan_vector (re_class->hs_db, (const char **)in, lens, count, 0,
834 					re_class->hs_scratch,
835 					rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
836 				ret = 0;
837 			}
838 			else {
839 				ret = rt->results[re_id];
840 				*processed_hyperscan = TRUE;
841 			}
842 		}
843 	}
844 #endif
845 
846 	return ret;
847 }
848 
849 static void
rspamd_re_cache_finish_class(struct rspamd_task * task,struct rspamd_re_runtime * rt,struct rspamd_re_class * re_class,const gchar * class_name)850 rspamd_re_cache_finish_class (struct rspamd_task *task,
851 							  struct rspamd_re_runtime *rt,
852 							  struct rspamd_re_class *re_class,
853 							  const gchar *class_name)
854 {
855 #ifdef WITH_HYPERSCAN
856 	guint i;
857 	guint64 re_id;
858 	guint found = 0;
859 
860 	/* Set all bits that are not checked and included in hyperscan to 1 */
861 	for (i = 0; i < re_class->nhs; i++) {
862 		re_id = re_class->hs_ids[i];
863 
864 		if (!isset (rt->checked, re_id)) {
865 			g_assert (rt->results[re_id] == 0);
866 			rt->results[re_id] = 0;
867 			setbit (rt->checked, re_id);
868 		}
869 		else {
870 			found ++;
871 		}
872 	}
873 
874 	msg_debug_re_task ("finished hyperscan for class %s; %d "
875 					   "matches found; %d hyperscan supported regexps; %d total regexps",
876 			class_name, found, re_class->nhs, (gint)g_hash_table_size (re_class->re));
877 #endif
878 }
879 
880 static gboolean
rspamd_re_cache_process_selector(struct rspamd_task * task,struct rspamd_re_runtime * rt,const gchar * name,guchar *** svec,guint ** lenvec,guint * n)881 rspamd_re_cache_process_selector (struct rspamd_task *task,
882 								  struct rspamd_re_runtime *rt,
883 								  const gchar *name,
884 								  guchar ***svec,
885 								  guint **lenvec,
886 								  guint *n)
887 {
888 	gint ref;
889 	khiter_t k;
890 	lua_State *L;
891 	gint err_idx, ret;
892 	struct rspamd_task **ptask;
893 	gboolean result = FALSE;
894 	struct rspamd_re_cache *cache = rt->cache;
895 	struct rspamd_re_selector_result *sr;
896 
897 	L = cache->L;
898 	k = kh_get (lua_selectors_hash, cache->selectors, (gchar *)name);
899 
900 	if (k == kh_end (cache->selectors)) {
901 		msg_err_task ("cannot find selector %s, not registered", name);
902 
903 		return FALSE;
904 	}
905 
906 	ref = kh_value (cache->selectors, k);
907 
908 	/* First, search for the cached result */
909 	if (rt->sel_cache) {
910 		k = kh_get (selectors_results_hash, rt->sel_cache, ref);
911 
912 		if (k != kh_end (rt->sel_cache)) {
913 			sr = &kh_value (rt->sel_cache, k);
914 
915 			*svec = sr->scvec;
916 			*lenvec = sr->lenvec;
917 			*n = sr->cnt;
918 
919 			return TRUE;
920 		}
921 	}
922 	else {
923 		rt->sel_cache = kh_init (selectors_results_hash);
924 	}
925 
926 	lua_pushcfunction (L, &rspamd_lua_traceback);
927 	err_idx = lua_gettop (L);
928 
929 	lua_rawgeti (L, LUA_REGISTRYINDEX, ref);
930 	ptask = lua_newuserdata (L, sizeof (*ptask));
931 	*ptask = task;
932 	rspamd_lua_setclass (L, "rspamd{task}", -1);
933 
934 	if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
935 		msg_err_task ("call to selector %s "
936 						"failed (%d): %s", name, ret,
937 						lua_tostring (L, -1));
938 	}
939 	else {
940 		struct rspamd_lua_text *txt;
941 		gsize slen;
942 		const gchar *sel_data;
943 
944 		if (lua_type (L, -1) != LUA_TTABLE) {
945 			txt = lua_check_text_or_string (L, -1);
946 
947 			if (txt) {
948 				sel_data = txt->start;
949 				slen = txt->len;
950 				*n = 1;
951 				*svec = g_malloc (sizeof (guchar *));
952 				*lenvec = g_malloc (sizeof (guint));
953 				(*svec)[0] = g_malloc (slen);
954 				memcpy ((*svec)[0], sel_data, slen);
955 				(*lenvec)[0] = slen;
956 				result = TRUE;
957 			}
958 		}
959 		else {
960 			*n = rspamd_lua_table_size (L, -1);
961 
962 			if (*n > 0) {
963 				*svec = g_malloc (sizeof (guchar *) * (*n));
964 				*lenvec = g_malloc (sizeof (guint) * (*n));
965 
966 				for (guint i = 0; i < *n; i ++) {
967 					lua_rawgeti (L, -1, i + 1);
968 
969 					txt = lua_check_text_or_string (L, -1);
970 					if (txt) {
971 						sel_data = txt->start;
972 						slen = txt->len;
973 					}
974 					else {
975 						sel_data = "";
976 						slen = 0;
977 					}
978 
979 					(*svec)[i] = g_malloc (slen);
980 					memcpy ((*svec)[i], sel_data, slen);
981 					(*lenvec)[i] = slen;
982 					lua_pop (L, 1);
983 				}
984 
985 				result = TRUE;
986 			}
987 		}
988 	}
989 
990 	lua_settop (L, err_idx - 1);
991 
992 	if (result) {
993 		k = kh_put (selectors_results_hash, rt->sel_cache, ref, &ret);
994 		sr = &kh_value (rt->sel_cache, k);
995 
996 		sr->cnt = *n;
997 		sr->scvec = *svec;
998 		sr->lenvec = *lenvec;
999 	}
1000 
1001 	return result;
1002 }
1003 
1004 static inline guint
rspamd_process_words_vector(GArray * words,const guchar ** scvec,guint * lenvec,struct rspamd_re_class * re_class,guint cnt,gboolean * raw)1005 rspamd_process_words_vector (GArray *words,
1006 							 const guchar **scvec,
1007 							 guint *lenvec,
1008 							 struct rspamd_re_class *re_class,
1009 							 guint cnt,
1010 							 gboolean *raw)
1011 {
1012 	guint j;
1013 	rspamd_stat_token_t *tok;
1014 
1015 	if (words) {
1016 		for (j = 0; j < words->len; j ++) {
1017 			tok = &g_array_index (words, rspamd_stat_token_t, j);
1018 
1019 			if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
1020 				if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
1021 					if (!re_class->has_utf8) {
1022 						*raw = TRUE;
1023 					}
1024 					else {
1025 						continue; /* Skip */
1026 					}
1027 				}
1028 			}
1029 			else {
1030 				continue; /* Skip non text */
1031 			}
1032 
1033 			if (re_class->type == RSPAMD_RE_RAWWORDS) {
1034 				if (tok->original.len > 0) {
1035 					scvec[cnt] = tok->original.begin;
1036 					lenvec[cnt++] = tok->original.len;
1037 				}
1038 			}
1039 			else if (re_class->type == RSPAMD_RE_WORDS) {
1040 				if (tok->normalized.len > 0) {
1041 					scvec[cnt] = tok->normalized.begin;
1042 					lenvec[cnt++] = tok->normalized.len;
1043 				}
1044 			}
1045 			else {
1046 				/* Stemmed words */
1047 				if (tok->stemmed.len > 0) {
1048 					scvec[cnt] = tok->stemmed.begin;
1049 					lenvec[cnt++] = tok->stemmed.len;
1050 				}
1051 			}
1052 		}
1053 	}
1054 
1055 	return cnt;
1056 }
1057 
1058 static guint
rspamd_re_cache_process_headers_list(struct rspamd_task * task,struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_re_class * re_class,struct rspamd_mime_header * rh,gboolean is_strong,gboolean * processed_hyperscan)1059 rspamd_re_cache_process_headers_list (struct rspamd_task *task,
1060 									  struct rspamd_re_runtime *rt,
1061 									  rspamd_regexp_t *re,
1062 									  struct rspamd_re_class *re_class,
1063 									  struct rspamd_mime_header *rh,
1064 									  gboolean is_strong,
1065 									  gboolean *processed_hyperscan)
1066 {
1067 	const guchar **scvec, *in;
1068 	gboolean raw = FALSE;
1069 	guint *lenvec;
1070 	struct rspamd_mime_header *cur;
1071 	guint cnt = 0, i = 0, ret = 0;
1072 
1073 	DL_COUNT (rh, cur, cnt);
1074 
1075 	scvec = g_malloc (sizeof (*scvec) * cnt);
1076 	lenvec = g_malloc (sizeof (*lenvec) * cnt);
1077 
1078 	DL_FOREACH (rh, cur) {
1079 
1080 		if (is_strong && strcmp (cur->name, re_class->type_data) != 0) {
1081 			/* Skip a different case */
1082 			continue;
1083 		}
1084 
1085 		if (re_class->type == RSPAMD_RE_RAWHEADER) {
1086 			in = (const guchar *)cur->value;
1087 			lenvec[i] = strlen (cur->value);
1088 
1089 			if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) {
1090 				raw = TRUE;
1091 			}
1092 		}
1093 		else {
1094 			in = (const guchar *)cur->decoded;
1095 			/* Validate input^W^WNo need to validate as it is already valid */
1096 			if (!in) {
1097 				lenvec[i] = 0;
1098 				scvec[i] = (guchar *)"";
1099 				continue;
1100 			}
1101 
1102 			lenvec[i] = strlen (in);
1103 		}
1104 
1105 		scvec[i] = in;
1106 
1107 		i ++;
1108 	}
1109 
1110 	if (i > 0) {
1111 		ret = rspamd_re_cache_process_regexp_data (rt, re,
1112 				task, scvec, lenvec, i, raw, processed_hyperscan);
1113 		msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d",
1114 				re_class->type_data,
1115 				rspamd_regexp_get_pattern (re),
1116 				(int) lenvec[0], scvec[0], ret);
1117 	}
1118 
1119 	g_free (scvec);
1120 	g_free (lenvec);
1121 
1122 	return ret;
1123 }
1124 
1125 /*
1126  * Calculates the specified regexp for the specified class if it's not calculated
1127  */
1128 static guint
rspamd_re_cache_exec_re(struct rspamd_task * task,struct rspamd_re_runtime * rt,rspamd_regexp_t * re,struct rspamd_re_class * re_class,gboolean is_strong)1129 rspamd_re_cache_exec_re (struct rspamd_task *task,
1130 		struct rspamd_re_runtime *rt,
1131 		rspamd_regexp_t *re,
1132 		struct rspamd_re_class *re_class,
1133 		gboolean is_strong)
1134 {
1135 	guint ret = 0, i, re_id;
1136 	struct rspamd_mime_header *rh;
1137 	const gchar *in;
1138 	const guchar **scvec;
1139 	guint *lenvec;
1140 	gboolean raw = FALSE, processed_hyperscan = FALSE;
1141 	struct rspamd_mime_text_part *text_part;
1142 	struct rspamd_mime_part *mime_part;
1143 	struct rspamd_url *url;
1144 	guint len, cnt;
1145 	const gchar *class_name;
1146 
1147 	class_name = rspamd_re_cache_type_to_string (re_class->type);
1148 	msg_debug_re_task ("start check re type: %s: /%s/",
1149 			class_name,
1150 			rspamd_regexp_get_pattern (re));
1151 	re_id = rspamd_regexp_get_cache_id (re);
1152 
1153 	switch (re_class->type) {
1154 	case RSPAMD_RE_HEADER:
1155 	case RSPAMD_RE_RAWHEADER:
1156 		/* Get list of specified headers */
1157 		rh = rspamd_message_get_header_array(task,
1158 				re_class->type_data, FALSE);
1159 
1160 		if (rh) {
1161 			ret = rspamd_re_cache_process_headers_list (task, rt, re,
1162 					re_class, rh, is_strong, &processed_hyperscan);
1163 			msg_debug_re_task ("checked header(%s) regexp: %s -> %d",
1164 					(const char *)re_class->type_data,
1165 					rspamd_regexp_get_pattern (re),
1166 					ret);
1167 		}
1168 		break;
1169 	case RSPAMD_RE_ALLHEADER:
1170 		raw = TRUE;
1171 		in = MESSAGE_FIELD (task, raw_headers_content).begin;
1172 		len = MESSAGE_FIELD (task, raw_headers_content).len;
1173 		ret = rspamd_re_cache_process_regexp_data (rt, re,
1174 				task, (const guchar **)&in, &len, 1, raw, &processed_hyperscan);
1175 		msg_debug_re_task ("checked allheader regexp: %s -> %d",
1176 				rspamd_regexp_get_pattern (re), ret);
1177 		break;
1178 	case RSPAMD_RE_MIMEHEADER:
1179 		PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, mime_part) {
1180 			rh = rspamd_message_get_header_from_hash(mime_part->raw_headers,
1181 					re_class->type_data, FALSE);
1182 
1183 			if (rh) {
1184 				ret += rspamd_re_cache_process_headers_list (task, rt, re,
1185 						re_class, rh, is_strong, &processed_hyperscan);
1186 			}
1187 			msg_debug_re_task ("checked mime header(%s) regexp: %s -> %d",
1188 					(const char *)re_class->type_data,
1189 					rspamd_regexp_get_pattern (re),
1190 					ret);
1191 		}
1192 		break;
1193 	case RSPAMD_RE_MIME:
1194 	case RSPAMD_RE_RAWMIME:
1195 		/* Iterate through text parts */
1196 		if (MESSAGE_FIELD (task, text_parts)->len > 0) {
1197 			cnt = MESSAGE_FIELD (task, text_parts)->len;
1198 			scvec = g_malloc (sizeof (*scvec) * cnt);
1199 			lenvec = g_malloc (sizeof (*lenvec) * cnt);
1200 
1201 			PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1202 				/* Select data for regexp */
1203 				if (re_class->type == RSPAMD_RE_RAWMIME) {
1204 					if (text_part->raw.len == 0) {
1205 						len = 0;
1206 						in = "";
1207 					}
1208 					else {
1209 						in = text_part->raw.begin;
1210 						len = text_part->raw.len;
1211 					}
1212 
1213 					raw = TRUE;
1214 				}
1215 				else {
1216 					/* Skip empty parts */
1217 					if (IS_TEXT_PART_EMPTY (text_part)) {
1218 						len = 0;
1219 						in = "";
1220 					}
1221 					else {
1222 						/* Check raw flags */
1223 						if (!IS_TEXT_PART_UTF (text_part)) {
1224 							raw = TRUE;
1225 						}
1226 
1227 						in = text_part->utf_content.begin;
1228 						len = text_part->utf_content.len;
1229 					}
1230 				}
1231 
1232 				scvec[i] = (guchar *) in;
1233 				lenvec[i] = len;
1234 			}
1235 
1236 			ret = rspamd_re_cache_process_regexp_data (rt, re,
1237 					task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1238 			msg_debug_re_task ("checked mime regexp: %s -> %d",
1239 					rspamd_regexp_get_pattern (re), ret);
1240 			g_free (scvec);
1241 			g_free (lenvec);
1242 		}
1243 		break;
1244 	case RSPAMD_RE_URL:
1245 		cnt = kh_size (MESSAGE_FIELD (task, urls));
1246 
1247 		if (cnt > 0) {
1248 			scvec = g_malloc (sizeof (*scvec) * cnt);
1249 			lenvec = g_malloc (sizeof (*lenvec) * cnt);
1250 			i = 0;
1251 			raw = FALSE;
1252 
1253 			kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
1254 				if ((url->protocol & PROTOCOL_MAILTO)) {
1255 					continue;
1256 				}
1257 				in = url->string;
1258 				len = url->urllen;
1259 
1260 				if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
1261 					scvec[i] = (guchar *) in;
1262 					lenvec[i++] = len;
1263 				}
1264 			});
1265 
1266 #if 0
1267 			g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
1268 
1269 			while (g_hash_table_iter_next (&it, &k, &v)) {
1270 				url = v;
1271 				in = url->string;
1272 				len = url->urllen;
1273 
1274 				if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
1275 					scvec[i] = (guchar *) in;
1276 					lenvec[i++] = len;
1277 				}
1278 			}
1279 #endif
1280 			ret = rspamd_re_cache_process_regexp_data (rt, re,
1281 					task, scvec, lenvec, i, raw, &processed_hyperscan);
1282 			msg_debug_re_task ("checked url regexp: %s -> %d",
1283 					rspamd_regexp_get_pattern (re), ret);
1284 			g_free (scvec);
1285 			g_free (lenvec);
1286 		}
1287 		break;
1288 	case RSPAMD_RE_EMAIL:
1289 		cnt = kh_size (MESSAGE_FIELD (task, urls));
1290 
1291 		if (cnt > 0) {
1292 			scvec = g_malloc (sizeof (*scvec) * cnt);
1293 			lenvec = g_malloc (sizeof (*lenvec) * cnt);
1294 			i = 0;
1295 			raw = FALSE;
1296 
1297 			kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
1298 
1299 				if (!(url->protocol & PROTOCOL_MAILTO)) {
1300 					continue;
1301 				}
1302 				if (url->userlen == 0 || url->hostlen == 0) {
1303 					continue;
1304 				}
1305 
1306 				in = rspamd_url_user_unsafe (url);
1307 				len = url->userlen + 1 + url->hostlen;
1308 				scvec[i] = (guchar *) in;
1309 				lenvec[i++] = len;
1310 			});
1311 
1312 			ret = rspamd_re_cache_process_regexp_data (rt, re,
1313 					task, scvec, lenvec, i, raw, &processed_hyperscan);
1314 			msg_debug_re_task ("checked email regexp: %s -> %d",
1315 					rspamd_regexp_get_pattern (re), ret);
1316 			g_free (scvec);
1317 			g_free (lenvec);
1318 		}
1319 		break;
1320 	case RSPAMD_RE_BODY:
1321 		raw = TRUE;
1322 		in = task->msg.begin;
1323 		len = task->msg.len;
1324 
1325 		ret = rspamd_re_cache_process_regexp_data (rt, re, task,
1326 				(const guchar **)&in, &len, 1, raw, &processed_hyperscan);
1327 		msg_debug_re_task ("checked rawbody regexp: %s -> %d",
1328 				rspamd_regexp_get_pattern (re), ret);
1329 		break;
1330 	case RSPAMD_RE_SABODY:
1331 		/* According to SA docs:
1332 		 * The 'body' in this case is the textual parts of the message body;
1333 		 * any non-text MIME parts are stripped, and the message decoded from
1334 		 * Quoted-Printable or Base-64-encoded format if necessary. The message
1335 		 * Subject header is considered part of the body and becomes the first
1336 		 * paragraph when running the rules. All HTML tags and line breaks will
1337 		 * be removed before matching.
1338 		 */
1339 		cnt = MESSAGE_FIELD (task, text_parts)->len + 1;
1340 		scvec = g_malloc (sizeof (*scvec) * cnt);
1341 		lenvec = g_malloc (sizeof (*lenvec) * cnt);
1342 
1343 		/*
1344 		 * Body rules also include the Subject as the first line
1345 		 * of the body content.
1346 		 */
1347 
1348 		rh = rspamd_message_get_header_array(task, "Subject", FALSE);
1349 
1350 		if (rh) {
1351 			scvec[0] = (guchar *)rh->decoded;
1352 			lenvec[0] = strlen (rh->decoded);
1353 		}
1354 		else {
1355 			scvec[0] = (guchar *)"";
1356 			lenvec[0] = 0;
1357 		}
1358 
1359 		PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1360 			if (text_part->utf_stripped_content) {
1361 				scvec[i + 1] = (guchar *)text_part->utf_stripped_content->data;
1362 				lenvec[i + 1] = text_part->utf_stripped_content->len;
1363 
1364 				if (!IS_TEXT_PART_UTF (text_part)) {
1365 					raw = TRUE;
1366 				}
1367 			}
1368 			else {
1369 				scvec[i + 1] = (guchar *)"";
1370 				lenvec[i + 1] = 0;
1371 			}
1372 		}
1373 
1374 		ret = rspamd_re_cache_process_regexp_data (rt, re,
1375 				task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1376 		msg_debug_re_task ("checked sa body regexp: %s -> %d",
1377 				rspamd_regexp_get_pattern (re), ret);
1378 		g_free (scvec);
1379 		g_free (lenvec);
1380 		break;
1381 	case RSPAMD_RE_SARAWBODY:
1382 		/* According to SA docs:
1383 		 * The 'raw body' of a message is the raw data inside all textual
1384 		 * parts. The text will be decoded from base64 or quoted-printable
1385 		 * encoding, but HTML tags and line breaks will still be present.
1386 		 * Multiline expressions will need to be used to match strings that are
1387 		 * broken by line breaks.
1388 		 */
1389 		if (MESSAGE_FIELD (task, text_parts)->len > 0) {
1390 			cnt = MESSAGE_FIELD (task, text_parts)->len;
1391 			scvec = g_malloc (sizeof (*scvec) * cnt);
1392 			lenvec = g_malloc (sizeof (*lenvec) * cnt);
1393 
1394 			for (i = 0; i < cnt; i++) {
1395 				text_part = g_ptr_array_index (MESSAGE_FIELD (task, text_parts), i);
1396 
1397 				if (text_part->parsed.len > 0) {
1398 					scvec[i] = (guchar *)text_part->parsed.begin;
1399 					lenvec[i] = text_part->parsed.len;
1400 
1401 					if (!IS_TEXT_PART_UTF (text_part)) {
1402 						raw = TRUE;
1403 					}
1404 				}
1405 				else {
1406 					scvec[i] = (guchar *)"";
1407 					lenvec[i] = 0;
1408 				}
1409 			}
1410 
1411 			ret = rspamd_re_cache_process_regexp_data (rt, re,
1412 					task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1413 			msg_debug_re_task ("checked sa rawbody regexp: %s -> %d",
1414 					rspamd_regexp_get_pattern (re), ret);
1415 			g_free (scvec);
1416 			g_free (lenvec);
1417 		}
1418 		break;
1419 	case RSPAMD_RE_WORDS:
1420 	case RSPAMD_RE_STEMWORDS:
1421 	case RSPAMD_RE_RAWWORDS:
1422 		if (MESSAGE_FIELD (task, text_parts)->len > 0) {
1423 			cnt = 0;
1424 			raw = FALSE;
1425 
1426 			PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1427 				if (text_part->utf_words) {
1428 					cnt += text_part->utf_words->len;
1429 				}
1430 			}
1431 
1432 			if (task->meta_words && task->meta_words->len > 0) {
1433 				cnt += task->meta_words->len;
1434 			}
1435 
1436 			if (cnt > 0) {
1437 				scvec = g_malloc (sizeof (*scvec) * cnt);
1438 				lenvec = g_malloc (sizeof (*lenvec) * cnt);
1439 
1440 				cnt = 0;
1441 
1442 				PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
1443 					if (text_part->utf_words) {
1444 						cnt = rspamd_process_words_vector (text_part->utf_words,
1445 								scvec, lenvec, re_class, cnt, &raw);
1446 					}
1447 				}
1448 
1449 				if (task->meta_words) {
1450 					cnt = rspamd_process_words_vector (task->meta_words,
1451 							scvec, lenvec, re_class, cnt, &raw);
1452 				}
1453 
1454 				ret = rspamd_re_cache_process_regexp_data (rt, re,
1455 						task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1456 
1457 				msg_debug_re_task ("checked sa words regexp: %s -> %d",
1458 						rspamd_regexp_get_pattern (re), ret);
1459 				g_free (scvec);
1460 				g_free (lenvec);
1461 			}
1462 		}
1463 		break;
1464 	case RSPAMD_RE_SELECTOR:
1465 		if (rspamd_re_cache_process_selector (task, rt,
1466 				re_class->type_data,
1467 				(guchar ***)&scvec,
1468 				&lenvec, &cnt)) {
1469 
1470 			ret = rspamd_re_cache_process_regexp_data (rt, re,
1471 					task, scvec, lenvec, cnt, raw, &processed_hyperscan);
1472 			msg_debug_re_task ("checked selector(%s) regexp: %s -> %d",
1473 					re_class->type_data,
1474 					rspamd_regexp_get_pattern (re), ret);
1475 
1476 			/* Do not free vectors as they are managed by rt->sel_cache */
1477 		}
1478 		break;
1479 	case RSPAMD_RE_MAX:
1480 		msg_err_task ("regexp of class invalid has been called: %s",
1481 				rspamd_regexp_get_pattern (re));
1482 		break;
1483 	}
1484 
1485 #if WITH_HYPERSCAN
1486 	if (processed_hyperscan) {
1487 		rspamd_re_cache_finish_class (task, rt, re_class, class_name);
1488 	}
1489 #endif
1490 
1491 	setbit (rt->checked, re_id);
1492 
1493 	return rt->results[re_id];
1494 }
1495 
1496 gint
rspamd_re_cache_process(struct rspamd_task * task,rspamd_regexp_t * re,enum rspamd_re_type type,gconstpointer type_data,gsize datalen,gboolean is_strong)1497 rspamd_re_cache_process (struct rspamd_task *task,
1498 		rspamd_regexp_t *re,
1499 		enum rspamd_re_type type,
1500 		gconstpointer type_data,
1501 		gsize datalen,
1502 		gboolean is_strong)
1503 {
1504 	guint64 re_id;
1505 	struct rspamd_re_class *re_class;
1506 	struct rspamd_re_cache *cache;
1507 	struct rspamd_re_runtime *rt;
1508 
1509 	g_assert (task != NULL);
1510 	rt = task->re_rt;
1511 	g_assert (rt != NULL);
1512 	g_assert (re != NULL);
1513 
1514 	cache = rt->cache;
1515 	re_id = rspamd_regexp_get_cache_id (re);
1516 
1517 	if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) {
1518 		msg_err_task ("re '%s' has no valid id for the cache",
1519 				rspamd_regexp_get_pattern (re));
1520 		return 0;
1521 	}
1522 
1523 	if (isset (rt->checked, re_id)) {
1524 		/* Fast path */
1525 		rt->stat.regexp_fast_cached ++;
1526 		return rt->results[re_id];
1527 	}
1528 	else {
1529 		/* Slow path */
1530 		re_class = rspamd_regexp_get_class (re);
1531 
1532 		if (re_class == NULL) {
1533 			msg_err_task ("cannot find re class for regexp '%s'",
1534 					rspamd_regexp_get_pattern (re));
1535 			return 0;
1536 		}
1537 
1538 		return rspamd_re_cache_exec_re (task, rt, re, re_class,
1539 				is_strong);
1540 	}
1541 
1542 	return 0;
1543 }
1544 
1545 int
rspamd_re_cache_process_ffi(void * ptask,void * pre,int type,void * type_data,int is_strong)1546 rspamd_re_cache_process_ffi (void *ptask,
1547 		void *pre,
1548 		int type,
1549 		void *type_data,
1550 		int is_strong)
1551 {
1552 	struct rspamd_lua_regexp **lua_re = pre;
1553 	struct rspamd_task **real_task = ptask;
1554 	gsize typelen = 0;
1555 
1556 	if (type_data) {
1557 		typelen = strlen (type_data);
1558 	}
1559 
1560 	return rspamd_re_cache_process (*real_task, (*lua_re)->re,
1561 			type, type_data, typelen, is_strong);
1562 }
1563 
1564 void
rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime * rt)1565 rspamd_re_cache_runtime_destroy (struct rspamd_re_runtime *rt)
1566 {
1567 	g_assert (rt != NULL);
1568 
1569 	if (rt->sel_cache) {
1570 		struct rspamd_re_selector_result sr;
1571 
1572 		kh_foreach_value (rt->sel_cache, sr, {
1573 			for (guint i = 0; i < sr.cnt; i ++) {
1574 				g_free ((gpointer)sr.scvec[i]);
1575 			}
1576 
1577 			g_free (sr.scvec);
1578 			g_free (sr.lenvec);
1579 		});
1580 		kh_destroy (selectors_results_hash, rt->sel_cache);
1581 	}
1582 
1583 	REF_RELEASE (rt->cache);
1584 	g_free (rt);
1585 }
1586 
1587 void
rspamd_re_cache_unref(struct rspamd_re_cache * cache)1588 rspamd_re_cache_unref (struct rspamd_re_cache *cache)
1589 {
1590 	if (cache) {
1591 		REF_RELEASE (cache);
1592 	}
1593 }
1594 
1595 struct rspamd_re_cache *
rspamd_re_cache_ref(struct rspamd_re_cache * cache)1596 rspamd_re_cache_ref (struct rspamd_re_cache *cache)
1597 {
1598 	if (cache) {
1599 		REF_RETAIN (cache);
1600 	}
1601 
1602 	return cache;
1603 }
1604 
1605 guint
rspamd_re_cache_set_limit(struct rspamd_re_cache * cache,guint limit)1606 rspamd_re_cache_set_limit (struct rspamd_re_cache *cache, guint limit)
1607 {
1608 	guint old;
1609 
1610 	g_assert (cache != NULL);
1611 
1612 	old = cache->max_re_data;
1613 	cache->max_re_data = limit;
1614 
1615 	return old;
1616 }
1617 
1618 const gchar *
rspamd_re_cache_type_to_string(enum rspamd_re_type type)1619 rspamd_re_cache_type_to_string (enum rspamd_re_type type)
1620 {
1621 	const gchar *ret = "unknown";
1622 
1623 	switch (type) {
1624 	case RSPAMD_RE_HEADER:
1625 		ret = "header";
1626 		break;
1627 	case RSPAMD_RE_RAWHEADER:
1628 		ret = "raw header";
1629 		break;
1630 	case RSPAMD_RE_MIMEHEADER:
1631 		ret = "mime header";
1632 		break;
1633 	case RSPAMD_RE_ALLHEADER:
1634 		ret = "all headers";
1635 		break;
1636 	case RSPAMD_RE_MIME:
1637 		ret = "part";
1638 		break;
1639 	case RSPAMD_RE_RAWMIME:
1640 		ret = "raw part";
1641 		break;
1642 	case RSPAMD_RE_BODY:
1643 		ret = "rawbody";
1644 		break;
1645 	case RSPAMD_RE_URL:
1646 		ret = "url";
1647 		break;
1648 	case RSPAMD_RE_EMAIL:
1649 		ret = "email";
1650 		break;
1651 	case RSPAMD_RE_SABODY:
1652 		ret = "sa body";
1653 		break;
1654 	case RSPAMD_RE_SARAWBODY:
1655 		ret = "sa raw body";
1656 		break;
1657 	case RSPAMD_RE_SELECTOR:
1658 		ret = "selector";
1659 		break;
1660 	case RSPAMD_RE_WORDS:
1661 		ret = "words";
1662 		break;
1663 	case RSPAMD_RE_RAWWORDS:
1664 		ret = "raw_words";
1665 		break;
1666 	case RSPAMD_RE_STEMWORDS:
1667 		ret = "stem_words";
1668 		break;
1669 	case RSPAMD_RE_MAX:
1670 	default:
1671 		ret = "invalid class";
1672 		break;
1673 	}
1674 
1675 	return ret;
1676 }
1677 
1678 enum rspamd_re_type
rspamd_re_cache_type_from_string(const char * str)1679 rspamd_re_cache_type_from_string (const char *str)
1680 {
1681 	enum rspamd_re_type ret;
1682 	guint64 h;
1683 
1684 	/*
1685 	 * To optimize this function, we apply hash to input string and
1686 	 * pre-select it from the values
1687 	 */
1688 
1689 	if (str != NULL) {
1690 		h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
1691 				str, strlen (str), 0xdeadbabe);
1692 
1693 		switch (h) {
1694 		case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */
1695 			ret = RSPAMD_RE_HEADER;
1696 			break;
1697 		case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */
1698 			ret = RSPAMD_RE_RAWHEADER;
1699 			break;
1700 		case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */
1701 			ret = RSPAMD_RE_MIME;
1702 			break;
1703 		case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */
1704 			ret = RSPAMD_RE_RAWMIME;
1705 			break;
1706 		case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */
1707 		case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */
1708 			ret = RSPAMD_RE_BODY;
1709 			break;
1710 		case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */
1711 		case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */
1712 			ret = RSPAMD_RE_URL;
1713 			break;
1714 		case G_GUINT64_CONSTANT (0x7e232b0f60b571be): /* email */
1715 			ret = RSPAMD_RE_EMAIL;
1716 			break;
1717 		case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */
1718 			ret = RSPAMD_RE_ALLHEADER;
1719 			break;
1720 		case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */
1721 			ret = RSPAMD_RE_MIMEHEADER;
1722 			break;
1723 		case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */
1724 			ret = RSPAMD_RE_SABODY;
1725 			break;
1726 		case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */
1727 			ret = RSPAMD_RE_SARAWBODY;
1728 			break;
1729 		default:
1730 			ret = RSPAMD_RE_MAX;
1731 			break;
1732 		}
1733 	}
1734 	else {
1735 		ret = RSPAMD_RE_MAX;
1736 	}
1737 
1738 	return ret;
1739 }
1740 
1741 #ifdef WITH_HYPERSCAN
1742 static gchar *
rspamd_re_cache_hs_pattern_from_pcre(rspamd_regexp_t * re)1743 rspamd_re_cache_hs_pattern_from_pcre (rspamd_regexp_t *re)
1744 {
1745 	/*
1746 	 * Workaroung for bug in ragel 7.0.0.11
1747 	 * https://github.com/intel/hyperscan/issues/133
1748 	 */
1749 	const gchar *pat = rspamd_regexp_get_pattern (re);
1750 	guint flags = rspamd_regexp_get_flags (re), esc_flags = RSPAMD_REGEXP_ESCAPE_RE;
1751 	gchar *escaped;
1752 	gsize esc_len;
1753 
1754 	if (flags & RSPAMD_REGEXP_FLAG_UTF) {
1755 		esc_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
1756 	}
1757 
1758 	escaped = rspamd_str_regexp_escape (pat, strlen (pat), &esc_len,esc_flags);
1759 
1760 	return escaped;
1761 }
1762 
1763 static gboolean
rspamd_re_cache_is_finite(struct rspamd_re_cache * cache,rspamd_regexp_t * re,gint flags,gdouble max_time)1764 rspamd_re_cache_is_finite (struct rspamd_re_cache *cache,
1765 		rspamd_regexp_t *re, gint flags, gdouble max_time)
1766 {
1767 	pid_t cld;
1768 	gint status;
1769 	struct timespec ts;
1770 	hs_compile_error_t *hs_errors;
1771 	hs_database_t *test_db;
1772 	gdouble wait_time;
1773 	const gint max_tries = 10;
1774 	gint tries = 0, rc;
1775 	void (*old_hdl)(int);
1776 
1777 	wait_time = max_time / max_tries;
1778 	/* We need to restore SIGCHLD processing */
1779 	old_hdl = signal (SIGCHLD, SIG_DFL);
1780 	cld = fork ();
1781 
1782 	if (cld == 0) {
1783 		/* Try to compile pattern */
1784 
1785 		gchar *pat = rspamd_re_cache_hs_pattern_from_pcre (re);
1786 
1787 		if (hs_compile (pat,
1788 				flags | HS_FLAG_PREFILTER,
1789 				cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
1790 				&cache->plt,
1791 				&test_db,
1792 				&hs_errors) != HS_SUCCESS) {
1793 
1794 			msg_info_re_cache ("cannot compile (prefilter mode) '%s' to hyperscan: '%s'",
1795 					pat,
1796 					hs_errors != NULL ? hs_errors->message : "unknown error");
1797 
1798 			hs_free_compile_error (hs_errors);
1799 			g_free (pat);
1800 
1801 			exit (EXIT_FAILURE);
1802 		}
1803 
1804 		g_free (pat);
1805 		exit (EXIT_SUCCESS);
1806 	}
1807 	else if (cld > 0) {
1808 		double_to_ts (wait_time, &ts);
1809 
1810 		while ((rc = waitpid (cld, &status, WNOHANG)) == 0 && tries ++ < max_tries) {
1811 			(void)nanosleep (&ts, NULL);
1812 		}
1813 
1814 		/* Child has been terminated */
1815 		if (rc > 0) {
1816 			/* Forget about SIGCHLD after this point */
1817 			signal (SIGCHLD, old_hdl);
1818 
1819 			if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_SUCCESS) {
1820 				return TRUE;
1821 			}
1822 			else {
1823 				msg_err_re_cache (
1824 						"cannot approximate %s to hyperscan",
1825 						rspamd_regexp_get_pattern (re));
1826 
1827 				return FALSE;
1828 			}
1829 		}
1830 		else {
1831 			/* We consider that as timeout */
1832 			kill (cld, SIGKILL);
1833 			g_assert (waitpid (cld, &status, 0) != -1);
1834 			msg_err_re_cache (
1835 					"cannot approximate %s to hyperscan: timeout waiting",
1836 					rspamd_regexp_get_pattern (re));
1837 			signal (SIGCHLD, old_hdl);
1838 		}
1839 	}
1840 	else {
1841 		msg_err_re_cache (
1842 				"cannot approximate %s to hyperscan: fork failed: %s",
1843 				rspamd_regexp_get_pattern (re), strerror (errno));
1844 		signal (SIGCHLD, old_hdl);
1845 	}
1846 
1847 	return FALSE;
1848 }
1849 #endif
1850 
1851 #ifdef WITH_HYPERSCAN
1852 struct rspamd_re_cache_hs_compile_cbdata {
1853 	GHashTableIter it;
1854 	struct rspamd_re_cache *cache;
1855 	const char *cache_dir;
1856 	gdouble max_time;
1857 	gboolean silent;
1858 	guint total;
1859 	void (*cb)(guint ncompiled, GError *err, void *cbd);
1860 	void *cbd;
1861 };
1862 
1863 static void
rspamd_re_cache_compile_err(EV_P_ ev_timer * w,GError * err,struct rspamd_re_cache_hs_compile_cbdata * cbdata,bool is_fatal)1864 rspamd_re_cache_compile_err (EV_P_ ev_timer *w, GError *err,
1865 		struct rspamd_re_cache_hs_compile_cbdata *cbdata, bool is_fatal)
1866 {
1867 	cbdata->cb (cbdata->total, err, cbdata->cbd);
1868 
1869 	if (is_fatal) {
1870 		ev_timer_stop(EV_A_ w);
1871 		g_free(w);
1872 		g_free(cbdata);
1873 	}
1874 	else {
1875 		/* Continue compilation */
1876 		ev_timer_again(EV_A_ w);
1877 	}
1878 	g_error_free (err);
1879 }
1880 
1881 static void
rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer * w,int revents)1882 rspamd_re_cache_compile_timer_cb (EV_P_ ev_timer *w, int revents )
1883 {
1884 	struct rspamd_re_cache_hs_compile_cbdata *cbdata =
1885 			(struct rspamd_re_cache_hs_compile_cbdata *)w->data;
1886 	GHashTableIter cit;
1887 	gpointer k, v;
1888 	struct rspamd_re_class *re_class;
1889 	gchar path[PATH_MAX], npath[PATH_MAX];
1890 	hs_database_t *test_db;
1891 	gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags;
1892 	rspamd_cryptobox_fast_hash_state_t crc_st;
1893 	guint64 crc;
1894 	rspamd_regexp_t *re;
1895 	hs_compile_error_t *hs_errors = NULL;
1896 	guint *hs_flags = NULL;
1897 	const hs_expr_ext_t **hs_exts = NULL;
1898 	gchar **hs_pats = NULL;
1899 	gchar *hs_serialized = NULL;
1900 	gsize serialized_len;
1901 	struct iovec iov[7];
1902 	struct rspamd_re_cache *cache;
1903 	GError *err;
1904 	pid_t our_pid = getpid ();
1905 
1906 	cache = cbdata->cache;
1907 
1908 	if (!g_hash_table_iter_next (&cbdata->it, &k, &v)) {
1909 		/* All done */
1910 		ev_timer_stop (EV_A_ w);
1911 		cbdata->cb (cbdata->total, NULL, cbdata->cbd);
1912 		g_free (w);
1913 		g_free (cbdata);
1914 
1915 		return;
1916 	}
1917 
1918 	re_class = v;
1919 	rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cbdata->cache_dir,
1920 			G_DIR_SEPARATOR, re_class->hash);
1921 
1922 	if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) {
1923 
1924 		fd = open (path, O_RDONLY, 00600);
1925 
1926 		/* Read number of regexps */
1927 		g_assert (fd != -1);
1928 		g_assert (lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET) != -1);
1929 		g_assert (read (fd, &n, sizeof (n)) == sizeof (n));
1930 		close (fd);
1931 
1932 		if (re_class->type_len > 0) {
1933 			if (!cbdata->silent) {
1934 				msg_info_re_cache (
1935 						"skip already valid class %s(%*s) to cache %6s, %d regexps",
1936 						rspamd_re_cache_type_to_string (re_class->type),
1937 						(gint) re_class->type_len - 1,
1938 						re_class->type_data,
1939 						re_class->hash,
1940 						n);
1941 			}
1942 		}
1943 		else {
1944 			if (!cbdata->silent) {
1945 				msg_info_re_cache (
1946 						"skip already valid class %s to cache %6s, %d regexps",
1947 						rspamd_re_cache_type_to_string (re_class->type),
1948 						re_class->hash,
1949 						n);
1950 			}
1951 		}
1952 
1953 		ev_timer_again (EV_A_ w);
1954 		return;
1955 	}
1956 
1957 	rspamd_snprintf (path, sizeof (path), "%s%c%s.%P.hs.new", cbdata->cache_dir,
1958 			G_DIR_SEPARATOR, re_class->hash, our_pid);
1959 	fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600);
1960 
1961 	if (fd == -1) {
1962 		err = g_error_new (rspamd_re_cache_quark (), errno,
1963 				"cannot open file %s: %s", path, strerror (errno));
1964 		rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
1965 		return;
1966 	}
1967 
1968 	g_hash_table_iter_init (&cit, re_class->re);
1969 	n = g_hash_table_size (re_class->re);
1970 	hs_flags = g_malloc0 (sizeof (*hs_flags) * n);
1971 	hs_ids = g_malloc (sizeof (*hs_ids) * n);
1972 	hs_pats = g_malloc (sizeof (*hs_pats) * n);
1973 	hs_exts = g_malloc0 (sizeof (*hs_exts) * n);
1974 	i = 0;
1975 
1976 	while (g_hash_table_iter_next (&cit, &k, &v)) {
1977 		re = v;
1978 
1979 		pcre_flags = rspamd_regexp_get_pcre_flags (re);
1980 		re_flags = rspamd_regexp_get_flags (re);
1981 
1982 		if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) {
1983 			/* Do not try to compile bad regexp */
1984 			msg_info_re_cache (
1985 					"do not try compile %s to hyperscan as it is PCRE only",
1986 					rspamd_regexp_get_pattern (re));
1987 			continue;
1988 		}
1989 
1990 		hs_flags[i] = 0;
1991 		hs_exts[i] = NULL;
1992 #ifndef WITH_PCRE2
1993 		if (pcre_flags & PCRE_FLAG(UTF8)) {
1994 			hs_flags[i] |= HS_FLAG_UTF8;
1995 		}
1996 #else
1997 		if (pcre_flags & PCRE_FLAG(UTF)) {
1998 				hs_flags[i] |= HS_FLAG_UTF8;
1999 			}
2000 #endif
2001 		if (pcre_flags & PCRE_FLAG(CASELESS)) {
2002 			hs_flags[i] |= HS_FLAG_CASELESS;
2003 		}
2004 		if (pcre_flags & PCRE_FLAG(MULTILINE)) {
2005 			hs_flags[i] |= HS_FLAG_MULTILINE;
2006 		}
2007 		if (pcre_flags & PCRE_FLAG(DOTALL)) {
2008 			hs_flags[i] |= HS_FLAG_DOTALL;
2009 		}
2010 
2011 
2012 		if (re_flags & RSPAMD_REGEXP_FLAG_LEFTMOST) {
2013 			hs_flags[i] |= HS_FLAG_SOM_LEFTMOST;
2014 		}
2015 		else if (rspamd_regexp_get_maxhits (re) == 1) {
2016 			hs_flags[i] |= HS_FLAG_SINGLEMATCH;
2017 		}
2018 
2019 		gchar *pat = rspamd_re_cache_hs_pattern_from_pcre (re);
2020 
2021 		if (hs_compile (pat,
2022 				hs_flags[i],
2023 				cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
2024 				&cache->plt,
2025 				&test_db,
2026 				&hs_errors) != HS_SUCCESS) {
2027 			msg_info_re_cache ("cannot compile '%s' to hyperscan: '%s', try prefilter match",
2028 					pat,
2029 					hs_errors != NULL ? hs_errors->message : "unknown error");
2030 			hs_free_compile_error (hs_errors);
2031 
2032 			/* The approximation operation might take a significant
2033 			 * amount of time, so we need to check if it's finite
2034 			 */
2035 			if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], cbdata->max_time)) {
2036 				hs_flags[i] |= HS_FLAG_PREFILTER;
2037 				hs_ids[i] = rspamd_regexp_get_cache_id (re);
2038 				hs_pats[i] = pat;
2039 				i++;
2040 			}
2041 			else {
2042 				g_free (pat); /* Avoid leak */
2043 			}
2044 		}
2045 		else {
2046 			hs_ids[i] = rspamd_regexp_get_cache_id (re);
2047 			hs_pats[i] = pat;
2048 			i ++;
2049 			hs_free_database (test_db);
2050 		}
2051 	}
2052 	/* Adjust real re number */
2053 	n = i;
2054 
2055 #define CLEANUP_ALLOCATED(is_err) do {    \
2056     g_free (hs_flags);                    \
2057     g_free (hs_ids);                    \
2058     for (guint j = 0; j < i; j ++) {    \
2059         g_free (hs_pats[j]);            \
2060     }                                    \
2061     g_free (hs_pats);                    \
2062     g_free (hs_exts);                    \
2063     if (is_err) {                         \
2064         close (fd);                            \
2065         unlink (path);                        \
2066         if (hs_errors) hs_free_compile_error (hs_errors); \
2067     }                                        \
2068 } while(0)
2069 
2070 	if (n > 0) {
2071 		/* Create the hs tree */
2072 		hs_errors = NULL;
2073 		if (hs_compile_ext_multi ((const char **)hs_pats,
2074 				hs_flags,
2075 				hs_ids,
2076 				hs_exts,
2077 				n,
2078 				cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
2079 				&cache->plt,
2080 				&test_db,
2081 				&hs_errors) != HS_SUCCESS) {
2082 
2083 			err = g_error_new (rspamd_re_cache_quark (), EINVAL,
2084 					"cannot create tree of regexp when processing '%s': %s",
2085 					hs_pats[hs_errors->expression], hs_errors->message);
2086 			CLEANUP_ALLOCATED(true);
2087 			rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2088 
2089 			return;
2090 		}
2091 
2092 		if (hs_serialize_database (test_db, &hs_serialized,
2093 				&serialized_len) != HS_SUCCESS) {
2094 			err = g_error_new (rspamd_re_cache_quark (),
2095 					errno,
2096 					"cannot serialize tree of regexp for %s",
2097 					re_class->hash);
2098 
2099 			CLEANUP_ALLOCATED(true);
2100 			hs_free_database (test_db);
2101 			rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2102 			return;
2103 		}
2104 
2105 		hs_free_database (test_db);
2106 
2107 		/*
2108 		 * Magic - 8 bytes
2109 		 * Platform - sizeof (platform)
2110 		 * n - number of regexps
2111 		 * n * <regexp ids>
2112 		 * n * <regexp flags>
2113 		 * crc - 8 bytes checksum
2114 		 * <hyperscan blob>
2115 		 */
2116 		rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe);
2117 		/* IDs -> Flags -> Hs blob */
2118 		rspamd_cryptobox_fast_hash_update (&crc_st,
2119 				hs_ids, sizeof (*hs_ids) * n);
2120 		rspamd_cryptobox_fast_hash_update (&crc_st,
2121 				hs_flags, sizeof (*hs_flags) * n);
2122 		rspamd_cryptobox_fast_hash_update (&crc_st,
2123 				hs_serialized, serialized_len);
2124 		crc = rspamd_cryptobox_fast_hash_final (&crc_st);
2125 
2126 		if (cache->vectorized_hyperscan) {
2127 			iov[0].iov_base = (void *) rspamd_hs_magic_vector;
2128 		}
2129 		else {
2130 			iov[0].iov_base = (void *) rspamd_hs_magic;
2131 		}
2132 
2133 		iov[0].iov_len = RSPAMD_HS_MAGIC_LEN;
2134 		iov[1].iov_base = &cache->plt;
2135 		iov[1].iov_len = sizeof (cache->plt);
2136 		iov[2].iov_base = &n;
2137 		iov[2].iov_len = sizeof (n);
2138 		iov[3].iov_base = hs_ids;
2139 		iov[3].iov_len = sizeof (*hs_ids) * n;
2140 		iov[4].iov_base = hs_flags;
2141 		iov[4].iov_len = sizeof (*hs_flags) * n;
2142 		iov[5].iov_base = &crc;
2143 		iov[5].iov_len = sizeof (crc);
2144 		iov[6].iov_base = hs_serialized;
2145 		iov[6].iov_len = serialized_len;
2146 
2147 		if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) {
2148 			err = g_error_new (rspamd_re_cache_quark (),
2149 					errno,
2150 					"cannot serialize tree of regexp to %s: %s",
2151 					path, strerror (errno));
2152 
2153 			CLEANUP_ALLOCATED(true);
2154 			g_free (hs_serialized);
2155 
2156 			rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2157 			return;
2158 		}
2159 
2160 		if (re_class->type_len > 0) {
2161 			msg_info_re_cache (
2162 					"compiled class %s(%*s) to cache %6s, %d/%d regexps",
2163 					rspamd_re_cache_type_to_string (re_class->type),
2164 					(gint) re_class->type_len - 1,
2165 					re_class->type_data,
2166 					re_class->hash,
2167 					n,
2168 					(gint)g_hash_table_size (re_class->re));
2169 		}
2170 		else {
2171 			msg_info_re_cache (
2172 					"compiled class %s to cache %6s, %d/%d regexps",
2173 					rspamd_re_cache_type_to_string (re_class->type),
2174 					re_class->hash,
2175 					n,
2176 					(gint)g_hash_table_size (re_class->re));
2177 		}
2178 
2179 		cbdata->total += n;
2180 		CLEANUP_ALLOCATED(false);
2181 
2182 		/* Now rename temporary file to the new .hs file */
2183 		rspamd_snprintf (npath, sizeof (npath), "%s%c%s.hs", cbdata->cache_dir,
2184 				G_DIR_SEPARATOR, re_class->hash);
2185 
2186 		if (rename (path, npath) == -1) {
2187 			err = g_error_new (rspamd_re_cache_quark (),
2188 					errno,
2189 					"cannot rename %s to %s: %s",
2190 					path, npath, strerror (errno));
2191 			unlink (path);
2192 			close (fd);
2193 
2194 			rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2195 			return;
2196 		}
2197 
2198 		close (fd);
2199 	}
2200 	else {
2201 		err = g_error_new (rspamd_re_cache_quark (),
2202 				errno,
2203 				"no suitable regular expressions %s (%d original): "
2204 				"remove temporary file %s",
2205 				rspamd_re_cache_type_to_string (re_class->type),
2206 				(gint)g_hash_table_size (re_class->re),
2207 				path);
2208 
2209 		CLEANUP_ALLOCATED(true);
2210 		rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
2211 
2212 		return;
2213 	}
2214 
2215 	/* Continue process */
2216 	ev_timer_again (EV_A_ w);
2217 }
2218 
2219 #endif
2220 
2221 gint
rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache * cache,const char * cache_dir,gdouble max_time,gboolean silent,struct ev_loop * event_loop,void (* cb)(guint ncompiled,GError * err,void * cbd),void * cbd)2222 rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache,
2223 								   const char *cache_dir,
2224 								   gdouble max_time,
2225 								   gboolean silent,
2226 								   struct ev_loop *event_loop,
2227 								   void (*cb)(guint ncompiled, GError *err, void *cbd),
2228 								   void *cbd)
2229 {
2230 	g_assert (cache != NULL);
2231 	g_assert (cache_dir != NULL);
2232 
2233 #ifndef WITH_HYPERSCAN
2234 	return -1;
2235 #else
2236 	static ev_timer *timer;
2237 	static const ev_tstamp timer_interval = 0.1;
2238 	struct rspamd_re_cache_hs_compile_cbdata *cbdata;
2239 
2240 	cbdata = g_malloc0 (sizeof (*cbdata));
2241 	g_hash_table_iter_init (&cbdata->it, cache->re_classes);
2242 	cbdata->cache = cache;
2243 	cbdata->cache_dir = cache_dir;
2244 	cbdata->cb = cb;
2245 	cbdata->cbd = cbd;
2246 	cbdata->max_time = max_time;
2247 	cbdata->silent = silent;
2248 	cbdata->total = 0;
2249 	timer = g_malloc0 (sizeof (*timer));
2250 	timer->data = (void *)cbdata; /* static */
2251 
2252 	ev_timer_init (timer, rspamd_re_cache_compile_timer_cb,
2253 			timer_interval, timer_interval);
2254 	ev_timer_start (event_loop, timer);
2255 
2256 	return 0;
2257 #endif
2258 }
2259 
2260 gboolean
rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache * cache,const char * path,gboolean silent,gboolean try_load)2261 rspamd_re_cache_is_valid_hyperscan_file (struct rspamd_re_cache *cache,
2262 		const char *path, gboolean silent, gboolean try_load)
2263 {
2264 	g_assert (cache != NULL);
2265 	g_assert (path != NULL);
2266 
2267 #ifndef WITH_HYPERSCAN
2268 	return FALSE;
2269 #else
2270 	gint fd, n, ret;
2271 	guchar magicbuf[RSPAMD_HS_MAGIC_LEN];
2272 	const guchar *mb;
2273 	GHashTableIter it;
2274 	gpointer k, v;
2275 	struct rspamd_re_class *re_class;
2276 	gsize len;
2277 	const gchar *hash_pos;
2278 	hs_platform_info_t test_plt;
2279 	hs_database_t *test_db = NULL;
2280 	guchar *map, *p, *end;
2281 	rspamd_cryptobox_fast_hash_state_t crc_st;
2282 	guint64 crc, valid_crc;
2283 
2284 	len = strlen (path);
2285 
2286 	if (len < sizeof (rspamd_cryptobox_HASHBYTES + 3)) {
2287 		if (!silent)  {
2288 			msg_err_re_cache ("cannot open hyperscan cache file %s: too short filename",
2289 					path);
2290 		}
2291 		return FALSE;
2292 	}
2293 
2294 	if (memcmp (path + len - 3, ".hs", 3) != 0) {
2295 		if (!silent)  {
2296 			msg_err_re_cache ("cannot open hyperscan cache file %s: not ending with .hs",
2297 					path);
2298 		}
2299 		return FALSE;
2300 	}
2301 
2302 	hash_pos = path + len - 3 - (sizeof (re_class->hash) - 1);
2303 	g_hash_table_iter_init (&it, cache->re_classes);
2304 
2305 	while (g_hash_table_iter_next (&it, &k, &v)) {
2306 		re_class = v;
2307 
2308 		if (memcmp (hash_pos, re_class->hash, sizeof (re_class->hash) - 1) == 0) {
2309 			/* Open file and check magic */
2310 			gssize r;
2311 
2312 			fd = open (path, O_RDONLY);
2313 
2314 			if (fd == -1) {
2315 				if (errno != ENOENT || !silent) {
2316 					msg_err_re_cache ("cannot open hyperscan cache file %s: %s",
2317 							path, strerror (errno));
2318 				}
2319 				return FALSE;
2320 			}
2321 
2322 			if ((r = read (fd, magicbuf, sizeof (magicbuf))) != sizeof (magicbuf)) {
2323 				if (r == -1) {
2324 					msg_err_re_cache ("cannot read magic from hyperscan "
2325 									  "cache file %s: %s",
2326 							path, strerror (errno));
2327 				}
2328 				else {
2329 					msg_err_re_cache ("truncated read magic from hyperscan "
2330 									  "cache file %s: %z, %z wanted",
2331 							path, r, (gsize)sizeof (magicbuf));
2332 				}
2333 				close (fd);
2334 				return FALSE;
2335 			}
2336 
2337 			if (cache->vectorized_hyperscan) {
2338 				mb = rspamd_hs_magic_vector;
2339 			}
2340 			else {
2341 				mb = rspamd_hs_magic;
2342 			}
2343 
2344 			if (memcmp (magicbuf, mb, sizeof (magicbuf)) != 0) {
2345 				msg_err_re_cache ("cannot open hyperscan cache file %s: "
2346 						"bad magic ('%*xs', '%*xs' expected)",
2347 						path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf,
2348 						(int) RSPAMD_HS_MAGIC_LEN, mb);
2349 
2350 				close (fd);
2351 				return FALSE;
2352 			}
2353 
2354 			if ((r = read (fd, &test_plt, sizeof (test_plt))) != sizeof (test_plt)) {
2355 				if (r == -1) {
2356 					msg_err_re_cache ("cannot read platform data from hyperscan "
2357 									  "cache file %s: %s",
2358 							path, strerror (errno));
2359 				}
2360 				else {
2361 					msg_err_re_cache ("truncated read platform data from hyperscan "
2362 									  "cache file %s: %z, %z wanted",
2363 							path, r, (gsize)sizeof (magicbuf));
2364 				}
2365 
2366 				close (fd);
2367 				return FALSE;
2368 			}
2369 
2370 			if (memcmp (&test_plt, &cache->plt, sizeof (test_plt)) != 0) {
2371 				msg_err_re_cache ("cannot open hyperscan cache file %s: "
2372 						"compiled for a different platform",
2373 						path);
2374 
2375 				close (fd);
2376 				return FALSE;
2377 			}
2378 
2379 			close (fd);
2380 
2381 			if (try_load) {
2382 				map = rspamd_file_xmap (path, PROT_READ, &len, TRUE);
2383 
2384 				if (map == NULL) {
2385 					msg_err_re_cache ("cannot mmap hyperscan cache file %s: "
2386 							"%s",
2387 							path, strerror (errno));
2388 					return FALSE;
2389 				}
2390 
2391 				p = map + RSPAMD_HS_MAGIC_LEN + sizeof (test_plt);
2392 				end = map + len;
2393 				n = *(gint *)p;
2394 				p += sizeof (gint);
2395 
2396 				if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */
2397 						sizeof (guint64) + /* crc */
2398 						RSPAMD_HS_MAGIC_LEN + /* header */
2399 						sizeof (cache->plt) > len) {
2400 					/* Some wrong amount of regexps */
2401 					msg_err_re_cache ("bad number of expressions in %s: %d",
2402 							path, n);
2403 					munmap (map, len);
2404 					return FALSE;
2405 				}
2406 
2407 				/*
2408 				 * Magic - 8 bytes
2409 				 * Platform - sizeof (platform)
2410 				 * n - number of regexps
2411 				 * n * <regexp ids>
2412 				 * n * <regexp flags>
2413 				 * crc - 8 bytes checksum
2414 				 * <hyperscan blob>
2415 				 */
2416 
2417 				memcpy (&crc, p + n * 2 * sizeof (gint), sizeof (crc));
2418 				rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe);
2419 				/* IDs */
2420 				rspamd_cryptobox_fast_hash_update (&crc_st, p, n * sizeof (gint));
2421 				/* Flags */
2422 				rspamd_cryptobox_fast_hash_update (&crc_st, p + n * sizeof (gint),
2423 						n * sizeof (gint));
2424 				/* HS database */
2425 				p += n * sizeof (gint) * 2 + sizeof (guint64);
2426 				rspamd_cryptobox_fast_hash_update (&crc_st, p, end - p);
2427 				valid_crc = rspamd_cryptobox_fast_hash_final (&crc_st);
2428 
2429 				if (crc != valid_crc) {
2430 					msg_warn_re_cache ("outdated or invalid hs database in %s: "
2431 							"crc read %xL, crc expected %xL", path, crc, valid_crc);
2432 					munmap (map, len);
2433 
2434 					return FALSE;
2435 				}
2436 
2437 				if ((ret = hs_deserialize_database (p, end - p, &test_db))
2438 						!= HS_SUCCESS) {
2439 					msg_err_re_cache ("bad hs database in %s: %d", path, ret);
2440 					munmap (map, len);
2441 
2442 					return FALSE;
2443 				}
2444 
2445 				hs_free_database (test_db);
2446 				munmap (map, len);
2447 			}
2448 			/* XXX: add crc check */
2449 
2450 			return TRUE;
2451 		}
2452 	}
2453 
2454 	if (!silent) {
2455 		msg_warn_re_cache ("unknown hyperscan cache file %s", path);
2456 	}
2457 
2458 	return FALSE;
2459 #endif
2460 }
2461 
2462 
2463 enum rspamd_hyperscan_status
rspamd_re_cache_load_hyperscan(struct rspamd_re_cache * cache,const char * cache_dir,bool try_load)2464 rspamd_re_cache_load_hyperscan (struct rspamd_re_cache *cache,
2465 		const char *cache_dir, bool try_load)
2466 {
2467 	g_assert (cache != NULL);
2468 	g_assert (cache_dir != NULL);
2469 
2470 #ifndef WITH_HYPERSCAN
2471 	return RSPAMD_HYPERSCAN_UNSUPPORTED;
2472 #else
2473 	gchar path[PATH_MAX];
2474 	gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret;
2475 	GHashTableIter it;
2476 	gpointer k, v;
2477 	guint8 *map, *p, *end;
2478 	struct rspamd_re_class *re_class;
2479 	struct rspamd_re_cache_elt *elt;
2480 	struct stat st;
2481 	gboolean has_valid = FALSE, all_valid = FALSE;
2482 
2483 	g_hash_table_iter_init (&it, cache->re_classes);
2484 
2485 	while (g_hash_table_iter_next (&it, &k, &v)) {
2486 		re_class = v;
2487 		rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir,
2488 				G_DIR_SEPARATOR, re_class->hash);
2489 
2490 		if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, try_load, FALSE)) {
2491 			msg_debug_re_cache ("load hyperscan database from '%s'",
2492 					re_class->hash);
2493 
2494 			fd = open (path, O_RDONLY);
2495 
2496 			/* Read number of regexps */
2497 			g_assert (fd != -1);
2498 			fstat (fd, &st);
2499 
2500 			map = mmap (NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
2501 
2502 			if (map == MAP_FAILED) {
2503 				if (!try_load) {
2504 					msg_err_re_cache ("cannot mmap %s: %s", path, strerror (errno));
2505 				}
2506 				else {
2507 					msg_debug_re_cache ("cannot mmap %s: %s", path, strerror (errno));
2508 				}
2509 
2510 				close (fd);
2511 				all_valid = FALSE;
2512 				continue;
2513 			}
2514 
2515 			close (fd);
2516 			end = map + st.st_size;
2517 			p = map + RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt);
2518 			n = *(gint *)p;
2519 
2520 			if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */
2521 							sizeof (guint64) + /* crc */
2522 							RSPAMD_HS_MAGIC_LEN + /* header */
2523 							sizeof (cache->plt) > (gsize)st.st_size) {
2524 				/* Some wrong amount of regexps */
2525 				if (!try_load) {
2526 					msg_err_re_cache ("bad number of expressions in %s: %d",
2527 							path, n);
2528 				}
2529 				else {
2530 					msg_debug_re_cache ("bad number of expressions in %s: %d",
2531 							path, n);
2532 				}
2533 
2534 				munmap (map, st.st_size);
2535 				all_valid = FALSE;
2536 				continue;
2537 			}
2538 
2539 			total += n;
2540 			p += sizeof (n);
2541 			hs_ids = g_malloc (n * sizeof (*hs_ids));
2542 			memcpy (hs_ids, p, n * sizeof (*hs_ids));
2543 			p += n * sizeof (*hs_ids);
2544 			hs_flags = g_malloc (n * sizeof (*hs_flags));
2545 			memcpy (hs_flags, p, n * sizeof (*hs_flags));
2546 
2547 			/* Skip crc */
2548 			p += n * sizeof (*hs_ids) + sizeof (guint64);
2549 
2550 			/* Cleanup */
2551 			if (re_class->hs_scratch != NULL) {
2552 				hs_free_scratch (re_class->hs_scratch);
2553 			}
2554 
2555 			if (re_class->hs_db != NULL) {
2556 				hs_free_database (re_class->hs_db);
2557 			}
2558 
2559 			if (re_class->hs_ids) {
2560 				g_free (re_class->hs_ids);
2561 			}
2562 
2563 			re_class->hs_ids = NULL;
2564 			re_class->hs_scratch = NULL;
2565 			re_class->hs_db = NULL;
2566 
2567 			if ((ret = hs_deserialize_database (p, end - p, &re_class->hs_db))
2568 					!= HS_SUCCESS) {
2569 				if (!try_load) {
2570 					msg_err_re_cache ("bad hs database in %s: %d", path, ret);
2571 				}
2572 				else {
2573 					msg_debug_re_cache ("bad hs database in %s: %d", path, ret);
2574 				}
2575 				munmap (map, st.st_size);
2576 				g_free (hs_ids);
2577 				g_free (hs_flags);
2578 
2579 				re_class->hs_ids = NULL;
2580 				re_class->hs_scratch = NULL;
2581 				re_class->hs_db = NULL;
2582 				all_valid = FALSE;
2583 
2584 				continue;
2585 			}
2586 
2587 			munmap (map, st.st_size);
2588 
2589 			g_assert (hs_alloc_scratch (re_class->hs_db,
2590 					&re_class->hs_scratch) == HS_SUCCESS);
2591 
2592 			/*
2593 			 * Now find hyperscan elts that are successfully compiled and
2594 			 * specify that they should be matched using hyperscan
2595 			 */
2596 			for (i = 0; i < n; i ++) {
2597 				g_assert ((gint)cache->re->len > hs_ids[i] && hs_ids[i] >= 0);
2598 				elt = g_ptr_array_index (cache->re, hs_ids[i]);
2599 
2600 				if (hs_flags[i] & HS_FLAG_PREFILTER) {
2601 					elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE;
2602 				}
2603 				else {
2604 					elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN;
2605 				}
2606 			}
2607 
2608 			re_class->hs_ids = hs_ids;
2609 			g_free (hs_flags);
2610 			re_class->nhs = n;
2611 
2612 			if (!has_valid) {
2613 				has_valid = TRUE;
2614 				all_valid = TRUE;
2615 			}
2616 		}
2617 		else {
2618 			if (!try_load) {
2619 				msg_err_re_cache ("invalid hyperscan hash file '%s'",
2620 						path);
2621 			}
2622 			else {
2623 				msg_debug_re_cache ("invalid hyperscan hash file '%s'",
2624 						path);
2625 			}
2626 			all_valid = FALSE;
2627 			continue;
2628 		}
2629 	}
2630 
2631 	if (has_valid) {
2632 		if (all_valid) {
2633 			msg_info_re_cache ("full hyperscan database of %d regexps has been loaded", total);
2634 			cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL;
2635 		}
2636 		else {
2637 			msg_info_re_cache ("partial hyperscan database of %d regexps has been loaded", total);
2638 			cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL;
2639 		}
2640 	}
2641 	else {
2642 		msg_info_re_cache ("hyperscan database has NOT been loaded; no valid expressions");
2643 		cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR;
2644 	}
2645 
2646 
2647 
2648 	return cache->hyperscan_loaded;
2649 #endif
2650 }
2651 
rspamd_re_cache_add_selector(struct rspamd_re_cache * cache,const gchar * sname,gint ref)2652 void rspamd_re_cache_add_selector (struct rspamd_re_cache *cache,
2653 								   const gchar *sname,
2654 								   gint ref)
2655 {
2656 	khiter_t k;
2657 
2658 	k = kh_get (lua_selectors_hash, cache->selectors, (gchar *)sname);
2659 
2660 	if (k == kh_end (cache->selectors)) {
2661 		gchar *cpy = g_strdup (sname);
2662 		gint res;
2663 
2664 		k = kh_put (lua_selectors_hash, cache->selectors, cpy, &res);
2665 
2666 		kh_value (cache->selectors, k) = ref;
2667 	}
2668 	else {
2669 		msg_warn_re_cache ("replacing selector with name %s", sname);
2670 
2671 		if (cache->L) {
2672 			luaL_unref (cache->L, LUA_REGISTRYINDEX, kh_value (cache->selectors, k));
2673 		}
2674 
2675 		kh_value (cache->selectors, k) = ref;
2676 	}
2677 }
2678