1 /*-
2  * Copyright 2018 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "map_helpers.h"
18 #include "map_private.h"
19 #include "khash.h"
20 #include "radix.h"
21 #include "rspamd.h"
22 #include "cryptobox.h"
23 #include "mempool_vars_internal.h"
24 #include "contrib/fastutf8/fastutf8.h"
25 #include "contrib/cdb/cdb.h"
26 
27 #ifdef WITH_HYPERSCAN
28 #include "hs.h"
29 #endif
30 #ifndef WITH_PCRE2
31 #include <pcre.h>
32 #else
33 #include <pcre2.h>
34 #endif
35 
36 
37 static const guint64 map_hash_seed = 0xdeadbabeULL;
38 static const gchar * const hash_fill = "1";
39 
40 struct rspamd_map_helper_value {
41 	gsize hits;
42 	gconstpointer key;
43 	gchar value[]; /* Null terminated */
44 };
45 
46 #define rspamd_map_ftok_hash(t) (rspamd_icase_hash((t).begin, (t).len, rspamd_hash_seed ()))
47 #define rspamd_map_ftok_equal(a, b) ((a).len == (b).len && rspamd_lc_cmp((a).begin, (b).begin, (a).len) == 0)
48 
49 KHASH_INIT (rspamd_map_hash, rspamd_ftok_t,
50 		struct rspamd_map_helper_value *, true,
51 		rspamd_map_ftok_hash, rspamd_map_ftok_equal);
52 
53 struct rspamd_radix_map_helper {
54 	rspamd_mempool_t *pool;
55 	khash_t(rspamd_map_hash) *htb;
56 	radix_compressed_t *trie;
57 	struct rspamd_map *map;
58 	rspamd_cryptobox_fast_hash_state_t hst;
59 };
60 
61 struct rspamd_hash_map_helper {
62 	rspamd_mempool_t *pool;
63 	khash_t(rspamd_map_hash) *htb;
64 	struct rspamd_map *map;
65 	rspamd_cryptobox_fast_hash_state_t hst;
66 };
67 
68 struct rspamd_cdb_map_helper {
69 	GQueue cdbs;
70 	struct rspamd_map *map;
71 	rspamd_cryptobox_fast_hash_state_t hst;
72 	gsize total_size;
73 };
74 
75 struct rspamd_regexp_map_helper {
76 	rspamd_cryptobox_hash_state_t hst;
77 	guchar re_digest[rspamd_cryptobox_HASHBYTES];
78 	rspamd_mempool_t *pool;
79 	struct rspamd_map *map;
80 	GPtrArray *regexps;
81 	GPtrArray *values;
82 	khash_t(rspamd_map_hash) *htb;
83 	enum rspamd_regexp_map_flags map_flags;
84 #ifdef WITH_HYPERSCAN
85 	hs_database_t *hs_db;
86 	hs_scratch_t *hs_scratch;
87 	gchar **patterns;
88 	gint *flags;
89 	gint *ids;
90 #endif
91 };
92 
93 /**
94  * FSM for parsing lists
95  */
96 
97 #define MAP_STORE_KEY do { \
98 	while (g_ascii_isspace (*c) && p > c) { c ++; } \
99 	key = g_malloc (p - c + 1); \
100 	rspamd_strlcpy (key, c, p - c + 1); \
101 	stripped_key = g_strstrip (key); \
102 } while (0)
103 
104 #define MAP_STORE_VALUE do { \
105 	while (g_ascii_isspace (*c) && p > c) { c ++; } \
106 	value = g_malloc (p - c + 1); \
107 	rspamd_strlcpy (value, c, p - c + 1); \
108 	stripped_value = g_strstrip (value); \
109 } while (0)
110 
111 gchar *
rspamd_parse_kv_list(gchar * chunk,gint len,struct map_cb_data * data,rspamd_map_insert_func func,const gchar * default_value,gboolean final)112 rspamd_parse_kv_list (
113 		gchar * chunk,
114 		gint len,
115 		struct map_cb_data *data,
116 		rspamd_map_insert_func func,
117 		const gchar *default_value,
118 		gboolean final)
119 {
120 	enum {
121 		map_skip_spaces_before_key = 0,
122 		map_read_key,
123 		map_read_key_quoted,
124 		map_read_key_slashed,
125 		map_skip_spaces_after_key,
126 		map_backslash_quoted,
127 		map_backslash_slashed,
128 		map_read_key_after_slash,
129 		map_read_value,
130 		map_read_comment_start,
131 		map_skip_comment,
132 		map_read_eol,
133 	};
134 
135 	gchar *c, *p, *key = NULL, *value = NULL, *stripped_key, *stripped_value, *end;
136 	struct rspamd_map *map = data->map;
137 	guint line_number = 0;
138 
139 	p = chunk;
140 	c = p;
141 	end = p + len;
142 
143 	while (p < end) {
144 		switch (data->state) {
145 		case map_skip_spaces_before_key:
146 			if (g_ascii_isspace (*p)) {
147 				p ++;
148 			}
149 			else {
150 				if (*p == '"') {
151 					p++;
152 					c = p;
153 					data->state = map_read_key_quoted;
154 				}
155 				else if (*p == '/') {
156 					/* Note that c is on '/' here as '/' is a part of key */
157 					c = p;
158 					p++;
159 					data->state = map_read_key_slashed;
160 				}
161 				else {
162 					c = p;
163 					data->state = map_read_key;
164 				}
165 			}
166 			break;
167 		case map_read_key:
168 			/* read key */
169 			/* Check here comments, eol and end of buffer */
170 			if (*p == '#' && (p == c || *(p - 1) != '\\')) {
171 				if (p - c > 0) {
172 					/* Store a single key */
173 					MAP_STORE_KEY;
174 					func (data->cur_data, stripped_key, default_value);
175 					msg_debug_map ("insert key only pair: %s -> %s; line: %d",
176 							stripped_key, default_value, line_number);
177 					g_free (key);
178 				}
179 
180 				key = NULL;
181 				data->state = map_read_comment_start;
182 			}
183 			else if (*p == '\r' || *p == '\n') {
184 				if (p - c > 0) {
185 					/* Store a single key */
186 					MAP_STORE_KEY;
187 					func (data->cur_data, stripped_key, default_value);
188 					msg_debug_map ("insert key only pair: %s -> %s; line: %d",
189 							stripped_key, default_value, line_number);
190 					g_free (key);
191 				}
192 
193 				data->state = map_read_eol;
194 				key = NULL;
195 			}
196 			else if (g_ascii_isspace (*p)) {
197 				if (p - c > 0) {
198 					MAP_STORE_KEY;
199 					data->state = map_skip_spaces_after_key;
200 				}
201 				else {
202 					msg_err_map ("empty or invalid key found on line %d", line_number);
203 					data->state = map_skip_comment;
204 				}
205 			}
206 			else {
207 				p++;
208 			}
209 			break;
210 		case map_read_key_quoted:
211 			if (*p == '\\') {
212 				data->state = map_backslash_quoted;
213 				p ++;
214 			}
215 			else if (*p == '"') {
216 				/* Allow empty keys in this case */
217 				if (p - c >= 0) {
218 					MAP_STORE_KEY;
219 					data->state = map_skip_spaces_after_key;
220 				}
221 				else {
222 					g_assert_not_reached ();
223 				}
224 				p ++;
225 			}
226 			else {
227 				p ++;
228 			}
229 			break;
230 		case map_read_key_slashed:
231 			if (*p == '\\') {
232 				data->state = map_backslash_slashed;
233 				p ++;
234 			}
235 			else if (*p == '/') {
236 				/* Allow empty keys in this case */
237 				if (p - c >= 0) {
238 					data->state = map_read_key_after_slash;
239 				}
240 				else {
241 					g_assert_not_reached ();
242 				}
243 			}
244 			else {
245 				p ++;
246 			}
247 			break;
248 		case map_read_key_after_slash:
249 			/*
250 			 * This state is equal to reading of key but '/' is not
251 			 * treated specially
252 			 */
253 			if (*p == '#') {
254 				if (p - c > 0) {
255 					/* Store a single key */
256 					MAP_STORE_KEY;
257 					func (data->cur_data, stripped_key, default_value);
258 					msg_debug_map ("insert key only pair: %s -> %s; line: %d",
259 							stripped_key, default_value, line_number);
260 					g_free (key);
261 					key = NULL;
262 				}
263 
264 				data->state = map_read_comment_start;
265 			}
266 			else if (*p == '\r' || *p == '\n') {
267 				if (p - c > 0) {
268 					/* Store a single key */
269 					MAP_STORE_KEY;
270 					func (data->cur_data, stripped_key, default_value);
271 
272 					msg_debug_map ("insert key only pair: %s -> %s; line: %d",
273 							stripped_key, default_value, line_number);
274 					g_free (key);
275 					key = NULL;
276 				}
277 
278 				data->state = map_read_eol;
279 				key = NULL;
280 			}
281 			else if (g_ascii_isspace (*p)) {
282 				if (p - c > 0) {
283 					MAP_STORE_KEY;
284 					data->state = map_skip_spaces_after_key;
285 				}
286 				else {
287 					msg_err_map ("empty or invalid key found on line %d", line_number);
288 					data->state = map_skip_comment;
289 				}
290 			}
291 			else {
292 				p ++;
293 			}
294 			break;
295 		case map_backslash_quoted:
296 			p ++;
297 			data->state = map_read_key_quoted;
298 			break;
299 		case map_backslash_slashed:
300 			p ++;
301 			data->state = map_read_key_slashed;
302 			break;
303 		case map_skip_spaces_after_key:
304 			if (*p == ' ' || *p == '\t') {
305 				p ++;
306 			}
307 			else {
308 				c = p;
309 				data->state = map_read_value;
310 			}
311 			break;
312 		case map_read_value:
313 			if (key == NULL) {
314 				/* Ignore line */
315 				msg_err_map ("empty or invalid key found on line %d", line_number);
316 				data->state = map_skip_comment;
317 			}
318 			else {
319 				if (*p == '#') {
320 					if (p - c > 0) {
321 						/* Store a single key */
322 						MAP_STORE_VALUE;
323 						func (data->cur_data, stripped_key, stripped_value);
324 						msg_debug_map ("insert key value pair: %s -> %s; line: %d",
325 								stripped_key, stripped_value, line_number);
326 						g_free (key);
327 						g_free (value);
328 						key = NULL;
329 						value = NULL;
330 					} else {
331 						func (data->cur_data, stripped_key, default_value);
332 						msg_debug_map ("insert key only pair: %s -> %s; line: %d",
333 								stripped_key, default_value, line_number);
334 						g_free (key);
335 						key = NULL;
336 					}
337 
338 					data->state = map_read_comment_start;
339 				} else if (*p == '\r' || *p == '\n') {
340 					if (p - c > 0) {
341 						/* Store a single key */
342 						MAP_STORE_VALUE;
343 						func (data->cur_data, stripped_key, stripped_value);
344 						msg_debug_map ("insert key value pair: %s -> %s",
345 								stripped_key, stripped_value);
346 						g_free (key);
347 						g_free (value);
348 						key = NULL;
349 						value = NULL;
350 					} else {
351 						func (data->cur_data, stripped_key, default_value);
352 						msg_debug_map ("insert key only pair: %s -> %s",
353 								stripped_key, default_value);
354 						g_free (key);
355 						key = NULL;
356 					}
357 
358 					data->state = map_read_eol;
359 					key = NULL;
360 				}
361 				else {
362 					p++;
363 				}
364 			}
365 			break;
366 		case map_read_comment_start:
367 			if (*p == '#') {
368 				data->state = map_skip_comment;
369 				p ++;
370 				key = NULL;
371 				value = NULL;
372 			}
373 			else {
374 				g_assert_not_reached ();
375 			}
376 			break;
377 		case map_skip_comment:
378 			if (*p == '\r' || *p == '\n') {
379 				data->state = map_read_eol;
380 			}
381 			else {
382 				p ++;
383 			}
384 			break;
385 		case map_read_eol:
386 			/* Skip \r\n and whitespaces */
387 			if (*p == '\r' || *p == '\n') {
388 				if (*p == '\n') {
389 					/* We don't care about \r only line separators, they are too rare */
390 					line_number ++;
391 				}
392 				p++;
393 			}
394 			else {
395 				data->state = map_skip_spaces_before_key;
396 			}
397 			break;
398 		default:
399 			g_assert_not_reached ();
400 			break;
401 		}
402 	}
403 
404 	if (final) {
405 		/* Examine the state */
406 		switch (data->state) {
407 		case map_read_key:
408 			if (p - c > 0) {
409 				/* Store a single key */
410 				MAP_STORE_KEY;
411 				func (data->cur_data, stripped_key, default_value);
412 				msg_debug_map ("insert key only pair: %s -> %s",
413 						stripped_key, default_value);
414 				g_free (key);
415 				key = NULL;
416 			}
417 			break;
418 		case map_read_value:
419 			if (key == NULL) {
420 				/* Ignore line */
421 				msg_err_map ("empty or invalid key found on line %d", line_number);
422 				data->state = map_skip_comment;
423 			}
424 			else {
425 				if (p - c > 0) {
426 					/* Store a single key */
427 					MAP_STORE_VALUE;
428 					func (data->cur_data, stripped_key, stripped_value);
429 					msg_debug_map ("insert key value pair: %s -> %s",
430 							stripped_key, stripped_value);
431 					g_free (key);
432 					g_free (value);
433 					key = NULL;
434 					value = NULL;
435 				} else {
436 					func (data->cur_data, stripped_key, default_value);
437 					msg_debug_map ("insert key only pair: %s -> %s",
438 							stripped_key, default_value);
439 					g_free (key);
440 					key = NULL;
441 				}
442 			}
443 			break;
444 		}
445 
446 		data->state = map_skip_spaces_before_key;
447 	}
448 
449 	return c;
450 }
451 
452 /**
453  * Radix tree helper function
454  */
455 void
rspamd_map_helper_insert_radix(gpointer st,gconstpointer key,gconstpointer value)456 rspamd_map_helper_insert_radix (gpointer st, gconstpointer key, gconstpointer value)
457 {
458 	struct rspamd_radix_map_helper *r = (struct rspamd_radix_map_helper *)st;
459 	struct rspamd_map_helper_value *val;
460 	gsize vlen;
461 	khiter_t k;
462 	gconstpointer nk;
463 	rspamd_ftok_t tok;
464 	gint res;
465 	struct rspamd_map *map;
466 
467 	map = r->map;
468 	tok.begin = key;
469 	tok.len = strlen (key);
470 
471 	k = kh_get (rspamd_map_hash, r->htb, tok);
472 
473 	if (k == kh_end (r->htb)) {
474 		nk = rspamd_mempool_strdup (r->pool, key);
475 		tok.begin = nk;
476 		k = kh_put (rspamd_map_hash, r->htb, tok, &res);
477 	}
478 	else {
479 		val = kh_value (r->htb, k);
480 
481 		if (strcmp (value, val->value) == 0) {
482 			/* Same element, skip */
483 			return;
484 		}
485 		else {
486 			msg_warn_map ("duplicate radix entry found for map %s: %s (old value: '%s', new: '%s')",
487 					map->name, key, val->value, value);
488 		}
489 
490 		nk = kh_key (r->htb, k).begin;
491 		val->key = nk;
492 		kh_value (r->htb, k) = val;
493 
494 		return; /* do not touch radix in case of exact duplicate */
495 	}
496 
497 	vlen = strlen (value);
498 	val = rspamd_mempool_alloc0 (r->pool, sizeof (*val) +
499 										  vlen + 1);
500 	memcpy (val->value, value, vlen);
501 
502 	nk = kh_key (r->htb, k).begin;
503 	val->key = nk;
504 	kh_value (r->htb, k) = val;
505 	rspamd_radix_add_iplist (key, ",", r->trie, val, FALSE,
506 			r->map->name);
507 	rspamd_cryptobox_fast_hash_update (&r->hst, nk, tok.len);
508 }
509 
510 void
rspamd_map_helper_insert_radix_resolve(gpointer st,gconstpointer key,gconstpointer value)511 rspamd_map_helper_insert_radix_resolve (gpointer st, gconstpointer key, gconstpointer value)
512 {
513 	struct rspamd_radix_map_helper *r = (struct rspamd_radix_map_helper *)st;
514 	struct rspamd_map_helper_value *val;
515 	gsize vlen;
516 	khiter_t k;
517 	gconstpointer nk;
518 	rspamd_ftok_t tok;
519 	gint res;
520 	struct rspamd_map *map;
521 
522 	map = r->map;
523 	tok.begin = key;
524 	tok.len = strlen (key);
525 
526 	k = kh_get (rspamd_map_hash, r->htb, tok);
527 
528 	if (k == kh_end (r->htb)) {
529 		nk = rspamd_mempool_strdup (r->pool, key);
530 		tok.begin = nk;
531 		k = kh_put (rspamd_map_hash, r->htb, tok, &res);
532 	}
533 	else {
534 		val = kh_value (r->htb, k);
535 
536 		if (strcmp (value, val->value) == 0) {
537 			/* Same element, skip */
538 			return;
539 		}
540 		else {
541 			msg_warn_map ("duplicate radix entry found for map %s: %s (old value: '%s', new: '%s')",
542 					map->name, key, val->value, value);
543 		}
544 
545 		nk = kh_key (r->htb, k).begin;
546 		val->key = nk;
547 		kh_value (r->htb, k) = val;
548 
549 		return; /* do not touch radix in case of exact duplicate */
550 	}
551 
552 	vlen = strlen (value);
553 	val = rspamd_mempool_alloc0 (r->pool, sizeof (*val) +
554 										  vlen + 1);
555 	memcpy (val->value, value, vlen);
556 	nk = kh_key (r->htb, k).begin;
557 	val->key = nk;
558 	kh_value (r->htb, k) = val;
559 	rspamd_radix_add_iplist (key, ",", r->trie, val, TRUE,
560 			r->map->name);
561 	rspamd_cryptobox_fast_hash_update (&r->hst, nk, tok.len);
562 }
563 
564 void
rspamd_map_helper_insert_hash(gpointer st,gconstpointer key,gconstpointer value)565 rspamd_map_helper_insert_hash (gpointer st, gconstpointer key, gconstpointer value)
566 {
567 	struct rspamd_hash_map_helper *ht = st;
568 	struct rspamd_map_helper_value *val;
569 	khiter_t k;
570 	gconstpointer nk;
571 	gsize vlen;
572 	gint r;
573 	rspamd_ftok_t tok;
574 	struct rspamd_map *map;
575 
576 	tok.begin = key;
577 	tok.len = strlen (key);
578 	map = ht->map;
579 
580 	k = kh_get (rspamd_map_hash, ht->htb, tok);
581 
582 	if (k == kh_end (ht->htb)) {
583 		nk = rspamd_mempool_strdup (ht->pool, key);
584 		tok.begin = nk;
585 		k = kh_put (rspamd_map_hash, ht->htb, tok, &r);
586 	}
587 	else {
588 		val = kh_value (ht->htb, k);
589 
590 		if (strcmp (value, val->value) == 0) {
591 			/* Same element, skip */
592 			return;
593 		}
594 		else {
595 			msg_warn_map ("duplicate hash entry found for map %s: %s (old value: '%s', new: '%s')",
596 					map->name, key, val->value, value);
597 		}
598 	}
599 
600 	/* Null termination due to alloc0 */
601 	vlen = strlen (value);
602 	val = rspamd_mempool_alloc0 (ht->pool, sizeof (*val) + vlen + 1);
603 	memcpy (val->value, value, vlen);
604 
605 	tok = kh_key (ht->htb, k);
606 	nk = tok.begin;
607 	val->key = nk;
608 	kh_value (ht->htb, k) = val;
609 
610 	rspamd_cryptobox_fast_hash_update (&ht->hst, nk, tok.len);
611 }
612 
613 void
rspamd_map_helper_insert_re(gpointer st,gconstpointer key,gconstpointer value)614 rspamd_map_helper_insert_re (gpointer st, gconstpointer key, gconstpointer value)
615 {
616 	struct rspamd_regexp_map_helper *re_map = st;
617 	struct rspamd_map *map;
618 	rspamd_regexp_t *re;
619 	gchar *escaped;
620 	GError *err = NULL;
621 	gint pcre_flags;
622 	gsize escaped_len;
623 	struct rspamd_map_helper_value *val;
624 	khiter_t k;
625 	rspamd_ftok_t tok;
626 	gconstpointer nk;
627 	gsize vlen;
628 	gint r;
629 
630 	map = re_map->map;
631 
632 	tok.begin = key;
633 	tok.len = strlen (key);
634 
635 	k = kh_get (rspamd_map_hash, re_map->htb, tok);
636 
637 	if (k == kh_end (re_map->htb)) {
638 		nk = rspamd_mempool_strdup (re_map->pool, key);
639 		tok.begin = nk;
640 		k = kh_put (rspamd_map_hash, re_map->htb, tok, &r);
641 	}
642 	else {
643 		val = kh_value (re_map->htb, k);
644 
645 		/* Always warn about regexp duplicate as it's likely a bad mistake */
646 		msg_warn_map ("duplicate re entry found for map %s: %s (old value: '%s', new: '%s')",
647 				map->name, key, val->value, value);
648 
649 		if (strcmp (val->value, value) == 0) {
650 			/* Same value, skip */
651 			return;
652 		}
653 
654 		/* Replace value but do not touch regexp */
655 		nk = kh_key (re_map->htb, k).begin;
656 		val->key = nk;
657 		kh_value (re_map->htb, k) = val;
658 
659 		return;
660 	}
661 
662 	/* Check regexp stuff */
663 	if (re_map->map_flags & RSPAMD_REGEXP_MAP_FLAG_GLOB) {
664 		escaped = rspamd_str_regexp_escape (key, strlen (key), &escaped_len,
665 				RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF);
666 		re = rspamd_regexp_new (escaped, NULL, &err);
667 		g_free (escaped);
668 	}
669 	else {
670 		re = rspamd_regexp_new (key, NULL, &err);
671 	}
672 
673 	if (re == NULL) {
674 		msg_err_map ("cannot parse regexp %s: %e", key, err);
675 
676 		if (err) {
677 			g_error_free (err);
678 		}
679 
680 		return;
681 	}
682 
683 	vlen = strlen (value);
684 	val = rspamd_mempool_alloc0 (re_map->pool, sizeof (*val) +
685 											   vlen + 1);
686 	memcpy (val->value, value, vlen); /* Null terminated due to alloc0 previously */
687 	nk = kh_key (re_map->htb, k).begin;
688 	val->key = nk;
689 	kh_value (re_map->htb, k) = val;
690 	rspamd_cryptobox_hash_update (&re_map->hst, nk, tok.len);
691 
692 	pcre_flags = rspamd_regexp_get_pcre_flags (re);
693 
694 #ifndef WITH_PCRE2
695 	if (pcre_flags & PCRE_FLAG(UTF8)) {
696 		re_map->map_flags |= RSPAMD_REGEXP_MAP_FLAG_UTF;
697 	}
698 #else
699 	if (pcre_flags & PCRE_FLAG(UTF)) {
700 		re_map->map_flags |= RSPAMD_REGEXP_MAP_FLAG_UTF;
701 	}
702 #endif
703 
704 	g_ptr_array_add (re_map->regexps, re);
705 	g_ptr_array_add (re_map->values, val);
706 }
707 
708 static void
rspamd_map_helper_traverse_regexp(void * data,rspamd_map_traverse_cb cb,gpointer cbdata,gboolean reset_hits)709 rspamd_map_helper_traverse_regexp (void *data,
710 		rspamd_map_traverse_cb cb,
711 		gpointer cbdata,
712 		gboolean reset_hits)
713 {
714 	rspamd_ftok_t tok;
715 	struct rspamd_map_helper_value *val;
716 	struct rspamd_regexp_map_helper *re_map = data;
717 
718 	kh_foreach (re_map->htb, tok, val, {
719 		if (!cb (tok.begin, val->value, val->hits, cbdata)) {
720 			break;
721 		}
722 
723 		if (reset_hits) {
724 			val->hits = 0;
725 		}
726 	});
727 }
728 
729 struct rspamd_hash_map_helper *
rspamd_map_helper_new_hash(struct rspamd_map * map)730 rspamd_map_helper_new_hash (struct rspamd_map *map)
731 {
732 	struct rspamd_hash_map_helper *htb;
733 	rspamd_mempool_t *pool;
734 
735 	if (map) {
736 		pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
737 				map->tag, 0);
738 	}
739 	else {
740 		pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
741 				NULL, 0);
742 	}
743 
744 	htb = rspamd_mempool_alloc0 (pool, sizeof (*htb));
745 	htb->htb = kh_init (rspamd_map_hash);
746 	htb->pool = pool;
747 	htb->map = map;
748 	rspamd_cryptobox_fast_hash_init (&htb->hst, map_hash_seed);
749 
750 	return htb;
751 }
752 
753 void
rspamd_map_helper_destroy_hash(struct rspamd_hash_map_helper * r)754 rspamd_map_helper_destroy_hash (struct rspamd_hash_map_helper *r)
755 {
756 	if (r == NULL || r->pool == NULL) {
757 		return;
758 	}
759 
760 	rspamd_mempool_t *pool = r->pool;
761 	kh_destroy (rspamd_map_hash, r->htb);
762 	memset (r, 0, sizeof (*r));
763 	rspamd_mempool_delete (pool);
764 }
765 
766 static void
rspamd_map_helper_traverse_hash(void * data,rspamd_map_traverse_cb cb,gpointer cbdata,gboolean reset_hits)767 rspamd_map_helper_traverse_hash (void *data,
768 		rspamd_map_traverse_cb cb,
769 		gpointer cbdata,
770 		gboolean reset_hits)
771 {
772 	rspamd_ftok_t tok;
773 	struct rspamd_map_helper_value *val;
774 	struct rspamd_hash_map_helper *ht = data;
775 
776 	kh_foreach (ht->htb, tok, val, {
777 		if (!cb (tok.begin, val->value, val->hits, cbdata)) {
778 			break;
779 		}
780 
781 		if (reset_hits) {
782 			val->hits = 0;
783 		}
784 	});
785 }
786 
787 struct rspamd_radix_map_helper *
rspamd_map_helper_new_radix(struct rspamd_map * map)788 rspamd_map_helper_new_radix (struct rspamd_map *map)
789 {
790 	struct rspamd_radix_map_helper *r;
791 	rspamd_mempool_t *pool;
792 	const gchar *name = "unnamed";
793 
794 	if (map) {
795 		pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
796 				map->tag, 0);
797 		name = map->name;
798 	}
799 	else {
800 		pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
801 				NULL, 0);
802 	}
803 
804 	r = rspamd_mempool_alloc0 (pool, sizeof (*r));
805 	r->trie = radix_create_compressed_with_pool (pool, name);
806 	r->htb = kh_init (rspamd_map_hash);
807 	r->pool = pool;
808 	r->map = map;
809 	rspamd_cryptobox_fast_hash_init (&r->hst, map_hash_seed);
810 
811 	return r;
812 }
813 
814 void
rspamd_map_helper_destroy_radix(struct rspamd_radix_map_helper * r)815 rspamd_map_helper_destroy_radix (struct rspamd_radix_map_helper *r)
816 {
817 	if (r == NULL || !r->pool) {
818 		return;
819 	}
820 
821 	kh_destroy (rspamd_map_hash, r->htb);
822 	rspamd_mempool_t *pool = r->pool;
823 	memset (r, 0, sizeof (*r));
824 	rspamd_mempool_delete (pool);
825 }
826 
827 static void
rspamd_map_helper_traverse_radix(void * data,rspamd_map_traverse_cb cb,gpointer cbdata,gboolean reset_hits)828 rspamd_map_helper_traverse_radix (void *data,
829 		rspamd_map_traverse_cb cb,
830 		gpointer cbdata,
831 		gboolean reset_hits)
832 {
833 	rspamd_ftok_t tok;
834 	struct rspamd_map_helper_value *val;
835 	struct rspamd_radix_map_helper *r = data;
836 
837 	kh_foreach (r->htb, tok, val, {
838 		if (!cb (tok.begin, val->value, val->hits, cbdata)) {
839 			break;
840 		}
841 
842 		if (reset_hits) {
843 			val->hits = 0;
844 		}
845 	});
846 }
847 
848 struct rspamd_regexp_map_helper *
rspamd_map_helper_new_regexp(struct rspamd_map * map,enum rspamd_regexp_map_flags flags)849 rspamd_map_helper_new_regexp (struct rspamd_map *map,
850 		enum rspamd_regexp_map_flags flags)
851 {
852 	struct rspamd_regexp_map_helper *re_map;
853 	rspamd_mempool_t *pool;
854 
855 	pool = rspamd_mempool_new (rspamd_mempool_suggest_size (),
856 			map->tag, 0);
857 
858 	re_map = rspamd_mempool_alloc0 (pool, sizeof (*re_map));
859 	re_map->pool = pool;
860 	re_map->values = g_ptr_array_new ();
861 	re_map->regexps = g_ptr_array_new ();
862 	re_map->map = map;
863 	re_map->map_flags = flags;
864 	re_map->htb = kh_init (rspamd_map_hash);
865 	rspamd_cryptobox_hash_init (&re_map->hst, NULL, 0);
866 
867 	return re_map;
868 }
869 
870 
871 void
rspamd_map_helper_destroy_regexp(struct rspamd_regexp_map_helper * re_map)872 rspamd_map_helper_destroy_regexp (struct rspamd_regexp_map_helper *re_map)
873 {
874 	rspamd_regexp_t *re;
875 	guint i;
876 
877 	if (!re_map || !re_map->regexps) {
878 		return;
879 	}
880 
881 #ifdef WITH_HYPERSCAN
882 	if (re_map->hs_scratch) {
883 		hs_free_scratch (re_map->hs_scratch);
884 	}
885 	if (re_map->hs_db) {
886 		hs_free_database (re_map->hs_db);
887 	}
888 	if (re_map->patterns) {
889 		for (i = 0; i < re_map->regexps->len; i ++) {
890 			g_free (re_map->patterns[i]);
891 		}
892 
893 		g_free (re_map->patterns);
894 	}
895 	if (re_map->flags) {
896 		g_free (re_map->flags);
897 	}
898 	if (re_map->ids) {
899 		g_free (re_map->ids);
900 	}
901 #endif
902 
903 	for (i = 0; i < re_map->regexps->len; i ++) {
904 		re = g_ptr_array_index (re_map->regexps, i);
905 		rspamd_regexp_unref (re);
906 	}
907 
908 	g_ptr_array_free (re_map->regexps, TRUE);
909 	g_ptr_array_free (re_map->values, TRUE);
910 	kh_destroy (rspamd_map_hash, re_map->htb);
911 
912 	rspamd_mempool_t *pool = re_map->pool;
913 	memset (re_map, 0, sizeof (*re_map));
914 	rspamd_mempool_delete (pool);
915 }
916 
917 gchar *
rspamd_kv_list_read(gchar * chunk,gint len,struct map_cb_data * data,gboolean final)918 rspamd_kv_list_read (
919 		gchar * chunk,
920 		gint len,
921 		struct map_cb_data *data,
922 		gboolean final)
923 {
924 	if (data->cur_data == NULL) {
925 		data->cur_data = rspamd_map_helper_new_hash (data->map);
926 	}
927 
928 	return rspamd_parse_kv_list (
929 			chunk,
930 			len,
931 			data,
932 			rspamd_map_helper_insert_hash,
933 			"",
934 			final);
935 }
936 
937 void
rspamd_kv_list_fin(struct map_cb_data * data,void ** target)938 rspamd_kv_list_fin (struct map_cb_data *data, void **target)
939 {
940 	struct rspamd_map *map = data->map;
941 	struct rspamd_hash_map_helper *htb;
942 
943 	if (data->cur_data) {
944 		htb = (struct rspamd_hash_map_helper *)data->cur_data;
945 		msg_info_map ("read hash of %d elements from %s", kh_size (htb->htb),
946 				map->name);
947 		data->map->traverse_function = rspamd_map_helper_traverse_hash;
948 		data->map->nelts = kh_size (htb->htb);
949 		data->map->digest = rspamd_cryptobox_fast_hash_final (&htb->hst);
950 	}
951 
952 	if (target) {
953 		*target = data->cur_data;
954 	}
955 
956 	if (data->prev_data) {
957 		htb = (struct rspamd_hash_map_helper *)data->prev_data;
958 		rspamd_map_helper_destroy_hash (htb);
959 	}
960 }
961 
962 void
rspamd_kv_list_dtor(struct map_cb_data * data)963 rspamd_kv_list_dtor (struct map_cb_data *data)
964 {
965 	struct rspamd_hash_map_helper *htb;
966 
967 	if (data->cur_data) {
968 		htb = (struct rspamd_hash_map_helper *)data->cur_data;
969 		rspamd_map_helper_destroy_hash (htb);
970 	}
971 }
972 
973 gchar *
rspamd_radix_read(gchar * chunk,gint len,struct map_cb_data * data,gboolean final)974 rspamd_radix_read (
975 		gchar * chunk,
976 		gint len,
977 		struct map_cb_data *data,
978 		gboolean final)
979 {
980 	struct rspamd_radix_map_helper *r;
981 	struct rspamd_map *map = data->map;
982 
983 	if (data->cur_data == NULL) {
984 		r = rspamd_map_helper_new_radix (map);
985 		data->cur_data = r;
986 	}
987 
988 	return rspamd_parse_kv_list (
989 			chunk,
990 			len,
991 			data,
992 			rspamd_map_helper_insert_radix,
993 			hash_fill,
994 			final);
995 }
996 
997 void
rspamd_radix_fin(struct map_cb_data * data,void ** target)998 rspamd_radix_fin (struct map_cb_data *data, void **target)
999 {
1000 	struct rspamd_map *map = data->map;
1001 	struct rspamd_radix_map_helper *r;
1002 
1003 	if (data->cur_data) {
1004 		r = (struct rspamd_radix_map_helper *)data->cur_data;
1005 		msg_info_map ("read radix trie of %z elements: %s",
1006 				radix_get_size (r->trie), radix_get_info (r->trie));
1007 		data->map->traverse_function = rspamd_map_helper_traverse_radix;
1008 		data->map->nelts = kh_size (r->htb);
1009 		data->map->digest = rspamd_cryptobox_fast_hash_final (&r->hst);
1010 	}
1011 
1012 	if (target) {
1013 		*target = data->cur_data;
1014 	}
1015 
1016 	if (data->prev_data) {
1017 		r = (struct rspamd_radix_map_helper *)data->prev_data;
1018 		rspamd_map_helper_destroy_radix (r);
1019 	}
1020 }
1021 
1022 void
rspamd_radix_dtor(struct map_cb_data * data)1023 rspamd_radix_dtor (struct map_cb_data *data)
1024 {
1025 	struct rspamd_radix_map_helper *r;
1026 
1027 	if (data->cur_data) {
1028 		r = (struct rspamd_radix_map_helper *)data->cur_data;
1029 		rspamd_map_helper_destroy_radix (r);
1030 	}
1031 }
1032 
1033 #ifdef WITH_HYPERSCAN
1034 struct rspamd_re_maps_cache_dtor_cbdata {
1035 	struct rspamd_config *cfg;
1036 	GHashTable *valid_re_hashes;
1037 	gchar *dirname;
1038 };
1039 
1040 static void
rspamd_re_maps_cache_cleanup_dtor(gpointer ud)1041 rspamd_re_maps_cache_cleanup_dtor (gpointer ud)
1042 {
1043 	struct rspamd_re_maps_cache_dtor_cbdata *cbd =
1044 			(struct rspamd_re_maps_cache_dtor_cbdata *)ud;
1045 	GPtrArray *cache_files;
1046 	GError *err = NULL;
1047 	struct rspamd_config *cfg;
1048 
1049 	cfg = cbd->cfg;
1050 
1051 	if (cfg->cur_worker != NULL) {
1052 		/* Skip dtor, limit it to main process only */
1053 		return;
1054 	}
1055 
1056 	cache_files = rspamd_glob_path (cbd->dirname, "*.hsmc", FALSE, &err);
1057 
1058 	if (!cache_files) {
1059 		msg_err_config ("cannot glob files in %s: %e", cbd->dirname, err);
1060 		g_error_free (err);
1061 	}
1062 	else {
1063 		const gchar *fname;
1064 		guint i;
1065 
1066 		PTR_ARRAY_FOREACH (cache_files, i, fname) {
1067 			gchar *basename = g_path_get_basename (fname);
1068 
1069 			if (g_hash_table_lookup (cbd->valid_re_hashes, basename) == NULL) {
1070 				gchar *dir;
1071 
1072 				dir = g_path_get_dirname (fname);
1073 
1074 				/* Sanity check to avoid removal of something bad */
1075 				if (strcmp (dir, cbd->dirname) != 0) {
1076 					msg_err_config ("bogus file found: %s in %s, skip deleting",
1077 							fname, dir);
1078 				}
1079 				else {
1080 					if (unlink (fname) == -1) {
1081 						msg_err_config ("cannot delete obsolete file %s in %s: %s",
1082 								fname, dir, strerror (errno));
1083 					}
1084 					else {
1085 						msg_info_config ("deleted obsolete file %s in %s",
1086 								fname, dir);
1087 					}
1088 				}
1089 
1090 				g_free (dir);
1091 			}
1092 			else {
1093 				msg_debug_config ("valid re cache file %s", fname);
1094 			}
1095 
1096 			g_free (basename);
1097 		}
1098 
1099 		g_ptr_array_free (cache_files, TRUE);
1100 	}
1101 
1102 	g_hash_table_unref (cbd->valid_re_hashes);
1103 	g_free (cbd->dirname);
1104 }
1105 
1106 static void
rspamd_re_map_cache_update(const gchar * fname,struct rspamd_config * cfg)1107 rspamd_re_map_cache_update (const gchar *fname, struct rspamd_config *cfg)
1108 {
1109 	GHashTable *valid_re_hashes;
1110 
1111 	valid_re_hashes = rspamd_mempool_get_variable (cfg->cfg_pool,
1112 			RSPAMD_MEMPOOL_RE_MAPS_CACHE);
1113 
1114 	if (!valid_re_hashes) {
1115 		valid_re_hashes = g_hash_table_new_full (g_str_hash, g_str_equal,
1116 				g_free, NULL);
1117 		rspamd_mempool_set_variable (cfg->cfg_pool,
1118 				RSPAMD_MEMPOOL_RE_MAPS_CACHE,
1119 				valid_re_hashes, (rspamd_mempool_destruct_t)g_hash_table_unref);
1120 
1121 		/* We also add a cleanup dtor for all hashes */
1122 		static struct rspamd_re_maps_cache_dtor_cbdata cbd;
1123 
1124 		cbd.valid_re_hashes = g_hash_table_ref (valid_re_hashes);
1125 		cbd.cfg = cfg;
1126 		cbd.dirname = g_path_get_dirname (fname);
1127 		rspamd_mempool_add_destructor (cfg->cfg_pool,
1128 				rspamd_re_maps_cache_cleanup_dtor, &cbd);
1129 	}
1130 
1131 	g_hash_table_insert (valid_re_hashes, g_path_get_basename (fname), "1");
1132 }
1133 
1134 static gboolean
rspamd_try_load_re_map_cache(struct rspamd_regexp_map_helper * re_map)1135 rspamd_try_load_re_map_cache (struct rspamd_regexp_map_helper *re_map)
1136 {
1137 	gchar fp[PATH_MAX];
1138 	gpointer data;
1139 	gsize len;
1140 	struct rspamd_map *map;
1141 
1142 	map = re_map->map;
1143 
1144 	if (!map->cfg->hs_cache_dir) {
1145 		return FALSE;
1146 	}
1147 
1148 	rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmc",
1149 			map->cfg->hs_cache_dir,
1150 			(gint)rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest);
1151 
1152 	if ((data = rspamd_file_xmap (fp, PROT_READ, &len, TRUE)) != NULL) {
1153 		if (hs_deserialize_database (data, len, &re_map->hs_db) == HS_SUCCESS) {
1154 			rspamd_re_map_cache_update (fp, map->cfg);
1155 			munmap (data, len);
1156 
1157 			msg_info_map ("loaded hypersan cache from %s (%Hz length) for %s",
1158 					fp, len, map->name);
1159 
1160 			return TRUE;
1161 		}
1162 
1163 		msg_info_map ("invalid hypersan cache in %s (%Hz length) for %s, removing file",
1164 				fp, len, map->name);
1165 		munmap (data, len);
1166 		/* Remove stale file */
1167 		(void)unlink (fp);
1168 	}
1169 
1170 	return FALSE;
1171 }
1172 
1173 static gboolean
rspamd_try_save_re_map_cache(struct rspamd_regexp_map_helper * re_map)1174 rspamd_try_save_re_map_cache (struct rspamd_regexp_map_helper *re_map)
1175 {
1176 	gchar fp[PATH_MAX], np[PATH_MAX];
1177 	gsize len;
1178 	gint fd;
1179 	char *bytes = NULL;
1180 	struct rspamd_map *map;
1181 
1182 	map = re_map->map;
1183 
1184 	if (!map->cfg->hs_cache_dir) {
1185 		return FALSE;
1186 	}
1187 
1188 	rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmc.tmp",
1189 			re_map->map->cfg->hs_cache_dir,
1190 			(gint)rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest);
1191 
1192 	if ((fd = rspamd_file_xopen (fp, O_WRONLY | O_CREAT | O_EXCL, 00644, 0)) != -1) {
1193 		if (hs_serialize_database (re_map->hs_db, &bytes, &len) == HS_SUCCESS) {
1194 			if (write (fd, bytes, len) == -1) {
1195 				msg_warn_map ("cannot write hyperscan cache to %s: %s",
1196 						fp, strerror (errno));
1197 				unlink (fp);
1198 				free (bytes);
1199 			}
1200 			else {
1201 				free (bytes);
1202 				fsync (fd);
1203 
1204 				rspamd_snprintf (np, sizeof (np), "%s/%*xs.hsmc",
1205 						re_map->map->cfg->hs_cache_dir,
1206 						(gint)rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest);
1207 
1208 				if (rename (fp, np) == -1) {
1209 					msg_warn_map ("cannot rename hyperscan cache from %s to %s: %s",
1210 							fp, np, strerror (errno));
1211 					unlink (fp);
1212 				}
1213 				else {
1214 					msg_info_map ("written cached hyperscan data for %s to %s (%Hz length)",
1215 							map->name, np, len);
1216 
1217 					rspamd_re_map_cache_update (np, map->cfg);
1218 				}
1219 			}
1220 		}
1221 		else {
1222 			msg_warn_map ("cannot serialize hyperscan cache to %s: %s",
1223 					fp, strerror (errno));
1224 			unlink (fp);
1225 		}
1226 
1227 
1228 		close (fd);
1229 	}
1230 
1231 	return FALSE;
1232 }
1233 
1234 static gboolean
rspamd_re_map_cache_cleanup_old(struct rspamd_regexp_map_helper * old_re_map)1235 rspamd_re_map_cache_cleanup_old (struct rspamd_regexp_map_helper *old_re_map)
1236 {
1237 	gchar fp[PATH_MAX];
1238 	struct rspamd_map *map;
1239 
1240 	map = old_re_map->map;
1241 
1242 	if (!map->cfg->hs_cache_dir) {
1243 		return FALSE;
1244 	}
1245 
1246 	rspamd_snprintf (fp, sizeof (fp), "%s/%*xs.hsmc",
1247 			map->cfg->hs_cache_dir,
1248 			(gint)rspamd_cryptobox_HASHBYTES / 2, old_re_map->re_digest);
1249 
1250 	msg_info_map ("unlink stale cache file for %s: %s", map->name, fp);
1251 
1252 	if (unlink (fp) == -1) {
1253 		msg_warn_map ("cannot unlink stale cache file for %s (%s): %s",
1254 				map->name, fp, strerror (errno));
1255 		return FALSE;
1256 	}
1257 
1258 	GHashTable *valid_re_hashes;
1259 
1260 	valid_re_hashes = rspamd_mempool_get_variable (map->cfg->cfg_pool,
1261 			RSPAMD_MEMPOOL_RE_MAPS_CACHE);
1262 
1263 	if (valid_re_hashes) {
1264 		g_hash_table_remove (valid_re_hashes, fp);
1265 	}
1266 
1267 	return TRUE;
1268 }
1269 
1270 #endif
1271 
1272 static void
rspamd_re_map_finalize(struct rspamd_regexp_map_helper * re_map)1273 rspamd_re_map_finalize (struct rspamd_regexp_map_helper *re_map)
1274 {
1275 #ifdef WITH_HYPERSCAN
1276 	guint i;
1277 	hs_platform_info_t plt;
1278 	hs_compile_error_t *err;
1279 	struct rspamd_map *map;
1280 	rspamd_regexp_t *re;
1281 	gint pcre_flags;
1282 
1283 	map = re_map->map;
1284 
1285 #ifndef __aarch64__
1286 	if (!(map->cfg->libs_ctx->crypto_ctx->cpu_config & CPUID_SSSE3)) {
1287 		msg_info_map ("disable hyperscan for map %s, ssse3 instructons are not supported by CPU",
1288 				map->name);
1289 		return;
1290 	}
1291 #endif
1292 
1293 	if (hs_populate_platform (&plt) != HS_SUCCESS) {
1294 		msg_err_map ("cannot populate hyperscan platform");
1295 		return;
1296 	}
1297 
1298 	re_map->patterns = g_new (gchar *, re_map->regexps->len);
1299 	re_map->flags = g_new (gint, re_map->regexps->len);
1300 	re_map->ids = g_new (gint, re_map->regexps->len);
1301 
1302 	for (i = 0; i < re_map->regexps->len; i ++) {
1303 		const gchar *pat;
1304 		gchar *escaped;
1305 		gint pat_flags;
1306 
1307 		re = g_ptr_array_index (re_map->regexps, i);
1308 		pcre_flags = rspamd_regexp_get_pcre_flags (re);
1309 		pat = rspamd_regexp_get_pattern (re);
1310 		pat_flags = rspamd_regexp_get_flags (re);
1311 
1312 		if (pat_flags & RSPAMD_REGEXP_FLAG_UTF) {
1313 			escaped = rspamd_str_regexp_escape (pat, strlen (pat), NULL,
1314 					RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_UTF);
1315 			re_map->flags[i] |= HS_FLAG_UTF8;
1316 		}
1317 		else {
1318 			escaped = rspamd_str_regexp_escape (pat, strlen (pat), NULL,
1319 					RSPAMD_REGEXP_ESCAPE_RE);
1320 		}
1321 
1322 		re_map->patterns[i] = escaped;
1323 		re_map->flags[i] = HS_FLAG_SINGLEMATCH;
1324 
1325 #ifndef WITH_PCRE2
1326 		if (pcre_flags & PCRE_FLAG(UTF8)) {
1327 			re_map->flags[i] |= HS_FLAG_UTF8;
1328 		}
1329 #else
1330 		if (pcre_flags & PCRE_FLAG(UTF)) {
1331 			re_map->flags[i] |= HS_FLAG_UTF8;
1332 		}
1333 #endif
1334 		if (pcre_flags & PCRE_FLAG(CASELESS)) {
1335 			re_map->flags[i] |= HS_FLAG_CASELESS;
1336 		}
1337 		if (pcre_flags & PCRE_FLAG(MULTILINE)) {
1338 			re_map->flags[i] |= HS_FLAG_MULTILINE;
1339 		}
1340 		if (pcre_flags & PCRE_FLAG(DOTALL)) {
1341 			re_map->flags[i] |= HS_FLAG_DOTALL;
1342 		}
1343 		if (rspamd_regexp_get_maxhits (re) == 1) {
1344 			re_map->flags[i] |= HS_FLAG_SINGLEMATCH;
1345 		}
1346 
1347 		re_map->ids[i] = i;
1348 	}
1349 
1350 	if (re_map->regexps->len > 0 && re_map->patterns) {
1351 
1352 		if (!rspamd_try_load_re_map_cache (re_map)) {
1353 			gdouble ts1 = rspamd_get_ticks (FALSE);
1354 
1355 			if (hs_compile_multi ((const gchar **) re_map->patterns,
1356 					re_map->flags,
1357 					re_map->ids,
1358 					re_map->regexps->len,
1359 					HS_MODE_BLOCK,
1360 					&plt,
1361 					&re_map->hs_db,
1362 					&err) != HS_SUCCESS) {
1363 
1364 				msg_err_map ("cannot create tree of regexp when processing '%s': %s",
1365 						err->expression >= 0 ?
1366 						re_map->patterns[err->expression] :
1367 						"unknown regexp", err->message);
1368 				re_map->hs_db = NULL;
1369 				hs_free_compile_error (err);
1370 
1371 				return;
1372 			}
1373 
1374 			ts1 = (rspamd_get_ticks (FALSE) - ts1) * 1000.0;
1375 			msg_info_map ("hyperscan compiled %d regular expressions from %s in %.1f ms",
1376 					re_map->regexps->len, re_map->map->name, ts1);
1377 			rspamd_try_save_re_map_cache (re_map);
1378 		}
1379 		else {
1380 			msg_info_map ("hyperscan read %d cached regular expressions from %s",
1381 					re_map->regexps->len, re_map->map->name);
1382 		}
1383 
1384 		if (hs_alloc_scratch (re_map->hs_db, &re_map->hs_scratch) != HS_SUCCESS) {
1385 			msg_err_map ("cannot allocate scratch space for hyperscan");
1386 			hs_free_database (re_map->hs_db);
1387 			re_map->hs_db = NULL;
1388 		}
1389 	}
1390 	else {
1391 		msg_err_map ("regexp map is empty");
1392 	}
1393 #endif
1394 }
1395 
1396 gchar *
rspamd_regexp_list_read_single(gchar * chunk,gint len,struct map_cb_data * data,gboolean final)1397 rspamd_regexp_list_read_single (
1398 		gchar *chunk,
1399 		gint len,
1400 		struct map_cb_data *data,
1401 		gboolean final)
1402 {
1403 	struct rspamd_regexp_map_helper *re_map;
1404 
1405 	if (data->cur_data == NULL) {
1406 		re_map = rspamd_map_helper_new_regexp (data->map, 0);
1407 		data->cur_data = re_map;
1408 	}
1409 
1410 	return rspamd_parse_kv_list (
1411 			chunk,
1412 			len,
1413 			data,
1414 			rspamd_map_helper_insert_re,
1415 			hash_fill,
1416 			final);
1417 }
1418 
1419 gchar *
rspamd_glob_list_read_single(gchar * chunk,gint len,struct map_cb_data * data,gboolean final)1420 rspamd_glob_list_read_single (
1421 		gchar *chunk,
1422 		gint len,
1423 		struct map_cb_data *data,
1424 		gboolean final)
1425 {
1426 	struct rspamd_regexp_map_helper *re_map;
1427 
1428 	if (data->cur_data == NULL) {
1429 		re_map = rspamd_map_helper_new_regexp (data->map, RSPAMD_REGEXP_MAP_FLAG_GLOB);
1430 		data->cur_data = re_map;
1431 	}
1432 
1433 	return rspamd_parse_kv_list (
1434 			chunk,
1435 			len,
1436 			data,
1437 			rspamd_map_helper_insert_re,
1438 			hash_fill,
1439 			final);
1440 }
1441 
1442 gchar *
rspamd_regexp_list_read_multiple(gchar * chunk,gint len,struct map_cb_data * data,gboolean final)1443 rspamd_regexp_list_read_multiple (
1444 		gchar *chunk,
1445 		gint len,
1446 		struct map_cb_data *data,
1447 		gboolean final)
1448 {
1449 	struct rspamd_regexp_map_helper *re_map;
1450 
1451 	if (data->cur_data == NULL) {
1452 		re_map = rspamd_map_helper_new_regexp (data->map,
1453 				RSPAMD_REGEXP_MAP_FLAG_MULTIPLE);
1454 		data->cur_data = re_map;
1455 	}
1456 
1457 	return rspamd_parse_kv_list (
1458 			chunk,
1459 			len,
1460 			data,
1461 			rspamd_map_helper_insert_re,
1462 			hash_fill,
1463 			final);
1464 }
1465 
1466 gchar *
rspamd_glob_list_read_multiple(gchar * chunk,gint len,struct map_cb_data * data,gboolean final)1467 rspamd_glob_list_read_multiple (
1468 		gchar *chunk,
1469 		gint len,
1470 		struct map_cb_data *data,
1471 		gboolean final)
1472 {
1473 	struct rspamd_regexp_map_helper *re_map;
1474 
1475 	if (data->cur_data == NULL) {
1476 		re_map = rspamd_map_helper_new_regexp (data->map,
1477 				RSPAMD_REGEXP_MAP_FLAG_GLOB|RSPAMD_REGEXP_MAP_FLAG_MULTIPLE);
1478 		data->cur_data = re_map;
1479 	}
1480 
1481 	return rspamd_parse_kv_list (
1482 			chunk,
1483 			len,
1484 			data,
1485 			rspamd_map_helper_insert_re,
1486 			hash_fill,
1487 			final);
1488 }
1489 
1490 
1491 void
rspamd_regexp_list_fin(struct map_cb_data * data,void ** target)1492 rspamd_regexp_list_fin (struct map_cb_data *data, void **target)
1493 {
1494 	struct rspamd_regexp_map_helper *re_map = NULL, *old_re_map;
1495 	struct rspamd_map *map = data->map;
1496 
1497 	if (data->cur_data) {
1498 		re_map = data->cur_data;
1499 		rspamd_cryptobox_hash_final (&re_map->hst, re_map->re_digest);
1500 		memcpy (&data->map->digest, re_map->re_digest, sizeof (data->map->digest));
1501 		rspamd_re_map_finalize (re_map);
1502 		msg_info_map ("read regexp list of %ud elements",
1503 				re_map->regexps->len);
1504 		data->map->traverse_function = rspamd_map_helper_traverse_regexp;
1505 		data->map->nelts = kh_size (re_map->htb);
1506 	}
1507 
1508 	if (target) {
1509 		*target = data->cur_data;
1510 	}
1511 
1512 	if (data->prev_data) {
1513 		old_re_map = data->prev_data;
1514 
1515 #ifdef WITH_HYPERSCAN
1516 		if (re_map && memcmp (re_map->re_digest, old_re_map->re_digest,
1517 				sizeof (re_map->re_digest)) != 0) {
1518 			/* Cleanup old stuff */
1519 			rspamd_re_map_cache_cleanup_old (old_re_map);
1520 		}
1521 #endif
1522 
1523 		rspamd_map_helper_destroy_regexp (old_re_map);
1524 	}
1525 }
1526 void
rspamd_regexp_list_dtor(struct map_cb_data * data)1527 rspamd_regexp_list_dtor (struct map_cb_data *data)
1528 {
1529 	if (data->cur_data) {
1530 		rspamd_map_helper_destroy_regexp (data->cur_data);
1531 	}
1532 }
1533 
1534 #ifdef WITH_HYPERSCAN
1535 static int
rspamd_match_hs_single_handler(unsigned int id,unsigned long long from,unsigned long long to,unsigned int flags,void * context)1536 rspamd_match_hs_single_handler (unsigned int id, unsigned long long from,
1537 		unsigned long long to,
1538 		unsigned int flags, void *context)
1539 {
1540 	guint *i = context;
1541 	/* Always return non-zero as we need a single match here */
1542 
1543 	*i = id;
1544 
1545 	return 1;
1546 }
1547 #endif
1548 
1549 gconstpointer
rspamd_match_regexp_map_single(struct rspamd_regexp_map_helper * map,const gchar * in,gsize len)1550 rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map,
1551 		const gchar *in, gsize len)
1552 {
1553 	guint i;
1554 	rspamd_regexp_t *re;
1555 	gint res = 0;
1556 	gpointer ret = NULL;
1557 	struct rspamd_map_helper_value *val;
1558 	gboolean validated = FALSE;
1559 
1560 	g_assert (in != NULL);
1561 
1562 	if (map == NULL || len == 0 || map->regexps == NULL) {
1563 		return NULL;
1564 	}
1565 
1566 	if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
1567 		if (rspamd_fast_utf8_validate (in, len) == 0) {
1568 			validated = TRUE;
1569 		}
1570 	}
1571 	else {
1572 		validated = TRUE;
1573 	}
1574 
1575 #ifdef WITH_HYPERSCAN
1576 	if (map->hs_db && map->hs_scratch) {
1577 
1578 		if (validated) {
1579 
1580 			res = hs_scan (map->hs_db, in, len, 0, map->hs_scratch,
1581 					rspamd_match_hs_single_handler, (void *)&i);
1582 
1583 			if (res == HS_SCAN_TERMINATED) {
1584 				res = 1;
1585 				val = g_ptr_array_index (map->values, i);
1586 
1587 				ret = val->value;
1588 				val->hits ++;
1589 			}
1590 
1591 			return ret;
1592 		}
1593 	}
1594 #endif
1595 
1596 	if (!res) {
1597 		/* PCRE version */
1598 		for (i = 0; i < map->regexps->len; i ++) {
1599 			re = g_ptr_array_index (map->regexps, i);
1600 
1601 			if (rspamd_regexp_search (re, in, len, NULL, NULL, !validated, NULL)) {
1602 				val = g_ptr_array_index (map->values, i);
1603 
1604 				ret = val->value;
1605 				val->hits ++;
1606 				break;
1607 			}
1608 		}
1609 	}
1610 
1611 	return ret;
1612 }
1613 
1614 #ifdef WITH_HYPERSCAN
1615 struct rspamd_multiple_cbdata {
1616 	GPtrArray *ar;
1617 	struct rspamd_regexp_map_helper *map;
1618 };
1619 
1620 static int
rspamd_match_hs_multiple_handler(unsigned int id,unsigned long long from,unsigned long long to,unsigned int flags,void * context)1621 rspamd_match_hs_multiple_handler (unsigned int id, unsigned long long from,
1622 		unsigned long long to,
1623 		unsigned int flags, void *context)
1624 {
1625 	struct rspamd_multiple_cbdata *cbd = context;
1626 	struct rspamd_map_helper_value *val;
1627 
1628 
1629 	if (id < cbd->map->values->len) {
1630 		val = g_ptr_array_index (cbd->map->values, id);
1631 		val->hits ++;
1632 		g_ptr_array_add (cbd->ar, val->value);
1633 	}
1634 
1635 	/* Always return zero as we need all matches here */
1636 	return 0;
1637 }
1638 #endif
1639 
1640 GPtrArray*
rspamd_match_regexp_map_all(struct rspamd_regexp_map_helper * map,const gchar * in,gsize len)1641 rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map,
1642 		const gchar *in, gsize len)
1643 {
1644 	guint i;
1645 	rspamd_regexp_t *re;
1646 	GPtrArray *ret;
1647 	gint res = 0;
1648 	gboolean validated = FALSE;
1649 	struct rspamd_map_helper_value *val;
1650 
1651 	if (map == NULL || map->regexps == NULL || len == 0) {
1652 		return NULL;
1653 	}
1654 
1655 	g_assert (in != NULL);
1656 
1657 	if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
1658 		if (rspamd_fast_utf8_validate (in, len) == 0) {
1659 			validated = TRUE;
1660 		}
1661 	}
1662 	else {
1663 		validated = TRUE;
1664 	}
1665 
1666 	ret = g_ptr_array_new ();
1667 
1668 #ifdef WITH_HYPERSCAN
1669 	if (map->hs_db && map->hs_scratch) {
1670 
1671 		if (validated) {
1672 			struct rspamd_multiple_cbdata cbd;
1673 
1674 			cbd.ar = ret;
1675 			cbd.map = map;
1676 
1677 			if (hs_scan (map->hs_db, in, len, 0, map->hs_scratch,
1678 					rspamd_match_hs_multiple_handler, &cbd) == HS_SUCCESS) {
1679 				res = 1;
1680 			}
1681 		}
1682 	}
1683 #endif
1684 
1685 	if (!res) {
1686 		/* PCRE version */
1687 		for (i = 0; i < map->regexps->len; i ++) {
1688 			re = g_ptr_array_index (map->regexps, i);
1689 
1690 			if (rspamd_regexp_search (re, in, len, NULL, NULL,
1691 					!validated, NULL)) {
1692 				val = g_ptr_array_index (map->values, i);
1693 				val->hits ++;
1694 				g_ptr_array_add (ret, val->value);
1695 			}
1696 		}
1697 	}
1698 
1699 	if (ret->len > 0) {
1700 		return ret;
1701 	}
1702 
1703 	g_ptr_array_free (ret, TRUE);
1704 
1705 	return NULL;
1706 }
1707 
1708 gconstpointer
rspamd_match_hash_map(struct rspamd_hash_map_helper * map,const gchar * in,gsize len)1709 rspamd_match_hash_map (struct rspamd_hash_map_helper *map, const gchar *in,
1710 		gsize len)
1711 {
1712 	khiter_t k;
1713 	struct rspamd_map_helper_value *val;
1714 	rspamd_ftok_t tok;
1715 
1716 	if (map == NULL || map->htb == NULL) {
1717 		return NULL;
1718 	}
1719 
1720 	tok.begin = in;
1721 	tok.len = len;
1722 
1723 	k = kh_get (rspamd_map_hash, map->htb, tok);
1724 
1725 	if (k != kh_end (map->htb)) {
1726 		val = kh_value (map->htb, k);
1727 		val->hits ++;
1728 
1729 		return val->value;
1730 	}
1731 
1732 	return NULL;
1733 }
1734 
1735 gconstpointer
rspamd_match_radix_map(struct rspamd_radix_map_helper * map,const guchar * in,gsize inlen)1736 rspamd_match_radix_map (struct rspamd_radix_map_helper *map,
1737 		const guchar *in, gsize inlen)
1738 {
1739 	struct rspamd_map_helper_value *val;
1740 
1741 	if (map == NULL || map->trie == NULL) {
1742 		return NULL;
1743 	}
1744 
1745 	val = (struct rspamd_map_helper_value *)radix_find_compressed (map->trie,
1746 			in, inlen);
1747 
1748 	if (val != (gconstpointer)RADIX_NO_VALUE) {
1749 		val->hits ++;
1750 
1751 		return val->value;
1752 	}
1753 
1754 	return NULL;
1755 }
1756 
1757 gconstpointer
rspamd_match_radix_map_addr(struct rspamd_radix_map_helper * map,const rspamd_inet_addr_t * addr)1758 rspamd_match_radix_map_addr (struct rspamd_radix_map_helper *map,
1759 		const rspamd_inet_addr_t *addr)
1760 {
1761 	struct rspamd_map_helper_value *val;
1762 
1763 	if (map == NULL || map->trie == NULL) {
1764 		return NULL;
1765 	}
1766 
1767 	val = (struct rspamd_map_helper_value *)radix_find_compressed_addr (map->trie, addr);
1768 
1769 	if (val != (gconstpointer)RADIX_NO_VALUE) {
1770 		val->hits ++;
1771 
1772 		return val->value;
1773 	}
1774 
1775 	return NULL;
1776 }
1777 
1778 
1779 /*
1780  * CBD stuff
1781  */
1782 
1783 struct rspamd_cdb_map_helper *
rspamd_map_helper_new_cdb(struct rspamd_map * map)1784 rspamd_map_helper_new_cdb (struct rspamd_map *map)
1785 {
1786 	struct rspamd_cdb_map_helper *n;
1787 
1788 	n = g_malloc0 (sizeof (*n));
1789 	n->cdbs = (GQueue)G_QUEUE_INIT;
1790 	n->map = map;
1791 
1792 	rspamd_cryptobox_fast_hash_init (&n->hst, map_hash_seed);
1793 
1794 	return n;
1795 }
1796 
1797 void
rspamd_map_helper_destroy_cdb(struct rspamd_cdb_map_helper * c)1798 rspamd_map_helper_destroy_cdb (struct rspamd_cdb_map_helper *c)
1799 {
1800 	if (c == NULL) {
1801 		return;
1802 	}
1803 
1804 	GList *cur = c->cdbs.head;
1805 
1806 	while (cur) {
1807 		struct cdb *cdb = (struct cdb *)cur->data;
1808 
1809 		cdb_free (cdb);
1810 		g_free (cdb->filename);
1811 		close (cdb->cdb_fd);
1812 		g_free (cdb);
1813 
1814 		cur = g_list_next (cur);
1815 	}
1816 
1817 	g_queue_clear (&c->cdbs);
1818 
1819 	g_free (c);
1820 }
1821 
1822 gchar *
rspamd_cdb_list_read(gchar * chunk,gint len,struct map_cb_data * data,gboolean final)1823 rspamd_cdb_list_read (gchar *chunk,
1824 					  gint len,
1825 					  struct map_cb_data *data,
1826 					  gboolean final)
1827 {
1828 	struct rspamd_cdb_map_helper *cdb_data;
1829 	struct cdb *found = NULL;
1830 	struct rspamd_map *map = data->map;
1831 
1832 	g_assert (map->no_file_read);
1833 
1834 	if (data->cur_data == NULL) {
1835 		cdb_data = rspamd_map_helper_new_cdb (data->map);
1836 		data->cur_data = cdb_data;
1837 	}
1838 	else {
1839 		cdb_data = (struct rspamd_cdb_map_helper *)data->cur_data;
1840 	}
1841 
1842 	GList *cur = cdb_data->cdbs.head;
1843 
1844 	while (cur) {
1845 		struct cdb *elt = (struct cdb *)cur->data;
1846 
1847 		if (strcmp (elt->filename, chunk) == 0) {
1848 			found = elt;
1849 			break;
1850 		}
1851 
1852 		cur = g_list_next (cur);
1853 	}
1854 
1855 	if (found == NULL) {
1856 		/* New cdb */
1857 		gint fd;
1858 		struct cdb *cdb;
1859 
1860 		fd = rspamd_file_xopen (chunk, O_RDONLY, 0, TRUE);
1861 
1862 		if (fd == -1) {
1863 			msg_err_map ("cannot open cdb map from %s: %s", chunk, strerror (errno));
1864 
1865 			return NULL;
1866 		}
1867 
1868 		cdb = g_malloc0 (sizeof (struct cdb));
1869 
1870 		if (cdb_init (cdb, fd) == -1) {
1871 			g_free (cdb);
1872 			msg_err_map ("cannot init cdb map from %s: %s", chunk, strerror (errno));
1873 
1874 			return NULL;
1875 		}
1876 
1877 		cdb->filename = g_strdup (chunk);
1878 		g_queue_push_tail (&cdb_data->cdbs, cdb);
1879 		cdb_data->total_size += cdb->cdb_fsize;
1880 		rspamd_cryptobox_fast_hash_update (&cdb_data->hst, chunk, len);
1881 	}
1882 
1883 	return chunk + len;
1884 }
1885 
1886 void
rspamd_cdb_list_fin(struct map_cb_data * data,void ** target)1887 rspamd_cdb_list_fin (struct map_cb_data *data, void **target)
1888 {
1889 	struct rspamd_map *map = data->map;
1890 	struct rspamd_cdb_map_helper *cdb_data;
1891 
1892 	if (data->cur_data) {
1893 		cdb_data = (struct rspamd_cdb_map_helper *)data->cur_data;
1894 		msg_info_map ("read cdb of %Hz size", cdb_data->total_size);
1895 		data->map->traverse_function = NULL;
1896 		data->map->nelts = 0;
1897 		data->map->digest = rspamd_cryptobox_fast_hash_final (&cdb_data->hst);
1898 	}
1899 
1900 	if (target) {
1901 		*target = data->cur_data;
1902 	}
1903 
1904 	if (data->prev_data) {
1905 		cdb_data = (struct rspamd_cdb_map_helper *)data->prev_data;
1906 		rspamd_map_helper_destroy_cdb (cdb_data);
1907 	}
1908 }
1909 void
rspamd_cdb_list_dtor(struct map_cb_data * data)1910 rspamd_cdb_list_dtor (struct map_cb_data *data)
1911 {
1912 	if (data->cur_data) {
1913 		rspamd_map_helper_destroy_cdb (data->cur_data);
1914 	}
1915 }
1916 
1917 gconstpointer
rspamd_match_cdb_map(struct rspamd_cdb_map_helper * map,const gchar * in,gsize inlen)1918 rspamd_match_cdb_map (struct rspamd_cdb_map_helper *map,
1919 					  const gchar *in, gsize inlen)
1920 {
1921 	if (map == NULL || map->cdbs.head == NULL) {
1922 		return NULL;
1923 	}
1924 
1925 	GList *cur = map->cdbs.head;
1926 	static rspamd_ftok_t found;
1927 
1928 	while (cur) {
1929 		struct cdb *cdb = (struct cdb *)cur->data;
1930 
1931 		if (cdb_find (cdb, in, inlen) > 0) {
1932 			/* Extract and push value to lua as string */
1933 			unsigned vlen;
1934 			gconstpointer vpos;
1935 
1936 			vpos = cdb->cdb_mem + cdb_datapos (cdb);
1937 			vlen = cdb_datalen (cdb);
1938 			found.len = vlen;
1939 			found.begin = vpos;
1940 
1941 			return &found; /* Do not reuse! */
1942 		}
1943 
1944 		cur = g_list_next (cur);
1945 	}
1946 
1947 	return NULL;
1948 }
1949