1 /*-
2  * Copyright 2016 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 /*
17  * Common tokenization functions
18  */
19 
20 #include "rspamd.h"
21 #include "tokenizers.h"
22 #include "stat_internal.h"
23 #include "contrib/mumhash/mum.h"
24 #include "libmime/lang_detection.h"
25 #include "libstemmer.h"
26 
27 #include <unicode/utf8.h>
28 #include <unicode/uchar.h>
29 #include <unicode/uiter.h>
30 #include <unicode/ubrk.h>
31 #include <unicode/ucnv.h>
32 #if U_ICU_VERSION_MAJOR_NUM >= 44
33 #include <unicode/unorm2.h>
34 #endif
35 
36 #include <math.h>
37 
38 typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
39 		rspamd_stat_token_t * token,
40 		GList **exceptions, gsize *rl, gboolean check_signature);
41 
42 const gchar t_delimiters[256] = {
43 	0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
44 	1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
45 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46 	0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
47 	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
48 	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
49 	1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
50 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
51 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52 	0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
53 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 	0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
56 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 	0, 0, 0, 0, 0, 0
69 };
70 
71 /* Get next word from specified f_str_t buf */
72 static gboolean
rspamd_tokenizer_get_word_raw(rspamd_stat_token_t * buf,gchar const ** cur,rspamd_stat_token_t * token,GList ** exceptions,gsize * rl,gboolean unused)73 rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
74 		gchar const **cur, rspamd_stat_token_t * token,
75 		GList **exceptions, gsize *rl, gboolean unused)
76 {
77 	gsize remain, pos;
78 	const gchar *p;
79 	struct rspamd_process_exception *ex = NULL;
80 
81 	if (buf == NULL) {
82 		return FALSE;
83 	}
84 
85 	g_assert (cur != NULL);
86 
87 	if (exceptions != NULL && *exceptions != NULL) {
88 		ex = (*exceptions)->data;
89 	}
90 
91 	if (token->original.begin == NULL || *cur == NULL) {
92 		if (ex != NULL) {
93 			if (ex->pos == 0) {
94 				token->original.begin = buf->original.begin + ex->len;
95 				token->original.len = ex->len;
96 				token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
97 			}
98 			else {
99 				token->original.begin = buf->original.begin;
100 				token->original.len = 0;
101 			}
102 		}
103 		else {
104 			token->original.begin = buf->original.begin;
105 			token->original.len = 0;
106 		}
107 		*cur = token->original.begin;
108 	}
109 
110 	token->original.len = 0;
111 
112 	pos = *cur - buf->original.begin;
113 	if (pos >= buf->original.len) {
114 		return FALSE;
115 	}
116 
117 	remain = buf->original.len - pos;
118 	p = *cur;
119 
120 	/* Skip non delimiters symbols */
121 	do {
122 		if (ex != NULL && ex->pos == pos) {
123 			/* Go to the next exception */
124 			*exceptions = g_list_next (*exceptions);
125 			*cur = p + ex->len;
126 			return TRUE;
127 		}
128 		pos++;
129 		p++;
130 		remain--;
131 	} while (remain > 0 && t_delimiters[(guchar)*p]);
132 
133 	token->original.begin = p;
134 
135 	while (remain > 0 && !t_delimiters[(guchar)*p]) {
136 		if (ex != NULL && ex->pos == pos) {
137 			*exceptions = g_list_next (*exceptions);
138 			*cur = p + ex->len;
139 			return TRUE;
140 		}
141 		token->original.len++;
142 		pos++;
143 		remain--;
144 		p++;
145 	}
146 
147 	if (remain == 0) {
148 		return FALSE;
149 	}
150 
151 	if (rl) {
152 		*rl = token->original.len;
153 	}
154 
155 	token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
156 
157 	*cur = p;
158 
159 	return TRUE;
160 }
161 
162 static inline gboolean
rspamd_tokenize_check_limit(gboolean decay,guint word_decay,guint nwords,guint64 * hv,guint64 * prob,const rspamd_stat_token_t * token,gssize remain,gssize total)163 rspamd_tokenize_check_limit (gboolean decay,
164 							 guint word_decay,
165 							 guint nwords,
166 							 guint64 *hv,
167 							 guint64 *prob,
168 							 const rspamd_stat_token_t *token,
169 							 gssize remain,
170 							 gssize total)
171 {
172 	static const gdouble avg_word_len = 6.0;
173 
174 	if (!decay) {
175 		if (token->original.len >= sizeof (guint64)) {
176 			guint64 tmp;
177 			memcpy (&tmp, token->original.begin, sizeof (tmp));
178 			*hv = mum_hash_step (*hv, tmp);
179 		}
180 
181 		/* Check for decay */
182 		if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
183 			/* Start decay */
184 			gdouble decay_prob;
185 
186 			*hv = mum_hash_finish (*hv);
187 
188 			/* We assume that word is 6 symbols length in average */
189 			decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len) * 10;
190 			decay_prob = floor (decay_prob) / 10.0;
191 
192 			if (decay_prob >= 1.0) {
193 				*prob = G_MAXUINT64;
194 			}
195 			else {
196 				*prob = decay_prob * G_MAXUINT64;
197 			}
198 
199 			return TRUE;
200 		}
201 	}
202 	else {
203 		/* Decaying probability */
204 		/* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
205 		*hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
206 
207 		if (*hv > *prob) {
208 			return TRUE;
209 		}
210 	}
211 
212 	return FALSE;
213 }
214 
215 static inline gboolean
rspamd_utf_word_valid(const guchar * text,const guchar * end,gint32 start,gint32 finish)216 rspamd_utf_word_valid (const guchar *text, const guchar *end,
217 		gint32 start, gint32 finish)
218 {
219 	const guchar *st = text + start, *fin = text + finish;
220 	UChar32 c;
221 
222 	if (st >= end || fin > end || st >= fin) {
223 		return FALSE;
224 	}
225 
226 	U8_NEXT (text, start, finish, c);
227 
228 	if (u_isJavaIDPart (c)) {
229 		return TRUE;
230 	}
231 
232 	return FALSE;
233 }
234 #define SHIFT_EX do { \
235     cur = g_list_next (cur); \
236     if (cur) { \
237         ex = (struct rspamd_process_exception *) cur->data; \
238     } \
239     else { \
240         ex = NULL; \
241     } \
242 } while(0)
243 
244 static inline void
rspamd_tokenize_exception(struct rspamd_process_exception * ex,GArray * res)245 rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
246 {
247 	rspamd_stat_token_t token;
248 
249 	memset (&token, 0, sizeof (token));
250 
251 	if (ex->type == RSPAMD_EXCEPTION_GENERIC) {
252 		token.original.begin = "!!EX!!";
253 		token.original.len = sizeof ("!!EX!!") - 1;
254 		token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
255 
256 		g_array_append_val (res, token);
257 		token.flags = 0;
258 	}
259 	else if (ex->type == RSPAMD_EXCEPTION_URL) {
260 		struct rspamd_url *uri;
261 
262 		uri = ex->ptr;
263 
264 		if (uri && uri->tldlen > 0) {
265 			token.original.begin = rspamd_url_tld_unsafe (uri);
266 			token.original.len = uri->tldlen;
267 
268 		}
269 		else {
270 			token.original.begin = "!!EX!!";
271 			token.original.len = sizeof ("!!EX!!") - 1;
272 		}
273 
274 		token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
275 		g_array_append_val (res, token);
276 		token.flags = 0;
277 	}
278 }
279 
280 
281 GArray *
rspamd_tokenize_text(const gchar * text,gsize len,const UText * utxt,enum rspamd_tokenize_type how,struct rspamd_config * cfg,GList * exceptions,guint64 * hash,GArray * cur_words,rspamd_mempool_t * pool)282 rspamd_tokenize_text (const gchar *text, gsize len,
283 					  const UText *utxt,
284 					  enum rspamd_tokenize_type how,
285 					  struct rspamd_config *cfg,
286 					  GList *exceptions,
287 					  guint64 *hash,
288 					  GArray *cur_words,
289 					  rspamd_mempool_t *pool)
290 {
291 	rspamd_stat_token_t token, buf;
292 	const gchar *pos = NULL;
293 	gsize l = 0;
294 	GArray *res;
295 	GList *cur = exceptions;
296 	guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
297 	guint64 hv = 0;
298 	gboolean decay = FALSE, long_text_mode = FALSE;
299 	guint64 prob = 0;
300 	static UBreakIterator* bi = NULL;
301 	static const gsize long_text_limit = 1 * 1024 * 1024;
302 	static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
303 	ev_tstamp start;
304 
305 	if (text == NULL) {
306 		return cur_words;
307 	}
308 
309 	if (len > long_text_limit) {
310 		/*
311 		 * In this mode we do additional checks to avoid performance issues
312 		 */
313 		long_text_mode = TRUE;
314 		start = ev_time ();
315 	}
316 
317 	buf.original.begin = text;
318 	buf.original.len = len;
319 	buf.flags = 0;
320 
321 	memset (&token, 0, sizeof (token));
322 
323 	if (cfg != NULL) {
324 		min_len = cfg->min_word_len;
325 		max_len = cfg->max_word_len;
326 		word_decay = cfg->words_decay;
327 		initial_size = word_decay * 2;
328 	}
329 
330 	if (!cur_words) {
331 		res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
332 				initial_size);
333 	}
334 	else {
335 		res = cur_words;
336 	}
337 
338 	if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
339 		while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
340 			if (l == 0 || (min_len > 0 && l < min_len) ||
341 				(max_len > 0 && l > max_len)) {
342 				token.original.begin = pos;
343 				continue;
344 			}
345 
346 			if (token.original.len > 0 &&
347 				rspamd_tokenize_check_limit (decay, word_decay, res->len,
348 					&hv, &prob, &token, pos - text, len)) {
349 				if (!decay) {
350 					decay = TRUE;
351 				}
352 				else {
353 					token.original.begin = pos;
354 					continue;
355 				}
356 			}
357 
358 			if (long_text_mode) {
359 				if ((res->len + 1) % 16 == 0) {
360 					ev_tstamp now = ev_time ();
361 
362 					if (now - start > max_exec_time) {
363 						msg_warn_pool_check (
364 								"too long time has been spent on tokenization:"
365 								  " %.1f ms, limit is %.1f ms; %d words added so far",
366 								(now - start) * 1e3, max_exec_time * 1e3,
367 								res->len);
368 
369 						goto end;
370 					}
371 				}
372 			}
373 
374 			g_array_append_val (res, token);
375 
376 			if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
377 				/* Due to bug in glib ! */
378 				msg_err_pool_check (
379 						"too many words found: %d, stop tokenization to avoid DoS",
380 						res->len);
381 
382 				goto end;
383 			}
384 
385 			token.original.begin = pos;
386 		}
387 	}
388 	else {
389 		/* UTF8 boundaries */
390 		UErrorCode uc_err = U_ZERO_ERROR;
391 		int32_t last, p;
392 		struct rspamd_process_exception *ex = NULL;
393 
394 		if (bi == NULL) {
395 			bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
396 
397 			g_assert (U_SUCCESS (uc_err));
398 		}
399 
400 		ubrk_setUText (bi, (UText*)utxt, &uc_err);
401 		last = ubrk_first (bi);
402 		p = last;
403 
404 		if (cur) {
405 			ex = (struct rspamd_process_exception *)cur->data;
406 		}
407 
408 		while (p != UBRK_DONE) {
409 start_over:
410 			token.original.len = 0;
411 
412 			if (p > last) {
413 				if (ex && cur) {
414 					/* Check exception */
415 					if (ex->pos >= last && ex->pos <= p) {
416 						/* We have an exception within boundary */
417 						/* First, start to drain exceptions from the start */
418 						while (cur && ex->pos <= last) {
419 							/* We have an exception at the beginning, skip those */
420 							last += ex->len;
421 							rspamd_tokenize_exception (ex, res);
422 
423 							if (last > p) {
424 								/* Exception spread over the boundaries */
425 								while (last > p && p != UBRK_DONE) {
426 									gint32 old_p = p;
427 									p = ubrk_next (bi);
428 
429 									if (p != UBRK_DONE && p <= old_p) {
430 										msg_warn_pool_check (
431 												"tokenization reversed back on position %d,"
432 												"%d new position (%d backward), likely libicu bug!",
433 												(gint)(p), (gint)(old_p), old_p - p);
434 
435 										goto end;
436 									}
437 								}
438 
439 								/* We need to reset our scan with new p and last */
440 								SHIFT_EX;
441 								goto start_over;
442 							}
443 
444 							SHIFT_EX;
445 						}
446 
447 						/* Now, we can have an exception within boundary again */
448 						if (cur && ex->pos >= last && ex->pos <= p) {
449 							/* Append the first part */
450 							if (rspamd_utf_word_valid (text, text + len, last,
451 									ex->pos)) {
452 								token.original.begin = text + last;
453 								token.original.len = ex->pos - last;
454 								token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
455 											  RSPAMD_STAT_TOKEN_FLAG_UTF;
456 							}
457 
458 							/* Process the current exception */
459 							last += ex->len + (ex->pos - last);
460 
461 							rspamd_tokenize_exception (ex, res);
462 
463 							if (last > p) {
464 								/* Exception spread over the boundaries */
465 								while (last > p && p != UBRK_DONE) {
466 									gint32 old_p = p;
467 									p = ubrk_next (bi);
468 									if (p != UBRK_DONE && p <= old_p) {
469 										msg_warn_pool_check (
470 												"tokenization reversed back on position %d,"
471 												"%d new position (%d backward), likely libicu bug!",
472 												(gint)(p), (gint)(old_p), old_p - p);
473 
474 										goto end;
475 									}
476 								}
477 								/* We need to reset our scan with new p and last */
478 								SHIFT_EX;
479 								goto start_over;
480 							}
481 
482 							SHIFT_EX;
483 						}
484 						else if (p > last) {
485 							if (rspamd_utf_word_valid (text, text + len, last, p)) {
486 								token.original.begin = text + last;
487 								token.original.len = p - last;
488 								token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
489 											  RSPAMD_STAT_TOKEN_FLAG_UTF;
490 							}
491 						}
492 					}
493 					else if (ex->pos < last) {
494 						/* Forward exceptions list */
495 						while (cur && ex->pos <= last) {
496 							/* We have an exception at the beginning, skip those */
497 							SHIFT_EX;
498 						}
499 
500 						if (rspamd_utf_word_valid (text, text + len, last, p)) {
501 							token.original.begin = text + last;
502 							token.original.len = p - last;
503 							token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
504 										  RSPAMD_STAT_TOKEN_FLAG_UTF;
505 						}
506 					}
507 					else {
508 						/* No exceptions within boundary */
509 						if (rspamd_utf_word_valid (text, text + len, last, p)) {
510 							token.original.begin = text + last;
511 							token.original.len = p - last;
512 							token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
513 										  RSPAMD_STAT_TOKEN_FLAG_UTF;
514 						}
515 					}
516 				}
517 				else {
518 					if (rspamd_utf_word_valid (text, text + len, last, p)) {
519 						token.original.begin = text + last;
520 						token.original.len = p - last;
521 						token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
522 									  RSPAMD_STAT_TOKEN_FLAG_UTF;
523 					}
524 				}
525 
526 				if (token.original.len > 0 &&
527 					rspamd_tokenize_check_limit (decay, word_decay, res->len,
528 						&hv, &prob, &token, p, len)) {
529 					if (!decay) {
530 						decay = TRUE;
531 					} else {
532 						token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED;
533 					}
534 				}
535 			}
536 
537 			if (token.original.len > 0) {
538 				/* Additional check for number of words */
539 				if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
540 					/* Due to bug in glib ! */
541 					msg_err ("too many words found: %d, stop tokenization to avoid DoS",
542 							res->len);
543 
544 					goto end;
545 				}
546 
547 				g_array_append_val (res, token);
548 			}
549 
550 			/* Also check for long text mode */
551 			if (long_text_mode) {
552 				/* Check time each 128 words added */
553 				const int words_check_mask = 0x7F;
554 
555 				if ((res->len & words_check_mask) == words_check_mask) {
556 					ev_tstamp now = ev_time ();
557 
558 					if (now - start > max_exec_time) {
559 						msg_warn_pool_check (
560 								"too long time has been spent on tokenization:"
561 								  " %.1f ms, limit is %.1f ms; %d words added so far",
562 								(now - start) * 1e3, max_exec_time * 1e3,
563 								res->len);
564 
565 						goto end;
566 					}
567 				}
568 			}
569 
570 			last = p;
571 			p = ubrk_next (bi);
572 
573 			if (p != UBRK_DONE && p <= last) {
574 				msg_warn_pool_check ("tokenization reversed back on position %d,"
575 						 "%d new position (%d backward), likely libicu bug!",
576 						(gint)(p), (gint)(last), last - p);
577 
578 				goto end;
579 			}
580 		}
581 	}
582 
583 end:
584 	if (!decay) {
585 		hv = mum_hash_finish (hv);
586 	}
587 
588 	if (hash) {
589 		*hash = hv;
590 	}
591 
592 	return res;
593 }
594 
595 #undef SHIFT_EX
596 
597 static void
rspamd_add_metawords_from_str(const gchar * beg,gsize len,struct rspamd_task * task)598 rspamd_add_metawords_from_str (const gchar *beg, gsize len,
599 								struct rspamd_task *task)
600 {
601 	UText utxt = UTEXT_INITIALIZER;
602 	UErrorCode uc_err = U_ZERO_ERROR;
603 	guint i = 0;
604 	UChar32 uc;
605 	gboolean valid_utf = TRUE;
606 
607 	while (i < len) {
608 		U8_NEXT (beg, i, len, uc);
609 
610 		if (((gint32) uc) < 0) {
611 			valid_utf = FALSE;
612 			break;
613 		}
614 
615 #if U_ICU_VERSION_MAJOR_NUM < 50
616 		if (u_isalpha (uc)) {
617 			gint32 sc = ublock_getCode (uc);
618 
619 			if (sc == UBLOCK_THAI) {
620 				valid_utf = FALSE;
621 				msg_info_task ("enable workaround for Thai characters for old libicu");
622 				break;
623 			}
624 		}
625 #endif
626 	}
627 
628 	if (valid_utf) {
629 		utext_openUTF8 (&utxt,
630 				beg,
631 				len,
632 				&uc_err);
633 
634 		task->meta_words = rspamd_tokenize_text (beg, len,
635 				&utxt, RSPAMD_TOKENIZE_UTF,
636 				task->cfg, NULL, NULL,
637 				task->meta_words,
638 				task->task_pool);
639 
640 		utext_close (&utxt);
641 	}
642 	else {
643 		task->meta_words = rspamd_tokenize_text (beg, len,
644 				NULL, RSPAMD_TOKENIZE_RAW,
645 				task->cfg, NULL, NULL, task->meta_words,
646 				task->task_pool);
647 	}
648 }
649 
650 void
rspamd_tokenize_meta_words(struct rspamd_task * task)651 rspamd_tokenize_meta_words (struct rspamd_task *task)
652 {
653 	guint i = 0;
654 	rspamd_stat_token_t *tok;
655 
656 	if (MESSAGE_FIELD (task, subject)) {
657 		rspamd_add_metawords_from_str (MESSAGE_FIELD (task, subject),
658 				strlen (MESSAGE_FIELD (task, subject)), task);
659 	}
660 
661 	if (MESSAGE_FIELD (task, from_mime) && MESSAGE_FIELD (task, from_mime)->len > 0) {
662 		struct rspamd_email_address *addr;
663 
664 		addr = g_ptr_array_index (MESSAGE_FIELD (task, from_mime), 0);
665 
666 		if (addr->name) {
667 			rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task);
668 		}
669 	}
670 
671 	if (task->meta_words != NULL) {
672 		const gchar *language = NULL;
673 
674 		if (MESSAGE_FIELD (task, text_parts) &&
675 				MESSAGE_FIELD (task, text_parts)->len > 0) {
676 			struct rspamd_mime_text_part *tp = g_ptr_array_index (
677 					MESSAGE_FIELD (task, text_parts), 0);
678 
679 			if (tp->language) {
680 				language = tp->language;
681 			}
682 		}
683 
684 		rspamd_normalize_words (task->meta_words, task->task_pool);
685 		rspamd_stem_words (task->meta_words, task->task_pool, language,
686 				task->lang_det);
687 
688 		for (i = 0; i < task->meta_words->len; i++) {
689 			tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
690 			tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
691 		}
692 	}
693 }
694 
695 static inline void
rspamd_uchars_to_ucs32(const UChar * src,gsize srclen,rspamd_stat_token_t * tok,rspamd_mempool_t * pool)696 rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
697 						rspamd_stat_token_t *tok,
698 						rspamd_mempool_t *pool)
699 {
700 	UChar32 *dest, t, *d;
701 	gint32 i = 0;
702 
703 	dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32));
704 	d = dest;
705 
706 	while (i < srclen) {
707 		U16_NEXT_UNSAFE (src, i, t);
708 
709 		if (u_isgraph (t)) {
710 			UCharCategory cat;
711 
712 			cat = u_charType (t);
713 #if U_ICU_VERSION_MAJOR_NUM >= 57
714 			if (u_hasBinaryProperty (t, UCHAR_EMOJI)) {
715 				tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
716 			}
717 #endif
718 
719 			if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) ||
720 					cat == U_CONNECTOR_PUNCTUATION ||
721 					cat == U_MATH_SYMBOL ||
722 					cat == U_CURRENCY_SYMBOL) {
723 				*d++ = u_tolower (t);
724 			}
725 		}
726 		else {
727 			/* Invisible spaces ! */
728 			tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
729 		}
730 	}
731 
732 	tok->unicode.begin = dest;
733 	tok->unicode.len = d - dest;
734 }
735 
736 static inline void
rspamd_ucs32_to_normalised(rspamd_stat_token_t * tok,rspamd_mempool_t * pool)737 rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok,
738 							rspamd_mempool_t *pool)
739 {
740 	guint i, doff = 0;
741 	gsize utflen = 0;
742 	gchar *dest;
743 	UChar32 t;
744 
745 	for (i = 0; i < tok->unicode.len; i ++) {
746 		utflen += U8_LENGTH (tok->unicode.begin[i]);
747 	}
748 
749 	dest = rspamd_mempool_alloc (pool, utflen + 1);
750 
751 	for (i = 0; i < tok->unicode.len; i ++) {
752 		t = tok->unicode.begin[i];
753 		U8_APPEND_UNSAFE (dest, doff, t);
754 	}
755 
756 	g_assert (doff <= utflen);
757 	dest[doff] = '\0';
758 
759 	tok->normalized.len = doff;
760 	tok->normalized.begin = dest;
761 }
762 
763 void
rspamd_normalize_single_word(rspamd_stat_token_t * tok,rspamd_mempool_t * pool)764 rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
765 {
766 	UErrorCode uc_err = U_ZERO_ERROR;
767 	UConverter *utf8_converter;
768 	UChar tmpbuf[1024]; /* Assume that we have no longer words... */
769 	gsize ulen;
770 
771 	utf8_converter = rspamd_get_utf8_converter ();
772 
773 	if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
774 		ulen = ucnv_toUChars (utf8_converter,
775 				tmpbuf,
776 				G_N_ELEMENTS (tmpbuf),
777 				tok->original.begin,
778 				tok->original.len,
779 				&uc_err);
780 
781 		/* Now, we need to understand if we need to normalise the word */
782 		if (!U_SUCCESS (uc_err)) {
783 			tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
784 			tok->unicode.begin = NULL;
785 			tok->unicode.len = 0;
786 			tok->normalized.begin = NULL;
787 			tok->normalized.len = 0;
788 		}
789 		else {
790 #if U_ICU_VERSION_MAJOR_NUM >= 44
791 			const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
792 			gint32 end;
793 
794 			/* We can now check if we need to decompose */
795 			end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err);
796 
797 			if (!U_SUCCESS (uc_err)) {
798 				rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
799 				tok->normalized.begin = NULL;
800 				tok->normalized.len = 0;
801 				tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
802 			}
803 			else {
804 				if (end == ulen) {
805 					/* Already normalised, just lowercase */
806 					rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
807 					rspamd_ucs32_to_normalised (tok, pool);
808 				}
809 				else {
810 					/* Perform normalization */
811 					UChar normbuf[1024];
812 
813 					g_assert (end < G_N_ELEMENTS (normbuf));
814 					/* First part */
815 					memcpy (normbuf, tmpbuf, end * sizeof (UChar));
816 					/* Second part */
817 					ulen = unorm2_normalizeSecondAndAppend (norm,
818 							normbuf, end,
819 							G_N_ELEMENTS (normbuf),
820 							tmpbuf + end,
821 							ulen - end,
822 							&uc_err);
823 
824 					if (!U_SUCCESS (uc_err)) {
825 						if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
826 							msg_warn_pool_check ("cannot normalise text '%*s': %s",
827 									(gint)tok->original.len, tok->original.begin,
828 									u_errorName (uc_err));
829 							rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
830 							rspamd_ucs32_to_normalised (tok, pool);
831 							tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
832 						}
833 					}
834 					else {
835 						/* Copy normalised back */
836 						rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool);
837 						tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
838 						rspamd_ucs32_to_normalised (tok, pool);
839 					}
840 				}
841 			}
842 #else
843 			/* Legacy version with no unorm2 interface */
844 			rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
845 			rspamd_ucs32_to_normalised (tok, pool);
846 #endif
847 		}
848 	}
849 	else {
850 		if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
851 			/* Simple lowercase */
852 			gchar *dest;
853 
854 			dest = rspamd_mempool_alloc (pool, tok->original.len + 1);
855 			rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1);
856 			rspamd_str_lc (dest, tok->original.len);
857 			tok->normalized.len = tok->original.len;
858 			tok->normalized.begin = dest;
859 		}
860 	}
861 }
862 
863 void
rspamd_normalize_words(GArray * words,rspamd_mempool_t * pool)864 rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
865 {
866 	rspamd_stat_token_t *tok;
867 	guint i;
868 
869 	for (i = 0; i < words->len; i++) {
870 		tok = &g_array_index (words, rspamd_stat_token_t, i);
871 		rspamd_normalize_single_word (tok, pool);
872 	}
873 }
874 
875 void
rspamd_stem_words(GArray * words,rspamd_mempool_t * pool,const gchar * language,struct rspamd_lang_detector * d)876 rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
877 				   const gchar *language,
878 				   struct rspamd_lang_detector *d)
879 {
880 	static GHashTable *stemmers = NULL;
881 	struct sb_stemmer *stem = NULL;
882 	guint i;
883 	rspamd_stat_token_t *tok;
884 	gchar *dest;
885 	gsize dlen;
886 
887 	if (!stemmers) {
888 		stemmers = g_hash_table_new (rspamd_strcase_hash,
889 				rspamd_strcase_equal);
890 	}
891 
892 	if (language && language[0] != '\0') {
893 		stem = g_hash_table_lookup (stemmers, language);
894 
895 		if (stem == NULL) {
896 
897 			stem = sb_stemmer_new (language, "UTF_8");
898 
899 			if (stem == NULL) {
900 				msg_debug_pool (
901 						"<%s> cannot create lemmatizer for %s language",
902 						language);
903 				g_hash_table_insert (stemmers, g_strdup (language),
904 						GINT_TO_POINTER (-1));
905 			}
906 			else {
907 				g_hash_table_insert (stemmers, g_strdup (language),
908 						stem);
909 			}
910 		}
911 		else if (stem == GINT_TO_POINTER (-1)) {
912 			/* Negative cache */
913 			stem = NULL;
914 		}
915 	}
916 	for (i = 0; i < words->len; i++) {
917 		tok = &g_array_index (words, rspamd_stat_token_t, i);
918 
919 		if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
920 			if (stem) {
921 				const gchar *stemmed = NULL;
922 
923 				stemmed = sb_stemmer_stem (stem,
924 						tok->normalized.begin, tok->normalized.len);
925 
926 				dlen = stemmed ? strlen (stemmed) : 0;
927 
928 				if (dlen > 0) {
929 					dest = rspamd_mempool_alloc (pool, dlen + 1);
930 					memcpy (dest, stemmed, dlen);
931 					dest[dlen] = '\0';
932 					tok->stemmed.len = dlen;
933 					tok->stemmed.begin = dest;
934 					tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
935 				}
936 				else {
937 					/* Fallback */
938 					tok->stemmed.len = tok->normalized.len;
939 					tok->stemmed.begin = tok->normalized.begin;
940 				}
941 			}
942 			else {
943 				tok->stemmed.len = tok->normalized.len;
944 				tok->stemmed.begin = tok->normalized.begin;
945 			}
946 
947 			if (tok->stemmed.len > 0 && d != NULL &&
948 				rspamd_language_detector_is_stop_word (d, tok->stemmed.begin, tok->stemmed.len)) {
949 				tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
950 			}
951 		}
952 		else {
953 			if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
954 				/* Raw text, lowercase */
955 				tok->stemmed.len = tok->normalized.len;
956 				tok->stemmed.begin = tok->normalized.begin;
957 			}
958 		}
959 	}
960 }