1 /*-
2 * Copyright 2016 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 /*
17 * Common tokenization functions
18 */
19
20 #include "rspamd.h"
21 #include "tokenizers.h"
22 #include "stat_internal.h"
23 #include "contrib/mumhash/mum.h"
24 #include "libmime/lang_detection.h"
25 #include "libstemmer.h"
26
27 #include <unicode/utf8.h>
28 #include <unicode/uchar.h>
29 #include <unicode/uiter.h>
30 #include <unicode/ubrk.h>
31 #include <unicode/ucnv.h>
32 #if U_ICU_VERSION_MAJOR_NUM >= 44
33 #include <unicode/unorm2.h>
34 #endif
35
36 #include <math.h>
37
38 typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
39 rspamd_stat_token_t * token,
40 GList **exceptions, gsize *rl, gboolean check_signature);
41
42 const gchar t_delimiters[256] = {
43 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
44 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
47 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
48 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
49 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 0, 0, 0, 0, 0, 0
69 };
70
71 /* Get next word from specified f_str_t buf */
72 static gboolean
rspamd_tokenizer_get_word_raw(rspamd_stat_token_t * buf,gchar const ** cur,rspamd_stat_token_t * token,GList ** exceptions,gsize * rl,gboolean unused)73 rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
74 gchar const **cur, rspamd_stat_token_t * token,
75 GList **exceptions, gsize *rl, gboolean unused)
76 {
77 gsize remain, pos;
78 const gchar *p;
79 struct rspamd_process_exception *ex = NULL;
80
81 if (buf == NULL) {
82 return FALSE;
83 }
84
85 g_assert (cur != NULL);
86
87 if (exceptions != NULL && *exceptions != NULL) {
88 ex = (*exceptions)->data;
89 }
90
91 if (token->original.begin == NULL || *cur == NULL) {
92 if (ex != NULL) {
93 if (ex->pos == 0) {
94 token->original.begin = buf->original.begin + ex->len;
95 token->original.len = ex->len;
96 token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
97 }
98 else {
99 token->original.begin = buf->original.begin;
100 token->original.len = 0;
101 }
102 }
103 else {
104 token->original.begin = buf->original.begin;
105 token->original.len = 0;
106 }
107 *cur = token->original.begin;
108 }
109
110 token->original.len = 0;
111
112 pos = *cur - buf->original.begin;
113 if (pos >= buf->original.len) {
114 return FALSE;
115 }
116
117 remain = buf->original.len - pos;
118 p = *cur;
119
120 /* Skip non delimiters symbols */
121 do {
122 if (ex != NULL && ex->pos == pos) {
123 /* Go to the next exception */
124 *exceptions = g_list_next (*exceptions);
125 *cur = p + ex->len;
126 return TRUE;
127 }
128 pos++;
129 p++;
130 remain--;
131 } while (remain > 0 && t_delimiters[(guchar)*p]);
132
133 token->original.begin = p;
134
135 while (remain > 0 && !t_delimiters[(guchar)*p]) {
136 if (ex != NULL && ex->pos == pos) {
137 *exceptions = g_list_next (*exceptions);
138 *cur = p + ex->len;
139 return TRUE;
140 }
141 token->original.len++;
142 pos++;
143 remain--;
144 p++;
145 }
146
147 if (remain == 0) {
148 return FALSE;
149 }
150
151 if (rl) {
152 *rl = token->original.len;
153 }
154
155 token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
156
157 *cur = p;
158
159 return TRUE;
160 }
161
162 static inline gboolean
rspamd_tokenize_check_limit(gboolean decay,guint word_decay,guint nwords,guint64 * hv,guint64 * prob,const rspamd_stat_token_t * token,gssize remain,gssize total)163 rspamd_tokenize_check_limit (gboolean decay,
164 guint word_decay,
165 guint nwords,
166 guint64 *hv,
167 guint64 *prob,
168 const rspamd_stat_token_t *token,
169 gssize remain,
170 gssize total)
171 {
172 static const gdouble avg_word_len = 6.0;
173
174 if (!decay) {
175 if (token->original.len >= sizeof (guint64)) {
176 guint64 tmp;
177 memcpy (&tmp, token->original.begin, sizeof (tmp));
178 *hv = mum_hash_step (*hv, tmp);
179 }
180
181 /* Check for decay */
182 if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
183 /* Start decay */
184 gdouble decay_prob;
185
186 *hv = mum_hash_finish (*hv);
187
188 /* We assume that word is 6 symbols length in average */
189 decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len) * 10;
190 decay_prob = floor (decay_prob) / 10.0;
191
192 if (decay_prob >= 1.0) {
193 *prob = G_MAXUINT64;
194 }
195 else {
196 *prob = decay_prob * G_MAXUINT64;
197 }
198
199 return TRUE;
200 }
201 }
202 else {
203 /* Decaying probability */
204 /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
205 *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
206
207 if (*hv > *prob) {
208 return TRUE;
209 }
210 }
211
212 return FALSE;
213 }
214
215 static inline gboolean
rspamd_utf_word_valid(const guchar * text,const guchar * end,gint32 start,gint32 finish)216 rspamd_utf_word_valid (const guchar *text, const guchar *end,
217 gint32 start, gint32 finish)
218 {
219 const guchar *st = text + start, *fin = text + finish;
220 UChar32 c;
221
222 if (st >= end || fin > end || st >= fin) {
223 return FALSE;
224 }
225
226 U8_NEXT (text, start, finish, c);
227
228 if (u_isJavaIDPart (c)) {
229 return TRUE;
230 }
231
232 return FALSE;
233 }
234 #define SHIFT_EX do { \
235 cur = g_list_next (cur); \
236 if (cur) { \
237 ex = (struct rspamd_process_exception *) cur->data; \
238 } \
239 else { \
240 ex = NULL; \
241 } \
242 } while(0)
243
244 static inline void
rspamd_tokenize_exception(struct rspamd_process_exception * ex,GArray * res)245 rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
246 {
247 rspamd_stat_token_t token;
248
249 memset (&token, 0, sizeof (token));
250
251 if (ex->type == RSPAMD_EXCEPTION_GENERIC) {
252 token.original.begin = "!!EX!!";
253 token.original.len = sizeof ("!!EX!!") - 1;
254 token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
255
256 g_array_append_val (res, token);
257 token.flags = 0;
258 }
259 else if (ex->type == RSPAMD_EXCEPTION_URL) {
260 struct rspamd_url *uri;
261
262 uri = ex->ptr;
263
264 if (uri && uri->tldlen > 0) {
265 token.original.begin = rspamd_url_tld_unsafe (uri);
266 token.original.len = uri->tldlen;
267
268 }
269 else {
270 token.original.begin = "!!EX!!";
271 token.original.len = sizeof ("!!EX!!") - 1;
272 }
273
274 token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
275 g_array_append_val (res, token);
276 token.flags = 0;
277 }
278 }
279
280
281 GArray *
rspamd_tokenize_text(const gchar * text,gsize len,const UText * utxt,enum rspamd_tokenize_type how,struct rspamd_config * cfg,GList * exceptions,guint64 * hash,GArray * cur_words,rspamd_mempool_t * pool)282 rspamd_tokenize_text (const gchar *text, gsize len,
283 const UText *utxt,
284 enum rspamd_tokenize_type how,
285 struct rspamd_config *cfg,
286 GList *exceptions,
287 guint64 *hash,
288 GArray *cur_words,
289 rspamd_mempool_t *pool)
290 {
291 rspamd_stat_token_t token, buf;
292 const gchar *pos = NULL;
293 gsize l = 0;
294 GArray *res;
295 GList *cur = exceptions;
296 guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
297 guint64 hv = 0;
298 gboolean decay = FALSE, long_text_mode = FALSE;
299 guint64 prob = 0;
300 static UBreakIterator* bi = NULL;
301 static const gsize long_text_limit = 1 * 1024 * 1024;
302 static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
303 ev_tstamp start;
304
305 if (text == NULL) {
306 return cur_words;
307 }
308
309 if (len > long_text_limit) {
310 /*
311 * In this mode we do additional checks to avoid performance issues
312 */
313 long_text_mode = TRUE;
314 start = ev_time ();
315 }
316
317 buf.original.begin = text;
318 buf.original.len = len;
319 buf.flags = 0;
320
321 memset (&token, 0, sizeof (token));
322
323 if (cfg != NULL) {
324 min_len = cfg->min_word_len;
325 max_len = cfg->max_word_len;
326 word_decay = cfg->words_decay;
327 initial_size = word_decay * 2;
328 }
329
330 if (!cur_words) {
331 res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
332 initial_size);
333 }
334 else {
335 res = cur_words;
336 }
337
338 if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
339 while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
340 if (l == 0 || (min_len > 0 && l < min_len) ||
341 (max_len > 0 && l > max_len)) {
342 token.original.begin = pos;
343 continue;
344 }
345
346 if (token.original.len > 0 &&
347 rspamd_tokenize_check_limit (decay, word_decay, res->len,
348 &hv, &prob, &token, pos - text, len)) {
349 if (!decay) {
350 decay = TRUE;
351 }
352 else {
353 token.original.begin = pos;
354 continue;
355 }
356 }
357
358 if (long_text_mode) {
359 if ((res->len + 1) % 16 == 0) {
360 ev_tstamp now = ev_time ();
361
362 if (now - start > max_exec_time) {
363 msg_warn_pool_check (
364 "too long time has been spent on tokenization:"
365 " %.1f ms, limit is %.1f ms; %d words added so far",
366 (now - start) * 1e3, max_exec_time * 1e3,
367 res->len);
368
369 goto end;
370 }
371 }
372 }
373
374 g_array_append_val (res, token);
375
376 if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
377 /* Due to bug in glib ! */
378 msg_err_pool_check (
379 "too many words found: %d, stop tokenization to avoid DoS",
380 res->len);
381
382 goto end;
383 }
384
385 token.original.begin = pos;
386 }
387 }
388 else {
389 /* UTF8 boundaries */
390 UErrorCode uc_err = U_ZERO_ERROR;
391 int32_t last, p;
392 struct rspamd_process_exception *ex = NULL;
393
394 if (bi == NULL) {
395 bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
396
397 g_assert (U_SUCCESS (uc_err));
398 }
399
400 ubrk_setUText (bi, (UText*)utxt, &uc_err);
401 last = ubrk_first (bi);
402 p = last;
403
404 if (cur) {
405 ex = (struct rspamd_process_exception *)cur->data;
406 }
407
408 while (p != UBRK_DONE) {
409 start_over:
410 token.original.len = 0;
411
412 if (p > last) {
413 if (ex && cur) {
414 /* Check exception */
415 if (ex->pos >= last && ex->pos <= p) {
416 /* We have an exception within boundary */
417 /* First, start to drain exceptions from the start */
418 while (cur && ex->pos <= last) {
419 /* We have an exception at the beginning, skip those */
420 last += ex->len;
421 rspamd_tokenize_exception (ex, res);
422
423 if (last > p) {
424 /* Exception spread over the boundaries */
425 while (last > p && p != UBRK_DONE) {
426 gint32 old_p = p;
427 p = ubrk_next (bi);
428
429 if (p != UBRK_DONE && p <= old_p) {
430 msg_warn_pool_check (
431 "tokenization reversed back on position %d,"
432 "%d new position (%d backward), likely libicu bug!",
433 (gint)(p), (gint)(old_p), old_p - p);
434
435 goto end;
436 }
437 }
438
439 /* We need to reset our scan with new p and last */
440 SHIFT_EX;
441 goto start_over;
442 }
443
444 SHIFT_EX;
445 }
446
447 /* Now, we can have an exception within boundary again */
448 if (cur && ex->pos >= last && ex->pos <= p) {
449 /* Append the first part */
450 if (rspamd_utf_word_valid (text, text + len, last,
451 ex->pos)) {
452 token.original.begin = text + last;
453 token.original.len = ex->pos - last;
454 token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
455 RSPAMD_STAT_TOKEN_FLAG_UTF;
456 }
457
458 /* Process the current exception */
459 last += ex->len + (ex->pos - last);
460
461 rspamd_tokenize_exception (ex, res);
462
463 if (last > p) {
464 /* Exception spread over the boundaries */
465 while (last > p && p != UBRK_DONE) {
466 gint32 old_p = p;
467 p = ubrk_next (bi);
468 if (p != UBRK_DONE && p <= old_p) {
469 msg_warn_pool_check (
470 "tokenization reversed back on position %d,"
471 "%d new position (%d backward), likely libicu bug!",
472 (gint)(p), (gint)(old_p), old_p - p);
473
474 goto end;
475 }
476 }
477 /* We need to reset our scan with new p and last */
478 SHIFT_EX;
479 goto start_over;
480 }
481
482 SHIFT_EX;
483 }
484 else if (p > last) {
485 if (rspamd_utf_word_valid (text, text + len, last, p)) {
486 token.original.begin = text + last;
487 token.original.len = p - last;
488 token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
489 RSPAMD_STAT_TOKEN_FLAG_UTF;
490 }
491 }
492 }
493 else if (ex->pos < last) {
494 /* Forward exceptions list */
495 while (cur && ex->pos <= last) {
496 /* We have an exception at the beginning, skip those */
497 SHIFT_EX;
498 }
499
500 if (rspamd_utf_word_valid (text, text + len, last, p)) {
501 token.original.begin = text + last;
502 token.original.len = p - last;
503 token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
504 RSPAMD_STAT_TOKEN_FLAG_UTF;
505 }
506 }
507 else {
508 /* No exceptions within boundary */
509 if (rspamd_utf_word_valid (text, text + len, last, p)) {
510 token.original.begin = text + last;
511 token.original.len = p - last;
512 token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
513 RSPAMD_STAT_TOKEN_FLAG_UTF;
514 }
515 }
516 }
517 else {
518 if (rspamd_utf_word_valid (text, text + len, last, p)) {
519 token.original.begin = text + last;
520 token.original.len = p - last;
521 token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
522 RSPAMD_STAT_TOKEN_FLAG_UTF;
523 }
524 }
525
526 if (token.original.len > 0 &&
527 rspamd_tokenize_check_limit (decay, word_decay, res->len,
528 &hv, &prob, &token, p, len)) {
529 if (!decay) {
530 decay = TRUE;
531 } else {
532 token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED;
533 }
534 }
535 }
536
537 if (token.original.len > 0) {
538 /* Additional check for number of words */
539 if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
540 /* Due to bug in glib ! */
541 msg_err ("too many words found: %d, stop tokenization to avoid DoS",
542 res->len);
543
544 goto end;
545 }
546
547 g_array_append_val (res, token);
548 }
549
550 /* Also check for long text mode */
551 if (long_text_mode) {
552 /* Check time each 128 words added */
553 const int words_check_mask = 0x7F;
554
555 if ((res->len & words_check_mask) == words_check_mask) {
556 ev_tstamp now = ev_time ();
557
558 if (now - start > max_exec_time) {
559 msg_warn_pool_check (
560 "too long time has been spent on tokenization:"
561 " %.1f ms, limit is %.1f ms; %d words added so far",
562 (now - start) * 1e3, max_exec_time * 1e3,
563 res->len);
564
565 goto end;
566 }
567 }
568 }
569
570 last = p;
571 p = ubrk_next (bi);
572
573 if (p != UBRK_DONE && p <= last) {
574 msg_warn_pool_check ("tokenization reversed back on position %d,"
575 "%d new position (%d backward), likely libicu bug!",
576 (gint)(p), (gint)(last), last - p);
577
578 goto end;
579 }
580 }
581 }
582
583 end:
584 if (!decay) {
585 hv = mum_hash_finish (hv);
586 }
587
588 if (hash) {
589 *hash = hv;
590 }
591
592 return res;
593 }
594
595 #undef SHIFT_EX
596
597 static void
rspamd_add_metawords_from_str(const gchar * beg,gsize len,struct rspamd_task * task)598 rspamd_add_metawords_from_str (const gchar *beg, gsize len,
599 struct rspamd_task *task)
600 {
601 UText utxt = UTEXT_INITIALIZER;
602 UErrorCode uc_err = U_ZERO_ERROR;
603 guint i = 0;
604 UChar32 uc;
605 gboolean valid_utf = TRUE;
606
607 while (i < len) {
608 U8_NEXT (beg, i, len, uc);
609
610 if (((gint32) uc) < 0) {
611 valid_utf = FALSE;
612 break;
613 }
614
615 #if U_ICU_VERSION_MAJOR_NUM < 50
616 if (u_isalpha (uc)) {
617 gint32 sc = ublock_getCode (uc);
618
619 if (sc == UBLOCK_THAI) {
620 valid_utf = FALSE;
621 msg_info_task ("enable workaround for Thai characters for old libicu");
622 break;
623 }
624 }
625 #endif
626 }
627
628 if (valid_utf) {
629 utext_openUTF8 (&utxt,
630 beg,
631 len,
632 &uc_err);
633
634 task->meta_words = rspamd_tokenize_text (beg, len,
635 &utxt, RSPAMD_TOKENIZE_UTF,
636 task->cfg, NULL, NULL,
637 task->meta_words,
638 task->task_pool);
639
640 utext_close (&utxt);
641 }
642 else {
643 task->meta_words = rspamd_tokenize_text (beg, len,
644 NULL, RSPAMD_TOKENIZE_RAW,
645 task->cfg, NULL, NULL, task->meta_words,
646 task->task_pool);
647 }
648 }
649
650 void
rspamd_tokenize_meta_words(struct rspamd_task * task)651 rspamd_tokenize_meta_words (struct rspamd_task *task)
652 {
653 guint i = 0;
654 rspamd_stat_token_t *tok;
655
656 if (MESSAGE_FIELD (task, subject)) {
657 rspamd_add_metawords_from_str (MESSAGE_FIELD (task, subject),
658 strlen (MESSAGE_FIELD (task, subject)), task);
659 }
660
661 if (MESSAGE_FIELD (task, from_mime) && MESSAGE_FIELD (task, from_mime)->len > 0) {
662 struct rspamd_email_address *addr;
663
664 addr = g_ptr_array_index (MESSAGE_FIELD (task, from_mime), 0);
665
666 if (addr->name) {
667 rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task);
668 }
669 }
670
671 if (task->meta_words != NULL) {
672 const gchar *language = NULL;
673
674 if (MESSAGE_FIELD (task, text_parts) &&
675 MESSAGE_FIELD (task, text_parts)->len > 0) {
676 struct rspamd_mime_text_part *tp = g_ptr_array_index (
677 MESSAGE_FIELD (task, text_parts), 0);
678
679 if (tp->language) {
680 language = tp->language;
681 }
682 }
683
684 rspamd_normalize_words (task->meta_words, task->task_pool);
685 rspamd_stem_words (task->meta_words, task->task_pool, language,
686 task->lang_det);
687
688 for (i = 0; i < task->meta_words->len; i++) {
689 tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
690 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
691 }
692 }
693 }
694
695 static inline void
rspamd_uchars_to_ucs32(const UChar * src,gsize srclen,rspamd_stat_token_t * tok,rspamd_mempool_t * pool)696 rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
697 rspamd_stat_token_t *tok,
698 rspamd_mempool_t *pool)
699 {
700 UChar32 *dest, t, *d;
701 gint32 i = 0;
702
703 dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32));
704 d = dest;
705
706 while (i < srclen) {
707 U16_NEXT_UNSAFE (src, i, t);
708
709 if (u_isgraph (t)) {
710 UCharCategory cat;
711
712 cat = u_charType (t);
713 #if U_ICU_VERSION_MAJOR_NUM >= 57
714 if (u_hasBinaryProperty (t, UCHAR_EMOJI)) {
715 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
716 }
717 #endif
718
719 if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) ||
720 cat == U_CONNECTOR_PUNCTUATION ||
721 cat == U_MATH_SYMBOL ||
722 cat == U_CURRENCY_SYMBOL) {
723 *d++ = u_tolower (t);
724 }
725 }
726 else {
727 /* Invisible spaces ! */
728 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
729 }
730 }
731
732 tok->unicode.begin = dest;
733 tok->unicode.len = d - dest;
734 }
735
736 static inline void
rspamd_ucs32_to_normalised(rspamd_stat_token_t * tok,rspamd_mempool_t * pool)737 rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok,
738 rspamd_mempool_t *pool)
739 {
740 guint i, doff = 0;
741 gsize utflen = 0;
742 gchar *dest;
743 UChar32 t;
744
745 for (i = 0; i < tok->unicode.len; i ++) {
746 utflen += U8_LENGTH (tok->unicode.begin[i]);
747 }
748
749 dest = rspamd_mempool_alloc (pool, utflen + 1);
750
751 for (i = 0; i < tok->unicode.len; i ++) {
752 t = tok->unicode.begin[i];
753 U8_APPEND_UNSAFE (dest, doff, t);
754 }
755
756 g_assert (doff <= utflen);
757 dest[doff] = '\0';
758
759 tok->normalized.len = doff;
760 tok->normalized.begin = dest;
761 }
762
763 void
rspamd_normalize_single_word(rspamd_stat_token_t * tok,rspamd_mempool_t * pool)764 rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
765 {
766 UErrorCode uc_err = U_ZERO_ERROR;
767 UConverter *utf8_converter;
768 UChar tmpbuf[1024]; /* Assume that we have no longer words... */
769 gsize ulen;
770
771 utf8_converter = rspamd_get_utf8_converter ();
772
773 if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
774 ulen = ucnv_toUChars (utf8_converter,
775 tmpbuf,
776 G_N_ELEMENTS (tmpbuf),
777 tok->original.begin,
778 tok->original.len,
779 &uc_err);
780
781 /* Now, we need to understand if we need to normalise the word */
782 if (!U_SUCCESS (uc_err)) {
783 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
784 tok->unicode.begin = NULL;
785 tok->unicode.len = 0;
786 tok->normalized.begin = NULL;
787 tok->normalized.len = 0;
788 }
789 else {
790 #if U_ICU_VERSION_MAJOR_NUM >= 44
791 const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
792 gint32 end;
793
794 /* We can now check if we need to decompose */
795 end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err);
796
797 if (!U_SUCCESS (uc_err)) {
798 rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
799 tok->normalized.begin = NULL;
800 tok->normalized.len = 0;
801 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
802 }
803 else {
804 if (end == ulen) {
805 /* Already normalised, just lowercase */
806 rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
807 rspamd_ucs32_to_normalised (tok, pool);
808 }
809 else {
810 /* Perform normalization */
811 UChar normbuf[1024];
812
813 g_assert (end < G_N_ELEMENTS (normbuf));
814 /* First part */
815 memcpy (normbuf, tmpbuf, end * sizeof (UChar));
816 /* Second part */
817 ulen = unorm2_normalizeSecondAndAppend (norm,
818 normbuf, end,
819 G_N_ELEMENTS (normbuf),
820 tmpbuf + end,
821 ulen - end,
822 &uc_err);
823
824 if (!U_SUCCESS (uc_err)) {
825 if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
826 msg_warn_pool_check ("cannot normalise text '%*s': %s",
827 (gint)tok->original.len, tok->original.begin,
828 u_errorName (uc_err));
829 rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
830 rspamd_ucs32_to_normalised (tok, pool);
831 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
832 }
833 }
834 else {
835 /* Copy normalised back */
836 rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool);
837 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
838 rspamd_ucs32_to_normalised (tok, pool);
839 }
840 }
841 }
842 #else
843 /* Legacy version with no unorm2 interface */
844 rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
845 rspamd_ucs32_to_normalised (tok, pool);
846 #endif
847 }
848 }
849 else {
850 if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
851 /* Simple lowercase */
852 gchar *dest;
853
854 dest = rspamd_mempool_alloc (pool, tok->original.len + 1);
855 rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1);
856 rspamd_str_lc (dest, tok->original.len);
857 tok->normalized.len = tok->original.len;
858 tok->normalized.begin = dest;
859 }
860 }
861 }
862
863 void
rspamd_normalize_words(GArray * words,rspamd_mempool_t * pool)864 rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
865 {
866 rspamd_stat_token_t *tok;
867 guint i;
868
869 for (i = 0; i < words->len; i++) {
870 tok = &g_array_index (words, rspamd_stat_token_t, i);
871 rspamd_normalize_single_word (tok, pool);
872 }
873 }
874
875 void
rspamd_stem_words(GArray * words,rspamd_mempool_t * pool,const gchar * language,struct rspamd_lang_detector * d)876 rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
877 const gchar *language,
878 struct rspamd_lang_detector *d)
879 {
880 static GHashTable *stemmers = NULL;
881 struct sb_stemmer *stem = NULL;
882 guint i;
883 rspamd_stat_token_t *tok;
884 gchar *dest;
885 gsize dlen;
886
887 if (!stemmers) {
888 stemmers = g_hash_table_new (rspamd_strcase_hash,
889 rspamd_strcase_equal);
890 }
891
892 if (language && language[0] != '\0') {
893 stem = g_hash_table_lookup (stemmers, language);
894
895 if (stem == NULL) {
896
897 stem = sb_stemmer_new (language, "UTF_8");
898
899 if (stem == NULL) {
900 msg_debug_pool (
901 "<%s> cannot create lemmatizer for %s language",
902 language);
903 g_hash_table_insert (stemmers, g_strdup (language),
904 GINT_TO_POINTER (-1));
905 }
906 else {
907 g_hash_table_insert (stemmers, g_strdup (language),
908 stem);
909 }
910 }
911 else if (stem == GINT_TO_POINTER (-1)) {
912 /* Negative cache */
913 stem = NULL;
914 }
915 }
916 for (i = 0; i < words->len; i++) {
917 tok = &g_array_index (words, rspamd_stat_token_t, i);
918
919 if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
920 if (stem) {
921 const gchar *stemmed = NULL;
922
923 stemmed = sb_stemmer_stem (stem,
924 tok->normalized.begin, tok->normalized.len);
925
926 dlen = stemmed ? strlen (stemmed) : 0;
927
928 if (dlen > 0) {
929 dest = rspamd_mempool_alloc (pool, dlen + 1);
930 memcpy (dest, stemmed, dlen);
931 dest[dlen] = '\0';
932 tok->stemmed.len = dlen;
933 tok->stemmed.begin = dest;
934 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
935 }
936 else {
937 /* Fallback */
938 tok->stemmed.len = tok->normalized.len;
939 tok->stemmed.begin = tok->normalized.begin;
940 }
941 }
942 else {
943 tok->stemmed.len = tok->normalized.len;
944 tok->stemmed.begin = tok->normalized.begin;
945 }
946
947 if (tok->stemmed.len > 0 && d != NULL &&
948 rspamd_language_detector_is_stop_word (d, tok->stemmed.begin, tok->stemmed.len)) {
949 tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
950 }
951 }
952 else {
953 if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
954 /* Raw text, lowercase */
955 tok->stemmed.len = tok->normalized.len;
956 tok->stemmed.begin = tok->normalized.begin;
957 }
958 }
959 }
960 }