1 /*-
2 * Copyright 2016 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "config.h"
18 #include "libutil/mem_pool.h"
19 #include "libutil/regexp.h"
20 #include "libutil/hash.h"
21 #include "libserver/cfg_file.h"
22 #include "libserver/task.h"
23 #include "mime_encoding.h"
24 #include "message.h"
25 #include "contrib/fastutf8/fastutf8.h"
26 #include "contrib/google-ced/ced_c.h"
27 #include <unicode/ucnv.h>
28 #if U_ICU_VERSION_MAJOR_NUM >= 44
29 #include <unicode/unorm2.h>
30 #endif
31 #include <math.h>
32
33 #define UTF8_CHARSET "UTF-8"
34
35 #define RSPAMD_CHARSET_FLAG_UTF (1 << 0)
36 #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
37
38 #define RSPAMD_CHARSET_CACHE_SIZE 32
39 #define RSPAMD_CHARSET_MAX_CONTENT 512
40
41 #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
42 #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
43
44 static rspamd_regexp_t *utf_compatible_re = NULL;
45
46 struct rspamd_charset_substitution {
47 const gchar *input;
48 const gchar *canon;
49 gint flags;
50 };
51
52 #include "mime_encoding_list.h"
53
54 static GHashTable *sub_hash = NULL;
55
56 static const UChar iso_8859_16_map[] = {
57 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
58 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
59 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
60 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
61 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
62 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
63 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
64 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
65 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
66 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
67 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
68 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
69 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
70 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
71 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
72 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
73 };
74
75 struct rspamd_charset_converter {
76 gchar *canon_name;
77 union {
78 UConverter *conv;
79 const UChar *cnv_table;
80 } d;
81 gboolean is_internal;
82 };
83
84 static GQuark
rspamd_charset_conv_error_quark(void)85 rspamd_charset_conv_error_quark (void)
86 {
87 return g_quark_from_static_string ("charset conversion error");
88 }
89
90 static void
rspamd_converter_dtor(gpointer p)91 rspamd_converter_dtor (gpointer p)
92 {
93 struct rspamd_charset_converter *c = (struct rspamd_charset_converter *)p;
94
95 if (!c->is_internal) {
96 ucnv_close (c->d.conv);
97 }
98
99 g_free (c->canon_name);
100 g_free (c);
101 }
102
103 int32_t
rspamd_converter_to_uchars(struct rspamd_charset_converter * cnv,UChar * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)104 rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
105 UChar *dest,
106 int32_t destCapacity,
107 const char *src,
108 int32_t srcLength,
109 UErrorCode *pErrorCode)
110 {
111 if (!cnv->is_internal) {
112 return ucnv_toUChars (cnv->d.conv,
113 dest, destCapacity,
114 src, srcLength,
115 pErrorCode);
116 }
117 else {
118 UChar *d = dest, *dend = dest + destCapacity;
119 const guchar *p = src, *end = src + srcLength;
120
121 while (p < end && d < dend) {
122 if (*p <= 127) {
123 *d++ = (UChar)*p;
124 }
125 else {
126 *d++ = cnv->d.cnv_table[*p - 128];
127 }
128
129 p ++;
130 }
131
132 return d - dest;
133 }
134 }
135
136
137 struct rspamd_charset_converter *
rspamd_mime_get_converter_cached(const gchar * enc,rspamd_mempool_t * pool,gboolean is_canon,UErrorCode * err)138 rspamd_mime_get_converter_cached (const gchar *enc,
139 rspamd_mempool_t *pool,
140 gboolean is_canon,
141 UErrorCode *err)
142 {
143 const gchar *canon_name;
144 static rspamd_lru_hash_t *cache;
145 struct rspamd_charset_converter *conv;
146
147 if (cache == NULL) {
148 cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, NULL,
149 rspamd_converter_dtor, rspamd_str_hash,
150 rspamd_str_equal);
151 }
152
153 if (enc == NULL) {
154 return NULL;
155 }
156
157 if (!is_canon) {
158 rspamd_ftok_t cset_tok;
159
160 RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
161 canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
162 }
163 else {
164 canon_name = enc;
165 }
166
167 if (canon_name == NULL) {
168 return NULL;
169 }
170
171 conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0);
172
173 if (conv == NULL) {
174 if (!(strcmp (canon_name, "ISO-8859-16") == 0 ||
175 strcmp (canon_name, "latin10") == 0 ||
176 strcmp (canon_name, "iso-ir-226") == 0)) {
177 conv = g_malloc0 (sizeof (*conv));
178 conv->d.conv = ucnv_open (canon_name, err);
179 conv->canon_name = g_strdup (canon_name);
180
181 if (conv->d.conv != NULL) {
182 ucnv_setToUCallBack (conv->d.conv,
183 UCNV_TO_U_CALLBACK_SUBSTITUTE,
184 NULL,
185 NULL,
186 NULL,
187 err);
188 rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
189 }
190 else {
191 g_free (conv);
192 conv = NULL;
193 }
194 }
195 else {
196 /* ISO-8859-16 */
197 conv = g_malloc0 (sizeof (*conv));
198 conv->is_internal = TRUE;
199 conv->d.cnv_table = iso_8859_16_map;
200 conv->canon_name = g_strdup (canon_name);
201
202 rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
203 }
204 }
205
206 return conv;
207 }
208
209 static void
rspamd_mime_encoding_substitute_init(void)210 rspamd_mime_encoding_substitute_init (void)
211 {
212 guint i;
213
214 sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal);
215
216 for (i = 0; i < G_N_ELEMENTS (sub); i ++) {
217 g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]);
218 }
219 }
220
221 static void
rspamd_charset_normalize(gchar * in)222 rspamd_charset_normalize (gchar *in)
223 {
224 /*
225 * This is a simple routine to validate input charset
226 * we just check that charset starts with alphanumeric and ends
227 * with alphanumeric
228 */
229 gchar *begin, *end;
230 gboolean changed = FALSE;
231
232 begin = in;
233
234 while (*begin && !g_ascii_isalnum (*begin)) {
235 begin ++;
236 changed = TRUE;
237 }
238
239 end = begin + strlen (begin) - 1;
240
241 while (end > begin && !g_ascii_isalnum (*end)) {
242 end --;
243 changed = TRUE;
244 }
245
246 if (changed) {
247 memmove (in, begin, end - begin + 2);
248 *(end + 1) = '\0';
249 }
250 }
251
252 const gchar *
rspamd_mime_detect_charset(const rspamd_ftok_t * in,rspamd_mempool_t * pool)253 rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
254 {
255 gchar *ret = NULL, *h, *t;
256 struct rspamd_charset_substitution *s;
257 const gchar *cset;
258 rspamd_ftok_t utf8_tok;
259 UErrorCode uc_err = U_ZERO_ERROR;
260
261 if (sub_hash == NULL) {
262 rspamd_mime_encoding_substitute_init ();
263 }
264
265 /* Fast path */
266 RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf-8");
267
268 if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
269 return UTF8_CHARSET;
270 }
271
272 RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf8");
273
274 if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
275 return UTF8_CHARSET;
276 }
277
278 ret = rspamd_mempool_ftokdup (pool, in);
279 rspamd_charset_normalize (ret);
280
281 if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) ||
282 (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) {
283 /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
284 h = ret;
285 t = ret;
286
287 while (*h != '\0') {
288 if (*h != '-') {
289 *t++ = *h;
290 }
291
292 h ++;
293 }
294
295 *t = '\0';
296 }
297
298 s = g_hash_table_lookup (sub_hash, ret);
299
300 if (s) {
301 ret = (char *)s->canon;
302 }
303
304 /* Try different aliases */
305 cset = ucnv_getCanonicalName (ret, "MIME", &uc_err);
306
307 if (cset == NULL) {
308 uc_err = U_ZERO_ERROR;
309 cset = ucnv_getCanonicalName (ret, "IANA", &uc_err);
310 }
311
312 if (cset == NULL) {
313 uc_err = U_ZERO_ERROR;
314 cset = ucnv_getCanonicalName (ret, "", &uc_err);
315 }
316
317 if (cset == NULL) {
318 uc_err = U_ZERO_ERROR;
319 cset = ucnv_getAlias (ret, 0, &uc_err);
320 }
321
322 return cset;
323 }
324
325 gchar *
rspamd_mime_text_to_utf8(rspamd_mempool_t * pool,gchar * input,gsize len,const gchar * in_enc,gsize * olen,GError ** err)326 rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
327 gchar *input, gsize len, const gchar *in_enc,
328 gsize *olen, GError **err)
329 {
330 gchar *d;
331 gint32 r, clen, dlen;
332 UChar *tmp_buf;
333
334 UErrorCode uc_err = U_ZERO_ERROR;
335 UConverter *utf8_converter;
336 struct rspamd_charset_converter *conv;
337 rspamd_ftok_t cset_tok;
338
339 /* Check if already utf8 */
340 RSPAMD_FTOK_FROM_STR (&cset_tok, in_enc);
341
342 if (rspamd_mime_charset_utf_check (&cset_tok, input, len,
343 FALSE)) {
344 d = rspamd_mempool_alloc (pool, len);
345 memcpy (d, input, len);
346 if (olen) {
347 *olen = len;
348 }
349
350 return d;
351 }
352
353 conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
354 utf8_converter = rspamd_get_utf8_converter ();
355
356 if (conv == NULL) {
357 g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
358 "cannot open converter for %s: %s",
359 in_enc, u_errorName (uc_err));
360
361 return NULL;
362 }
363
364 tmp_buf = g_new (UChar, len + 1);
365 uc_err = U_ZERO_ERROR;
366 r = rspamd_converter_to_uchars (conv, tmp_buf, len + 1, input, len, &uc_err);
367
368 if (!U_SUCCESS (uc_err)) {
369 g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
370 "cannot convert data to unicode from %s: %s",
371 in_enc, u_errorName (uc_err));
372 g_free (tmp_buf);
373
374 return NULL;
375 }
376
377 /* Now, convert to utf8 */
378 clen = ucnv_getMaxCharSize (utf8_converter);
379 dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
380 d = rspamd_mempool_alloc (pool, dlen);
381 r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
382
383 if (!U_SUCCESS (uc_err)) {
384 g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
385 "cannot convert data from unicode from %s: %s",
386 in_enc, u_errorName (uc_err));
387 g_free (tmp_buf);
388
389 return NULL;
390 }
391
392 msg_debug_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d",
393 in_enc, len, r);
394 g_free (tmp_buf);
395
396 if (olen) {
397 *olen = r;
398 }
399
400 return d;
401 }
402
403 static gboolean
rspamd_mime_text_part_utf8_convert(struct rspamd_task * task,struct rspamd_mime_text_part * text_part,GByteArray * input,const gchar * charset,GError ** err)404 rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
405 struct rspamd_mime_text_part *text_part,
406 GByteArray *input,
407 const gchar *charset,
408 GError **err)
409 {
410 gchar *d;
411 gint32 r, clen, dlen, uc_len;
412 UChar *tmp_buf;
413 UErrorCode uc_err = U_ZERO_ERROR;
414 UConverter *utf8_converter;
415 struct rspamd_charset_converter *conv;
416
417 conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
418 TRUE, &uc_err);
419 utf8_converter = rspamd_get_utf8_converter ();
420
421 if (conv == NULL) {
422 g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
423 "cannot open converter for %s: %s",
424 charset, u_errorName (uc_err));
425
426 return FALSE;
427 }
428
429 tmp_buf = g_new (UChar, input->len + 1);
430 uc_err = U_ZERO_ERROR;
431 uc_len = rspamd_converter_to_uchars (conv,
432 tmp_buf,
433 input->len + 1,
434 input->data,
435 input->len,
436 &uc_err);
437
438 if (!U_SUCCESS (uc_err)) {
439 g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
440 "cannot convert data to unicode from %s: %s",
441 charset, u_errorName (uc_err));
442 g_free (tmp_buf);
443
444 return FALSE;
445 }
446
447 /* Now, convert to utf8 */
448 clen = ucnv_getMaxCharSize (utf8_converter);
449 dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
450 d = rspamd_mempool_alloc (task->task_pool, dlen);
451 r = ucnv_fromUChars (utf8_converter, d, dlen,
452 tmp_buf, uc_len, &uc_err);
453
454 if (!U_SUCCESS (uc_err)) {
455 g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
456 "cannot convert data from unicode from %s: %s",
457 charset, u_errorName (uc_err));
458 g_free (tmp_buf);
459
460 return FALSE;
461 }
462
463 if (text_part->mime_part && text_part->mime_part->ct) {
464 msg_info_task ("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
465 charset, &text_part->mime_part->ct->charset, input->len, r, uc_len);
466 }
467 else {
468 msg_info_task ("converted text part from %s (no charset announced) to UTF-8 inlen: %d, "
469 "outlen: %d (%d UTF16 chars)",
470 charset, input->len, r, uc_len);
471 }
472
473 text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
474 sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
475 text_part->utf_raw_content->data = d;
476 text_part->utf_raw_content->len = r;
477 g_free (tmp_buf);
478
479 return TRUE;
480 }
481
482 gboolean
rspamd_mime_to_utf8_byte_array(GByteArray * in,GByteArray * out,rspamd_mempool_t * pool,const gchar * enc)483 rspamd_mime_to_utf8_byte_array (GByteArray *in,
484 GByteArray *out,
485 rspamd_mempool_t *pool,
486 const gchar *enc)
487 {
488 gint32 r, clen, dlen;
489 UChar *tmp_buf;
490 UErrorCode uc_err = U_ZERO_ERROR;
491 UConverter *utf8_converter;
492 struct rspamd_charset_converter *conv;
493 rspamd_ftok_t charset_tok;
494
495 if (in == NULL || in->len == 0) {
496 return FALSE;
497 }
498
499 if (enc == NULL) {
500 /* Assume utf ? */
501 if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
502 g_byte_array_set_size (out, in->len);
503 memcpy (out->data, in->data, out->len);
504
505 return TRUE;
506 }
507 else {
508 /* Bad stuff, keep out */
509 return FALSE;
510 }
511 }
512
513 RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
514
515 if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
516 FALSE)) {
517 g_byte_array_set_size (out, in->len);
518 memcpy (out->data, in->data, out->len);
519
520 return TRUE;
521 }
522
523 utf8_converter = rspamd_get_utf8_converter ();
524 conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);
525
526 if (conv == NULL) {
527 return FALSE;
528 }
529
530 tmp_buf = g_new (UChar, in->len + 1);
531 uc_err = U_ZERO_ERROR;
532 r = rspamd_converter_to_uchars (conv,
533 tmp_buf, in->len + 1,
534 in->data, in->len, &uc_err);
535
536 if (!U_SUCCESS (uc_err)) {
537 g_free (tmp_buf);
538
539 return FALSE;
540 }
541
542 /* Now, convert to utf8 */
543 clen = ucnv_getMaxCharSize (utf8_converter);
544 dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
545 g_byte_array_set_size (out, dlen);
546 r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
547
548 if (!U_SUCCESS (uc_err)) {
549 g_free (tmp_buf);
550
551 return FALSE;
552 }
553
554 g_free (tmp_buf);
555 out->len = r;
556
557 return TRUE;
558 }
559
560 void
rspamd_mime_charset_utf_enforce(gchar * in,gsize len)561 rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
562 {
563 gchar *p, *end;
564 goffset err_offset;
565 UChar32 uc = 0;
566
567 /* Now we validate input and replace bad characters with '?' symbol */
568 p = in;
569 end = in + len;
570
571 while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len)) > 0) {
572 err_offset --; /* As it returns it 1 indexed */
573 gint32 cur_offset = err_offset;
574
575 while (cur_offset < len) {
576 gint32 tmp = cur_offset;
577
578 U8_NEXT (p, cur_offset, len, uc);
579
580 if (uc > 0) {
581 /* Fill string between err_offset and tmp with `?` character */
582 memset (p + err_offset, '?', tmp - err_offset);
583 break;
584 }
585 }
586
587 if (uc < 0) {
588 /* Fill till the end */
589 memset (p + err_offset, '?', len - err_offset);
590 break;
591 }
592
593 p += cur_offset;
594 len = end - p;
595 }
596 }
597
598 const char *
rspamd_mime_charset_find_by_content(const gchar * in,gsize inlen,bool check_utf8)599 rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen,
600 bool check_utf8)
601 {
602 int nconsumed;
603 bool is_reliable;
604 const gchar *ced_name;
605
606 if (check_utf8) {
607 if (rspamd_fast_utf8_validate (in, inlen) == 0) {
608 return UTF8_CHARSET;
609 }
610 }
611
612
613 ced_name = ced_encoding_detect (in, inlen, NULL, NULL,
614 NULL, 0, CED_EMAIL_CORPUS,
615 false, &nconsumed, &is_reliable);
616
617 if (ced_name) {
618
619 return ced_name;
620 }
621
622 return NULL;
623 }
624
625 static const char *
rspamd_mime_charset_find_by_content_maybe_split(const gchar * in,gsize inlen)626 rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen)
627 {
628 if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) {
629 return rspamd_mime_charset_find_by_content (in, inlen, false);
630 }
631 else {
632 const gchar *c1, *c2, *c3;
633
634 c1 = rspamd_mime_charset_find_by_content (in, RSPAMD_CHARSET_MAX_CONTENT, false);
635 c2 = rspamd_mime_charset_find_by_content (in + inlen / 2,
636 RSPAMD_CHARSET_MAX_CONTENT, false);
637 c3 = rspamd_mime_charset_find_by_content (in + inlen - RSPAMD_CHARSET_MAX_CONTENT,
638 RSPAMD_CHARSET_MAX_CONTENT, false);
639
640 /* 7bit stuff */
641 if (c1 && strcmp (c1, "US-ASCII") == 0) {
642 c1 = NULL; /* Invalid - we have 8 bit there */
643 }
644 if (c2 && strcmp (c2, "US-ASCII") == 0) {
645 c2 = NULL; /* Invalid - we have 8 bit there */
646 }
647 if (c3 && strcmp (c3, "US-ASCII") == 0) {
648 c3 = NULL; /* Invalid - we have 8 bit there */
649 }
650
651 if (!c1) {
652 c1 = c2 ? c2 : c3;
653 }
654 if (!c2) {
655 c2 = c3 ? c3 : c1;
656 }
657 if (!c3) {
658 c3 = c1 ? c2 : c1;
659 }
660
661 if (c1 && c2 && c3) {
662 /* Quorum */
663 if (c1 == c2) {
664 return c1;
665 }
666 else if (c2 == c3) {
667 return c2;
668 }
669 else if (c1 == c3) {
670 return c3;
671 }
672
673 /* All charsets are distinct. Use the one from the top */
674 return c1;
675 }
676
677 return NULL;
678 }
679 }
680
681 gboolean
rspamd_mime_charset_utf_check(rspamd_ftok_t * charset,gchar * in,gsize len,gboolean content_check)682 rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
683 gchar *in, gsize len, gboolean content_check)
684 {
685 const gchar *real_charset;
686
687 if (utf_compatible_re == NULL) {
688 utf_compatible_re = rspamd_regexp_new (
689 "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
690 "i", NULL);
691 }
692
693 if (charset->len == 0 ||
694 rspamd_regexp_match (utf_compatible_re,
695 charset->begin, charset->len, TRUE)) {
696 /*
697 * In case of UTF8 charset we still can check the content to find
698 * corner cases
699 */
700 if (content_check) {
701 if (rspamd_fast_utf8_validate (in, len) != 0) {
702 real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len);
703
704 if (real_charset) {
705
706 if (rspamd_regexp_match (utf_compatible_re,
707 real_charset, strlen (real_charset), TRUE)) {
708 RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
709
710 return TRUE;
711 }
712 else {
713 charset->begin = real_charset;
714 charset->len = strlen (real_charset);
715
716 return FALSE;
717 }
718 }
719
720 rspamd_mime_charset_utf_enforce (in, len);
721 }
722 }
723
724 return TRUE;
725 }
726
727 return FALSE;
728 }
729
730 void
rspamd_mime_text_part_maybe_convert(struct rspamd_task * task,struct rspamd_mime_text_part * text_part)731 rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
732 struct rspamd_mime_text_part *text_part)
733 {
734 GError *err = NULL;
735 const gchar *charset = NULL;
736 gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
737 GByteArray *part_content;
738 rspamd_ftok_t charset_tok;
739 struct rspamd_mime_part *part = text_part->mime_part;
740
741 if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
742 text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW;
743 }
744
745 /* Allocate copy storage */
746 part_content = g_byte_array_sized_new (text_part->parsed.len);
747 memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
748 part_content->len = text_part->parsed.len;
749 rspamd_mempool_notify_alloc (task->task_pool,
750 part_content->len);
751 rspamd_mempool_add_destructor (task->task_pool,
752 (rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
753
754 if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
755 if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
756 /* Valid UTF, likely all good */
757 need_charset_heuristic = FALSE;
758 valid_utf8 = TRUE;
759 checked = TRUE;
760 }
761
762 text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
763 }
764 else {
765 /* All 7bit characters, assume it valid utf */
766 need_charset_heuristic = FALSE;
767 valid_utf8 = TRUE;
768 checked = TRUE; /* Already valid utf, no need in further checks */
769 }
770
771 if (part->ct->charset.len == 0) {
772 if (need_charset_heuristic) {
773 charset = rspamd_mime_charset_find_by_content_maybe_split (text_part->parsed.begin,
774 text_part->parsed.len);
775
776 if (charset != NULL) {
777 msg_info_task ("detected charset %s", charset);
778 }
779
780 checked = TRUE;
781 text_part->real_charset = charset;
782 }
783 else if (valid_utf8) {
784 SET_PART_UTF (text_part);
785 text_part->utf_raw_content = part_content;
786 text_part->real_charset = UTF8_CHARSET;
787
788 return;
789 }
790 }
791 else {
792 charset = rspamd_mime_detect_charset (&part->ct->charset,
793 task->task_pool);
794
795 if (charset == NULL) {
796 /* We don't know the real charset but can try heuristic */
797 if (need_charset_heuristic) {
798 charset = rspamd_mime_charset_find_by_content_maybe_split (part_content->data,
799 part_content->len);
800 msg_info_task ("detected charset: %s", charset);
801 checked = TRUE;
802 text_part->real_charset = charset;
803 }
804 else if (valid_utf8) {
805 /* We already know that the input is valid utf, so skip heuristic */
806 text_part->real_charset = UTF8_CHARSET;
807 }
808 }
809 else {
810 text_part->real_charset = charset;
811
812 if (strcmp (charset, UTF8_CHARSET) != 0) {
813 /*
814 * We have detected some charset, but we don't know which one,
815 * so we need to reset valid utf8 flag and enforce it later
816 */
817 valid_utf8 = FALSE;
818 }
819 }
820 }
821
822 if (text_part->real_charset == NULL) {
823 msg_info_task ("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"",
824 MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset,
825 part->ct->cpy);
826 SET_PART_RAW (text_part);
827 text_part->utf_raw_content = part_content;
828
829 return;
830 }
831
832 RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
833
834 if (!valid_utf8) {
835 if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
836 part_content->len, !checked)) {
837 SET_PART_UTF (text_part);
838 text_part->utf_raw_content = part_content;
839 text_part->real_charset = UTF8_CHARSET;
840
841 return;
842 }
843 else {
844 charset = charset_tok.begin;
845
846 if (!rspamd_mime_text_part_utf8_convert (task, text_part,
847 part_content, charset, &err)) {
848 msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
849 MESSAGE_FIELD (task, message_id),
850 charset,
851 err ? err->message : "unknown problem");
852 SET_PART_RAW (text_part);
853 g_error_free (err);
854
855 text_part->utf_raw_content = part_content;
856 return;
857 }
858
859 SET_PART_UTF (text_part);
860 text_part->real_charset = charset;
861 }
862 }
863 else {
864 SET_PART_UTF (text_part);
865 text_part->utf_raw_content = part_content;
866 }
867 }
868