1 /*-
2  * Copyright 2016 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "config.h"
18 #include "libutil/mem_pool.h"
19 #include "libutil/regexp.h"
20 #include "libutil/hash.h"
21 #include "libserver/cfg_file.h"
22 #include "libserver/task.h"
23 #include "mime_encoding.h"
24 #include "message.h"
25 #include "contrib/fastutf8/fastutf8.h"
26 #include "contrib/google-ced/ced_c.h"
27 #include <unicode/ucnv.h>
28 #if U_ICU_VERSION_MAJOR_NUM >= 44
29 #include <unicode/unorm2.h>
30 #endif
31 #include <math.h>
32 
33 #define UTF8_CHARSET "UTF-8"
34 
35 #define RSPAMD_CHARSET_FLAG_UTF (1 << 0)
36 #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
37 
38 #define RSPAMD_CHARSET_CACHE_SIZE 32
39 #define RSPAMD_CHARSET_MAX_CONTENT 512
40 
41 #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
42 #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
43 
44 static rspamd_regexp_t *utf_compatible_re = NULL;
45 
46 struct rspamd_charset_substitution {
47 	const gchar *input;
48 	const gchar *canon;
49 	gint flags;
50 };
51 
52 #include "mime_encoding_list.h"
53 
54 static GHashTable *sub_hash = NULL;
55 
56 static const UChar iso_8859_16_map[] = {
57 		0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
58 		0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
59 		0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
60 		0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
61 		0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
62 		0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
63 		0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
64 		0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
65 		0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
66 		0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
67 		0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
68 		0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
69 		0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
70 		0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
71 		0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
72 		0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
73 };
74 
75 struct rspamd_charset_converter {
76 	gchar *canon_name;
77 	union {
78 		UConverter *conv;
79 		const UChar *cnv_table;
80 	} d;
81 	gboolean is_internal;
82 };
83 
84 static GQuark
rspamd_charset_conv_error_quark(void)85 rspamd_charset_conv_error_quark (void)
86 {
87 	return g_quark_from_static_string ("charset conversion error");
88 }
89 
90 static void
rspamd_converter_dtor(gpointer p)91 rspamd_converter_dtor (gpointer p)
92 {
93 	struct rspamd_charset_converter *c = (struct rspamd_charset_converter *)p;
94 
95 	if (!c->is_internal) {
96 		ucnv_close (c->d.conv);
97 	}
98 
99 	g_free (c->canon_name);
100 	g_free (c);
101 }
102 
103 int32_t
rspamd_converter_to_uchars(struct rspamd_charset_converter * cnv,UChar * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)104 rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
105 							UChar *dest,
106 							int32_t destCapacity,
107 							const char *src,
108 							int32_t srcLength,
109 							UErrorCode *pErrorCode)
110 {
111 	if (!cnv->is_internal) {
112 		return ucnv_toUChars (cnv->d.conv,
113 				dest, destCapacity,
114 				src, srcLength,
115 				pErrorCode);
116 	}
117 	else {
118 		UChar *d = dest, *dend = dest + destCapacity;
119 		const guchar *p = src, *end = src + srcLength;
120 
121 		while (p < end && d < dend) {
122 			if (*p <= 127) {
123 				*d++ = (UChar)*p;
124 			}
125 			else {
126 				*d++ = cnv->d.cnv_table[*p - 128];
127 			}
128 
129 			p ++;
130 		}
131 
132 		return d - dest;
133 	}
134 }
135 
136 
137 struct rspamd_charset_converter *
rspamd_mime_get_converter_cached(const gchar * enc,rspamd_mempool_t * pool,gboolean is_canon,UErrorCode * err)138 rspamd_mime_get_converter_cached (const gchar *enc,
139 								  rspamd_mempool_t *pool,
140 								  gboolean is_canon,
141 								  UErrorCode *err)
142 {
143 	const gchar *canon_name;
144 	static rspamd_lru_hash_t *cache;
145 	struct rspamd_charset_converter *conv;
146 
147 	if (cache == NULL) {
148 		cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, NULL,
149 				rspamd_converter_dtor, rspamd_str_hash,
150 				rspamd_str_equal);
151 	}
152 
153 	if (enc == NULL) {
154 		return NULL;
155 	}
156 
157 	if (!is_canon) {
158 		rspamd_ftok_t cset_tok;
159 
160 		RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
161 		canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
162 	}
163 	else {
164 		canon_name = enc;
165 	}
166 
167 	if (canon_name == NULL) {
168 		return NULL;
169 	}
170 
171 	conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0);
172 
173 	if (conv == NULL) {
174 		if (!(strcmp (canon_name, "ISO-8859-16") == 0 ||
175 				strcmp (canon_name, "latin10") == 0 ||
176 				strcmp (canon_name, "iso-ir-226") == 0)) {
177 			conv = g_malloc0 (sizeof (*conv));
178 			conv->d.conv = ucnv_open (canon_name, err);
179 			conv->canon_name = g_strdup (canon_name);
180 
181 			if (conv->d.conv != NULL) {
182 				ucnv_setToUCallBack (conv->d.conv,
183 						UCNV_TO_U_CALLBACK_SUBSTITUTE,
184 						NULL,
185 						NULL,
186 						NULL,
187 						err);
188 				rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
189 			}
190 			else {
191 				g_free (conv);
192 				conv = NULL;
193 			}
194 		}
195 		else {
196 			/* ISO-8859-16 */
197 			conv = g_malloc0 (sizeof (*conv));
198 			conv->is_internal = TRUE;
199 			conv->d.cnv_table = iso_8859_16_map;
200 			conv->canon_name = g_strdup (canon_name);
201 
202 			rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
203 		}
204 	}
205 
206 	return conv;
207 }
208 
209 static void
rspamd_mime_encoding_substitute_init(void)210 rspamd_mime_encoding_substitute_init (void)
211 {
212 	guint i;
213 
214 	sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal);
215 
216 	for (i = 0; i < G_N_ELEMENTS (sub); i ++) {
217 		g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]);
218 	}
219 }
220 
221 static void
rspamd_charset_normalize(gchar * in)222 rspamd_charset_normalize (gchar *in)
223 {
224 	/*
225 	 * This is a simple routine to validate input charset
226 	 * we just check that charset starts with alphanumeric and ends
227 	 * with alphanumeric
228 	 */
229 	gchar *begin, *end;
230 	gboolean changed = FALSE;
231 
232 	begin = in;
233 
234 	while (*begin && !g_ascii_isalnum (*begin)) {
235 		begin ++;
236 		changed = TRUE;
237 	}
238 
239 	end = begin + strlen (begin) - 1;
240 
241 	while (end > begin && !g_ascii_isalnum (*end)) {
242 		end --;
243 		changed = TRUE;
244 	}
245 
246 	if (changed) {
247 		memmove (in, begin, end - begin + 2);
248 		*(end + 1) = '\0';
249 	}
250 }
251 
252 const gchar *
rspamd_mime_detect_charset(const rspamd_ftok_t * in,rspamd_mempool_t * pool)253 rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
254 {
255 	gchar *ret = NULL, *h, *t;
256 	struct rspamd_charset_substitution *s;
257 	const gchar *cset;
258 	rspamd_ftok_t utf8_tok;
259 	UErrorCode uc_err = U_ZERO_ERROR;
260 
261 	if (sub_hash == NULL) {
262 		rspamd_mime_encoding_substitute_init ();
263 	}
264 
265 	/* Fast path */
266 	RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf-8");
267 
268 	if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
269 		return UTF8_CHARSET;
270 	}
271 
272 	RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf8");
273 
274 	if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
275 		return UTF8_CHARSET;
276 	}
277 
278 	ret = rspamd_mempool_ftokdup (pool, in);
279 	rspamd_charset_normalize (ret);
280 
281 	if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) ||
282 			(in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) {
283 		/* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
284 		h = ret;
285 		t = ret;
286 
287 		while (*h != '\0') {
288 			if (*h != '-') {
289 				*t++ = *h;
290 			}
291 
292 			h ++;
293 		}
294 
295 		*t = '\0';
296 	}
297 
298 	s = g_hash_table_lookup (sub_hash, ret);
299 
300 	if (s) {
301 		ret = (char *)s->canon;
302 	}
303 
304 	/* Try different aliases */
305 	cset = ucnv_getCanonicalName (ret, "MIME", &uc_err);
306 
307 	if (cset == NULL) {
308 		uc_err = U_ZERO_ERROR;
309 		cset = ucnv_getCanonicalName (ret, "IANA", &uc_err);
310 	}
311 
312 	if (cset == NULL) {
313 		uc_err = U_ZERO_ERROR;
314 		cset = ucnv_getCanonicalName (ret, "", &uc_err);
315 	}
316 
317 	if (cset == NULL) {
318 		uc_err = U_ZERO_ERROR;
319 		cset = ucnv_getAlias (ret, 0, &uc_err);
320 	}
321 
322 	return cset;
323 }
324 
325 gchar *
rspamd_mime_text_to_utf8(rspamd_mempool_t * pool,gchar * input,gsize len,const gchar * in_enc,gsize * olen,GError ** err)326 rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
327 		gchar *input, gsize len, const gchar *in_enc,
328 		gsize *olen, GError **err)
329 {
330 	gchar *d;
331 	gint32 r, clen, dlen;
332 	UChar *tmp_buf;
333 
334 	UErrorCode uc_err = U_ZERO_ERROR;
335 	UConverter *utf8_converter;
336 	struct rspamd_charset_converter *conv;
337 	rspamd_ftok_t cset_tok;
338 
339 	/* Check if already utf8 */
340 	RSPAMD_FTOK_FROM_STR (&cset_tok, in_enc);
341 
342 	if (rspamd_mime_charset_utf_check (&cset_tok, input, len,
343 			FALSE)) {
344 		d = rspamd_mempool_alloc (pool, len);
345 		memcpy (d, input, len);
346 		if (olen) {
347 			*olen = len;
348 		}
349 
350 		return d;
351 	}
352 
353 	conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
354 	utf8_converter = rspamd_get_utf8_converter ();
355 
356 	if (conv == NULL) {
357 		g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
358 				"cannot open converter for %s: %s",
359 				in_enc, u_errorName (uc_err));
360 
361 		return NULL;
362 	}
363 
364 	tmp_buf = g_new (UChar, len + 1);
365 	uc_err = U_ZERO_ERROR;
366 	r = rspamd_converter_to_uchars (conv, tmp_buf, len + 1, input, len, &uc_err);
367 
368 	if (!U_SUCCESS (uc_err)) {
369 		g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
370 					"cannot convert data to unicode from %s: %s",
371 					in_enc, u_errorName (uc_err));
372 		g_free (tmp_buf);
373 
374 		return NULL;
375 	}
376 
377 	/* Now, convert to utf8 */
378 	clen = ucnv_getMaxCharSize (utf8_converter);
379 	dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
380 	d = rspamd_mempool_alloc (pool, dlen);
381 	r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
382 
383 	if (!U_SUCCESS (uc_err)) {
384 		g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
385 				"cannot convert data from unicode from %s: %s",
386 				in_enc, u_errorName (uc_err));
387 		g_free (tmp_buf);
388 
389 		return NULL;
390 	}
391 
392 	msg_debug_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d",
393 			in_enc, len, r);
394 	g_free (tmp_buf);
395 
396 	if (olen) {
397 		*olen = r;
398 	}
399 
400 	return d;
401 }
402 
403 static gboolean
rspamd_mime_text_part_utf8_convert(struct rspamd_task * task,struct rspamd_mime_text_part * text_part,GByteArray * input,const gchar * charset,GError ** err)404 rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
405 									struct rspamd_mime_text_part *text_part,
406 									GByteArray *input,
407 									const gchar *charset,
408 									GError **err)
409 {
410 	gchar *d;
411 	gint32 r, clen, dlen, uc_len;
412 	UChar *tmp_buf;
413 	UErrorCode uc_err = U_ZERO_ERROR;
414 	UConverter *utf8_converter;
415 	struct rspamd_charset_converter *conv;
416 
417 	conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
418 			TRUE, &uc_err);
419 	utf8_converter = rspamd_get_utf8_converter ();
420 
421 	if (conv == NULL) {
422 		g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
423 				"cannot open converter for %s: %s",
424 				charset, u_errorName (uc_err));
425 
426 		return FALSE;
427 	}
428 
429 	tmp_buf = g_new (UChar, input->len + 1);
430 	uc_err = U_ZERO_ERROR;
431 	uc_len = rspamd_converter_to_uchars (conv,
432 			tmp_buf,
433 			input->len + 1,
434 			input->data,
435 			input->len,
436 			&uc_err);
437 
438 	if (!U_SUCCESS (uc_err)) {
439 		g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
440 				"cannot convert data to unicode from %s: %s",
441 				charset, u_errorName (uc_err));
442 		g_free (tmp_buf);
443 
444 		return FALSE;
445 	}
446 
447 	/* Now, convert to utf8 */
448 	clen = ucnv_getMaxCharSize (utf8_converter);
449 	dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
450 	d = rspamd_mempool_alloc (task->task_pool, dlen);
451 	r = ucnv_fromUChars (utf8_converter, d, dlen,
452 			tmp_buf, uc_len, &uc_err);
453 
454 	if (!U_SUCCESS (uc_err)) {
455 		g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
456 				"cannot convert data from unicode from %s: %s",
457 				charset, u_errorName (uc_err));
458 		g_free (tmp_buf);
459 
460 		return FALSE;
461 	}
462 
463 	if (text_part->mime_part && text_part->mime_part->ct) {
464 		msg_info_task ("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
465 				charset, &text_part->mime_part->ct->charset, input->len, r, uc_len);
466 	}
467 	else {
468 		msg_info_task ("converted text part from %s (no charset announced) to UTF-8 inlen: %d, "
469 				 "outlen: %d (%d UTF16 chars)",
470 				charset, input->len, r, uc_len);
471 	}
472 
473 	text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
474 			sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
475 	text_part->utf_raw_content->data = d;
476 	text_part->utf_raw_content->len = r;
477 	g_free (tmp_buf);
478 
479 	return TRUE;
480 }
481 
482 gboolean
rspamd_mime_to_utf8_byte_array(GByteArray * in,GByteArray * out,rspamd_mempool_t * pool,const gchar * enc)483 rspamd_mime_to_utf8_byte_array (GByteArray *in,
484 		GByteArray *out,
485 		rspamd_mempool_t *pool,
486 		const gchar *enc)
487 {
488 	gint32 r, clen, dlen;
489 	UChar *tmp_buf;
490 	UErrorCode uc_err = U_ZERO_ERROR;
491 	UConverter *utf8_converter;
492 	struct rspamd_charset_converter *conv;
493 	rspamd_ftok_t charset_tok;
494 
495 	if (in == NULL || in->len == 0) {
496 		return FALSE;
497 	}
498 
499 	if (enc == NULL) {
500 		/* Assume utf ? */
501 		if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
502 			g_byte_array_set_size (out, in->len);
503 			memcpy (out->data, in->data, out->len);
504 
505 			return TRUE;
506 		}
507 		else {
508 			/* Bad stuff, keep out */
509 			return FALSE;
510 		}
511 	}
512 
513 	RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
514 
515 	if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
516 			FALSE)) {
517 		g_byte_array_set_size (out, in->len);
518 		memcpy (out->data, in->data, out->len);
519 
520 		return TRUE;
521 	}
522 
523 	utf8_converter = rspamd_get_utf8_converter ();
524 	conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);
525 
526 	if (conv == NULL) {
527 		return FALSE;
528 	}
529 
530 	tmp_buf = g_new (UChar, in->len + 1);
531 	uc_err = U_ZERO_ERROR;
532 	r = rspamd_converter_to_uchars (conv,
533 			tmp_buf, in->len + 1,
534 			in->data, in->len, &uc_err);
535 
536 	if (!U_SUCCESS (uc_err)) {
537 		g_free (tmp_buf);
538 
539 		return FALSE;
540 	}
541 
542 	/* Now, convert to utf8 */
543 	clen = ucnv_getMaxCharSize (utf8_converter);
544 	dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
545 	g_byte_array_set_size (out, dlen);
546 	r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
547 
548 	if (!U_SUCCESS (uc_err)) {
549 		g_free (tmp_buf);
550 
551 		return FALSE;
552 	}
553 
554 	g_free (tmp_buf);
555 	out->len = r;
556 
557 	return TRUE;
558 }
559 
560 void
rspamd_mime_charset_utf_enforce(gchar * in,gsize len)561 rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
562 {
563 	gchar *p, *end;
564 	goffset err_offset;
565 	UChar32 uc = 0;
566 
567 	/* Now we validate input and replace bad characters with '?' symbol */
568 	p = in;
569 	end = in + len;
570 
571 	while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len)) > 0) {
572 		err_offset --; /* As it returns it 1 indexed */
573 		gint32 cur_offset = err_offset;
574 
575 		while (cur_offset < len) {
576 			gint32 tmp = cur_offset;
577 
578 			U8_NEXT (p, cur_offset, len, uc);
579 
580 			if (uc > 0) {
581 				/* Fill string between err_offset and tmp with `?` character */
582 				memset (p + err_offset, '?', tmp - err_offset);
583 				break;
584 			}
585 		}
586 
587 		if (uc < 0) {
588 			/* Fill till the end */
589 			memset (p + err_offset, '?', len - err_offset);
590 			break;
591 		}
592 
593 		p += cur_offset;
594 		len = end - p;
595 	}
596 }
597 
598 const char *
rspamd_mime_charset_find_by_content(const gchar * in,gsize inlen,bool check_utf8)599 rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen,
600 									 bool check_utf8)
601 {
602 	int nconsumed;
603 	bool is_reliable;
604 	const gchar *ced_name;
605 
606 	if (check_utf8) {
607 		if (rspamd_fast_utf8_validate (in, inlen) == 0) {
608 			return UTF8_CHARSET;
609 		}
610 	}
611 
612 
613 	ced_name = ced_encoding_detect (in, inlen, NULL, NULL,
614 			NULL, 0, CED_EMAIL_CORPUS,
615 			false, &nconsumed, &is_reliable);
616 
617 	if (ced_name) {
618 
619 		return ced_name;
620 	}
621 
622 	return NULL;
623 }
624 
625 static const char *
rspamd_mime_charset_find_by_content_maybe_split(const gchar * in,gsize inlen)626 rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen)
627 {
628 	if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) {
629 		return rspamd_mime_charset_find_by_content (in, inlen, false);
630 	}
631 	else {
632 		const gchar *c1, *c2, *c3;
633 
634 		c1 = rspamd_mime_charset_find_by_content (in, RSPAMD_CHARSET_MAX_CONTENT, false);
635 		c2 = rspamd_mime_charset_find_by_content (in + inlen / 2,
636 				RSPAMD_CHARSET_MAX_CONTENT, false);
637 		c3 = rspamd_mime_charset_find_by_content (in + inlen - RSPAMD_CHARSET_MAX_CONTENT,
638 				RSPAMD_CHARSET_MAX_CONTENT, false);
639 
640 		/* 7bit stuff */
641 		if (c1 && strcmp (c1, "US-ASCII") == 0) {
642 			c1 = NULL; /* Invalid - we have 8 bit there */
643 		}
644 		if (c2 && strcmp (c2, "US-ASCII") == 0) {
645 			c2 = NULL; /* Invalid - we have 8 bit there */
646 		}
647 		if (c3 && strcmp (c3, "US-ASCII") == 0) {
648 			c3 = NULL; /* Invalid - we have 8 bit there */
649 		}
650 
651 		if (!c1) {
652 			c1 = c2 ? c2 : c3;
653 		}
654 		if (!c2) {
655 			c2 = c3 ? c3 : c1;
656 		}
657 		if (!c3) {
658 			c3 = c1 ? c2 : c1;
659 		}
660 
661 		if (c1 && c2 && c3) {
662 			/* Quorum */
663 			if (c1 == c2) {
664 				return c1;
665 			}
666 			else if (c2 == c3) {
667 				return c2;
668 			}
669 			else if (c1 == c3) {
670 				return c3;
671 			}
672 
673 			/* All charsets are distinct. Use the one from the top */
674 			return c1;
675 		}
676 
677 		return NULL;
678 	}
679 }
680 
681 gboolean
rspamd_mime_charset_utf_check(rspamd_ftok_t * charset,gchar * in,gsize len,gboolean content_check)682 rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
683 		gchar *in, gsize len, gboolean content_check)
684 {
685 	const gchar *real_charset;
686 
687 	if (utf_compatible_re == NULL) {
688 		utf_compatible_re = rspamd_regexp_new (
689 				"^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
690 				"i", NULL);
691 	}
692 
693 	if (charset->len == 0 ||
694 			rspamd_regexp_match (utf_compatible_re,
695 					charset->begin, charset->len, TRUE)) {
696 		/*
697 		 * In case of UTF8 charset we still can check the content to find
698 		 * corner cases
699 		 */
700 		if (content_check) {
701 			if (rspamd_fast_utf8_validate (in, len) != 0) {
702 				real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len);
703 
704 				if (real_charset) {
705 
706 					if (rspamd_regexp_match (utf_compatible_re,
707 							real_charset, strlen (real_charset), TRUE)) {
708 						RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
709 
710 						return TRUE;
711 					}
712 					else {
713 						charset->begin = real_charset;
714 						charset->len = strlen (real_charset);
715 
716 						return FALSE;
717 					}
718 				}
719 
720 				rspamd_mime_charset_utf_enforce (in, len);
721 			}
722 		}
723 
724 		return TRUE;
725 	}
726 
727 	return FALSE;
728 }
729 
730 void
rspamd_mime_text_part_maybe_convert(struct rspamd_task * task,struct rspamd_mime_text_part * text_part)731 rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
732 		struct rspamd_mime_text_part *text_part)
733 {
734 	GError *err = NULL;
735 	const gchar *charset = NULL;
736 	gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
737 	GByteArray *part_content;
738 	rspamd_ftok_t charset_tok;
739 	struct rspamd_mime_part *part = text_part->mime_part;
740 
741 	if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
742 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW;
743 	}
744 
745 	/* Allocate copy storage */
746 	part_content = g_byte_array_sized_new (text_part->parsed.len);
747 	memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
748 	part_content->len = text_part->parsed.len;
749 	rspamd_mempool_notify_alloc (task->task_pool,
750 			part_content->len);
751 	rspamd_mempool_add_destructor (task->task_pool,
752 			(rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
753 
754 	if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
755 		if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
756 			/* Valid UTF, likely all good */
757 			need_charset_heuristic = FALSE;
758 			valid_utf8 = TRUE;
759 			checked = TRUE;
760 		}
761 
762 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
763 	}
764 	else {
765 		/* All 7bit characters, assume it valid utf */
766 		need_charset_heuristic = FALSE;
767 		valid_utf8 = TRUE;
768 		checked = TRUE; /* Already valid utf, no need in further checks */
769 	}
770 
771 	if (part->ct->charset.len == 0) {
772 		if (need_charset_heuristic) {
773 			charset = rspamd_mime_charset_find_by_content_maybe_split (text_part->parsed.begin,
774 					text_part->parsed.len);
775 
776 			if (charset != NULL) {
777 				msg_info_task ("detected charset %s", charset);
778 			}
779 
780 			checked = TRUE;
781 			text_part->real_charset = charset;
782 		}
783 		else if (valid_utf8) {
784 			SET_PART_UTF (text_part);
785 			text_part->utf_raw_content = part_content;
786 			text_part->real_charset = UTF8_CHARSET;
787 
788 			return;
789 		}
790 	}
791 	else {
792 		charset = rspamd_mime_detect_charset (&part->ct->charset,
793 				task->task_pool);
794 
795 		if (charset == NULL) {
796 			/* We don't know the real charset but can try heuristic */
797 			if (need_charset_heuristic) {
798 				charset = rspamd_mime_charset_find_by_content_maybe_split (part_content->data,
799 						part_content->len);
800 				msg_info_task ("detected charset: %s", charset);
801 				checked = TRUE;
802 				text_part->real_charset = charset;
803 			}
804 			else if (valid_utf8) {
805 				/* We already know that the input is valid utf, so skip heuristic */
806 				text_part->real_charset = UTF8_CHARSET;
807 			}
808 		}
809 		else {
810 			text_part->real_charset = charset;
811 
812 			if (strcmp (charset, UTF8_CHARSET) != 0) {
813 				/*
814 				 * We have detected some charset, but we don't know which one,
815 				 * so we need to reset valid utf8 flag and enforce it later
816 				 */
817 				valid_utf8 = FALSE;
818 			}
819 		}
820 	}
821 
822 	if (text_part->real_charset == NULL) {
823 		msg_info_task ("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"",
824 				MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset,
825 				part->ct->cpy);
826 		SET_PART_RAW (text_part);
827 		text_part->utf_raw_content = part_content;
828 
829 		return;
830 	}
831 
832 	RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
833 
834 	if (!valid_utf8) {
835 		if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
836 				part_content->len, !checked)) {
837 			SET_PART_UTF (text_part);
838 			text_part->utf_raw_content = part_content;
839 			text_part->real_charset = UTF8_CHARSET;
840 
841 			return;
842 		}
843 		else {
844 			charset = charset_tok.begin;
845 
846 			if (!rspamd_mime_text_part_utf8_convert (task, text_part,
847 					part_content, charset, &err)) {
848 				msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
849 						MESSAGE_FIELD (task, message_id),
850 						charset,
851 						err ? err->message : "unknown problem");
852 				SET_PART_RAW (text_part);
853 				g_error_free (err);
854 
855 				text_part->utf_raw_content = part_content;
856 				return;
857 			}
858 
859 			SET_PART_UTF (text_part);
860 			text_part->real_charset = charset;
861 		}
862 	}
863 	else {
864 		SET_PART_UTF (text_part);
865 		text_part->utf_raw_content = part_content;
866 	}
867 }
868