1 /*-
2  * Copyright 2016 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "config.h"
18 #include "message.h"
19 #include "task.h"
20 #include "archives.h"
21 #include "libmime/mime_encoding.h"
22 #include <unicode/uchar.h>
23 #include <unicode/utf8.h>
24 #include <unicode/utf16.h>
25 #include <unicode/ucnv.h>
26 
27 #define msg_debug_archive(...)  rspamd_conditional_debug_fast (NULL, NULL, \
28         rspamd_archive_log_id, "archive", task->task_pool->tag.uid, \
29         G_STRFUNC, \
30         __VA_ARGS__)
31 
INIT_LOG_MODULE(archive)32 INIT_LOG_MODULE(archive)
33 
34 static void
35 rspamd_archive_dtor (gpointer p)
36 {
37 	struct rspamd_archive *arch = p;
38 	struct rspamd_archive_file *f;
39 	guint i;
40 
41 	for (i = 0; i < arch->files->len; i ++) {
42 		f = g_ptr_array_index (arch->files, i);
43 
44 		if (f->fname) {
45 			g_string_free (f->fname, TRUE);
46 		}
47 
48 		g_free (f);
49 	}
50 
51 	g_ptr_array_free (arch->files, TRUE);
52 }
53 
54 static GString *
rspamd_archive_file_try_utf(struct rspamd_task * task,const gchar * in,gsize inlen)55 rspamd_archive_file_try_utf (struct rspamd_task *task,
56 		const gchar *in, gsize inlen)
57 {
58 	const gchar *charset = NULL, *p, *end;
59 	GString *res;
60 
61 	charset = rspamd_mime_charset_find_by_content (in, inlen, TRUE);
62 
63 	if (charset) {
64 		UChar *tmp;
65 		UErrorCode uc_err = U_ZERO_ERROR;
66 		gint32 r, clen, dlen;
67 		struct rspamd_charset_converter *conv;
68 		UConverter *utf8_converter;
69 
70 		conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
71 				TRUE, &uc_err);
72 		utf8_converter = rspamd_get_utf8_converter ();
73 
74 		if (conv == NULL) {
75 			msg_info_task ("cannot open converter for %s: %s",
76 					charset, u_errorName (uc_err));
77 
78 			return NULL;
79 		}
80 
81 		tmp = g_malloc (sizeof (*tmp) * (inlen + 1));
82 		r = rspamd_converter_to_uchars (conv, tmp, inlen + 1,
83 				in, inlen, &uc_err);
84 		if (!U_SUCCESS (uc_err)) {
85 			msg_info_task ("cannot convert data to unicode from %s: %s",
86 					charset, u_errorName (uc_err));
87 			g_free (tmp);
88 
89 			return NULL;
90 		}
91 
92 		clen = ucnv_getMaxCharSize (utf8_converter);
93 		dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
94 		res = g_string_sized_new (dlen);
95 		r = ucnv_fromUChars (utf8_converter, res->str, dlen, tmp, r, &uc_err);
96 
97 		if (!U_SUCCESS (uc_err)) {
98 			msg_info_task ("cannot convert data from unicode from %s: %s",
99 					charset, u_errorName (uc_err));
100 			g_free (tmp);
101 			g_string_free (res, TRUE);
102 
103 			return NULL;
104 		}
105 
106 		g_free (tmp);
107 		res->len = r;
108 
109 		msg_debug_archive ("converted from %s to UTF-8 inlen: %z, outlen: %d",
110 				charset, inlen, r);
111 	}
112 	else {
113 		/* Convert unsafe characters to '?' */
114 		res = g_string_sized_new (inlen);
115 		p = in;
116 		end = in + inlen;
117 
118 		while (p < end) {
119 			if (g_ascii_isgraph (*p)) {
120 				g_string_append_c (res, *p);
121 			}
122 			else {
123 				g_string_append_c (res, '?');
124 			}
125 
126 			p ++;
127 		}
128 	}
129 
130 	return res;
131 }
132 
133 static void
rspamd_archive_process_zip(struct rspamd_task * task,struct rspamd_mime_part * part)134 rspamd_archive_process_zip (struct rspamd_task *task,
135 		struct rspamd_mime_part *part)
136 {
137 	const guchar *p, *start, *end, *eocd = NULL, *cd;
138 	const guint32 eocd_magic = 0x06054b50, cd_basic_len = 46;
139 	const guchar cd_magic[] = {0x50, 0x4b, 0x01, 0x02};
140 	const guint max_processed = 1024;
141 	guint32 cd_offset, cd_size, comp_size, uncomp_size, processed = 0;
142 	guint16 extra_len, fname_len, comment_len;
143 	struct rspamd_archive *arch;
144 	struct rspamd_archive_file *f = NULL;
145 
146 	/* Zip files have interesting data at the end of archive */
147 	p = part->parsed_data.begin + part->parsed_data.len - 1;
148 	start = part->parsed_data.begin;
149 	end = p;
150 
151 	/* Search for EOCD:
152 	 * 22 bytes is a typical size of eocd without a comment and
153 	 * end points one byte after the last character
154 	 */
155 	p -= 21;
156 
157 	while (p > start + sizeof (guint32)) {
158 		guint32 t;
159 
160 		if (processed > max_processed) {
161 			break;
162 		}
163 
164 		/* XXX: not an efficient approach */
165 		memcpy (&t, p, sizeof (t));
166 
167 		if (GUINT32_FROM_LE (t) == eocd_magic) {
168 			eocd = p;
169 			break;
170 		}
171 
172 		p --;
173 		processed ++;
174 	}
175 
176 
177 	if (eocd == NULL) {
178 		/* Not a zip file */
179 		msg_info_task ("zip archive is invalid (no EOCD)");
180 
181 		return;
182 	}
183 
184 	if (end - eocd < 21) {
185 		msg_info_task ("zip archive is invalid (short EOCD)");
186 
187 		return;
188 	}
189 
190 
191 	memcpy (&cd_size, eocd + 12, sizeof (cd_size));
192 	cd_size = GUINT32_FROM_LE (cd_size);
193 	memcpy (&cd_offset, eocd + 16, sizeof (cd_offset));
194 	cd_offset = GUINT32_FROM_LE (cd_offset);
195 
196 	/* We need to check sanity as well */
197 	if (cd_offset + cd_size > (guint)(eocd - start)) {
198 		msg_info_task ("zip archive is invalid (bad size/offset for CD)");
199 
200 		return;
201 	}
202 
203 	cd = start + cd_offset;
204 
205 	arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
206 	arch->files = g_ptr_array_new ();
207 	arch->type = RSPAMD_ARCHIVE_ZIP;
208 	rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
209 			arch);
210 
211 	while (cd < start + cd_offset + cd_size) {
212 		guint16 flags;
213 
214 		/* Read central directory record */
215 		if (eocd - cd < cd_basic_len ||
216 				memcmp (cd, cd_magic, sizeof (cd_magic)) != 0) {
217 			msg_info_task ("zip archive is invalid (bad cd record)");
218 
219 			return;
220 		}
221 
222 		memcpy (&flags, cd + 8, sizeof (guint16));
223 		flags = GUINT16_FROM_LE (flags);
224 		memcpy (&comp_size, cd + 20, sizeof (guint32));
225 		comp_size = GUINT32_FROM_LE (comp_size);
226 		memcpy (&uncomp_size, cd + 24, sizeof (guint32));
227 		uncomp_size = GUINT32_FROM_LE (uncomp_size);
228 		memcpy (&fname_len, cd + 28, sizeof (fname_len));
229 		fname_len = GUINT16_FROM_LE (fname_len);
230 		memcpy (&extra_len, cd + 30, sizeof (extra_len));
231 		extra_len = GUINT16_FROM_LE (extra_len);
232 		memcpy (&comment_len, cd + 32, sizeof (comment_len));
233 		comment_len = GUINT16_FROM_LE (comment_len);
234 
235 		if (cd + fname_len + comment_len + extra_len + cd_basic_len > eocd) {
236 			msg_info_task ("zip archive is invalid (too large cd record)");
237 
238 			return;
239 		}
240 
241 		f = g_malloc0 (sizeof (*f));
242 		f->fname = rspamd_archive_file_try_utf (task,
243 				cd + cd_basic_len, fname_len);
244 		f->compressed_size = comp_size;
245 		f->uncompressed_size = uncomp_size;
246 
247 		if (flags & 0x41u) {
248 			f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
249 		}
250 
251 		if (f->fname) {
252 			g_ptr_array_add (arch->files, f);
253 			msg_debug_archive ("found file in zip archive: %v", f->fname);
254 		}
255 		else {
256 			g_free (f);
257 
258 			return;
259 		}
260 
261 		/* Process extra fields */
262 		const guchar *extra = cd + fname_len + cd_basic_len;
263 		p = extra;
264 
265 		while (p + sizeof (guint16) * 2 < extra + extra_len) {
266 			guint16 hid, hlen;
267 
268 			memcpy (&hid, p, sizeof (guint16));
269 			hid = GUINT16_FROM_LE (hid);
270 			memcpy (&hlen, p + sizeof (guint16), sizeof (guint16));
271 			hlen = GUINT16_FROM_LE (hlen);
272 
273 			if (hid == 0x0017) {
274 				f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
275 			}
276 
277 			p += hlen + sizeof (guint16) * 2;
278 		}
279 
280 		cd += fname_len + comment_len + extra_len + cd_basic_len;
281 	}
282 
283 	part->part_type = RSPAMD_MIME_PART_ARCHIVE;
284 	part->specific.arch = arch;
285 
286 	if (part->cd) {
287 		arch->archive_name = &part->cd->filename;
288 	}
289 
290 	arch->size = part->parsed_data.len;
291 }
292 
293 static inline gint
rspamd_archive_rar_read_vint(const guchar * start,gsize remain,guint64 * res)294 rspamd_archive_rar_read_vint (const guchar *start, gsize remain, guint64 *res)
295 {
296 	/*
297 	 * From http://www.rarlab.com/technote.htm:
298 	 * Variable length integer. Can include one or more bytes, where
299 	 * lower 7 bits of every byte contain integer data and highest bit
300 	 * in every byte is the continuation flag.
301 	 * If highest bit is 0, this is the last byte in sequence.
302 	 * So first byte contains 7 least significant bits of integer and
303 	 * continuation flag. Second byte, if present, contains next 7 bits and so on.
304 	 */
305 	guint64 t = 0;
306 	guint shift = 0;
307 	const guchar *p = start;
308 
309 	while (remain > 0 && shift <= 57) {
310 		if (*p & 0x80) {
311 			t |= ((guint64)(*p & 0x7f)) << shift;
312 		}
313 		else {
314 			t |= ((guint64)(*p & 0x7f)) << shift;
315 			p ++;
316 			break;
317 		}
318 
319 		shift += 7;
320 		p++;
321 		remain --;
322 	}
323 
324 	if (remain == 0 || shift > 64) {
325 		return -1;
326 	}
327 
328 	*res = GUINT64_FROM_LE (t);
329 
330 	return p - start;
331 }
332 
333 #define RAR_SKIP_BYTES(n) do { \
334 	if ((n) <= 0) { \
335 		msg_debug_archive ("rar archive is invalid (bad skip value)"); \
336 		return; \
337 	} \
338 	if ((gsize)(end - p) < (n)) { \
339 		msg_debug_archive ("rar archive is invalid (truncated)"); \
340 		return; \
341 	} \
342 	p += (n); \
343 } while (0)
344 
345 #define RAR_READ_VINT() do { \
346 	r = rspamd_archive_rar_read_vint (p, end - p, &vint); \
347 	if (r == -1) { \
348 		msg_debug_archive ("rar archive is invalid (bad vint)"); \
349 		return; \
350 	} \
351 	else if (r == 0) { \
352 		msg_debug_archive ("rar archive is invalid (BAD vint offset)"); \
353 		return; \
354 	}\
355 } while (0)
356 
357 #define RAR_READ_VINT_SKIP() do { \
358 	r = rspamd_archive_rar_read_vint (p, end - p, &vint); \
359 	if (r == -1) { \
360 		msg_debug_archive ("rar archive is invalid (bad vint)"); \
361 		return; \
362 	} \
363 	p += r; \
364 } while (0)
365 
366 #define RAR_READ_UINT16(n) do { \
367 	if (end - p < (glong)sizeof (guint16)) { \
368 		msg_debug_archive ("rar archive is invalid (bad int16)"); \
369 		return; \
370 	} \
371 	n = p[0] + (p[1] << 8); \
372 	p += sizeof (guint16); \
373 } while (0)
374 
375 #define RAR_READ_UINT32(n) do { \
376 	if (end - p < (glong)sizeof (guint32)) { \
377 		msg_debug_archive ("rar archive is invalid (bad int32)"); \
378 		return; \
379 	} \
380 	n = (guint)p[0] + ((guint)p[1] << 8) + ((guint)p[2] << 16) + ((guint)p[3] << 24); \
381 	p += sizeof (guint32); \
382 } while (0)
383 
384 static void
rspamd_archive_process_rar_v4(struct rspamd_task * task,const guchar * start,const guchar * end,struct rspamd_mime_part * part)385 rspamd_archive_process_rar_v4 (struct rspamd_task *task, const guchar *start,
386 		const guchar *end, struct rspamd_mime_part *part)
387 {
388 	const guchar *p = start, *start_section;
389 	guint8 type;
390 	guint flags;
391 	guint64 sz, comp_sz = 0, uncomp_sz = 0;
392 	struct rspamd_archive *arch;
393 	struct rspamd_archive_file *f;
394 
395 	arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
396 	arch->files = g_ptr_array_new ();
397 	arch->type = RSPAMD_ARCHIVE_RAR;
398 	rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
399 			arch);
400 
401 	while (p < end) {
402 		/* Crc16 */
403 		start_section = p;
404 		RAR_SKIP_BYTES (sizeof (guint16));
405 		type = *p;
406 		p ++;
407 		RAR_READ_UINT16 (flags);
408 
409 		if (type == 0x73) {
410 			/* Main header, check for encryption */
411 			if (flags & 0x80) {
412 				arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
413 				goto end;
414 			}
415 		}
416 
417 		RAR_READ_UINT16 (sz);
418 
419 		if (flags & 0x8000) {
420 			/* We also need to read ADD_SIZE element */
421 			guint32 tmp;
422 
423 			RAR_READ_UINT32 (tmp);
424 			sz += tmp;
425 			/* This is also used as PACK_SIZE */
426 			comp_sz = tmp;
427 		}
428 
429 		if (sz == 0) {
430 			/* Zero sized block - error */
431 			msg_debug_archive ("rar archive is invalid (zero size block)");
432 
433 			return;
434 		}
435 
436 		if (type == 0x74) {
437 			guint fname_len;
438 
439 			/* File header */
440 			/* Uncompressed size */
441 			RAR_READ_UINT32 (uncomp_sz);
442 			/* Skip to NAME_SIZE element */
443 			RAR_SKIP_BYTES (11);
444 			RAR_READ_UINT16 (fname_len);
445 
446 			if (fname_len == 0 || fname_len > (gsize)(end - p)) {
447 				msg_debug_archive ("rar archive is invalid (bad filename size: %d)",
448 						fname_len);
449 
450 				return;
451 			}
452 
453 			/* Attrs */
454 			RAR_SKIP_BYTES (4);
455 
456 			if (flags & 0x100) {
457 				/* We also need to read HIGH_PACK_SIZE */
458 				guint32 tmp;
459 
460 				RAR_READ_UINT32 (tmp);
461 				sz += tmp;
462 				comp_sz += tmp;
463 				/* HIGH_UNP_SIZE  */
464 				RAR_READ_UINT32 (tmp);
465 				uncomp_sz += tmp;
466 			}
467 
468 			f = g_malloc0 (sizeof (*f));
469 
470 			if (flags & 0x200) {
471 				/* We have unicode + normal version */
472 				guchar *tmp;
473 
474 				tmp = memchr (p, '\0', fname_len);
475 
476 				if (tmp != NULL) {
477 					/* Just use ASCII version */
478 					f->fname = rspamd_archive_file_try_utf (task, p, tmp - p);
479 					msg_debug_archive ("found ascii filename in rarv4 archive: %v",
480 							f->fname);
481 				}
482 				else {
483 					/* We have UTF8 filename, use it as is */
484 					f->fname = rspamd_archive_file_try_utf (task, p, fname_len);
485 					msg_debug_archive ("found utf filename in rarv4 archive: %v",
486 							f->fname);
487 				}
488 			}
489 			else {
490 				f->fname = rspamd_archive_file_try_utf (task, p, fname_len);
491 				msg_debug_archive ("found ascii (old) filename in rarv4 archive: %v",
492 						f->fname);
493 			}
494 
495 			f->compressed_size = comp_sz;
496 			f->uncompressed_size = uncomp_sz;
497 
498 			if (flags & 0x4) {
499 				f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
500 			}
501 
502 			if (f->fname) {
503 				g_ptr_array_add (arch->files, f);
504 			}
505 			else {
506 				g_free (f);
507 			}
508 		}
509 
510 		p = start_section;
511 		RAR_SKIP_BYTES (sz);
512 	}
513 
514 end:
515 	part->part_type = RSPAMD_MIME_PART_ARCHIVE;
516 	part->specific.arch = arch;
517 	arch->archive_name = &part->cd->filename;
518 	arch->size = part->parsed_data.len;
519 }
520 
521 static void
rspamd_archive_process_rar(struct rspamd_task * task,struct rspamd_mime_part * part)522 rspamd_archive_process_rar (struct rspamd_task *task,
523 		struct rspamd_mime_part *part)
524 {
525 	const guchar *p, *end, *section_start;
526 	const guchar rar_v5_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00},
527 			rar_v4_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00};
528 	const guint rar_encrypted_header = 4, rar_main_header = 1,
529 			rar_file_header = 2;
530 	guint64 vint, sz, comp_sz = 0, uncomp_sz = 0, flags = 0, type = 0,
531 			extra_sz = 0;
532 	struct rspamd_archive *arch;
533 	struct rspamd_archive_file *f;
534 	gint r;
535 
536 	p = part->parsed_data.begin;
537 	end = p + part->parsed_data.len;
538 
539 	if ((gsize)(end - p) <= sizeof (rar_v5_magic)) {
540 		msg_debug_archive ("rar archive is invalid (too small)");
541 
542 		return;
543 	}
544 
545 	if (memcmp (p, rar_v5_magic, sizeof (rar_v5_magic)) == 0) {
546 		p += sizeof (rar_v5_magic);
547 	}
548 	else if (memcmp (p, rar_v4_magic, sizeof (rar_v4_magic)) == 0) {
549 		p += sizeof (rar_v4_magic);
550 
551 		rspamd_archive_process_rar_v4 (task, p, end, part);
552 		return;
553 	}
554 	else {
555 		msg_debug_archive ("rar archive is invalid (no rar magic)");
556 
557 		return;
558 	}
559 
560 	/* Rar v5 format */
561 	arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
562 	arch->files = g_ptr_array_new ();
563 	arch->type = RSPAMD_ARCHIVE_RAR;
564 	rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
565 			arch);
566 
567 	/* Now we can have either encryption header or archive header */
568 	/* Crc 32 */
569 	RAR_SKIP_BYTES (sizeof (guint32));
570 	/* Size */
571 	RAR_READ_VINT_SKIP ();
572 	sz = vint;
573 	/* Type */
574 	section_start = p;
575 	RAR_READ_VINT_SKIP ();
576 	type = vint;
577 	/* Header flags */
578 	RAR_READ_VINT_SKIP ();
579 	flags = vint;
580 
581 	if (flags & 0x1) {
582 		/* Have extra zone */
583 		RAR_READ_VINT_SKIP ();
584 	}
585 	if (flags & 0x2) {
586 		/* Data zone is presented */
587 		RAR_READ_VINT_SKIP ();
588 		sz += vint;
589 	}
590 
591 	if (type == rar_encrypted_header) {
592 		/* We can't read any further information as archive is encrypted */
593 		arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
594 		goto end;
595 	}
596 	else if (type != rar_main_header) {
597 		msg_debug_archive ("rar archive is invalid (bad main header)");
598 
599 		return;
600 	}
601 
602 	/* Nothing useful in main header */
603 	p = section_start;
604 	RAR_SKIP_BYTES (sz);
605 
606 	while (p < end) {
607 		gboolean has_extra = FALSE;
608 		/* Read the next header */
609 		/* Crc 32 */
610 		RAR_SKIP_BYTES (sizeof (guint32));
611 		/* Size */
612 		RAR_READ_VINT_SKIP ();
613 
614 		sz = vint;
615 		if (sz == 0) {
616 			/* Zero sized block - error */
617 			msg_debug_archive ("rar archive is invalid (zero size block)");
618 
619 			return;
620 		}
621 
622 		section_start = p;
623 		/* Type */
624 		RAR_READ_VINT_SKIP ();
625 		type = vint;
626 		/* Header flags */
627 		RAR_READ_VINT_SKIP ();
628 		flags = vint;
629 
630 		if (flags & 0x1) {
631 			/* Have extra zone */
632 			RAR_READ_VINT_SKIP ();
633 			extra_sz = vint;
634 			has_extra = TRUE;
635 		}
636 
637 		if (flags & 0x2) {
638 			/* Data zone is presented */
639 			RAR_READ_VINT_SKIP ();
640 			sz += vint;
641 			comp_sz = vint;
642 		}
643 
644 		if (type != rar_file_header) {
645 			p = section_start;
646 			RAR_SKIP_BYTES (sz);
647 		}
648 		else {
649 			/* We have a file header, go forward */
650 			guint64 fname_len;
651 
652 			/* File header specific flags */
653 			RAR_READ_VINT_SKIP ();
654 			flags = vint;
655 
656 			/* Unpacked size */
657 			RAR_READ_VINT_SKIP ();
658 			uncomp_sz = vint;
659 			/* Attributes */
660 			RAR_READ_VINT_SKIP ();
661 
662 			if (flags & 0x2) {
663 				/* Unix mtime */
664 				RAR_SKIP_BYTES (sizeof (guint32));
665 			}
666 			if (flags & 0x4) {
667 				/* Crc32 */
668 				RAR_SKIP_BYTES (sizeof (guint32));
669 			}
670 
671 			/* Compression */
672 			RAR_READ_VINT_SKIP ();
673 			/* Host OS */
674 			RAR_READ_VINT_SKIP ();
675 			/* Filename length (finally!) */
676 			RAR_READ_VINT_SKIP ();
677 			fname_len = vint;
678 
679 			if (fname_len == 0 || fname_len > (gsize)(end - p)) {
680 				msg_debug_archive ("rar archive is invalid (bad filename size)");
681 
682 				return;
683 			}
684 
685 			f = g_malloc0 (sizeof (*f));
686 			f->uncompressed_size = uncomp_sz;
687 			f->compressed_size = comp_sz;
688 			f->fname = rspamd_archive_file_try_utf (task, p, fname_len);
689 
690 			if (f->fname) {
691 				msg_debug_archive ("added rarv5 file: %v", f->fname);
692 				g_ptr_array_add (arch->files, f);
693 			}
694 			else {
695 				g_free (f);
696 				f = NULL;
697 			}
698 
699 			if (f && has_extra && extra_sz > 0 &&
700 				p + fname_len + extra_sz < end) {
701 				/* Try to find encryption record in extra field */
702 				const guchar *ex = p + fname_len;
703 
704 				while (ex < p + extra_sz) {
705 					const guchar *t;
706 					gint64 cur_sz = 0, sec_type = 0;
707 
708 					r = rspamd_archive_rar_read_vint (ex, extra_sz, &cur_sz);
709 					if (r == -1) {
710 						msg_debug_archive ("rar archive is invalid (bad vint)");
711 						return;
712 					}
713 
714 					t = ex + r;
715 
716 					r = rspamd_archive_rar_read_vint (t, extra_sz - r, &sec_type);
717 					if (r == -1) {
718 						msg_debug_archive ("rar archive is invalid (bad vint)");
719 						return;
720 					}
721 
722 					if (sec_type == 0x01) {
723 						f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
724 						arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
725 						break;
726 					}
727 
728 					ex += cur_sz;
729 				}
730 			}
731 
732 			/* Restore p to the beginning of the header */
733 			p = section_start;
734 			RAR_SKIP_BYTES (sz);
735 		}
736 	}
737 
738 end:
739 	part->part_type = RSPAMD_MIME_PART_ARCHIVE;
740 	part->specific.arch = arch;
741 	if (part->cd != NULL) {
742 		arch->archive_name = &part->cd->filename;
743 	}
744 	arch->size = part->parsed_data.len;
745 }
746 
747 static inline gint
rspamd_archive_7zip_read_vint(const guchar * start,gsize remain,guint64 * res)748 rspamd_archive_7zip_read_vint (const guchar *start, gsize remain, guint64 *res)
749 {
750 	/*
751 	 * REAL_UINT64 means real UINT64.
752 	 * UINT64 means real UINT64 encoded with the following scheme:
753 	 *
754 	 * Size of encoding sequence depends from first byte:
755 	 * First_Byte  Extra_Bytes        Value
756 	 * (binary)
757 	 * 0xxxxxxx               : ( xxxxxxx           )
758 	 * 10xxxxxx    BYTE y[1]  : (  xxxxxx << (8 * 1)) + y
759 	 * 110xxxxx    BYTE y[2]  : (   xxxxx << (8 * 2)) + y
760 	 * ...
761 	 * 1111110x    BYTE y[6]  : (       x << (8 * 6)) + y
762 	 * 11111110    BYTE y[7]  :                         y
763 	 * 11111111    BYTE y[8]  :                         y
764 	 */
765 	guchar t;
766 
767 	if (remain == 0) {
768 		return -1;
769 	}
770 
771 	t = *start;
772 
773 	if (!isset (&t, 7)) {
774 		/* Trivial case */
775 		*res = t;
776 		return 1;
777 	}
778 	else if (t == 0xFF) {
779 		if (remain >= sizeof (guint64) + 1) {
780 			memcpy (res, start + 1, sizeof (guint64));
781 			*res = GUINT64_FROM_LE (*res);
782 
783 			return sizeof (guint64) + 1;
784 		}
785 	}
786 	else {
787 		gint cur_bit = 6, intlen = 1;
788 		const guchar bmask = 0xFF;
789 		guint64 tgt;
790 
791 		while (cur_bit > 0) {
792 			if (!isset (&t, cur_bit)) {
793 				if (remain >= intlen + 1) {
794 					memcpy (&tgt, start + 1, intlen);
795 					tgt = GUINT64_FROM_LE (tgt);
796 					/* Shift back */
797 					tgt >>= sizeof (tgt) - NBBY * intlen;
798 					/* Add masked value */
799 					tgt += (guint64)(t & (bmask >> (NBBY - cur_bit)))
800 							<< (NBBY * intlen);
801 					*res = tgt;
802 
803 					return intlen + 1;
804 				}
805 			}
806 			cur_bit --;
807 			intlen ++;
808 		}
809 	}
810 
811 	return -1;
812 }
813 
814 #define SZ_READ_VINT_SKIP() do { \
815 	r = rspamd_archive_7zip_read_vint (p, end - p, &vint); \
816 	if (r == -1) { \
817 		msg_debug_archive ("7z archive is invalid (bad vint)"); \
818 		return; \
819 	} \
820 	p += r; \
821 } while (0)
822 #define SZ_READ_VINT(var) do { \
823 	int r; \
824 	r = rspamd_archive_7zip_read_vint (p, end - p, &(var)); \
825 	if (r == -1) { \
826 		msg_debug_archive ("7z archive is invalid (bad vint): %s", G_STRLOC); \
827 		return NULL; \
828 	} \
829 	p += r; \
830 } while (0)
831 
832 #define SZ_READ_UINT64(n) do { \
833 	if (end - p < (goffset)sizeof (guint64)) { \
834 		msg_debug_archive ("7zip archive is invalid (bad uint64): %s", G_STRLOC); \
835 		return; \
836 	} \
837 	memcpy (&(n), p, sizeof (guint64)); \
838 	n = GUINT64_FROM_LE(n); \
839 	p += sizeof (guint64); \
840 } while (0)
841 #define SZ_SKIP_BYTES(n) do { \
842 	if (end - p >= (n)) { \
843 		p += (n); \
844 	} \
845 	else { \
846 		msg_debug_archive ("7zip archive is invalid (truncated); wanted to read %d bytes, %d avail: %s", (gint)(n), (gint)(end - p), G_STRLOC); \
847 		return NULL; \
848 	} \
849 } while (0)
850 
851 enum rspamd_7zip_header_mark {
852 	kEnd = 0x00,
853 	kHeader = 0x01,
854 	kArchiveProperties = 0x02,
855 	kAdditionalStreamsInfo = 0x03,
856 	kMainStreamsInfo = 0x04,
857 	kFilesInfo = 0x05,
858 	kPackInfo = 0x06,
859 	kUnPackInfo = 0x07,
860 	kSubStreamsInfo = 0x08,
861 	kSize = 0x09,
862 	kCRC = 0x0A,
863 	kFolder = 0x0B,
864 	kCodersUnPackSize = 0x0C,
865 	kNumUnPackStream = 0x0D,
866 	kEmptyStream = 0x0E,
867 	kEmptyFile = 0x0F,
868 	kAnti = 0x10,
869 	kName = 0x11,
870 	kCTime = 0x12,
871 	kATime = 0x13,
872 	kMTime = 0x14,
873 	kWinAttributes = 0x15,
874 	kComment = 0x16,
875 	kEncodedHeader = 0x17,
876 	kStartPos = 0x18,
877 	kDummy = 0x19,
878 };
879 
880 
881 #define _7Z_CRYPTO_MAIN_ZIP			0x06F10101 /* Main Zip crypto algo */
882 #define _7Z_CRYPTO_RAR_29			0x06F10303 /* Rar29 AES-128 + (modified SHA-1) */
883 #define _7Z_CRYPTO_AES_256_SHA_256	0x06F10701 /* AES-256 + SHA-256 */
884 
885 #define IS_SZ_ENCRYPTED(codec_id) (((codec_id) == _7Z_CRYPTO_MAIN_ZIP) || \
886 	((codec_id) == _7Z_CRYPTO_RAR_29) || \
887 	((codec_id) == _7Z_CRYPTO_AES_256_SHA_256))
888 
889 static const guchar *
rspamd_7zip_read_bits(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch,guint nbits,guint * pbits_set)890 rspamd_7zip_read_bits (struct rspamd_task *task,
891 		const guchar *p, const guchar *end,
892 		struct rspamd_archive *arch, guint nbits,
893 		guint *pbits_set)
894 {
895 	unsigned mask = 0, avail = 0, i;
896 	gboolean bit_set = 0;
897 
898 	for (i = 0; i < nbits; i++) {
899 		if (mask == 0) {
900 			avail = *p;
901 			SZ_SKIP_BYTES(1);
902 			mask = 0x80;
903 		}
904 
905 		bit_set = (avail & mask) ? 1 : 0;
906 
907 		if (bit_set && pbits_set) {
908 			(*pbits_set) ++;
909 		}
910 
911 		mask >>= 1;
912 	}
913 
914 	return p;
915 }
916 
917 static const guchar *
rspamd_7zip_read_digest(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch,guint64 num_streams,guint * pdigest_read)918 rspamd_7zip_read_digest (struct rspamd_task *task,
919 		const guchar *p, const guchar *end,
920 		struct rspamd_archive *arch,
921 		guint64 num_streams,
922 		guint *pdigest_read)
923 {
924 	guchar all_defined = *p;
925 	guint64 i;
926 	guint num_defined = 0;
927 	/*
928 	 * BYTE AllAreDefined
929 	 *  if (AllAreDefined == 0)
930 	 *  {
931 	 *    for(NumStreams)
932 	 *    BIT Defined
933 	 *  }
934 	 *  UINT32 CRCs[NumDefined]
935 	 */
936 	SZ_SKIP_BYTES(1);
937 
938 	if (all_defined) {
939 		num_defined = num_streams;
940 	}
941 	else {
942 		if (num_streams > 8192) {
943 			/* Gah */
944 			return NULL;
945 		}
946 
947 		p = rspamd_7zip_read_bits (task, p, end, arch, num_streams, &num_defined);
948 
949 		if (p == NULL) {
950 			return NULL;
951 		}
952 	}
953 
954 	for (i = 0; i < num_defined; i ++) {
955 		SZ_SKIP_BYTES(sizeof(guint32));
956 	}
957 
958 	if (pdigest_read) {
959 		*pdigest_read = num_defined;
960 	}
961 
962 	return p;
963 }
964 
965 static const guchar *
rspamd_7zip_read_pack_info(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch)966 rspamd_7zip_read_pack_info (struct rspamd_task *task,
967 		const guchar *p, const guchar *end,
968 		struct rspamd_archive *arch)
969 {
970 	guint64 pack_pos = 0, pack_streams = 0, i, cur_sz;
971 	guint num_digests = 0;
972 	guchar t;
973 	/*
974 	 *  UINT64 PackPos
975 	 *  UINT64 NumPackStreams
976 	 *
977 	 *  []
978 	 *  BYTE NID::kSize    (0x09)
979 	 *  UINT64 PackSizes[NumPackStreams]
980 	 *  []
981 	 *
982 	 *  []
983 	 *  BYTE NID::kCRC      (0x0A)
984 	 *  PackStreamDigests[NumPackStreams]
985 	 *  []
986 	 *  BYTE NID::kEnd
987 	 */
988 
989 	SZ_READ_VINT(pack_pos);
990 	SZ_READ_VINT(pack_streams);
991 
992 	while (p != NULL && p < end) {
993 		t = *p;
994 		SZ_SKIP_BYTES(1);
995 		msg_debug_archive ("7zip: read pack info %xc", t);
996 
997 		switch (t) {
998 		case kSize:
999 			/* We need to skip pack_streams VINTS */
1000 			for (i = 0; i < pack_streams; i++) {
1001 				SZ_READ_VINT(cur_sz);
1002 			}
1003 			break;
1004 		case kCRC:
1005 			/* CRCs are more complicated */
1006 			p = rspamd_7zip_read_digest (task, p, end, arch, pack_streams,
1007 					&num_digests);
1008 			break;
1009 		case kEnd:
1010 			goto end;
1011 			break;
1012 		default:
1013 			p = NULL;
1014 			msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
1015 			goto end;
1016 			break;
1017 		}
1018 	}
1019 
1020 end:
1021 
1022 	return p;
1023 }
1024 
1025 static const guchar *
rspamd_7zip_read_folder(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch,guint * pnstreams,guint * ndigests)1026 rspamd_7zip_read_folder (struct rspamd_task *task,
1027 		const guchar *p, const guchar *end,
1028 		struct rspamd_archive *arch, guint *pnstreams, guint *ndigests)
1029 {
1030 	guint64 ncoders = 0, i, j, noutstreams = 0, ninstreams = 0;
1031 
1032 	SZ_READ_VINT (ncoders);
1033 
1034 	for (i = 0; i < ncoders && p != NULL && p < end; i ++) {
1035 		guint64 sz, tmp;
1036 		guchar t;
1037 		/*
1038 		 * BYTE
1039 		 * {
1040 		 *   0:3 CodecIdSize
1041 		 *   4:  Is Complex Coder
1042 		 *   5:  There Are Attributes
1043 		 *   6:  Reserved
1044 		 *   7:  There are more alternative methods. (Not used anymore, must be 0).
1045 		 * }
1046 		 * BYTE CodecId[CodecIdSize]
1047 		 * if (Is Complex Coder)
1048 		 * {
1049 		 *   UINT64 NumInStreams;
1050 		 *   UINT64 NumOutStreams;
1051 		 * }
1052 		 * if (There Are Attributes)
1053 		 * {
1054 		 *   UINT64 PropertiesSize
1055 		 *   BYTE Properties[PropertiesSize]
1056 		 * }
1057 		 */
1058 		t = *p;
1059 		SZ_SKIP_BYTES (1);
1060 		sz = t & 0xF;
1061 		/* Codec ID */
1062 		tmp = 0;
1063 		for (j = 0; j < sz; j++) {
1064 			tmp <<= 8;
1065 			tmp += p[j];
1066 		}
1067 
1068 		msg_debug_archive ("7zip: read codec id: %L", tmp);
1069 
1070 		if (IS_SZ_ENCRYPTED (tmp)) {
1071 			arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
1072 		}
1073 
1074 		SZ_SKIP_BYTES (sz);
1075 
1076 		if (t & (1u << 4)) {
1077 			/* Complex */
1078 			SZ_READ_VINT (tmp); /* InStreams */
1079 			ninstreams += tmp;
1080 			SZ_READ_VINT (tmp); /* OutStreams */
1081 			noutstreams += tmp;
1082 		}
1083 		else {
1084 			/* XXX: is it correct ? */
1085 			noutstreams ++;
1086 			ninstreams ++;
1087 		}
1088 		if (t & (1u << 5)) {
1089 			/* Attributes ... */
1090 			SZ_READ_VINT (tmp); /* Size of attrs */
1091 			SZ_SKIP_BYTES (tmp);
1092 		}
1093 	}
1094 
1095 	if (noutstreams > 1) {
1096 		/* BindPairs, WTF, huh */
1097 		for (i = 0; i < noutstreams - 1; i ++) {
1098 			guint64 tmp;
1099 
1100 			SZ_READ_VINT (tmp);
1101 			SZ_READ_VINT (tmp);
1102 		}
1103 	}
1104 
1105 	gint64 npacked = (gint64)ninstreams - (gint64)noutstreams + 1;
1106 	msg_debug_archive ("7zip: instreams=%L, outstreams=%L, packed=%L",
1107 			ninstreams, noutstreams, npacked);
1108 
1109 	if (npacked > 1) {
1110 		/* Gah... */
1111 		for (i = 0; i < npacked; i ++) {
1112 			guint64 tmp;
1113 
1114 			SZ_READ_VINT (tmp);
1115 		}
1116 	}
1117 
1118 	*pnstreams = noutstreams;
1119 	(*ndigests) += npacked;
1120 
1121 	return p;
1122 }
1123 
1124 static const guchar *
rspamd_7zip_read_coders_info(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch,guint * pnum_folders,guint * pnum_nodigest)1125 rspamd_7zip_read_coders_info (struct rspamd_task *task,
1126 		const guchar *p, const guchar *end,
1127 		struct rspamd_archive *arch,
1128 		guint *pnum_folders, guint *pnum_nodigest)
1129 {
1130 	guint64 num_folders = 0, i, tmp;
1131 	guchar t;
1132 	guint *folder_nstreams = NULL, num_digests = 0, digests_read = 0;
1133 
1134 	while (p != NULL && p < end) {
1135 		/*
1136 		 * BYTE NID::kFolder  (0x0B)
1137 		 *  UINT64 NumFolders
1138 		 *  BYTE External
1139 		 *  switch(External)
1140 		 *  {
1141 		 * 	case 0:
1142 		 * 	  Folders[NumFolders]
1143 		 * 	case 1:
1144 		 * 	  UINT64 DataStreamIndex
1145 		 *   }
1146 		 *   BYTE ID::kCodersUnPackSize  (0x0C)
1147 		 *   for(Folders)
1148 		 * 	for(Folder.NumOutStreams)
1149 		 * 	 UINT64 UnPackSize;
1150 		 *   []
1151 		 *   BYTE NID::kCRC   (0x0A)
1152 		 *   UnPackDigests[NumFolders]
1153 		 *   []
1154 		 *   BYTE NID::kEnd
1155 		 */
1156 
1157 		t = *p;
1158 		SZ_SKIP_BYTES(1);
1159 		msg_debug_archive ("7zip: read coders info %xc", t);
1160 
1161 		switch (t) {
1162 		case kFolder:
1163 			SZ_READ_VINT (num_folders);
1164 			msg_debug_archive ("7zip: nfolders=%L", num_folders);
1165 
1166 			if (*p != 0) {
1167 				/* External folders */
1168 				SZ_SKIP_BYTES(1);
1169 				SZ_READ_VINT (tmp);
1170 			}
1171 			else {
1172 				SZ_SKIP_BYTES(1);
1173 
1174 				if (num_folders > 8192) {
1175 					/* Gah */
1176 					return NULL;
1177 				}
1178 
1179 				if (folder_nstreams) {
1180 					g_free (folder_nstreams);
1181 				}
1182 
1183 				folder_nstreams = g_malloc (sizeof (int) * num_folders);
1184 
1185 				for (i = 0; i < num_folders && p != NULL && p < end; i++) {
1186 					p = rspamd_7zip_read_folder (task, p, end, arch,
1187 							&folder_nstreams[i], &num_digests);
1188 				}
1189 			}
1190 			break;
1191 		case kCodersUnPackSize:
1192 			for (i = 0; i < num_folders && p != NULL && p < end; i++) {
1193 				if (folder_nstreams) {
1194 					for (guint j = 0; j < folder_nstreams[i]; j++) {
1195 						SZ_READ_VINT (tmp); /* Unpacked size */
1196 						msg_debug_archive ("7zip: unpacked size "
1197 										   "(folder=%d, stream=%d) = %L",
1198 								(gint)i, j, tmp);
1199 					}
1200 				}
1201 				else {
1202 					msg_err_task ("internal 7zip error");
1203 				}
1204 			}
1205 			break;
1206 		case kCRC:
1207 			/*
1208 			 * Here are dragons. Spec tells that here there could be up
1209 			 * to nfolders digests. However, according to the actual source
1210 			 * code, in case of multiple out streams there should be digests
1211 			 * for all out streams.
1212 			 *
1213 			 * In the real life (tm) it is even more idiotic: all these digests
1214 			 * are in another section! But that section needs number of digests
1215 			 * that are absent here. It is the most stupid thing I've ever seen
1216 			 * in any file format.
1217 			 *
1218 			 * I hope there *WAS* some reason to do such shit...
1219 			 */
1220 			p = rspamd_7zip_read_digest (task, p, end, arch, num_digests,
1221 					&digests_read);
1222 			break;
1223 		case kEnd:
1224 			goto end;
1225 			break;
1226 		default:
1227 			p = NULL;
1228 			msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
1229 			goto end;
1230 			break;
1231 		}
1232 	}
1233 
1234 end:
1235 
1236 	if (pnum_nodigest) {
1237 		*pnum_nodigest = num_digests - digests_read;
1238 	}
1239 	if (pnum_folders) {
1240 		*pnum_folders = num_folders;
1241 	}
1242 
1243 	if (folder_nstreams) {
1244 		g_free (folder_nstreams);
1245 	}
1246 
1247 	return p;
1248 }
1249 
1250 static const guchar *
rspamd_7zip_read_substreams_info(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch,guint num_folders,guint num_nodigest)1251 rspamd_7zip_read_substreams_info (struct rspamd_task *task,
1252 		const guchar *p, const guchar *end,
1253 		struct rspamd_archive *arch,
1254 		guint num_folders, guint num_nodigest)
1255 {
1256 	guchar t;
1257 	guint i;
1258 	guint64 *folder_nstreams;
1259 
1260 	if (num_folders > 8192) {
1261 		/* Gah */
1262 		return NULL;
1263 	}
1264 
1265 	folder_nstreams = g_alloca (sizeof (guint64) * num_folders);
1266 	memset (folder_nstreams, 0, sizeof (guint64) * num_folders);
1267 
1268 	while (p != NULL && p < end) {
1269 		/*
1270 		 * []
1271 		 *  BYTE NID::kNumUnPackStream; (0x0D)
1272 		 *  UINT64 NumUnPackStreamsInFolders[NumFolders];
1273 		 *  []
1274 		 *
1275 		 *  []
1276 		 *  BYTE NID::kSize  (0x09)
1277 		 *  UINT64 UnPackSizes[??]
1278 		 *  []
1279 		 *
1280 		 *
1281 		 *  []
1282 		 *  BYTE NID::kCRC  (0x0A)
1283 		 *  Digests[Number of streams with unknown CRC]
1284 		 *  []
1285 
1286 		 */
1287 		t = *p;
1288 		SZ_SKIP_BYTES(1);
1289 
1290 		msg_debug_archive ("7zip: read substream info %xc", t);
1291 
1292 		switch (t) {
1293 		case kNumUnPackStream:
1294 			for (i = 0; i < num_folders; i ++) {
1295 				guint64 tmp;
1296 
1297 				SZ_READ_VINT (tmp);
1298 				folder_nstreams[i] = tmp;
1299 			}
1300 			break;
1301 		case kCRC:
1302 			/*
1303 			 * Read the comment in the rspamd_7zip_read_coders_info
1304 			 */
1305 			p = rspamd_7zip_read_digest (task, p, end, arch, num_nodigest,
1306 					NULL);
1307 			break;
1308 		case kSize:
1309 			/*
1310 			 * Another brain damaged logic, but we have to support it
1311 			 * as there are no ways to proceed without it.
1312 			 * In fact, it is just absent in the real life...
1313 			 */
1314 			for (i = 0; i < num_folders; i ++) {
1315 				for (guint j = 0; j < folder_nstreams[i]; j++) {
1316 					guint64 tmp;
1317 
1318 					SZ_READ_VINT (tmp); /* Who cares indeed */
1319 				}
1320 			}
1321 			break;
1322 		case kEnd:
1323 			goto end;
1324 			break;
1325 		default:
1326 			p = NULL;
1327 			msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
1328 			goto end;
1329 			break;
1330 		}
1331 	}
1332 
1333 end:
1334 	return p;
1335 }
1336 
1337 static const guchar *
rspamd_7zip_read_main_streams_info(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch)1338 rspamd_7zip_read_main_streams_info (struct rspamd_task *task,
1339 		const guchar *p, const guchar *end,
1340 		struct rspamd_archive *arch)
1341 {
1342 	guchar t;
1343 	guint num_folders = 0, unknown_digests = 0;
1344 
1345 	while (p != NULL && p < end) {
1346 		t = *p;
1347 		SZ_SKIP_BYTES(1);
1348 		msg_debug_archive ("7zip: read main streams info %xc", t);
1349 
1350 		/*
1351 		 *
1352 		 *  []
1353 		 *  PackInfo
1354 		 *  []
1355 
1356 		 *  []
1357 		 *  CodersInfo
1358 		 *  []
1359 		 *
1360 		 *  []
1361 		 *  SubStreamsInfo
1362 		 *  []
1363 		 *
1364 		 *  BYTE NID::kEnd
1365 		 */
1366 		switch (t) {
1367 		case kPackInfo:
1368 			p = rspamd_7zip_read_pack_info (task, p, end, arch);
1369 			break;
1370 		case kUnPackInfo:
1371 			p = rspamd_7zip_read_coders_info (task, p, end, arch, &num_folders,
1372 					&unknown_digests);
1373 			break;
1374 		case kSubStreamsInfo:
1375 			p = rspamd_7zip_read_substreams_info (task, p, end, arch, num_folders,
1376 					unknown_digests);
1377 			break;
1378 			break;
1379 		case kEnd:
1380 			goto end;
1381 			break;
1382 		default:
1383 			p = NULL;
1384 			msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
1385 			goto end;
1386 			break;
1387 		}
1388 	}
1389 
1390 end:
1391 	return p;
1392 }
1393 
1394 static const guchar *
rspamd_7zip_read_archive_props(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch)1395 rspamd_7zip_read_archive_props (struct rspamd_task *task,
1396 		const guchar *p, const guchar *end,
1397 		struct rspamd_archive *arch)
1398 {
1399 	guchar proptype;
1400 	guint64 proplen;
1401 
1402 	/*
1403 	 * for (;;)
1404 	 * {
1405 	 *   BYTE PropertyType;
1406 	 *   if (aType == 0)
1407 	 *     break;
1408 	 *   UINT64 PropertySize;
1409 	 *   BYTE PropertyData[PropertySize];
1410 	 * }
1411 	 */
1412 
1413 	if (p != NULL) {
1414 		proptype = *p;
1415 		SZ_SKIP_BYTES(1);
1416 
1417 		while (proptype != 0) {
1418 			SZ_READ_VINT(proplen);
1419 
1420 			if (p + proplen < end) {
1421 				p += proplen;
1422 			}
1423 			else {
1424 				return NULL;
1425 			}
1426 
1427 			proptype = *p;
1428 			SZ_SKIP_BYTES(1);
1429 		}
1430 	}
1431 
1432 	return p;
1433 }
1434 
1435 static GString *
rspamd_7zip_ucs2_to_utf8(struct rspamd_task * task,const guchar * p,const guchar * end)1436 rspamd_7zip_ucs2_to_utf8 (struct rspamd_task *task, const guchar *p,
1437 		const guchar *end)
1438 {
1439 	GString *res;
1440 	goffset dest_pos = 0, src_pos = 0;
1441 	const gsize len = (end - p) / sizeof (guint16);
1442 	guint16 *up;
1443 	UChar32 wc;
1444 	UBool is_error = 0;
1445 
1446 	res = g_string_sized_new ((end - p) * 3 / 2 + sizeof (wc) + 1);
1447 	up = (guint16 *)p;
1448 
1449 	while (src_pos < len) {
1450 		U16_NEXT (up, src_pos, len, wc);
1451 
1452 		if (wc > 0) {
1453 			U8_APPEND (res->str, dest_pos,
1454 					res->allocated_len - 1,
1455 					wc, is_error);
1456 		}
1457 
1458 		if (is_error) {
1459 			g_string_free (res, TRUE);
1460 
1461 			return NULL;
1462 		}
1463 	}
1464 
1465 	g_assert (dest_pos < res->allocated_len);
1466 
1467 	res->len = dest_pos;
1468 	res->str[dest_pos] = '\0';
1469 
1470 	return res;
1471 }
1472 
1473 static const guchar *
rspamd_7zip_read_files_info(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch)1474 rspamd_7zip_read_files_info (struct rspamd_task *task,
1475 		const guchar *p, const guchar *end,
1476 		struct rspamd_archive *arch)
1477 {
1478 	guint64 nfiles = 0, sz, i;
1479 	guchar t, b;
1480 	struct rspamd_archive_file *fentry;
1481 
1482 	SZ_READ_VINT (nfiles);
1483 
1484 	for (;p != NULL && p < end;) {
1485 		t = *p;
1486 		SZ_SKIP_BYTES (1);
1487 
1488 		msg_debug_archive ("7zip: read file data type %xc", t);
1489 
1490 		if (t == kEnd) {
1491 			goto end;
1492 		}
1493 
1494 		/* This is SO SPECIAL, gah */
1495 		SZ_READ_VINT (sz);
1496 
1497 		switch (t) {
1498 		case kEmptyStream:
1499 		case kEmptyFile:
1500 		case kAnti: /* AntiFile, OMFG */
1501 			/* We don't care about these bits */
1502 		case kCTime:
1503 		case kATime:
1504 		case kMTime:
1505 			/* We don't care of these guys, but we still have to parse them, gah */
1506 			if (sz > 0) {
1507 				SZ_SKIP_BYTES (sz);
1508 			}
1509 			break;
1510 		case kName:
1511 			/* The most useful part in this whole bloody format */
1512 			b = *p; /* External flag */
1513 			SZ_SKIP_BYTES (1);
1514 
1515 			if (b) {
1516 				/* TODO: for the god sake, do something about external
1517 				 * filenames...
1518 				 */
1519 				guint64 tmp;
1520 
1521 				SZ_READ_VINT (tmp);
1522 			}
1523 			else {
1524 				for (i = 0; i < nfiles; i ++) {
1525 					/* Zero terminated wchar_t: happy converting... */
1526 					/* First, find terminator */
1527 					const guchar *fend = NULL, *tp = p;
1528 					GString *res;
1529 
1530 					while (tp < end - 1) {
1531 						if (*tp == 0 && *(tp + 1) == 0) {
1532 							fend = tp;
1533 							break;
1534 						}
1535 
1536 						tp += 2;
1537 					}
1538 
1539 					if (fend == NULL || fend - p == 0) {
1540 						/* Crap instead of fname */
1541 						msg_debug_archive ("bad 7zip name; %s", G_STRLOC);
1542 						goto end;
1543 					}
1544 
1545 					res = rspamd_7zip_ucs2_to_utf8 (task, p, fend);
1546 
1547 					if (res != NULL) {
1548 						fentry = g_malloc0 (sizeof (*fentry));
1549 						fentry->fname = res;
1550 						g_ptr_array_add (arch->files, fentry);
1551 						msg_debug_archive ("7zip: found file %v", res);
1552 					}
1553 					else {
1554 						msg_debug_archive ("bad 7zip name; %s", G_STRLOC);
1555 					}
1556 					/* Skip zero terminating character */
1557 					p = fend + 2;
1558 				}
1559 			}
1560 			break;
1561 		case kDummy:
1562 		case kWinAttributes:
1563 			if (sz > 0) {
1564 				SZ_SKIP_BYTES (sz);
1565 			}
1566 			break;
1567 		default:
1568 			p = NULL;
1569 			msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
1570 			goto end;
1571 			break;
1572 		}
1573 	}
1574 
1575 end:
1576 	return p;
1577 }
1578 
1579 static const guchar *
rspamd_7zip_read_next_section(struct rspamd_task * task,const guchar * p,const guchar * end,struct rspamd_archive * arch)1580 rspamd_7zip_read_next_section (struct rspamd_task *task,
1581 		const guchar *p, const guchar *end,
1582 		struct rspamd_archive *arch)
1583 {
1584 	guchar t = *p;
1585 
1586 	SZ_SKIP_BYTES(1);
1587 
1588 	msg_debug_archive ("7zip: read section %xc", t);
1589 
1590 	switch (t) {
1591 	case kHeader:
1592 		/* We just skip byte and go further */
1593 		break;
1594 	case kEncodedHeader:
1595 		/*
1596 		 * In fact, headers are just packed, but we assume it as
1597 		 * encrypted to distinguish from the normal archives
1598 		 */
1599 		msg_debug_archive ("7zip: encoded header, needs to be uncompressed");
1600 		arch->flags |= RSPAMD_ARCHIVE_CANNOT_READ;
1601 		p = NULL; /* Cannot get anything useful */
1602 		break;
1603 	case kArchiveProperties:
1604 		p = rspamd_7zip_read_archive_props (task, p, end, arch);
1605 		break;
1606 	case kMainStreamsInfo:
1607 		p = rspamd_7zip_read_main_streams_info (task, p, end, arch);
1608 		break;
1609 	case kAdditionalStreamsInfo:
1610 		p = rspamd_7zip_read_main_streams_info (task, p, end, arch);
1611 		break;
1612 	case kFilesInfo:
1613 		p = rspamd_7zip_read_files_info (task, p, end, arch);
1614 		break;
1615 	case kEnd:
1616 		p = NULL;
1617 		msg_debug_archive ("7zip: read final section");
1618 		break;
1619 	default:
1620 		p = NULL;
1621 		msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
1622 		break;
1623 	}
1624 
1625 	return p;
1626 }
1627 
1628 static void
rspamd_archive_process_7zip(struct rspamd_task * task,struct rspamd_mime_part * part)1629 rspamd_archive_process_7zip (struct rspamd_task *task,
1630 		struct rspamd_mime_part *part)
1631 {
1632 	struct rspamd_archive *arch;
1633 	const guchar *start, *p, *end;
1634 	const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
1635 	guint64 section_offset = 0, section_length = 0;
1636 
1637 	start = part->parsed_data.begin;
1638 	p = start;
1639 	end = p + part->parsed_data.len;
1640 
1641 	if (end - p <= sizeof (guint64) + sizeof (guint32) ||
1642 			memcmp (p, sz_magic, sizeof (sz_magic)) != 0) {
1643 		msg_debug_archive ("7z archive is invalid (no 7z magic)");
1644 
1645 		return;
1646 	}
1647 
1648 	arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
1649 	arch->files = g_ptr_array_new ();
1650 	arch->type = RSPAMD_ARCHIVE_7ZIP;
1651 	rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
1652 			arch);
1653 
1654 	/* Magic (6 bytes) + version (2 bytes) + crc32 (4 bytes) */
1655 	p += sizeof (guint64) + sizeof (guint32);
1656 
1657 	SZ_READ_UINT64(section_offset);
1658 	SZ_READ_UINT64(section_length);
1659 
1660 	if (end - p > sizeof (guint32)) {
1661 		p += sizeof (guint32);
1662 	}
1663 	else {
1664 		msg_debug_archive ("7z archive is invalid (truncated crc)");
1665 
1666 		return;
1667 	}
1668 
1669 	if (end - p > section_offset) {
1670 		p += section_offset;
1671 	}
1672 	else {
1673 		msg_debug_archive ("7z archive is invalid (incorrect section offset)");
1674 
1675 		return;
1676 	}
1677 
1678 	while ((p = rspamd_7zip_read_next_section (task, p, end, arch)) != NULL);
1679 
1680 	part->part_type = RSPAMD_MIME_PART_ARCHIVE;
1681 	part->specific.arch = arch;
1682 	if (part->cd != NULL) {
1683 		arch->archive_name = &part->cd->filename;
1684 	}
1685 	arch->size = part->parsed_data.len;
1686 }
1687 
1688 static void
rspamd_archive_process_gzip(struct rspamd_task * task,struct rspamd_mime_part * part)1689 rspamd_archive_process_gzip (struct rspamd_task *task,
1690 							 struct rspamd_mime_part *part) {
1691 	struct rspamd_archive *arch;
1692 	const guchar *start, *p, *end;
1693 	const guchar gz_magic[] = {0x1F, 0x8B};
1694 	guchar flags;
1695 
1696 	start = part->parsed_data.begin;
1697 	p = start;
1698 	end = p + part->parsed_data.len;
1699 
1700 	if (end - p <= 10 || memcmp (p, gz_magic, sizeof (gz_magic)) != 0) {
1701 		msg_debug_archive ("gzip archive is invalid (no gzip magic)");
1702 
1703 		return;
1704 	}
1705 
1706 	arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
1707 	arch->files = g_ptr_array_sized_new (1);
1708 	arch->type = RSPAMD_ARCHIVE_GZIP;
1709 	rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
1710 			arch);
1711 
1712 	flags = p[3];
1713 
1714 	if (flags & (1u << 5)) {
1715 		arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
1716 	}
1717 
1718 	if (flags & (1u << 3)) {
1719 		/* We have file name presented in archive, try to use it */
1720 		if (flags & (1u << 1)) {
1721 			/* Multipart */
1722 			p += 12;
1723 		}
1724 		else {
1725 			p += 10;
1726 		}
1727 
1728 		if (flags & (1u << 2)) {
1729 			/* Optional section */
1730 			guint16 optlen = 0;
1731 
1732 			RAR_READ_UINT16 (optlen);
1733 
1734 			if (end <= p + optlen) {
1735 				msg_debug_archive ("gzip archive is invalid, bad extra length: %d",
1736 						(int)optlen);
1737 
1738 				return;
1739 			}
1740 
1741 			p += optlen;
1742 		}
1743 
1744 		/* Read file name */
1745 		const guchar *fname_start = p;
1746 
1747 		while (p < end) {
1748 			if (*p == '\0') {
1749 				if (p > fname_start) {
1750 					struct rspamd_archive_file *f;
1751 
1752 					f = g_malloc0 (sizeof (*f));
1753 					f->fname = rspamd_archive_file_try_utf (task, fname_start,
1754 							p - fname_start);
1755 
1756 					if (f->fname) {
1757 						g_ptr_array_add (arch->files, f);
1758 					}
1759 					else {
1760 						/* Invalid filename, skip */
1761 						g_free (f);
1762 					}
1763 
1764 					goto set;
1765 				}
1766 			}
1767 
1768 			p ++;
1769 		}
1770 
1771 		/* Wrong filename, not zero terminated */
1772 		msg_debug_archive ("gzip archive is invalid, bad filename at pos %d",
1773 				(int)(p - start));
1774 
1775 		return;
1776 	}
1777 
1778 	/* Fallback, we need to extract file name from archive name if possible */
1779 	if (part->cd && part->cd->filename.len > 0) {
1780 		const gchar *dot_pos, *slash_pos;
1781 
1782 		dot_pos = rspamd_memrchr (part->cd->filename.begin, '.',
1783 				part->cd->filename.len);
1784 
1785 		if (dot_pos) {
1786 			struct rspamd_archive_file *f;
1787 
1788 			slash_pos = rspamd_memrchr (part->cd->filename.begin, '/',
1789 					part->cd->filename.len);
1790 
1791 			if (slash_pos && slash_pos < dot_pos) {
1792 				f = g_malloc0 (sizeof (*f));
1793 				f->fname = g_string_sized_new (dot_pos - slash_pos);
1794 				g_string_append_len (f->fname, slash_pos + 1,
1795 						dot_pos - slash_pos - 1);
1796 
1797 				msg_debug_archive ("fallback to gzip filename based on cd: %v",
1798 						f->fname);
1799 
1800 				g_ptr_array_add (arch->files, f);
1801 
1802 				goto set;
1803 			}
1804 			else {
1805 				const gchar *fname_start = part->cd->filename.begin;
1806 
1807 				f = g_malloc0 (sizeof (*f));
1808 
1809 				if (memchr (fname_start, '.', part->cd->filename.len) != dot_pos) {
1810 					/* Double dots, something like foo.exe.gz */
1811 					f->fname = g_string_sized_new (dot_pos - fname_start);
1812 					g_string_append_len (f->fname, fname_start,
1813 							dot_pos - fname_start);
1814 				}
1815 				else {
1816 					/* Single dot, something like foo.gzz */
1817 					f->fname = g_string_sized_new (part->cd->filename.len);
1818 					g_string_append_len (f->fname, fname_start,
1819 							part->cd->filename.len);
1820 				}
1821 
1822 				msg_debug_archive ("fallback to gzip filename based on cd: %v",
1823 						f->fname);
1824 
1825 				g_ptr_array_add (arch->files, f);
1826 
1827 				goto set;
1828 			}
1829 		}
1830 	}
1831 
1832 	return;
1833 
1834 set:
1835 	/* Set archive data */
1836 	part->part_type = RSPAMD_MIME_PART_ARCHIVE;
1837 	part->specific.arch = arch;
1838 
1839 	if (part->cd) {
1840 		arch->archive_name = &part->cd->filename;
1841 	}
1842 
1843 	arch->size = part->parsed_data.len;
1844 }
1845 
1846 static gboolean
rspamd_archive_cheat_detect(struct rspamd_mime_part * part,const gchar * str,const guchar * magic_start,gsize magic_len)1847 rspamd_archive_cheat_detect (struct rspamd_mime_part *part, const gchar *str,
1848 		const guchar *magic_start, gsize magic_len)
1849 {
1850 	struct rspamd_content_type *ct;
1851 	const gchar *p;
1852 	rspamd_ftok_t srch, *fname;
1853 
1854 	ct = part->ct;
1855 	RSPAMD_FTOK_ASSIGN (&srch, "application");
1856 
1857 	if (ct && ct->type.len && ct->subtype.len > 0 && rspamd_ftok_cmp (&ct->type,
1858 			&srch) == 0) {
1859 		if (rspamd_substring_search_caseless (ct->subtype.begin, ct->subtype.len,
1860 				str, strlen (str)) != -1) {
1861 			/* We still need to check magic, see #1848 */
1862 			if (magic_start != NULL) {
1863 				if (part->parsed_data.len > magic_len &&
1864 						memcmp (part->parsed_data.begin,
1865 								magic_start, magic_len) == 0) {
1866 					return TRUE;
1867 				}
1868 				/* No magic, refuse this type of archive */
1869 				return FALSE;
1870 			}
1871 			else {
1872 				return TRUE;
1873 			}
1874 		}
1875 	}
1876 
1877 	if (part->cd) {
1878 		fname = &part->cd->filename;
1879 
1880 		if (fname && fname->len > strlen (str)) {
1881 			p = fname->begin + fname->len - strlen (str);
1882 
1883 			if (rspamd_lc_cmp (p, str, strlen (str)) == 0) {
1884 				if (*(p - 1) == '.') {
1885 					if (magic_start != NULL) {
1886 						if (part->parsed_data.len > magic_len &&
1887 								memcmp (part->parsed_data.begin,
1888 										magic_start, magic_len) == 0) {
1889 							return TRUE;
1890 						}
1891 						/* No magic, refuse this type of archive */
1892 						return FALSE;
1893 					}
1894 
1895 					return TRUE;
1896 				}
1897 			}
1898 		}
1899 
1900 		if (magic_start != NULL) {
1901 			if (part->parsed_data.len > magic_len &&
1902 				memcmp (part->parsed_data.begin, magic_start, magic_len) == 0) {
1903 				return TRUE;
1904 			}
1905 		}
1906 	}
1907 	else {
1908 		if (magic_start != NULL) {
1909 			if (part->parsed_data.len > magic_len &&
1910 				memcmp (part->parsed_data.begin, magic_start, magic_len) == 0) {
1911 				return TRUE;
1912 			}
1913 		}
1914 	}
1915 
1916 	return FALSE;
1917 }
1918 
1919 void
rspamd_archives_process(struct rspamd_task * task)1920 rspamd_archives_process (struct rspamd_task *task)
1921 {
1922 	guint i;
1923 	struct rspamd_mime_part *part;
1924 	const guchar rar_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07};
1925 	const guchar zip_magic[] = {0x50, 0x4b, 0x03, 0x04};
1926 	const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
1927 	const guchar gz_magic[] = {0x1F, 0x8B, 0x08};
1928 
1929 	PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) {
1930 		if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
1931 			if (part->parsed_data.len > 0) {
1932 				if (rspamd_archive_cheat_detect (part, "zip",
1933 						zip_magic, sizeof (zip_magic))) {
1934 					rspamd_archive_process_zip (task, part);
1935 				}
1936 				else if (rspamd_archive_cheat_detect (part, "rar",
1937 						rar_magic, sizeof (rar_magic))) {
1938 					rspamd_archive_process_rar (task, part);
1939 				}
1940 				else if (rspamd_archive_cheat_detect (part, "7z",
1941 						sz_magic, sizeof (sz_magic))) {
1942 					rspamd_archive_process_7zip (task, part);
1943 				}
1944 				else if (rspamd_archive_cheat_detect (part, "gz",
1945 						gz_magic, sizeof (gz_magic))) {
1946 					rspamd_archive_process_gzip (task, part);
1947 				}
1948 
1949 				if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) &&
1950 						part->part_type == RSPAMD_MIME_PART_ARCHIVE &&
1951 						part->specific.arch) {
1952 					struct rspamd_archive *arch = part->specific.arch;
1953 
1954 					msg_info_task ("found %s archive with incorrect content-type: %T/%T",
1955 							rspamd_archive_type_str (arch->type),
1956 							&part->ct->type, &part->ct->subtype);
1957 
1958 					if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
1959 						part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
1960 					}
1961 				}
1962 			}
1963 		}
1964 	}
1965 }
1966 
1967 
1968 const gchar *
rspamd_archive_type_str(enum rspamd_archive_type type)1969 rspamd_archive_type_str (enum rspamd_archive_type type)
1970 {
1971 	const gchar *ret = "unknown";
1972 
1973 	switch (type) {
1974 	case RSPAMD_ARCHIVE_ZIP:
1975 		ret = "zip";
1976 		break;
1977 	case RSPAMD_ARCHIVE_RAR:
1978 		ret = "rar";
1979 		break;
1980 	case RSPAMD_ARCHIVE_7ZIP:
1981 		ret = "7z";
1982 		break;
1983 	case RSPAMD_ARCHIVE_GZIP:
1984 		ret = "gz";
1985 		break;
1986 	}
1987 
1988 	return ret;
1989 }
1990