xref: /netbsd/usr.bin/mail/mime_header.c (revision 6550d01e)
1 /*	$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Anon Ymous.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * This module contains the core MIME header decoding routines.
35  * Please refer to RFC 2047 and RFC 2822.
36  */
37 
38 #ifdef MIME_SUPPORT
39 
40 #include <sys/cdefs.h>
41 #ifndef __lint__
42 __RCSID("$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $");
43 #endif /* not __lint__ */
44 
45 #include <assert.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 
50 #include "def.h"
51 #include "extern.h"
52 #include "mime.h"
53 #include "mime_header.h"
54 #include "mime_codecs.h"
55 
56 /*
57  * Our interface to mime_b64tobin()
58  *
59  * XXX - This should move to mime_codecs.c.
60  */
61 static ssize_t
62 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
63 {
64 	if (outlen < 3 * roundup(inlen, 4) / 4)
65 		return -1;
66 
67 	return mime_b64tobin(outbuf, inbuf, inlen);
68 }
69 
70 
71 /*
72  * Header specific "quoted-printable" decode!
73  * Differences with body QP decoding (see rfc 2047, sec 4.2):
74  * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed).
75  * 2) Spaces can be encoded as '_' in headers for readability.
76  *
77  * XXX - This should move to mime_codecs.c.
78  */
79 static ssize_t
80 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen)
81 {
82 	const char *p, *inend;
83 	char *outend;
84 	char *q;
85 
86 	outend = outbuf + outlen;
87 	inend = inbuf + inlen;
88 	q = outbuf;
89 	for (p = inbuf; p < inend; p++) {
90 		if (q >= outend)
91 			return -1;
92 		if (*p == '=') {
93 			p++;
94 			if (p + 1 < inend) {
95 				size_t c;
96 				char *bufend;
97 				char buf[3];
98 
99 				buf[0] = *p++;
100 				buf[1] = *p;
101 				buf[2] = '\0';
102 				c = strtol(buf, &bufend, 16);
103 				if (bufend != &buf[2])
104 					return -1;
105 				*q++ = (char)c;
106 			}
107 			else
108 				return -1;
109 		}
110 		else if (*p == '_')  /* header's may encode ' ' as '_' */
111 			*q++ = ' ';
112 		else
113 			*q++ = *p;
114 	}
115 	return q - outbuf;
116 }
117 
118 static const char *
119 grab_charset(char *from_cs, size_t from_cs_len, const char *p)
120 {
121 	char *q;
122 	q = from_cs;
123 	for (/*EMPTY*/; *p != '?'; p++) {
124 		if (*p == '\0' || q >= from_cs + from_cs_len - 1)
125 			return NULL;
126 		*q++ = *p;
127 	}
128 	*q = '\0';
129 	return ++p;	/* if here, then we got the '?' */
130 }
131 
132 /*
133  * An encoded word is a string of at most 75 non-white space
134  * characters of the following form:
135  *
136  *  =?charset?X?encoding?=
137  *
138  * where:
139  *   'charset'	is the original character set of the unencoded string.
140  *
141  *   'X'	is the encoding type 'B' or 'Q' for "base64" or
142  *              "quoted-printable", respectively,
143  *   'encoding'	is the encoded string.
144  *
145  * Both 'charset' and 'X' are case independent and 'encoding' cannot
146  * contain any whitespace or '?' characters.  The 'encoding' must also
147  * be fully contained within the encoded words, i.e., it cannot be
148  * split between encoded words.
149  *
150  * Note: the 'B' encoding is a slightly modified "quoted-printable"
151  * encoding.  In particular, spaces (' ') may be encoded as '_' to
152  * improve undecoded readability.
153  */
154 static int
155 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
156 {
157 	ssize_t declen;
158 	size_t enclen, dstlen;
159 	char decword[LINESIZE];
160 	char from_cs[LINESIZE];
161 	const char *encword, *iend, *p;
162 	char *dstend;
163 	char enctype;
164 
165 	p = *ibuf;
166 	if (p[0] != '=' && p[1] != '?')
167 		return -1;
168 	if (strlen(p) <  2 + 1 + 3 + 1 + 2)
169 		return -1;
170 	p = grab_charset(from_cs, sizeof(from_cs), p + 2);
171 	if (p == NULL)
172 		return -1;
173 	enctype = *p++;
174 	if (*p++ != '?')
175 		return -1;
176 	encword = p;
177 	p = strchr(p, '?');
178 	if (p == NULL || p[1] != '=')
179 		return -1;
180 	enclen = p - encword;	/* length of encoded substring */
181 	iend = p + 2;
182 	/* encoded words are at most 75 characters (RFC 2047, sec 2) */
183 	if (iend > *ibuf + 75)
184 		return -1;
185 
186 	if (oend < *obuf + 1) {
187 		assert(/*CONSTCOND*/ 0);	/* We have a coding error! */
188 		return -1;
189 	}
190 	dstend = to_cs ? decword : *obuf;
191 	dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1;
192 
193 	if (enctype == 'B' || enctype == 'b')
194 		declen = mime_B64_decode(dstend, dstlen, encword, enclen);
195 	else if (enctype == 'Q' || enctype == 'q')
196 		declen = mime_QPh_decode(dstend, dstlen, encword, enclen);
197 	else
198 		return -1;
199 
200 	if (declen == -1)
201 		return -1;
202 
203 	dstend += declen;
204 #ifdef CHARSET_SUPPORT
205 	if (to_cs != NULL) {
206 		iconv_t cd;
207 		const char *src;
208 		size_t srclen;
209 		size_t cnt;
210 
211 		cd = iconv_open(to_cs, from_cs);
212 		if (cd == (iconv_t)-1)
213 			return -1;
214 
215 		src = decword;
216 		srclen = declen;
217 		dstend = *obuf;
218 		dstlen = oend - *obuf - 1;
219 		cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);
220 
221 		(void)iconv_close(cd);
222 		if (cnt == (size_t)-1)
223 			return -1;
224 	}
225 #endif /* CHARSET_SUPPORT */
226 	*dstend = '\0';
227 	*ibuf = iend;
228 	*obuf = dstend;
229 	return 0;
230 }
231 
232 
233 /*
234  * Folding White Space.  See RFC 2822.
235  *
236  * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
237  * pairs (i.e., "\r\n") and never separately.  However, by the time
238  * mail(1) sees the messages, all CRLF pairs have been converted to
239  * '\n' characters.
240  *
241  * XXX - pull is_FWS() and skip_FWS() up to def.h?
242  */
243 static inline int
244 is_FWS(int c)
245 {
246 	return c == ' ' || c == '\t' || c == '\n';
247 }
248 
249 static inline const char *
250 skip_FWS(const char *p)
251 {
252 	while (is_FWS(*p))
253 		p++;
254 	return p;
255 }
256 
257 static inline void
258 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
259 {
260 	const char *p, *pend;
261 	char *q, *qend;
262 
263 	p = *src;
264 	q = *dst;
265 	pend = srcend;
266 	qend = dstend;
267 
268 	if (p) {  /* copy any skipped linear-white-space */
269 		while (p < pend && q < qend)
270 			*q++ = *p++;
271 		*dst = q;
272 		*src = NULL;
273 	}
274 }
275 
276 /*
277  * Decode an unstructured field.
278  *
279  * See RFC 2822 Sec 2.2.1 and 3.6.5.
280  * Encoded words may occur anywhere in unstructured fields provided
281  * they are separated from any other text or encoded words by at least
282  * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
283  * encoded words occur sequentially (separated by only FWS) then the
284  * separating FWS is removed.
285  *
286  * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
287  * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
288  * (or any non-whitespace character) immediately before an
289  * encoded-word will prevent it from being decoded.
290  *
291  * hstring should be a NULL terminated string.
292  * outbuf should be sufficiently large to hold the result.
293  */
294 static void
295 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
296 {
297 	const char *p, *p0;
298 	char *q, *qend;
299 	int lastc;
300 	const char *charset;
301 
302 	charset = value(ENAME_MIME_CHARSET);
303 	qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
304 	q = outbuf;
305 	p = hstring;
306 	p0 = NULL;
307 	lastc = (unsigned char)' ';
308 	while (*p && q < qend) {
309 		const char *p1;
310 		char *q1;
311 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
312 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
313 		    (*p1 == '\0' || is_FWS(*p1))) {
314 			p0 = p1;  /* pointer to first character after encoded word */
315 			q = q1;
316 			p = skip_FWS(p1);
317 			lastc = (unsigned char)*p0;
318 		}
319 		else {
320 			copy_skipped_FWS(&q, qend, &p0, p);
321 			lastc = (unsigned char)*p;
322 			if (q < qend)
323 				*q++ = *p++;
324 		}
325 	}
326 	copy_skipped_FWS(&q, qend, &p0, p);
327 	*q = '\0';
328 }
329 
330 /*
331  * Decode a field comment.
332  *
333  * Comments only occur in structured fields, can be nested (rfc 2822,
334  * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
335  * Otherwise, they can be regarded as unstructured fields that are
336  * bounded by '(' and ')' characters.
337  */
338 static int
339 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
340 {
341 	const char *p, *pend, *p0;
342 	char *q, *qend;
343 	int lastc;
344 
345 	p = *ibuf;
346 	q = *obuf;
347 	pend = iend;
348 	qend = oend;
349 	lastc = ' ';
350 	p0 = NULL;
351 	while (p < pend && q < qend) {
352 		const char *p1;
353 		char *q1;
354 
355 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
356 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
357 		    (*p1 == ')' || is_FWS(*p1))) {
358 			lastc = (unsigned char)*p1;
359 			p0 = p1;
360 			q = q1;
361 			p = skip_FWS(p1);
362 			/*
363 			 * XXX - this check should be unnecessary as *pend should
364 			 * be '\0' which will stop skip_FWS()
365 			 */
366 			if (p > pend)
367 				p = pend;
368 		}
369 		else {
370 			copy_skipped_FWS(&q, qend, &p0, p);
371 			if (q >= qend)	/* XXX - q > qend cannot happen */
372 				break;
373 
374 			if (*p == ')') {
375 				*q++ = *p++;	/* copy the closing ')' */
376 				break;		/* and get out of here! */
377 			}
378 
379 			if (*p == '(') {
380 				*q++ = *p++;	/* copy the opening '(' */
381 				if (decode_comment(&q, qend, &p, pend, charset) == -1)
382 					return -1;	/* is this right or should we update? */
383 				lastc = ')';
384 			}
385 			else if (*p == '\\' && p + 1 < pend) {	/* quoted-pair */
386 				if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
387 					*q++ = *p;
388 				p++;
389 				lastc = (unsigned char)*p;
390 				if (q < qend)
391 					*q++ = *p++;
392 			}
393 			else {
394 				lastc = (unsigned char)*p;
395 				*q++ = *p++;
396 			}
397 		}
398 	}
399 	*ibuf = p;
400 	*obuf = q;
401 	return 0;
402 }
403 
404 /*
405  * Decode a quoted-string or no-fold-quote.
406  *
407  * These cannot contain encoded words.  They can contain quoted-pairs,
408  * making '\\' special.  They have no other structure.  See RFC 2822
409  * sec 3.2.5 and 3.6.4.
410  */
411 static void
412 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
413 {
414 	const char *p, *pend;
415 	char *q, *qend;
416 
417 	qend = oend;
418 	pend = iend;
419 	p = *ibuf;
420 	q = *obuf;
421 	while (p < pend && q < qend) {
422 		if (*p == '"') {
423 			*q++ = *p++;	/* copy the closing '"' */
424 			break;
425 		}
426 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
427 			if (p[1] == '"' || p[1] == '\\') {
428 				*q++ = *p;
429 				if (q >= qend)
430 					break;
431 			}
432 			p++;
433 		}
434 		*q++ = *p++;
435 	}
436 	*ibuf = p;
437 	*obuf = q;
438 }
439 
440 /*
441  * Decode a domain-literal or no-fold-literal.
442  *
443  * These cannot contain encoded words.  They can have quoted pairs and
444  * are delimited by '[' and ']' making '\\', '[', and ']' special.
445  * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
446  */
447 static void
448 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
449 {
450 	const char *p, *pend;
451 	char *q, *qend;
452 
453 	qend = oend;
454 	pend = iend;
455 	p = *ibuf;
456 	q = *obuf;
457 	while (p < pend && q < qend) {
458 		if (*p == ']') {
459 			*q++ = *p++;	/* copy the closing ']' */
460 			break;
461 		}
462 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
463 			if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
464 				*q++ = *p;
465 				if (q >= qend)
466 					break;
467 			}
468 			p++;
469 		}
470 		*q++ = *p++;
471 	}
472 	*ibuf = p;
473 	*obuf = q;
474 }
475 
476 /*
477  * Specials: see RFC 2822 sec 3.2.1.
478  */
479 static inline int
480 is_specials(int c)
481 {
482 	static const char specialtab[] = {
483 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
484 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
485 		0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
486 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,
487 
488 		1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
489 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
490 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
491 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
492 	};
493 	return !(c & ~0x7f) ? specialtab[c] : 0;
494 }
495 
496 /*
497  * Decode a structured field.
498  *
499  * At the top level, structured fields can only contain encoded-words
500  * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
501  */
502 static void
503 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
504 {
505 	const char *p, *pend, *p0;
506 	char *q, *qend;
507 	const char *charset;
508 	int lastc;
509 
510 	charset = value(ENAME_MIME_CHARSET);
511 
512 	p = hstring;
513 	q = linebuf;
514 	pend = hstring + strlen(hstring);
515 	qend = linebuf + bufsize - 1;	/* save room for the NULL terminator */
516 	lastc = (unsigned char)' ';
517 	p0 = NULL;
518 	while (p < pend && q < qend) {
519 		const char *p1;
520 		char *q1;
521 
522 		if (*p != '=') {
523 			copy_skipped_FWS(&q, qend, &p0, p);
524 			if (q >= qend)
525 				break;
526 		}
527 
528 		switch (*p) {
529 		case '(':	/* start of comment */
530 			*q++ = *p++;	/* copy the opening '(' */
531 			(void)decode_comment(&q, qend, &p, pend, charset);
532 			lastc = (unsigned char)p[-1];
533 			break;
534 
535 		case '"':	/* start of quoted-string or no-fold-quote */
536 			*q++ = *p++;	/* copy the opening '"' */
537 			decode_quoted_string(&q, qend, &p, pend);
538 			lastc = (unsigned char)p[-1];
539 			break;
540 
541 		case '[':	/* start of domain-literal or no-fold-literal */
542 			*q++ = *p++;	/* copy the opening '[' */
543 			decode_domain_literal(&q, qend, &p, pend);
544 			lastc = (unsigned char)p[-1];
545 			break;
546 
547 		case '\\':	/* start of quoted-pair */
548 			if (p + 1 < pend) {		/* quoted pair */
549 				if (is_specials(p[1])) {
550 					*q++ = *p;
551 					if (q >= qend)
552 						break;
553 				}
554 				p++;	/* skip the '\\' */
555 			}
556 			goto copy_char;
557 
558 		case '=':
559 			/*
560 			 * At this level encoded words can appear via
561 			 * 'phrases' (possibly delimited by ',' as in
562 			 * 'keywords').  Thus we handle them as such.
563 			 * Hopefully this is sufficient.
564 			 */
565 			if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
566 			    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
567 			    (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
568 				lastc = (unsigned char)*p1;
569 				p0 = p1;
570 				q = q1;
571 				p = skip_FWS(p1);
572 				/*
573 				 * XXX - this check should be
574 				 * unnecessary as *pend should be '\0'
575 				 * which will stop skip_FWS()
576 				 */
577 				if (p > pend)
578 					p = pend;
579 				break;
580 			}
581 			else {
582 				copy_skipped_FWS(&q, qend, &p0, p);
583 				if (q >= qend)
584 					break;
585 				goto copy_char;
586 			}
587 
588 		case '<':	/* start of angle-addr, msg-id, or path. */
589 			/*
590 			 * A msg-id cannot contain encoded-pairs or
591 			 * encoded-words, but angle-addr and path can.
592 			 * Distinguishing between them seems to be
593 			 * unnecessary, so let's be loose and just
594 			 * decode them as if they were all the same.
595 			 */
596 		default:
597 	copy_char:
598 			lastc = (unsigned char)*p;
599 			*q++ = *p++;
600 			break;
601 		}
602 	}
603 	copy_skipped_FWS(&q, qend, &p0, p);
604 	*q = '\0';	/* null terminate the result! */
605 }
606 
607 /*
608  * Returns the correct hfield decoder, or NULL if none.
609  * Info extracted from RFC 2822.
610  *
611  * name - pointer to field name of header line (with colon).
612  */
613 PUBLIC hfield_decoder_t
614 mime_hfield_decoder(const char *name)
615 {
616 	static const struct field_decoder_tbl_s {
617 		const char *field_name;
618 		size_t field_len;
619 		hfield_decoder_t decoder;
620 	} field_decoder_tbl[] = {
621 #define X(s)	s, sizeof(s) - 1
622 		{ X("Received:"),			NULL },
623 
624 		{ X("Content-Type:"),			NULL },
625 		{ X("Content-Disposition:"),		NULL },
626 		{ X("Content-Transfer-Encoding:"),	NULL },
627 		{ X("Content-Description:"),		mime_decode_sfield },
628 		{ X("Content-ID:"),			mime_decode_sfield },
629 		{ X("MIME-Version:"),			mime_decode_sfield },
630 
631 		{ X("Bcc:"),				mime_decode_sfield },
632 		{ X("Cc:"),				mime_decode_sfield },
633 		{ X("Date:"),				mime_decode_sfield },
634 		{ X("From:"),				mime_decode_sfield },
635 		{ X("In-Reply-To:"),			mime_decode_sfield },
636 		{ X("Keywords:"),			mime_decode_sfield },
637 		{ X("Message-ID:"),			mime_decode_sfield },
638 		{ X("References:"),			mime_decode_sfield },
639 		{ X("Reply-To:"),			mime_decode_sfield },
640 		{ X("Return-Path:"),			mime_decode_sfield },
641 		{ X("Sender:"),				mime_decode_sfield },
642 		{ X("To:"),				mime_decode_sfield },
643 		{ X("Subject:"),			mime_decode_usfield },
644 		{ X("Comments:"),			mime_decode_usfield },
645 		{ X("X-"),				mime_decode_usfield },
646 		{ NULL, 0,				mime_decode_usfield },	/* optional-fields */
647 #undef X
648 	};
649 	const struct field_decoder_tbl_s *fp;
650 
651 	/* XXX - this begs for a hash table! */
652 	for (fp = field_decoder_tbl; fp->field_name; fp++)
653 		if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
654 			break;
655 	return fp->decoder;
656 }
657 
658 #endif /* MIME_SUPPORT */
659