1 /* $NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Anon Ymous. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 33 /* 34 * This module contains the core MIME header decoding routines. 35 * Please refer to RFC 2047 and RFC 2822. 36 */ 37 38 #ifdef MIME_SUPPORT 39 40 #include <sys/cdefs.h> 41 #ifndef __lint__ 42 __RCSID("$NetBSD: mime_header.c,v 1.8 2009/04/10 13:08:25 christos Exp $"); 43 #endif /* not __lint__ */ 44 45 #include <assert.h> 46 #include <stdio.h> 47 #include <stdlib.h> 48 #include <string.h> 49 50 #include "def.h" 51 #include "extern.h" 52 #include "mime.h" 53 #include "mime_header.h" 54 #include "mime_codecs.h" 55 56 /* 57 * Our interface to mime_b64tobin() 58 * 59 * XXX - This should move to mime_codecs.c. 60 */ 61 static ssize_t 62 mime_B64_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 63 { 64 if (outlen < 3 * roundup(inlen, 4) / 4) 65 return -1; 66 67 return mime_b64tobin(outbuf, inbuf, inlen); 68 } 69 70 71 /* 72 * Header specific "quoted-printable" decode! 73 * Differences with body QP decoding (see rfc 2047, sec 4.2): 74 * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed). 75 * 2) Spaces can be encoded as '_' in headers for readability. 76 * 77 * XXX - This should move to mime_codecs.c. 78 */ 79 static ssize_t 80 mime_QPh_decode(char *outbuf, size_t outlen, const char *inbuf, size_t inlen) 81 { 82 const char *p, *inend; 83 char *outend; 84 char *q; 85 86 outend = outbuf + outlen; 87 inend = inbuf + inlen; 88 q = outbuf; 89 for (p = inbuf; p < inend; p++) { 90 if (q >= outend) 91 return -1; 92 if (*p == '=') { 93 p++; 94 if (p + 1 < inend) { 95 size_t c; 96 char *bufend; 97 char buf[3]; 98 99 buf[0] = *p++; 100 buf[1] = *p; 101 buf[2] = '\0'; 102 c = strtol(buf, &bufend, 16); 103 if (bufend != &buf[2]) 104 return -1; 105 *q++ = (char)c; 106 } 107 else 108 return -1; 109 } 110 else if (*p == '_') /* header's may encode ' ' as '_' */ 111 *q++ = ' '; 112 else 113 *q++ = *p; 114 } 115 return q - outbuf; 116 } 117 118 static const char * 119 grab_charset(char *from_cs, size_t from_cs_len, const char *p) 120 { 121 char *q; 122 q = from_cs; 123 for (/*EMPTY*/; *p != '?'; p++) { 124 if (*p == '\0' || q >= from_cs + from_cs_len - 1) 125 return NULL; 126 *q++ = *p; 127 } 128 *q = '\0'; 129 return ++p; /* if here, then we got the '?' */ 130 } 131 132 /* 133 * An encoded word is a string of at most 75 non-white space 134 * characters of the following form: 135 * 136 * =?charset?X?encoding?= 137 * 138 * where: 139 * 'charset' is the original character set of the unencoded string. 140 * 141 * 'X' is the encoding type 'B' or 'Q' for "base64" or 142 * "quoted-printable", respectively, 143 * 'encoding' is the encoded string. 144 * 145 * Both 'charset' and 'X' are case independent and 'encoding' cannot 146 * contain any whitespace or '?' characters. The 'encoding' must also 147 * be fully contained within the encoded words, i.e., it cannot be 148 * split between encoded words. 149 * 150 * Note: the 'B' encoding is a slightly modified "quoted-printable" 151 * encoding. In particular, spaces (' ') may be encoded as '_' to 152 * improve undecoded readability. 153 */ 154 static int 155 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs) 156 { 157 ssize_t declen; 158 size_t enclen, dstlen; 159 char decword[LINESIZE]; 160 char from_cs[LINESIZE]; 161 const char *encword, *iend, *p; 162 char *dstend; 163 char enctype; 164 165 p = *ibuf; 166 if (p[0] != '=' && p[1] != '?') 167 return -1; 168 if (strlen(p) < 2 + 1 + 3 + 1 + 2) 169 return -1; 170 p = grab_charset(from_cs, sizeof(from_cs), p + 2); 171 if (p == NULL) 172 return -1; 173 enctype = *p++; 174 if (*p++ != '?') 175 return -1; 176 encword = p; 177 p = strchr(p, '?'); 178 if (p == NULL || p[1] != '=') 179 return -1; 180 enclen = p - encword; /* length of encoded substring */ 181 iend = p + 2; 182 /* encoded words are at most 75 characters (RFC 2047, sec 2) */ 183 if (iend > *ibuf + 75) 184 return -1; 185 186 if (oend < *obuf + 1) { 187 assert(/*CONSTCOND*/ 0); /* We have a coding error! */ 188 return -1; 189 } 190 dstend = to_cs ? decword : *obuf; 191 dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1; 192 193 if (enctype == 'B' || enctype == 'b') 194 declen = mime_B64_decode(dstend, dstlen, encword, enclen); 195 else if (enctype == 'Q' || enctype == 'q') 196 declen = mime_QPh_decode(dstend, dstlen, encword, enclen); 197 else 198 return -1; 199 200 if (declen == -1) 201 return -1; 202 203 dstend += declen; 204 #ifdef CHARSET_SUPPORT 205 if (to_cs != NULL) { 206 iconv_t cd; 207 const char *src; 208 size_t srclen; 209 size_t cnt; 210 211 cd = iconv_open(to_cs, from_cs); 212 if (cd == (iconv_t)-1) 213 return -1; 214 215 src = decword; 216 srclen = declen; 217 dstend = *obuf; 218 dstlen = oend - *obuf - 1; 219 cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen); 220 221 (void)iconv_close(cd); 222 if (cnt == (size_t)-1) 223 return -1; 224 } 225 #endif /* CHARSET_SUPPORT */ 226 *dstend = '\0'; 227 *ibuf = iend; 228 *obuf = dstend; 229 return 0; 230 } 231 232 233 /* 234 * Folding White Space. See RFC 2822. 235 * 236 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF 237 * pairs (i.e., "\r\n") and never separately. However, by the time 238 * mail(1) sees the messages, all CRLF pairs have been converted to 239 * '\n' characters. 240 * 241 * XXX - pull is_FWS() and skip_FWS() up to def.h? 242 */ 243 static inline int 244 is_FWS(int c) 245 { 246 return c == ' ' || c == '\t' || c == '\n'; 247 } 248 249 static inline const char * 250 skip_FWS(const char *p) 251 { 252 while (is_FWS(*p)) 253 p++; 254 return p; 255 } 256 257 static inline void 258 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend) 259 { 260 const char *p, *pend; 261 char *q, *qend; 262 263 p = *src; 264 q = *dst; 265 pend = srcend; 266 qend = dstend; 267 268 if (p) { /* copy any skipped linear-white-space */ 269 while (p < pend && q < qend) 270 *q++ = *p++; 271 *dst = q; 272 *src = NULL; 273 } 274 } 275 276 /* 277 * Decode an unstructured field. 278 * 279 * See RFC 2822 Sec 2.2.1 and 3.6.5. 280 * Encoded words may occur anywhere in unstructured fields provided 281 * they are separated from any other text or encoded words by at least 282 * one linear-white-space character. (See RFC 2047 sec 5.1.) If two 283 * encoded words occur sequentially (separated by only FWS) then the 284 * separating FWS is removed. 285 * 286 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see 287 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\' 288 * (or any non-whitespace character) immediately before an 289 * encoded-word will prevent it from being decoded. 290 * 291 * hstring should be a NULL terminated string. 292 * outbuf should be sufficiently large to hold the result. 293 */ 294 static void 295 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring) 296 { 297 const char *p, *p0; 298 char *q, *qend; 299 int lastc; 300 const char *charset; 301 302 charset = value(ENAME_MIME_CHARSET); 303 qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */ 304 q = outbuf; 305 p = hstring; 306 p0 = NULL; 307 lastc = (unsigned char)' '; 308 while (*p && q < qend) { 309 const char *p1; 310 char *q1; 311 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 312 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 313 (*p1 == '\0' || is_FWS(*p1))) { 314 p0 = p1; /* pointer to first character after encoded word */ 315 q = q1; 316 p = skip_FWS(p1); 317 lastc = (unsigned char)*p0; 318 } 319 else { 320 copy_skipped_FWS(&q, qend, &p0, p); 321 lastc = (unsigned char)*p; 322 if (q < qend) 323 *q++ = *p++; 324 } 325 } 326 copy_skipped_FWS(&q, qend, &p0, p); 327 *q = '\0'; 328 } 329 330 /* 331 * Decode a field comment. 332 * 333 * Comments only occur in structured fields, can be nested (rfc 2822, 334 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'. 335 * Otherwise, they can be regarded as unstructured fields that are 336 * bounded by '(' and ')' characters. 337 */ 338 static int 339 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset) 340 { 341 const char *p, *pend, *p0; 342 char *q, *qend; 343 int lastc; 344 345 p = *ibuf; 346 q = *obuf; 347 pend = iend; 348 qend = oend; 349 lastc = ' '; 350 p0 = NULL; 351 while (p < pend && q < qend) { 352 const char *p1; 353 char *q1; 354 355 if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' && 356 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 357 (*p1 == ')' || is_FWS(*p1))) { 358 lastc = (unsigned char)*p1; 359 p0 = p1; 360 q = q1; 361 p = skip_FWS(p1); 362 /* 363 * XXX - this check should be unnecessary as *pend should 364 * be '\0' which will stop skip_FWS() 365 */ 366 if (p > pend) 367 p = pend; 368 } 369 else { 370 copy_skipped_FWS(&q, qend, &p0, p); 371 if (q >= qend) /* XXX - q > qend cannot happen */ 372 break; 373 374 if (*p == ')') { 375 *q++ = *p++; /* copy the closing ')' */ 376 break; /* and get out of here! */ 377 } 378 379 if (*p == '(') { 380 *q++ = *p++; /* copy the opening '(' */ 381 if (decode_comment(&q, qend, &p, pend, charset) == -1) 382 return -1; /* is this right or should we update? */ 383 lastc = ')'; 384 } 385 else if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 386 if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/ 387 *q++ = *p; 388 p++; 389 lastc = (unsigned char)*p; 390 if (q < qend) 391 *q++ = *p++; 392 } 393 else { 394 lastc = (unsigned char)*p; 395 *q++ = *p++; 396 } 397 } 398 } 399 *ibuf = p; 400 *obuf = q; 401 return 0; 402 } 403 404 /* 405 * Decode a quoted-string or no-fold-quote. 406 * 407 * These cannot contain encoded words. They can contain quoted-pairs, 408 * making '\\' special. They have no other structure. See RFC 2822 409 * sec 3.2.5 and 3.6.4. 410 */ 411 static void 412 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend) 413 { 414 const char *p, *pend; 415 char *q, *qend; 416 417 qend = oend; 418 pend = iend; 419 p = *ibuf; 420 q = *obuf; 421 while (p < pend && q < qend) { 422 if (*p == '"') { 423 *q++ = *p++; /* copy the closing '"' */ 424 break; 425 } 426 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 427 if (p[1] == '"' || p[1] == '\\') { 428 *q++ = *p; 429 if (q >= qend) 430 break; 431 } 432 p++; 433 } 434 *q++ = *p++; 435 } 436 *ibuf = p; 437 *obuf = q; 438 } 439 440 /* 441 * Decode a domain-literal or no-fold-literal. 442 * 443 * These cannot contain encoded words. They can have quoted pairs and 444 * are delimited by '[' and ']' making '\\', '[', and ']' special. 445 * They have no other structure. See RFC 2822 sec 3.4.1 and 3.6.4. 446 */ 447 static void 448 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend) 449 { 450 const char *p, *pend; 451 char *q, *qend; 452 453 qend = oend; 454 pend = iend; 455 p = *ibuf; 456 q = *obuf; 457 while (p < pend && q < qend) { 458 if (*p == ']') { 459 *q++ = *p++; /* copy the closing ']' */ 460 break; 461 } 462 if (*p == '\\' && p + 1 < pend) { /* quoted-pair */ 463 if (p[1] == '[' || p[1] == ']' || p[1] == '\\') { 464 *q++ = *p; 465 if (q >= qend) 466 break; 467 } 468 p++; 469 } 470 *q++ = *p++; 471 } 472 *ibuf = p; 473 *obuf = q; 474 } 475 476 /* 477 * Specials: see RFC 2822 sec 3.2.1. 478 */ 479 static inline int 480 is_specials(int c) 481 { 482 static const char specialtab[] = { 483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 485 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 487 488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 492 }; 493 return !(c & ~0x7f) ? specialtab[c] : 0; 494 } 495 496 /* 497 * Decode a structured field. 498 * 499 * At the top level, structured fields can only contain encoded-words 500 * via 'phrases' and 'comments'. See RFC 2047 sec 5. 501 */ 502 static void 503 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring) 504 { 505 const char *p, *pend, *p0; 506 char *q, *qend; 507 const char *charset; 508 int lastc; 509 510 charset = value(ENAME_MIME_CHARSET); 511 512 p = hstring; 513 q = linebuf; 514 pend = hstring + strlen(hstring); 515 qend = linebuf + bufsize - 1; /* save room for the NULL terminator */ 516 lastc = (unsigned char)' '; 517 p0 = NULL; 518 while (p < pend && q < qend) { 519 const char *p1; 520 char *q1; 521 522 if (*p != '=') { 523 copy_skipped_FWS(&q, qend, &p0, p); 524 if (q >= qend) 525 break; 526 } 527 528 switch (*p) { 529 case '(': /* start of comment */ 530 *q++ = *p++; /* copy the opening '(' */ 531 (void)decode_comment(&q, qend, &p, pend, charset); 532 lastc = (unsigned char)p[-1]; 533 break; 534 535 case '"': /* start of quoted-string or no-fold-quote */ 536 *q++ = *p++; /* copy the opening '"' */ 537 decode_quoted_string(&q, qend, &p, pend); 538 lastc = (unsigned char)p[-1]; 539 break; 540 541 case '[': /* start of domain-literal or no-fold-literal */ 542 *q++ = *p++; /* copy the opening '[' */ 543 decode_domain_literal(&q, qend, &p, pend); 544 lastc = (unsigned char)p[-1]; 545 break; 546 547 case '\\': /* start of quoted-pair */ 548 if (p + 1 < pend) { /* quoted pair */ 549 if (is_specials(p[1])) { 550 *q++ = *p; 551 if (q >= qend) 552 break; 553 } 554 p++; /* skip the '\\' */ 555 } 556 goto copy_char; 557 558 case '=': 559 /* 560 * At this level encoded words can appear via 561 * 'phrases' (possibly delimited by ',' as in 562 * 'keywords'). Thus we handle them as such. 563 * Hopefully this is sufficient. 564 */ 565 if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' && 566 decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 && 567 (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) { 568 lastc = (unsigned char)*p1; 569 p0 = p1; 570 q = q1; 571 p = skip_FWS(p1); 572 /* 573 * XXX - this check should be 574 * unnecessary as *pend should be '\0' 575 * which will stop skip_FWS() 576 */ 577 if (p > pend) 578 p = pend; 579 break; 580 } 581 else { 582 copy_skipped_FWS(&q, qend, &p0, p); 583 if (q >= qend) 584 break; 585 goto copy_char; 586 } 587 588 case '<': /* start of angle-addr, msg-id, or path. */ 589 /* 590 * A msg-id cannot contain encoded-pairs or 591 * encoded-words, but angle-addr and path can. 592 * Distinguishing between them seems to be 593 * unnecessary, so let's be loose and just 594 * decode them as if they were all the same. 595 */ 596 default: 597 copy_char: 598 lastc = (unsigned char)*p; 599 *q++ = *p++; 600 break; 601 } 602 } 603 copy_skipped_FWS(&q, qend, &p0, p); 604 *q = '\0'; /* null terminate the result! */ 605 } 606 607 /* 608 * Returns the correct hfield decoder, or NULL if none. 609 * Info extracted from RFC 2822. 610 * 611 * name - pointer to field name of header line (with colon). 612 */ 613 PUBLIC hfield_decoder_t 614 mime_hfield_decoder(const char *name) 615 { 616 static const struct field_decoder_tbl_s { 617 const char *field_name; 618 size_t field_len; 619 hfield_decoder_t decoder; 620 } field_decoder_tbl[] = { 621 #define X(s) s, sizeof(s) - 1 622 { X("Received:"), NULL }, 623 624 { X("Content-Type:"), NULL }, 625 { X("Content-Disposition:"), NULL }, 626 { X("Content-Transfer-Encoding:"), NULL }, 627 { X("Content-Description:"), mime_decode_sfield }, 628 { X("Content-ID:"), mime_decode_sfield }, 629 { X("MIME-Version:"), mime_decode_sfield }, 630 631 { X("Bcc:"), mime_decode_sfield }, 632 { X("Cc:"), mime_decode_sfield }, 633 { X("Date:"), mime_decode_sfield }, 634 { X("From:"), mime_decode_sfield }, 635 { X("In-Reply-To:"), mime_decode_sfield }, 636 { X("Keywords:"), mime_decode_sfield }, 637 { X("Message-ID:"), mime_decode_sfield }, 638 { X("References:"), mime_decode_sfield }, 639 { X("Reply-To:"), mime_decode_sfield }, 640 { X("Return-Path:"), mime_decode_sfield }, 641 { X("Sender:"), mime_decode_sfield }, 642 { X("To:"), mime_decode_sfield }, 643 { X("Subject:"), mime_decode_usfield }, 644 { X("Comments:"), mime_decode_usfield }, 645 { X("X-"), mime_decode_usfield }, 646 { NULL, 0, mime_decode_usfield }, /* optional-fields */ 647 #undef X 648 }; 649 const struct field_decoder_tbl_s *fp; 650 651 /* XXX - this begs for a hash table! */ 652 for (fp = field_decoder_tbl; fp->field_name; fp++) 653 if (strncasecmp(name, fp->field_name, fp->field_len) == 0) 654 break; 655 return fp->decoder; 656 } 657 658 #endif /* MIME_SUPPORT */ 659