1 /*
2 ** Copyright (c) 2005, 2007, 2008 Sendmail, Inc. and its suppliers.
3 ** All rights reserved.
4 **
5 ** Copyright (c) 2009, 2010, 2012, 2014, The Trusted Domain Project.
6 ** All rights reserved.
7 */
8
9 /* system inludes */
10 #include <sys/types.h>
11 #include <ctype.h>
12 #include <string.h>
13 #include <limits.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16
17 /* libopendkim includes */
18 #include "dkim-mailparse.h"
19
20 /* types */
21 typedef unsigned long cmap_elem_type;
22
23 /* symbolic names */
24 #define DKIM_MAILPARSE_OK 0 /* success */
25 #define DKIM_MAILPARSE_ERR_PUNBALANCED 1 /* unbalanced parentheses */
26 #define DKIM_MAILPARSE_ERR_QUNBALANCED 2 /* unbalanced quotes */
27 #define DKIM_MAILPARSE_ERR_SUNBALANCED 3 /* unbalanced sq. brackets */
28
29 /* a bitmap for the "specials" character class */
30 #define CMAP_NBITS (sizeof(cmap_elem_type) * CHAR_BIT)
31 #define CMAP_NELEMS ((1 + UCHAR_MAX) / CMAP_NBITS)
32 #define CMAP_INDEX(i) ((unsigned char)(i) / CMAP_NBITS)
33 #define CMAP_BIT(i) (1L << (unsigned char)(i) % CMAP_NBITS)
34 #define CMAP_TST(ar, c) ((ar)[CMAP_INDEX(c)] & CMAP_BIT(c))
35 #define CMAP_SET(ar, c) ((ar)[CMAP_INDEX(c)] |= CMAP_BIT(c))
36
37 static unsigned char const SPECIALS[] = "<>@,;:\\\"/[]?=";
38
39 #ifndef FALSE
40 # define FALSE 0
41 #endif /* ! FALSE */
42 #ifndef TRUE
43 # define TRUE 1
44 #endif /* ! TRUE */
45
46 #ifdef DKIM_MAILPARSE_TEST
47 /*
48 ** DKIM_MAIL_UNESCAPE -- remove escape characters from a string
49 **
50 ** Parameters:
51 ** s -- the string to be unescaped
52 **
53 ** Return value:
54 ** s.
55 */
56
57 static char *
dkim_mail_unescape(char * s)58 dkim_mail_unescape(char *s)
59 {
60 char *w;
61 char const *r, *p, *e;
62
63 if (s == NULL)
64 return NULL;
65
66 r = w = s;
67 e = s + strlen(s);
68
69 while ((p = memchr(r, '\\', e - s)) != NULL)
70 {
71 if (p > s)
72 {
73 if (r != w)
74 memmove(w, r, p - r);
75 w += p - r;
76 }
77
78 if (p[1] == '\0')
79 {
80 r = p + 1;
81 }
82 else
83 {
84 *w++ = p[1];
85 r = p + 2;
86 }
87 }
88
89 if (r > w)
90 {
91 if (e > r)
92 {
93 memmove(w, r, e - r);
94 w += e - r;
95 }
96 *w = '\0';
97 }
98
99 return s;
100 }
101 #endif /* DKIM_MAILPARSE_TEST */
102
103 /*
104 ** DKIM_MAIL_MATCHING_PAREN -- return the location past matching opposite
105 ** parentheses
106 **
107 ** Parameters:
108 ** s -- start of string to be processed
109 ** e -- end of string to be processed
110 ** open_paren -- open parenthesis character
111 ** close_paren -- close parenthesis character
112 **
113 ** Return value:
114 ** Location of the final close parenthesis character in the string.
115 ** For example, given "xxx((yyyy)zz)aaaa", would return the location
116 ** of the second ")". There may be more beyond that, but at that point
117 ** everything is balanced.
118 */
119
120 static u_char *
dkim_mail_matching_paren(u_char * s,u_char * e,int open_paren,int close_paren)121 dkim_mail_matching_paren(u_char *s, u_char *e, int open_paren, int close_paren)
122 {
123 int paren = 1;
124
125 for (; s < e; s++)
126 {
127 if (*s == close_paren)
128 {
129 if (--paren == 0)
130 break;
131 }
132 else if (*s == open_paren)
133 {
134 paren++;
135 }
136 else if (*s == '\\')
137 {
138 if (s[1] != '\0')
139 s++;
140 }
141 }
142
143 return s;
144 }
145
146 /*
147 ** DKIM_MAIL_FIRST_SPECIAL -- find the first "special" character
148 **
149 ** Parameters:
150 ** p -- input string
151 ** e -- end of input string
152 ** special_out -- pointer to the first special character found
153 **
154 ** Return value:
155 ** 0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
156 */
157
158 static int
dkim_mail_first_special(u_char * p,u_char * e,u_char ** special_out)159 dkim_mail_first_special(u_char *p, u_char *e, u_char **special_out)
160 {
161 size_t i;
162 cmap_elem_type is_special[CMAP_NELEMS] = { 0 };
163 u_char *at_ptr = NULL;
164
165 /* set up special finder */
166 for (i = 0; SPECIALS[i] != '\0'; i++)
167 CMAP_SET(is_special, SPECIALS[i]);
168
169 for (; p < e && *p != '\0'; p++)
170 {
171 /* skip white space between tokens */
172 while (p < e && (*p == '(' ||
173 (isascii(*p) && isspace(*p))))
174 {
175 if (*p != '(')
176 {
177 p++;
178 }
179 else
180 {
181 p = dkim_mail_matching_paren(p + 1, e,
182 '(', ')');
183 if (*p == '\0')
184 return DKIM_MAILPARSE_ERR_PUNBALANCED;
185 else
186 p++;
187 }
188 }
189
190 if (*p == '\0')
191 break;
192
193 if (*p == '"')
194 {
195 p = dkim_mail_matching_paren(p + 1, e, '\0', '"');
196 if (*p == '\0')
197 return DKIM_MAILPARSE_ERR_QUNBALANCED;
198 }
199 else if (*p == '[')
200 {
201 p = dkim_mail_matching_paren(p + 1, e, '\0', ']');
202 if (*p == '\0')
203 return DKIM_MAILPARSE_ERR_SUNBALANCED;
204 }
205 else if (CMAP_TST(is_special, *p))
206 {
207 if (*p == '<')
208 {
209 *special_out = p;
210 return 0;
211 }
212 else if (*p == ':' || *p == ';' || *p == ',')
213 {
214 if (at_ptr != NULL)
215 *special_out = at_ptr;
216 else
217 *special_out = p;
218 return 0;
219 }
220 else if (*p == '@')
221 {
222 at_ptr = p;
223 }
224 }
225 else
226 {
227 while (*p != '\0' &&
228 !CMAP_TST(is_special, *p) &&
229 (!isascii(*p) ||
230 !isspace((unsigned char) *p)) &&
231 *p != '(')
232 p++;
233 p--;
234 }
235 }
236
237 *special_out = p;
238 return 0;
239 }
240
241 /*
242 ** DKIM_MAIL_TOKEN -- find the next token
243 **
244 ** Parameters:
245 ** s -- start of input string
246 ** e -- end of input string
247 ** type_out -- type of token (returned)
248 ** start_out -- start of token (returned)
249 ** end_out -- start of token (returned)
250 ** uncommented_whitespace -- set to TRUE if uncommented whitespace is
251 ** discovered (returned)
252 **
253 ** Return value:
254 ** 0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
255 */
256
257 static int
dkim_mail_token(u_char * s,u_char * e,int * type_out,u_char ** start_out,u_char ** end_out,int * uncommented_whitespace)258 dkim_mail_token(u_char *s, u_char *e, int *type_out, u_char **start_out,
259 u_char **end_out, int *uncommented_whitespace)
260 {
261 u_char *p;
262 int err = 0;
263 size_t i;
264 int token_type;
265 cmap_elem_type is_special[CMAP_NELEMS] = { 0 };
266 u_char *token_start, *token_end;
267
268 *start_out = NULL;
269 *end_out = NULL;
270 *type_out = 0;
271
272 err = 0;
273
274 /* set up special finder */
275 for (i = 0; SPECIALS[i] != '\0'; i++)
276 CMAP_SET(is_special, SPECIALS[i]);
277
278 p = s;
279
280 /* skip white space between tokens */
281 while (p < e && (*p == '(' ||
282 (isascii((unsigned char) *p) &&
283 isspace((unsigned char) *p))))
284 {
285 if (*p != '(')
286 {
287 *uncommented_whitespace = 1;
288 p++;
289 }
290 else
291 {
292 p = dkim_mail_matching_paren(p + 1, e, '(', ')');
293 if (*p == '\0')
294 return DKIM_MAILPARSE_ERR_PUNBALANCED;
295 else
296 p++;
297 }
298 }
299
300 if (p >= e || *p == '\0')
301 return 0;
302
303 /* our new token starts here */
304 token_start = p;
305
306 /* fill in the token contents and type */
307 if (*p == '"')
308 {
309 token_end = dkim_mail_matching_paren(p + 1, e, '\0', '"');
310 token_type = '"';
311 if (*token_end != '\0')
312 token_end++;
313 else
314 err = DKIM_MAILPARSE_ERR_QUNBALANCED;
315 }
316 else if (*p == '[')
317 {
318 token_end = p = dkim_mail_matching_paren(p + 1, e, '\0', ']');
319 token_type = '[';
320 if (*token_end != '\0')
321 token_end++;
322 else
323 err = DKIM_MAILPARSE_ERR_SUNBALANCED;
324 }
325 else if (CMAP_TST(is_special, *p))
326 {
327 token_end = p + 1;
328 token_type = *p;
329 }
330 else
331 {
332 while (p < e && *p != '\0' && !CMAP_TST(is_special, *p) &&
333 (!isascii(*p) || !isspace((unsigned char) *p)) &&
334 *p != '(')
335 p++;
336
337 token_end = p;
338 token_type = 'x';
339 }
340
341 *start_out = token_start;
342 *end_out = token_end;
343 *type_out = token_type;
344
345 return err;
346 }
347
348 /*
349 ** DKIM_MAIL_PARSE -- extract the local-part and hostname from a mail
350 ** header field, e.g. "From:"
351 **
352 ** Parameters:
353 ** line -- input line
354 ** user_out -- pointer to "local-part" (returned)
355 ** domain_out -- pointer to hostname (returned)
356 **
357 ** Return value:
358 ** 0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
359 **
360 ** Notes:
361 ** Input string is modified.
362 */
363
364 int
dkim_mail_parse(unsigned char * line,unsigned char ** user_out,unsigned char ** domain_out)365 dkim_mail_parse(unsigned char *line, unsigned char **user_out,
366 unsigned char **domain_out)
367 {
368 int type;
369 int ws;
370 int err;
371 u_char *e, *special;
372 u_char *tok_s, *tok_e;
373 u_char *w;
374
375 *user_out = NULL;
376 *domain_out = NULL;
377
378 err = 0;
379 w = line;
380 e = line + strlen((char *) line);
381 ws = 0;
382
383 for (;;)
384 {
385 err = dkim_mail_first_special(line, e, &special);
386 if (err != 0)
387 return err;
388
389 /* given the construct we're looking at, do the right thing */
390 switch (*special)
391 {
392 case '<':
393 /* display name <address> */
394 line = special + 1;
395 for (;;)
396 {
397 err = dkim_mail_token(line, e, &type, &tok_s,
398 &tok_e, &ws);
399 if (err != 0)
400 return err;
401
402 if (type == '>' || type == '\0')
403 {
404 *w = '\0';
405 return 0;
406 }
407 else if (type == '@')
408 {
409 *w++ = '\0';
410 *domain_out = w;
411 }
412 else if (type == ',' || type == ':')
413 {
414 /* source route punctuation */
415 *user_out = NULL;
416 *domain_out = NULL;
417 }
418 else
419 {
420 if (*user_out == NULL)
421 *user_out = w;
422 memmove(w, tok_s, tok_e - tok_s);
423 w += tok_e - tok_s;
424 }
425 line = tok_e;
426 }
427
428 case ';':
429 case ':':
430 case ',':
431 /* skip a group name or result */
432 line = special + 1;
433 break;
434
435 default:
436 /* (display name) addr(display name)ess */
437 ws = 0;
438 for (;;)
439 {
440 err = dkim_mail_token(line, e, &type, &tok_s,
441 &tok_e, &ws);
442 if (err != 0)
443 return err;
444
445 if (type == '\0' || type == ',' || type == ';')
446 {
447 *w = '\0';
448 break;
449 }
450 else if (type == '@')
451 {
452 *w++ = '\0';
453 *domain_out = w;
454 ws = 0;
455 }
456 else
457 {
458
459 if (*user_out == NULL)
460 *user_out = w;
461 else if (type == 'x' && ws == 1)
462 *w++ = ' ';
463
464 memmove(w, tok_s, tok_e - tok_s);
465 w += tok_e - tok_s;
466
467 ws = 0;
468 }
469
470 line = tok_e;
471 }
472 return 0;
473 }
474 }
475 }
476
477 /*
478 ** DKIM_MAIL_PARSE_MULTI -- extract the local-part and hostname from a mail
479 ** header field that might contain multiple
480 ** values, e.g. "To:", "Cc:"
481 **
482 ** Parameters:
483 ** line -- input line
484 ** users_out -- array of pointers to "local-part" (returned)
485 ** domains_out -- array of pointers to hostname (returned)
486 **
487 ** Return value:
488 ** 0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
489 **
490 ** Notes:
491 ** Input string is modified.
492 */
493
494 int
dkim_mail_parse_multi(unsigned char * line,unsigned char *** users_out,unsigned char *** domains_out)495 dkim_mail_parse_multi(unsigned char *line, unsigned char ***users_out,
496 unsigned char ***domains_out)
497 {
498 _Bool escaped = FALSE;
499 _Bool quoted = FALSE;
500 _Bool done = FALSE;
501 int a = 0;
502 int n = 0;
503 int status;
504 int parens = 0;
505 char *p;
506 char *addr;
507 unsigned char **uout = NULL;
508 unsigned char **dout = NULL;
509 unsigned char *u;
510 unsigned char *d;
511
512 /* walk the input string looking for unenclosed commas */
513 addr = line;
514 for (p = line; !done; p++)
515 {
516 if (escaped)
517 {
518 escaped = FALSE;
519 continue;
520 }
521
522 switch (*p)
523 {
524 case '\\':
525 escaped = TRUE;
526 continue;
527
528 case ':':
529 quoted = !quoted;
530 continue;
531
532 case '(':
533 parens++;
534 continue;
535
536 case ')':
537 parens--;
538 continue;
539
540 case ',':
541 case '\0':
542 if (parens != 0)
543 continue;
544
545 if (*p == '\0')
546 done = TRUE;
547 else
548 *p = '\0';
549
550 status = dkim_mail_parse(addr, &u, &d);
551 if (status != 0)
552 {
553 if (uout != NULL)
554 {
555 free(uout);
556 free(dout);
557 }
558
559 return status;
560 }
561
562 if (n == 0)
563 {
564 size_t newsize = 2 * sizeof(unsigned char *);
565
566 uout = (unsigned char **) malloc(newsize);
567 if (uout == NULL)
568 return -1;
569
570 dout = (unsigned char **) malloc(newsize);
571 if (dout == NULL)
572 {
573 free(uout);
574 return -1;
575 }
576
577 a = 2;
578 }
579 else if (n + 1 == a)
580 {
581 unsigned char **new;
582
583 size_t newsize = a * 2 * sizeof(unsigned char *);
584
585 new = (unsigned char **) realloc(uout, newsize);
586 if (new == NULL)
587 {
588 free(uout);
589 free(dout);
590 return -1;
591 }
592
593 uout = new;
594
595 new = (unsigned char **) realloc(dout, newsize);
596 if (new == NULL)
597 {
598 free(uout);
599 free(dout);
600 return -1;
601 }
602
603 dout = new;
604
605 a *= 2;
606 }
607
608 uout[n] = u;
609 dout[n++] = d;
610
611 uout[n] = '\0';
612 dout[n] = '\0';
613
614 addr = p + 1;
615
616 break;
617
618 default:
619 break;
620 }
621 }
622
623 *users_out = uout;
624 *domains_out = dout;
625
626 return 0;
627 }
628
629 #ifdef DKIM_MAILPARSE_TEST
630 int
main(int argc,char ** argv)631 main(int argc, char **argv)
632 {
633 int err;
634 unsigned char **domains, **users;
635
636 if (argc != 2)
637 {
638 fprintf(stderr, "Usage: %s mailheader\n", argv[0]);
639 exit(64);
640 }
641
642 err = dkim_mail_parse_multi(argv[1], &users, &domains);
643
644 if (err != 0)
645 {
646 printf("error %d\n", err);
647 }
648 else
649 {
650 int n;
651
652 for (n = 0; users[n] != NULL || domains[n] != NULL; n++)
653 {
654 printf("user: '%s'\ndomain: '%s'\n",
655 users[n] ? dkim_mail_unescape(users[n]) : "null",
656 domains[n] ? dkim_mail_unescape(domains[n])
657 : "null");
658 }
659 }
660
661 return 0;
662 }
663 #endif /* DKIM_MAILPARSE_TEST */
664