1 /*
2 **  Copyright (c) 2005, 2007, 2008 Sendmail, Inc. and its suppliers.
3 **    All rights reserved.
4 **
5 **  Copyright (c) 2009, 2010, 2012, 2014, The Trusted Domain Project.
6 **    All rights reserved.
7 */
8 
9 /* system inludes */
10 #include <sys/types.h>
11 #include <ctype.h>
12 #include <string.h>
13 #include <limits.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 
17 /* libopendkim includes */
18 #include "dkim-mailparse.h"
19 
20 /* types */
21 typedef unsigned long cmap_elem_type;
22 
23 /* symbolic names */
24 #define DKIM_MAILPARSE_OK 		0 	/* success */
25 #define DKIM_MAILPARSE_ERR_PUNBALANCED	1	/* unbalanced parentheses */
26 #define DKIM_MAILPARSE_ERR_QUNBALANCED	2	/* unbalanced quotes */
27 #define DKIM_MAILPARSE_ERR_SUNBALANCED	3	/* unbalanced sq. brackets */
28 
29 /* a bitmap for the "specials" character class */
30 #define	CMAP_NBITS	 	(sizeof(cmap_elem_type) * CHAR_BIT)
31 #define	CMAP_NELEMS	  	((1 + UCHAR_MAX) / CMAP_NBITS)
32 #define	CMAP_INDEX(i)		((unsigned char)(i) / CMAP_NBITS)
33 #define	CMAP_BIT(i)  		(1L << (unsigned char)(i) % CMAP_NBITS)
34 #define	CMAP_TST(ar, c)    	((ar)[CMAP_INDEX(c)] &  CMAP_BIT(c))
35 #define	CMAP_SET(ar, c)    	((ar)[CMAP_INDEX(c)] |= CMAP_BIT(c))
36 
37 static unsigned char const SPECIALS[] = "<>@,;:\\\"/[]?=";
38 
39 #ifndef FALSE
40 # define FALSE	0
41 #endif /* ! FALSE */
42 #ifndef TRUE
43 # define TRUE	1
44 #endif /* ! TRUE */
45 
46 #ifdef DKIM_MAILPARSE_TEST
47 /*
48 **  DKIM_MAIL_UNESCAPE -- remove escape characters from a string
49 **
50 **  Parameters:
51 **  	s -- the string to be unescaped
52 **
53 **  Return value:
54 **  	s.
55 */
56 
57 static char *
dkim_mail_unescape(char * s)58 dkim_mail_unescape(char *s)
59 {
60 	char 		*w;
61 	char const 	*r, *p, *e;
62 
63 	if (s == NULL)
64 		return NULL;
65 
66 	r = w = s;
67 	e = s + strlen(s);
68 
69 	while ((p = memchr(r, '\\', e - s)) != NULL)
70 	{
71 		if (p > s)
72 		{
73 			if (r != w)
74 				memmove(w, r, p - r);
75 			w += p - r;
76 		}
77 
78 		if (p[1] == '\0')
79 		{
80 			r = p + 1;
81 		}
82 		else
83 		{
84 			*w++ = p[1];
85 			r = p + 2;
86 		}
87 	}
88 
89 	if (r > w)
90 	{
91 		if (e > r)
92 		{
93 			memmove(w, r, e - r);
94 			w += e - r;
95 		}
96 		*w = '\0';
97 	}
98 
99 	return s;
100 }
101 #endif /* DKIM_MAILPARSE_TEST */
102 
103 /*
104 **  DKIM_MAIL_MATCHING_PAREN -- return the location past matching opposite
105 **                              parentheses
106 **
107 **  Parameters:
108 **  	s -- start of string to be processed
109 **  	e -- end of string to be processed
110 **  	open_paren -- open parenthesis character
111 **  	close_paren -- close parenthesis character
112 **
113 **  Return value:
114 **  	Location of the final close parenthesis character in the string.
115 **  	For example, given "xxx((yyyy)zz)aaaa", would return the location
116 **  	of the second ")".  There may be more beyond that, but at that point
117 **  	everything is balanced.
118 */
119 
120 static u_char *
dkim_mail_matching_paren(u_char * s,u_char * e,int open_paren,int close_paren)121 dkim_mail_matching_paren(u_char *s, u_char *e, int open_paren, int close_paren)
122 {
123 	int 		paren = 1;
124 
125 	for (; s < e; s++)
126 	{
127 		if (*s == close_paren)
128 		{
129 			if (--paren == 0)
130 				break;
131 		}
132 		else if (*s == open_paren)
133 		{
134 			paren++;
135 		}
136 		else if (*s == '\\')
137 		{
138 			if (s[1] != '\0')
139 				s++;
140 		}
141 	}
142 
143 	return s;
144 }
145 
146 /*
147 **  DKIM_MAIL_FIRST_SPECIAL -- find the first "special" character
148 **
149 **  Parameters:
150 **  	p -- input string
151 **  	e -- end of input string
152 **  	special_out -- pointer to the first special character found
153 **
154 **  Return value:
155 **  	0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
156 */
157 
158 static int
dkim_mail_first_special(u_char * p,u_char * e,u_char ** special_out)159 dkim_mail_first_special(u_char *p, u_char *e, u_char **special_out)
160 {
161 	size_t		i;
162 	cmap_elem_type	is_special[CMAP_NELEMS] = { 0 };
163 	u_char		*at_ptr = NULL;
164 
165 	/* set up special finder */
166 	for (i = 0; SPECIALS[i] != '\0'; i++)
167 		CMAP_SET(is_special, SPECIALS[i]);
168 
169 	for (; p < e && *p != '\0'; p++)
170 	{
171 		/* skip white space between tokens */
172 		while (p < e && (*p == '(' ||
173 		                 (isascii(*p) && isspace(*p))))
174 		{
175 			if (*p != '(')
176 			{
177 				p++;
178 			}
179 			else
180 			{
181 				p = dkim_mail_matching_paren(p + 1, e,
182 				                             '(', ')');
183 				if (*p == '\0')
184 					return DKIM_MAILPARSE_ERR_PUNBALANCED;
185 				else
186 					p++;
187 			}
188 		}
189 
190 		if (*p == '\0')
191 			break;
192 
193 		if (*p == '"')
194 		{
195 			p = dkim_mail_matching_paren(p + 1, e, '\0', '"');
196 			if (*p == '\0')
197 				return DKIM_MAILPARSE_ERR_QUNBALANCED;
198 		}
199 		else if (*p == '[')
200 		{
201 			p = dkim_mail_matching_paren(p + 1, e, '\0', ']');
202 			if (*p == '\0')
203 				return DKIM_MAILPARSE_ERR_SUNBALANCED;
204 		}
205 		else if (CMAP_TST(is_special, *p))
206 		{
207 			if (*p == '<')
208 			{
209 				*special_out = p;
210 				return 0;
211 			}
212 			else if (*p == ':' || *p == ';' || *p == ',')
213 			{
214 				if (at_ptr != NULL)
215 					*special_out = at_ptr;
216 				else
217 					*special_out = p;
218 				return 0;
219 			}
220 			else if (*p == '@')
221 			{
222 				at_ptr = p;
223 			}
224 		}
225 		else
226 		{
227 			while (*p != '\0' &&
228 			       !CMAP_TST(is_special, *p) &&
229 			       (!isascii(*p) ||
230 			        !isspace((unsigned char) *p)) &&
231 			       *p != '(')
232 				p++;
233 			p--;
234 		}
235 	}
236 
237 	*special_out = p;
238 	return 0;
239 }
240 
241 /*
242 **  DKIM_MAIL_TOKEN -- find the next token
243 **
244 **  Parameters:
245 **  	s -- start of input string
246 **  	e -- end of input string
247 **  	type_out -- type of token (returned)
248 **  	start_out -- start of token (returned)
249 **  	end_out -- start of token (returned)
250 **  	uncommented_whitespace -- set to TRUE if uncommented whitespace is
251 **  	                          discovered (returned)
252 **
253 **  Return value:
254 **  	0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
255 */
256 
257 static int
dkim_mail_token(u_char * s,u_char * e,int * type_out,u_char ** start_out,u_char ** end_out,int * uncommented_whitespace)258 dkim_mail_token(u_char *s, u_char *e, int *type_out, u_char **start_out,
259                 u_char **end_out, int *uncommented_whitespace)
260 {
261 	u_char *p;
262 	int err = 0;
263 	size_t i;
264 	int token_type;
265 	cmap_elem_type is_special[CMAP_NELEMS] = { 0 };
266 	u_char *token_start, *token_end;
267 
268 	*start_out = NULL;
269 	*end_out   = NULL;
270 	*type_out  = 0;
271 
272 	err = 0;
273 
274 	/* set up special finder */
275 	for (i = 0; SPECIALS[i] != '\0'; i++)
276 		CMAP_SET(is_special, SPECIALS[i]);
277 
278 	p = s;
279 
280 	/* skip white space between tokens */
281 	while (p < e && (*p == '(' ||
282 	                 (isascii((unsigned char) *p) &&
283 	                  isspace((unsigned char) *p))))
284 	{
285 		if (*p != '(')
286 		{
287 			*uncommented_whitespace = 1;
288 			p++;
289 		}
290 		else
291 		{
292 			p = dkim_mail_matching_paren(p + 1, e, '(', ')');
293 			if (*p == '\0')
294 				return DKIM_MAILPARSE_ERR_PUNBALANCED;
295 			else
296 				p++;
297 		}
298 	}
299 
300 	if (p >= e || *p == '\0')
301 		return 0;
302 
303 	/* our new token starts here */
304 	token_start = p;
305 
306 	/* fill in the token contents and type */
307 	if (*p == '"')
308 	{
309 		token_end = dkim_mail_matching_paren(p + 1, e, '\0', '"');
310 		token_type = '"';
311 		if (*token_end != '\0')
312 			token_end++;
313 		else
314 			err = DKIM_MAILPARSE_ERR_QUNBALANCED;
315 	}
316 	else if (*p == '[')
317 	{
318 		token_end = p = dkim_mail_matching_paren(p + 1, e, '\0', ']');
319 		token_type = '[';
320 		if (*token_end != '\0')
321 			token_end++;
322 		else
323 			err = DKIM_MAILPARSE_ERR_SUNBALANCED;
324 	}
325 	else if (CMAP_TST(is_special, *p))
326 	{
327 		token_end  = p + 1;
328 		token_type = *p;
329 	}
330 	else
331 	{
332 		while (p < e && *p != '\0' && !CMAP_TST(is_special, *p) &&
333 		       (!isascii(*p) || !isspace((unsigned char) *p)) &&
334 		       *p != '(')
335 			p++;
336 
337 		token_end = p;
338 		token_type = 'x';
339 	}
340 
341 	*start_out = token_start;
342 	*end_out   = token_end;
343 	*type_out  = token_type;
344 
345 	return err;
346 }
347 
348 /*
349 **  DKIM_MAIL_PARSE -- extract the local-part and hostname from a mail
350 **                     header field, e.g. "From:"
351 **
352 **  Parameters:
353 **  	line -- input line
354 **  	user_out -- pointer to "local-part" (returned)
355 **  	domain_out -- pointer to hostname (returned)
356 **
357 **  Return value:
358 **  	0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
359 **
360 **  Notes:
361 **  	Input string is modified.
362 */
363 
364 int
dkim_mail_parse(unsigned char * line,unsigned char ** user_out,unsigned char ** domain_out)365 dkim_mail_parse(unsigned char *line, unsigned char **user_out,
366                 unsigned char **domain_out)
367 {
368 	int type;
369 	int ws;
370 	int err;
371 	u_char *e, *special;
372 	u_char *tok_s, *tok_e;
373 	u_char *w;
374 
375 	*user_out = NULL;
376 	*domain_out = NULL;
377 
378 	err = 0;
379 	w = line;
380 	e = line + strlen((char *) line);
381 	ws = 0;
382 
383 	for (;;)
384 	{
385 		err = dkim_mail_first_special(line, e, &special);
386 		if (err != 0)
387 			return err;
388 
389 		/* given the construct we're looking at, do the right thing */
390 		switch (*special)
391 		{
392 		  case '<':
393 			/* display name <address> */
394 			line = special + 1;
395 			for (;;)
396 			{
397 				err = dkim_mail_token(line, e, &type, &tok_s,
398 				                      &tok_e, &ws);
399 				if (err != 0)
400 					return err;
401 
402 				if (type == '>' || type == '\0')
403 				{
404 					*w = '\0';
405 					return 0;
406 				}
407 				else if (type == '@')
408 				{
409 					*w++ = '\0';
410 					*domain_out = w;
411 				}
412 				else if (type == ',' || type == ':')
413 				{
414 					/* source route punctuation */
415 					*user_out = NULL;
416 					*domain_out = NULL;
417 				}
418 				else
419 				{
420 					if (*user_out == NULL)
421 						*user_out = w;
422 					memmove(w, tok_s, tok_e - tok_s);
423 					w += tok_e - tok_s;
424 				}
425 				line = tok_e;
426 			}
427 
428 		  case ';':
429 		  case ':':
430 		  case ',':
431 			/* skip a group name or result */
432 		  	line = special + 1;
433 			break;
434 
435 		  default:
436 			/* (display name) addr(display name)ess */
437 			ws = 0;
438 			for (;;)
439 			{
440 				err = dkim_mail_token(line, e, &type, &tok_s,
441 				                      &tok_e, &ws);
442 				if (err != 0)
443 					return err;
444 
445 				if (type == '\0' ||  type == ',' || type == ';')
446 				{
447 					*w = '\0';
448 					break;
449 				}
450 				else if (type == '@')
451 				{
452 					*w++ = '\0';
453 					*domain_out = w;
454 					ws = 0;
455 				}
456 				else
457 				{
458 
459 					if (*user_out == NULL)
460 						*user_out = w;
461 					else if (type == 'x' && ws == 1)
462 						*w++ = ' ';
463 
464 					memmove(w, tok_s, tok_e - tok_s);
465 					w += tok_e - tok_s;
466 
467 					ws = 0;
468 				}
469 
470 				line = tok_e;
471 			}
472 			return 0;
473 		}
474 	}
475 }
476 
477 /*
478 **  DKIM_MAIL_PARSE_MULTI -- extract the local-part and hostname from a mail
479 **                           header field that might contain multiple
480 **                           values, e.g. "To:", "Cc:"
481 **
482 **  Parameters:
483 **  	line -- input line
484 **  	users_out -- array of pointers to "local-part" (returned)
485 **  	domains_out -- array of pointers to hostname (returned)
486 **
487 **  Return value:
488 **  	0 on success, or an DKIM_MAILPARSE_ERR_* on failure.
489 **
490 **  Notes:
491 **  	Input string is modified.
492 */
493 
494 int
dkim_mail_parse_multi(unsigned char * line,unsigned char *** users_out,unsigned char *** domains_out)495 dkim_mail_parse_multi(unsigned char *line, unsigned char ***users_out,
496                       unsigned char ***domains_out)
497 {
498 	_Bool escaped = FALSE;
499 	_Bool quoted = FALSE;
500 	_Bool done = FALSE;
501 	int a = 0;
502 	int n = 0;
503 	int status;
504 	int parens = 0;
505 	char *p;
506 	char *addr;
507 	unsigned char **uout = NULL;
508 	unsigned char **dout = NULL;
509 	unsigned char *u;
510 	unsigned char *d;
511 
512 	/* walk the input string looking for unenclosed commas */
513 	addr = line;
514 	for (p = line; !done; p++)
515 	{
516 		if (escaped)
517 		{
518 			escaped = FALSE;
519 			continue;
520 		}
521 
522 		switch (*p)
523 		{
524 		  case '\\':
525 			escaped = TRUE;
526 			continue;
527 
528 		  case ':':
529 			quoted = !quoted;
530 			continue;
531 
532 		  case '(':
533 			parens++;
534 			continue;
535 
536 		  case ')':
537 			parens--;
538 			continue;
539 
540 		  case ',':
541 		  case '\0':
542 			if (parens != 0)
543 				continue;
544 
545 			if (*p == '\0')
546 				done = TRUE;
547 			else
548 				*p = '\0';
549 
550 			status = dkim_mail_parse(addr, &u, &d);
551 			if (status != 0)
552 			{
553 				if (uout != NULL)
554 				{
555 					free(uout);
556 					free(dout);
557 				}
558 
559 				return status;
560 			}
561 
562 			if (n == 0)
563 			{
564 				size_t newsize = 2 * sizeof(unsigned char *);
565 
566 				uout = (unsigned char **) malloc(newsize);
567 				if (uout == NULL)
568 					return -1;
569 
570 				dout = (unsigned char **) malloc(newsize);
571 				if (dout == NULL)
572 				{
573 					free(uout);
574 					return -1;
575 				}
576 
577 				a = 2;
578 			}
579 			else if (n + 1 == a)
580 			{
581 				unsigned char **new;
582 
583 				size_t newsize = a * 2 * sizeof(unsigned char *);
584 
585 				new = (unsigned char **) realloc(uout, newsize);
586 				if (new == NULL)
587 				{
588 					free(uout);
589 					free(dout);
590 					return -1;
591 				}
592 
593 				uout = new;
594 
595 				new = (unsigned char **) realloc(dout, newsize);
596 				if (new == NULL)
597 				{
598 					free(uout);
599 					free(dout);
600 					return -1;
601 				}
602 
603 				dout = new;
604 
605 				a *= 2;
606 			}
607 
608 			uout[n] = u;
609 			dout[n++] = d;
610 
611 			uout[n] = '\0';
612 			dout[n] = '\0';
613 
614 			addr = p + 1;
615 
616 			break;
617 
618 		  default:
619 			break;
620 		}
621 	}
622 
623 	*users_out = uout;
624 	*domains_out = dout;
625 
626 	return 0;
627 }
628 
629 #ifdef DKIM_MAILPARSE_TEST
630 int
main(int argc,char ** argv)631 main(int argc, char **argv)
632 {
633 	int err;
634 	unsigned char **domains, **users;
635 
636 	if (argc != 2)
637 	{
638 		fprintf(stderr, "Usage: %s mailheader\n", argv[0]);
639 		exit(64);
640 	}
641 
642 	err = dkim_mail_parse_multi(argv[1], &users, &domains);
643 
644 	if (err != 0)
645 	{
646 		printf("error %d\n", err);
647 	}
648 	else
649 	{
650 		int n;
651 
652 		for (n = 0; users[n] != NULL || domains[n] != NULL; n++)
653 		{
654 			printf("user: '%s'\ndomain: '%s'\n",
655 				users[n] ? dkim_mail_unescape(users[n]) : "null",
656 				domains[n] ? dkim_mail_unescape(domains[n])
657 			                   : "null");
658 		}
659 	}
660 
661 	return 0;
662 }
663 #endif /* DKIM_MAILPARSE_TEST */
664