1 /*-
2  * Copyright 2016 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "config.h"
18 #include "email_addr.h"
19 #include "message.h"
20 #include "printf.h"
21 #include "smtp_parsers.h"
22 
23 static void
rspamd_email_address_unescape(struct rspamd_email_address * addr)24 rspamd_email_address_unescape (struct rspamd_email_address *addr)
25 {
26 	const char *h, *end;
27 	char *t, *d;
28 
29 	if (addr->user_len == 0) {
30 		return;
31 	}
32 
33 	d = g_malloc (addr->user_len);
34 	t = d;
35 	h = addr->user;
36 	end = h + addr->user_len;
37 
38 	while (h < end) {
39 		if (*h != '\\') {
40 			*t++ = *h;
41 		}
42 		h ++;
43 	}
44 
45 	addr->user = d;
46 	addr->user_len = t - d;
47 	addr->flags |= RSPAMD_EMAIL_ADDR_USER_ALLOCATED;
48 }
49 
50 struct rspamd_email_address *
rspamd_email_address_from_smtp(const gchar * str,guint len)51 rspamd_email_address_from_smtp (const gchar *str, guint len)
52 {
53 	struct rspamd_email_address addr, *ret;
54 	gsize nlen;
55 
56 	if (str == NULL || len == 0) {
57 		return NULL;
58 	}
59 
60 	rspamd_smtp_addr_parse (str, len, &addr);
61 
62 	if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
63 		ret = g_malloc (sizeof (*ret));
64 		memcpy (ret, &addr, sizeof (addr));
65 
66 		if ((ret->flags & RSPAMD_EMAIL_ADDR_QUOTED) && ret->addr[0] == '"') {
67 			if (ret->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
68 				/* We also need to unquote user */
69 				rspamd_email_address_unescape (ret);
70 			}
71 
72 			/* We need to unquote addr */
73 			nlen = ret->domain_len + ret->user_len + 2;
74 			ret->addr = g_malloc (nlen + 1);
75 			ret->addr_len = rspamd_snprintf ((char *)ret->addr, nlen, "%*s@%*s",
76 					(gint)ret->user_len, ret->user,
77 					(gint)ret->domain_len, ret->domain);
78 			ret->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
79 		}
80 
81 		return ret;
82 	}
83 
84 	return NULL;
85 }
86 
87 void
rspamd_email_address_free(struct rspamd_email_address * addr)88 rspamd_email_address_free (struct rspamd_email_address *addr)
89 {
90 	if (addr) {
91 		if (addr->flags & RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED) {
92 			g_free ((void *) addr->addr);
93 		}
94 
95 		if (addr->flags & RSPAMD_EMAIL_ADDR_USER_ALLOCATED) {
96 			g_free ((void *) addr->user);
97 		}
98 
99 		g_free (addr);
100 	}
101 }
102 
103 static inline void
rspamd_email_address_add(rspamd_mempool_t * pool,GPtrArray * ar,struct rspamd_email_address * addr,GString * name)104 rspamd_email_address_add (rspamd_mempool_t *pool,
105 		GPtrArray *ar,
106 		struct rspamd_email_address *addr,
107 		GString *name)
108 {
109 	struct rspamd_email_address *elt;
110 	guint nlen;
111 
112 	elt = g_malloc0 (sizeof (*elt));
113 	rspamd_mempool_notify_alloc (pool, sizeof (*elt));
114 
115 	if (addr != NULL) {
116 		memcpy (elt, addr, sizeof (*addr));
117 	}
118 	else {
119 		elt->addr = "";
120 		elt->domain = "";
121 		elt->raw = "<>";
122 		elt->raw_len = 2;
123 		elt->user = "";
124 		elt->flags |= RSPAMD_EMAIL_ADDR_EMPTY;
125 	}
126 
127 	if ((elt->flags & RSPAMD_EMAIL_ADDR_QUOTED) && elt->addr[0] == '"') {
128 		if (elt->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
129 			/* We also need to unquote user */
130 			rspamd_email_address_unescape (elt);
131 		}
132 
133 		/* We need to unquote addr */
134 		nlen = elt->domain_len + elt->user_len + 2;
135 		elt->addr = g_malloc (nlen + 1);
136 		rspamd_mempool_notify_alloc (pool, nlen + 1);
137 		elt->addr_len = rspamd_snprintf ((char *)elt->addr, nlen, "%*s@%*s",
138 				(gint)elt->user_len, elt->user,
139 				(gint)elt->domain_len, elt->domain);
140 		elt->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
141 	}
142 
143 	if (name->len > 0) {
144 		rspamd_gstring_strip (name, " \t\v");
145 		elt->name = rspamd_mime_header_decode (pool, name->str, name->len, NULL);
146 	}
147 
148 	rspamd_mempool_notify_alloc (pool, name->len);
149 	g_ptr_array_add (ar, elt);
150 }
151 
152 /*
153  * Tries to parse an email address that doesn't conform RFC
154  */
155 static gboolean
rspamd_email_address_parse_heuristic(const char * data,size_t len,struct rspamd_email_address * addr)156 rspamd_email_address_parse_heuristic (const char *data, size_t len,
157 		struct rspamd_email_address *addr)
158 {
159 	const gchar *p = data, *at = NULL, *end = data + len;
160 	gboolean ret = FALSE;
161 
162 	memset (addr, 0, sizeof (*addr));
163 
164 	if (*p == '<' && len > 1) {
165 		/* Angled address */
166 		addr->addr_len = rspamd_memcspn (p + 1, ">", len - 1);
167 		addr->addr = p + 1;
168 		addr->raw = p;
169 		addr->raw_len = len;
170 		ret = TRUE;
171 
172 		p = p + 1;
173 		len = addr->addr_len;
174 		end = p + len;
175 	}
176 	else if (len > 0) {
177 		addr->addr = p;
178 		addr->addr_len = len;
179 		addr->raw = p;
180 		addr->raw_len = len;
181 		ret = TRUE;
182 	}
183 
184 	if (ret) {
185 		at = rspamd_memrchr (p, '@', len);
186 
187 		if (at != NULL && at + 1 < end) {
188 			addr->domain = at + 1;
189 			addr->domain_len = end - (at + 1);
190 			addr->user = p;
191 			addr->user_len = at - p;
192 		}
193 
194 		if (rspamd_str_has_8bit (p, len)) {
195 			addr->flags |= RSPAMD_EMAIL_ADDR_HAS_8BIT;
196 		}
197 	}
198 
199 	return ret;
200 }
201 
202 static inline int
rspamd_email_address_check_and_add(const gchar * start,gsize len,GPtrArray * res,rspamd_mempool_t * pool,GString * ns,gint max_elements)203 rspamd_email_address_check_and_add (const gchar *start, gsize len,
204 									GPtrArray *res,
205 									rspamd_mempool_t *pool,
206 									GString *ns,
207 									gint max_elements)
208 {
209 	struct rspamd_email_address addr;
210 
211 	g_assert (res != NULL);
212 
213 	if (max_elements > 0 && res->len >= max_elements) {
214 		msg_info_pool_check ("reached maximum number of elements %d when adding %v",
215 				max_elements,
216 				ns);
217 
218 		return -1;
219 	}
220 
221 	/* The whole email is likely address */
222 	memset (&addr, 0, sizeof (addr));
223 	rspamd_smtp_addr_parse (start, len, &addr);
224 
225 	if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
226 		rspamd_email_address_add (pool, res, &addr, ns);
227 	}
228 	else {
229 		/* Try heuristic */
230 		if (rspamd_email_address_parse_heuristic (start,
231 				len, &addr)) {
232 			rspamd_email_address_add (pool, res, &addr, ns);
233 
234 			return 1;
235 		}
236 		else {
237 			return 0;
238 		}
239 	}
240 
241 	return 1;
242 }
243 
244 GPtrArray *
rspamd_email_address_from_mime(rspamd_mempool_t * pool,const gchar * hdr,guint len,GPtrArray * src,gint max_elements)245 rspamd_email_address_from_mime (rspamd_mempool_t *pool, const gchar *hdr,
246 								guint len,
247 								GPtrArray *src,
248 								gint max_elements)
249 {
250 	GPtrArray *res = src;
251 	gboolean seen_at = FALSE, seen_obrace = FALSE;
252 
253 	const gchar *p = hdr, *end = hdr + len, *c = hdr, *t;
254 	GString *ns, *cpy;
255 	gint obraces, ebraces;
256 	enum {
257 		parse_name = 0,
258 		parse_quoted,
259 		parse_addr,
260 		skip_spaces
261 	} state = parse_name, next_state = parse_name;
262 
263 	if (res == NULL) {
264 		res = g_ptr_array_sized_new (2);
265 		rspamd_mempool_add_destructor (pool, rspamd_email_address_list_destroy,
266 				res);
267 	}
268 	else if (max_elements > 0 && res->len >= max_elements) {
269 		msg_info_pool_check ("reached maximum number of elements %d", max_elements);
270 
271 		return res;
272 	}
273 
274 	ns = g_string_sized_new (len);
275 	cpy = g_string_sized_new (len);
276 
277 	rspamd_mempool_add_destructor (pool, rspamd_gstring_free_hard, cpy);
278 
279 	/* First, we need to remove all comments as they are terrible */
280 	obraces = 0;
281 	ebraces = 0;
282 
283 	while (p < end) {
284 		if (state == parse_name) {
285 			if (*p == '\\') {
286 				if (obraces == 0) {
287 					g_string_append_c (cpy, *p);
288 				}
289 
290 				p++;
291 			}
292 			else {
293 				if (*p == '"') {
294 					state = parse_quoted;
295 				}
296 				else if (*p == '(') {
297 					obraces ++; /* To avoid ) itself being copied */
298 				}
299 				else if (*p == ')') {
300 					ebraces ++;
301 					p ++;
302 				}
303 
304 				if (obraces == ebraces) {
305 					obraces = 0;
306 					ebraces = 0;
307 				}
308 			}
309 
310 			if (p < end && obraces == 0) {
311 				g_string_append_c (cpy, *p);
312 			}
313 		}
314 		else {
315 			/* Quoted elt */
316 			if (*p == '\\') {
317 				g_string_append_c (cpy, *p);
318 				p++;
319 			}
320 			else {
321 				if (*p == '"') {
322 					state = parse_name;
323 				}
324 			}
325 
326 			if (p < end) {
327 				g_string_append_c (cpy, *p);
328 			}
329 		}
330 
331 		p++;
332 	}
333 
334 	state = parse_name;
335 
336 	p = cpy->str;
337 	c = p;
338 	end = p + cpy->len;
339 
340 	while (p < end) {
341 		switch (state) {
342 		case parse_name:
343 			if (*p == '"') {
344 				/* We need to strip last spaces and update `ns` */
345 				if (p > c) {
346 					guint nspaces = 0;
347 
348 					t = p - 1;
349 
350 					while (t > c && g_ascii_isspace (*t)) {
351 						t --;
352 						nspaces ++;
353 					}
354 
355 					g_string_append_len (ns, c, t - c + 1);
356 
357 					if (nspaces > 0) {
358 						g_string_append_c (ns, ' ');
359 					}
360 				}
361 
362 				state = parse_quoted;
363 				c = p + 1;
364 			}
365 			else if (*p == '<') {
366 				if (p > c) {
367 					t = p - 1;
368 
369 					while (t > c && g_ascii_isspace (*t)) {
370 						t --;
371 					}
372 
373 					g_string_append_len (ns, c, t - c + 1);
374 				}
375 
376 				c = p;
377 				state = parse_addr;
378 			}
379 			else if (*p == ',') {
380 				if (p > c && seen_at) {
381 					/*
382 					 * Last token must be the address:
383 					 * e.g. Some name name@domain.com
384 					 */
385 					t = p - 1;
386 
387 					while (t > c && g_ascii_isspace (*t)) {
388 						t --;
389 					}
390 
391 					int check = rspamd_email_address_check_and_add (c, t - c + 1,
392 							res, pool, ns, max_elements);
393 
394 					if (check == 0 && res->len == 0) {
395 						/* Insert fake address */
396 						rspamd_email_address_add (pool, res, NULL, ns);
397 					}
398 					else if (check != 1) {
399 						goto end;
400 					}
401 
402 					/* Cleanup for the next use */
403 					g_string_set_size (ns, 0);
404 					seen_at = FALSE;
405 				}
406 
407 				state = skip_spaces;
408 				next_state = parse_name;
409 			}
410 			else if (*p == '@') {
411 				seen_at = TRUE;
412 			}
413 
414 			p ++;
415 			break;
416 		case parse_quoted:
417 			if (*p == '\\') {
418 				if (p > c) {
419 					g_string_append_len (ns, c, p - c);
420 				}
421 
422 				p ++;
423 				c = p;
424 			}
425 			else if (*p == '"') {
426 				if (p > c) {
427 					g_string_append_len (ns, c, p - c);
428 				}
429 
430 				if (p + 1 < end && g_ascii_isspace (p[1])) {
431 					g_string_append_c (ns, ' ');
432 				}
433 
434 				state = skip_spaces;
435 				next_state = parse_name;
436 			}
437 			else if (*p == '@' && seen_obrace) {
438 				seen_at = TRUE;
439 			}
440 			else if (*p == '<') {
441 				seen_obrace = TRUE;
442 			}
443 			p ++;
444 			break;
445 		case parse_addr:
446 			if (*p == '>') {
447 				int check = rspamd_email_address_check_and_add (c, p - c + 1,
448 						res, pool, ns, max_elements);
449 				if (check == 0 && res->len == 0) {
450 					/* Insert a fake address */
451 					rspamd_email_address_add (pool, res, NULL, ns);
452 				}
453 				else if (check != 1) {
454 					goto end;
455 				}
456 
457 				/* Cleanup for the next use */
458 				g_string_set_size (ns, 0);
459 				seen_at = FALSE;
460 				state = skip_spaces;
461 				next_state = parse_name;
462 			}
463 			else if (*p == '@') {
464 				seen_at = TRUE;
465 			}
466 			p ++;
467 			break;
468 		case skip_spaces:
469 			if (!g_ascii_isspace (*p)) {
470 				c = p;
471 				state = next_state;
472 			}
473 			else {
474 				p ++;
475 			}
476 			break;
477 		}
478 	}
479 
480 	/* Handle leftover */
481 	switch (state) {
482 	case parse_name:
483 		/* Assume the whole header as name (bad thing) */
484 		if (p > c) {
485 			while (p > c && g_ascii_isspace (*p)) {
486 				p --;
487 			}
488 
489 			if (p > c) {
490 				if (seen_at) {
491 					/* The whole email is likely address */
492 					int check = rspamd_email_address_check_and_add (c, p - c,
493 							res, pool, ns, max_elements);
494 					if (check == 0 && res->len == 0) {
495 						/* Insert a fake address */
496 						rspamd_email_address_add (pool, res, NULL, ns);
497 					}
498 					else if (check != 1) {
499 						goto end;
500 					}
501 				} else {
502 					/* No @ seen */
503 					g_string_append_len (ns, c, p - c);
504 
505 					if (res->len == 0) {
506 						rspamd_email_address_add (pool, res, NULL, ns);
507 					}
508 				}
509 			}
510 			else if (res->len == 0) {
511 				rspamd_email_address_add (pool, res, NULL, ns);
512 			}
513 		}
514 		break;
515 	case parse_addr:
516 		if (p > c) {
517 			if (rspamd_email_address_check_and_add (c, p - c,
518 					res, pool, ns, max_elements) == 0) {
519 				if (res->len == 0) {
520 					rspamd_email_address_add (pool, res, NULL, ns);
521 				}
522 			}
523 		}
524 		break;
525 	case parse_quoted:
526 		/* Unfinished quoted string or a comment */
527 		/* If we have seen obrace + at, then we still can try to resolve address */
528 		if (seen_at && seen_obrace) {
529 			p = rspamd_memrchr (cpy->str, '<', cpy->len);
530 			g_assert (p != NULL);
531 			if (rspamd_email_address_check_and_add (p, end - p,
532 					res, pool, ns, max_elements) == 0) {
533 				if (res->len == 0) {
534 					rspamd_email_address_add (pool, res, NULL, ns);
535 				}
536 			}
537 		}
538 		break;
539 	default:
540 		/* Do nothing */
541 		break;
542 	}
543 end:
544 	rspamd_mempool_notify_alloc (pool, cpy->len);
545 	g_string_free (ns, TRUE);
546 
547 	return res;
548 }
549 
550 void
rspamd_email_address_list_destroy(gpointer ptr)551 rspamd_email_address_list_destroy (gpointer ptr)
552 {
553 	GPtrArray *ar = ptr;
554 	guint i;
555 	struct rspamd_email_address *addr;
556 
557 	PTR_ARRAY_FOREACH (ar, i, addr) {
558 		rspamd_email_address_free (addr);
559 	}
560 
561 	g_ptr_array_free (ar, TRUE);
562 }