1 /*-
2 * Copyright 2016 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "config.h"
18 #include "email_addr.h"
19 #include "message.h"
20 #include "printf.h"
21 #include "smtp_parsers.h"
22
23 static void
rspamd_email_address_unescape(struct rspamd_email_address * addr)24 rspamd_email_address_unescape (struct rspamd_email_address *addr)
25 {
26 const char *h, *end;
27 char *t, *d;
28
29 if (addr->user_len == 0) {
30 return;
31 }
32
33 d = g_malloc (addr->user_len);
34 t = d;
35 h = addr->user;
36 end = h + addr->user_len;
37
38 while (h < end) {
39 if (*h != '\\') {
40 *t++ = *h;
41 }
42 h ++;
43 }
44
45 addr->user = d;
46 addr->user_len = t - d;
47 addr->flags |= RSPAMD_EMAIL_ADDR_USER_ALLOCATED;
48 }
49
50 struct rspamd_email_address *
rspamd_email_address_from_smtp(const gchar * str,guint len)51 rspamd_email_address_from_smtp (const gchar *str, guint len)
52 {
53 struct rspamd_email_address addr, *ret;
54 gsize nlen;
55
56 if (str == NULL || len == 0) {
57 return NULL;
58 }
59
60 rspamd_smtp_addr_parse (str, len, &addr);
61
62 if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
63 ret = g_malloc (sizeof (*ret));
64 memcpy (ret, &addr, sizeof (addr));
65
66 if ((ret->flags & RSPAMD_EMAIL_ADDR_QUOTED) && ret->addr[0] == '"') {
67 if (ret->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
68 /* We also need to unquote user */
69 rspamd_email_address_unescape (ret);
70 }
71
72 /* We need to unquote addr */
73 nlen = ret->domain_len + ret->user_len + 2;
74 ret->addr = g_malloc (nlen + 1);
75 ret->addr_len = rspamd_snprintf ((char *)ret->addr, nlen, "%*s@%*s",
76 (gint)ret->user_len, ret->user,
77 (gint)ret->domain_len, ret->domain);
78 ret->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
79 }
80
81 return ret;
82 }
83
84 return NULL;
85 }
86
87 void
rspamd_email_address_free(struct rspamd_email_address * addr)88 rspamd_email_address_free (struct rspamd_email_address *addr)
89 {
90 if (addr) {
91 if (addr->flags & RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED) {
92 g_free ((void *) addr->addr);
93 }
94
95 if (addr->flags & RSPAMD_EMAIL_ADDR_USER_ALLOCATED) {
96 g_free ((void *) addr->user);
97 }
98
99 g_free (addr);
100 }
101 }
102
103 static inline void
rspamd_email_address_add(rspamd_mempool_t * pool,GPtrArray * ar,struct rspamd_email_address * addr,GString * name)104 rspamd_email_address_add (rspamd_mempool_t *pool,
105 GPtrArray *ar,
106 struct rspamd_email_address *addr,
107 GString *name)
108 {
109 struct rspamd_email_address *elt;
110 guint nlen;
111
112 elt = g_malloc0 (sizeof (*elt));
113 rspamd_mempool_notify_alloc (pool, sizeof (*elt));
114
115 if (addr != NULL) {
116 memcpy (elt, addr, sizeof (*addr));
117 }
118 else {
119 elt->addr = "";
120 elt->domain = "";
121 elt->raw = "<>";
122 elt->raw_len = 2;
123 elt->user = "";
124 elt->flags |= RSPAMD_EMAIL_ADDR_EMPTY;
125 }
126
127 if ((elt->flags & RSPAMD_EMAIL_ADDR_QUOTED) && elt->addr[0] == '"') {
128 if (elt->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
129 /* We also need to unquote user */
130 rspamd_email_address_unescape (elt);
131 }
132
133 /* We need to unquote addr */
134 nlen = elt->domain_len + elt->user_len + 2;
135 elt->addr = g_malloc (nlen + 1);
136 rspamd_mempool_notify_alloc (pool, nlen + 1);
137 elt->addr_len = rspamd_snprintf ((char *)elt->addr, nlen, "%*s@%*s",
138 (gint)elt->user_len, elt->user,
139 (gint)elt->domain_len, elt->domain);
140 elt->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
141 }
142
143 if (name->len > 0) {
144 rspamd_gstring_strip (name, " \t\v");
145 elt->name = rspamd_mime_header_decode (pool, name->str, name->len, NULL);
146 }
147
148 rspamd_mempool_notify_alloc (pool, name->len);
149 g_ptr_array_add (ar, elt);
150 }
151
152 /*
153 * Tries to parse an email address that doesn't conform RFC
154 */
155 static gboolean
rspamd_email_address_parse_heuristic(const char * data,size_t len,struct rspamd_email_address * addr)156 rspamd_email_address_parse_heuristic (const char *data, size_t len,
157 struct rspamd_email_address *addr)
158 {
159 const gchar *p = data, *at = NULL, *end = data + len;
160 gboolean ret = FALSE;
161
162 memset (addr, 0, sizeof (*addr));
163
164 if (*p == '<' && len > 1) {
165 /* Angled address */
166 addr->addr_len = rspamd_memcspn (p + 1, ">", len - 1);
167 addr->addr = p + 1;
168 addr->raw = p;
169 addr->raw_len = len;
170 ret = TRUE;
171
172 p = p + 1;
173 len = addr->addr_len;
174 end = p + len;
175 }
176 else if (len > 0) {
177 addr->addr = p;
178 addr->addr_len = len;
179 addr->raw = p;
180 addr->raw_len = len;
181 ret = TRUE;
182 }
183
184 if (ret) {
185 at = rspamd_memrchr (p, '@', len);
186
187 if (at != NULL && at + 1 < end) {
188 addr->domain = at + 1;
189 addr->domain_len = end - (at + 1);
190 addr->user = p;
191 addr->user_len = at - p;
192 }
193
194 if (rspamd_str_has_8bit (p, len)) {
195 addr->flags |= RSPAMD_EMAIL_ADDR_HAS_8BIT;
196 }
197 }
198
199 return ret;
200 }
201
202 static inline int
rspamd_email_address_check_and_add(const gchar * start,gsize len,GPtrArray * res,rspamd_mempool_t * pool,GString * ns,gint max_elements)203 rspamd_email_address_check_and_add (const gchar *start, gsize len,
204 GPtrArray *res,
205 rspamd_mempool_t *pool,
206 GString *ns,
207 gint max_elements)
208 {
209 struct rspamd_email_address addr;
210
211 g_assert (res != NULL);
212
213 if (max_elements > 0 && res->len >= max_elements) {
214 msg_info_pool_check ("reached maximum number of elements %d when adding %v",
215 max_elements,
216 ns);
217
218 return -1;
219 }
220
221 /* The whole email is likely address */
222 memset (&addr, 0, sizeof (addr));
223 rspamd_smtp_addr_parse (start, len, &addr);
224
225 if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
226 rspamd_email_address_add (pool, res, &addr, ns);
227 }
228 else {
229 /* Try heuristic */
230 if (rspamd_email_address_parse_heuristic (start,
231 len, &addr)) {
232 rspamd_email_address_add (pool, res, &addr, ns);
233
234 return 1;
235 }
236 else {
237 return 0;
238 }
239 }
240
241 return 1;
242 }
243
244 GPtrArray *
rspamd_email_address_from_mime(rspamd_mempool_t * pool,const gchar * hdr,guint len,GPtrArray * src,gint max_elements)245 rspamd_email_address_from_mime (rspamd_mempool_t *pool, const gchar *hdr,
246 guint len,
247 GPtrArray *src,
248 gint max_elements)
249 {
250 GPtrArray *res = src;
251 gboolean seen_at = FALSE, seen_obrace = FALSE;
252
253 const gchar *p = hdr, *end = hdr + len, *c = hdr, *t;
254 GString *ns, *cpy;
255 gint obraces, ebraces;
256 enum {
257 parse_name = 0,
258 parse_quoted,
259 parse_addr,
260 skip_spaces
261 } state = parse_name, next_state = parse_name;
262
263 if (res == NULL) {
264 res = g_ptr_array_sized_new (2);
265 rspamd_mempool_add_destructor (pool, rspamd_email_address_list_destroy,
266 res);
267 }
268 else if (max_elements > 0 && res->len >= max_elements) {
269 msg_info_pool_check ("reached maximum number of elements %d", max_elements);
270
271 return res;
272 }
273
274 ns = g_string_sized_new (len);
275 cpy = g_string_sized_new (len);
276
277 rspamd_mempool_add_destructor (pool, rspamd_gstring_free_hard, cpy);
278
279 /* First, we need to remove all comments as they are terrible */
280 obraces = 0;
281 ebraces = 0;
282
283 while (p < end) {
284 if (state == parse_name) {
285 if (*p == '\\') {
286 if (obraces == 0) {
287 g_string_append_c (cpy, *p);
288 }
289
290 p++;
291 }
292 else {
293 if (*p == '"') {
294 state = parse_quoted;
295 }
296 else if (*p == '(') {
297 obraces ++; /* To avoid ) itself being copied */
298 }
299 else if (*p == ')') {
300 ebraces ++;
301 p ++;
302 }
303
304 if (obraces == ebraces) {
305 obraces = 0;
306 ebraces = 0;
307 }
308 }
309
310 if (p < end && obraces == 0) {
311 g_string_append_c (cpy, *p);
312 }
313 }
314 else {
315 /* Quoted elt */
316 if (*p == '\\') {
317 g_string_append_c (cpy, *p);
318 p++;
319 }
320 else {
321 if (*p == '"') {
322 state = parse_name;
323 }
324 }
325
326 if (p < end) {
327 g_string_append_c (cpy, *p);
328 }
329 }
330
331 p++;
332 }
333
334 state = parse_name;
335
336 p = cpy->str;
337 c = p;
338 end = p + cpy->len;
339
340 while (p < end) {
341 switch (state) {
342 case parse_name:
343 if (*p == '"') {
344 /* We need to strip last spaces and update `ns` */
345 if (p > c) {
346 guint nspaces = 0;
347
348 t = p - 1;
349
350 while (t > c && g_ascii_isspace (*t)) {
351 t --;
352 nspaces ++;
353 }
354
355 g_string_append_len (ns, c, t - c + 1);
356
357 if (nspaces > 0) {
358 g_string_append_c (ns, ' ');
359 }
360 }
361
362 state = parse_quoted;
363 c = p + 1;
364 }
365 else if (*p == '<') {
366 if (p > c) {
367 t = p - 1;
368
369 while (t > c && g_ascii_isspace (*t)) {
370 t --;
371 }
372
373 g_string_append_len (ns, c, t - c + 1);
374 }
375
376 c = p;
377 state = parse_addr;
378 }
379 else if (*p == ',') {
380 if (p > c && seen_at) {
381 /*
382 * Last token must be the address:
383 * e.g. Some name name@domain.com
384 */
385 t = p - 1;
386
387 while (t > c && g_ascii_isspace (*t)) {
388 t --;
389 }
390
391 int check = rspamd_email_address_check_and_add (c, t - c + 1,
392 res, pool, ns, max_elements);
393
394 if (check == 0 && res->len == 0) {
395 /* Insert fake address */
396 rspamd_email_address_add (pool, res, NULL, ns);
397 }
398 else if (check != 1) {
399 goto end;
400 }
401
402 /* Cleanup for the next use */
403 g_string_set_size (ns, 0);
404 seen_at = FALSE;
405 }
406
407 state = skip_spaces;
408 next_state = parse_name;
409 }
410 else if (*p == '@') {
411 seen_at = TRUE;
412 }
413
414 p ++;
415 break;
416 case parse_quoted:
417 if (*p == '\\') {
418 if (p > c) {
419 g_string_append_len (ns, c, p - c);
420 }
421
422 p ++;
423 c = p;
424 }
425 else if (*p == '"') {
426 if (p > c) {
427 g_string_append_len (ns, c, p - c);
428 }
429
430 if (p + 1 < end && g_ascii_isspace (p[1])) {
431 g_string_append_c (ns, ' ');
432 }
433
434 state = skip_spaces;
435 next_state = parse_name;
436 }
437 else if (*p == '@' && seen_obrace) {
438 seen_at = TRUE;
439 }
440 else if (*p == '<') {
441 seen_obrace = TRUE;
442 }
443 p ++;
444 break;
445 case parse_addr:
446 if (*p == '>') {
447 int check = rspamd_email_address_check_and_add (c, p - c + 1,
448 res, pool, ns, max_elements);
449 if (check == 0 && res->len == 0) {
450 /* Insert a fake address */
451 rspamd_email_address_add (pool, res, NULL, ns);
452 }
453 else if (check != 1) {
454 goto end;
455 }
456
457 /* Cleanup for the next use */
458 g_string_set_size (ns, 0);
459 seen_at = FALSE;
460 state = skip_spaces;
461 next_state = parse_name;
462 }
463 else if (*p == '@') {
464 seen_at = TRUE;
465 }
466 p ++;
467 break;
468 case skip_spaces:
469 if (!g_ascii_isspace (*p)) {
470 c = p;
471 state = next_state;
472 }
473 else {
474 p ++;
475 }
476 break;
477 }
478 }
479
480 /* Handle leftover */
481 switch (state) {
482 case parse_name:
483 /* Assume the whole header as name (bad thing) */
484 if (p > c) {
485 while (p > c && g_ascii_isspace (*p)) {
486 p --;
487 }
488
489 if (p > c) {
490 if (seen_at) {
491 /* The whole email is likely address */
492 int check = rspamd_email_address_check_and_add (c, p - c,
493 res, pool, ns, max_elements);
494 if (check == 0 && res->len == 0) {
495 /* Insert a fake address */
496 rspamd_email_address_add (pool, res, NULL, ns);
497 }
498 else if (check != 1) {
499 goto end;
500 }
501 } else {
502 /* No @ seen */
503 g_string_append_len (ns, c, p - c);
504
505 if (res->len == 0) {
506 rspamd_email_address_add (pool, res, NULL, ns);
507 }
508 }
509 }
510 else if (res->len == 0) {
511 rspamd_email_address_add (pool, res, NULL, ns);
512 }
513 }
514 break;
515 case parse_addr:
516 if (p > c) {
517 if (rspamd_email_address_check_and_add (c, p - c,
518 res, pool, ns, max_elements) == 0) {
519 if (res->len == 0) {
520 rspamd_email_address_add (pool, res, NULL, ns);
521 }
522 }
523 }
524 break;
525 case parse_quoted:
526 /* Unfinished quoted string or a comment */
527 /* If we have seen obrace + at, then we still can try to resolve address */
528 if (seen_at && seen_obrace) {
529 p = rspamd_memrchr (cpy->str, '<', cpy->len);
530 g_assert (p != NULL);
531 if (rspamd_email_address_check_and_add (p, end - p,
532 res, pool, ns, max_elements) == 0) {
533 if (res->len == 0) {
534 rspamd_email_address_add (pool, res, NULL, ns);
535 }
536 }
537 }
538 break;
539 default:
540 /* Do nothing */
541 break;
542 }
543 end:
544 rspamd_mempool_notify_alloc (pool, cpy->len);
545 g_string_free (ns, TRUE);
546
547 return res;
548 }
549
550 void
rspamd_email_address_list_destroy(gpointer ptr)551 rspamd_email_address_list_destroy (gpointer ptr)
552 {
553 GPtrArray *ar = ptr;
554 guint i;
555 struct rspamd_email_address *addr;
556
557 PTR_ARRAY_FOREACH (ar, i, addr) {
558 rspamd_email_address_free (addr);
559 }
560
561 g_ptr_array_free (ar, TRUE);
562 }