1 /* -*- mode: C; mode: fold -*- */
2 /*
3 This file is part of SLRN.
4
5 Copyright (c) 2009-2016 John E. Davis <jed@jedsoft.org>
6
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2 of the License, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 more details.
16
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation, Inc.,
19 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 */
21
22 #include <stdio.h>
23 #include <string.h>
24
25 #include "config.h"
26 #ifndef SLRNPULL_CODE
27 #include "slrnfeat.h"
28 #endif
29
30 #ifdef HAVE_STDLIB_H
31 # include <stdlib.h>
32 #endif
33
34 #ifdef HAVE_UNISTD_H
35 # include <unistd.h>
36 #endif
37
38 #include "parse2822.h"
39 #include "strutil.h"
40
41 /* The grammar from rfc2822 is:
42 *
43 * address = mailbox | group
44 * mailbox = name-addr | addr-spec
45 * name-addr = [display-name] angle-addr
46 * angle-addr = [CFWS] "<" addr-spec ">" CFWS | obs-angle-addr
47 * group = display-name ":" [mailbox-list | CFWS ] ";"
48 * display-name = phrase
49 * mailbox-list = (mailbox *("," mailbox)) | obs-mbox-list
50 * addr-spec = local-part "@" domain
51 * local-part = dot-atom | quoted-string | obs-local-part
52 * domain = dot-atom | domain-literal | obs-domain
53 * domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
54 * dcontent = dtext | quoted-pair
55 * dtext = non-white-space-ctrl | 33-90 | 94-126
56 * dot-atom = [CFWS] dot-atom-text [CFWS]
57 * dot-atom-text = 1*atext *("." 1*atext)
58 * atom = [CFWS] 1*atext [CFWS]
59 * atext = ascii except controls, space, and specials
60 * quoted-string = [CFWS] DQUOTE *([FWS qcontent) [FWS] DQUOTE [CFWS]
61 * qcontent = qtext | quoted-pair
62 * qtext = NO-WS-CTRL | 33-126 except \ and "
63 * FWS = ([*WSP CRLF] 1*WSP) | obs-FWS
64 * WSP = SPACE | TAB
65 * CFWS = *([FWS] comment) (([FWS] comment) | FWS)
66 * comment = "(" *([FWS] ccontent) [FWS] ")"
67 * ccontent = ctext | quoted-pair | comment
68 * ctext = NO-WS-CTRL | 33-39 | 42-91 | 93-126
69 * quoted-pair = ("\" text )
70 * text = ASCII except CR and LF
71 *
72 * Obsolete:
73 *
74 * word = atom | quoted-string
75 * phrase = 1*word | obs-phrase
76 * obs-phrase = word *(word | "." | CFWS)
77 * obs-local-part = word *("." word)
78 *
79 */
80 #define TYPE_ADD_ONLY 1
81 #define TYPE_OLD_STYLE 2
82 #define TYPE_RFC_2822 3
83
84 #define RFC_2822_SPECIAL_CHARS "()<>[]:;@\\,.\""
85 #define RFC_2822_NOT_ATOM_CHARS RFC_2822_SPECIAL_CHARS
86 #define RFC_2822_NOT_DOTATOM_CHARS "(),:;<>@[\\]\""
87 #define RFC_2822_NOT_QUOTED_CHARS "\t\\\""
88 #define RFC_2822_NOT_DOMLIT_CHARS "[\\]"
89 #define RFC_2822_NOT_COMMENT_CHARS "(\\)"
90
91 #define IS_RFC2822_SPECIAL(ch) \
92 (NULL != slrn_strbyte(RFC_2822_SPECIAL_CHARS, ch))
93 #define IS_RFC2822_ATEXT(ch) \
94 (((unsigned char)(ch) > 32) && !IS_RFC2822_SPECIAL(ch))
95 #define IS_RFC2822_PHRASE_CHAR(ch) \
96 (((ch) == ' ') || ((ch) == '\t') || ((ch) == '.') || IS_RFC2822_ATEXT(ch))
97
check_quoted_pair(char * p,char * pmax,char ** errmsg)98 static int check_quoted_pair (char *p, char *pmax, char **errmsg)
99 {
100 char ch;
101
102 if (p == pmax)
103 {
104 *errmsg = _("Expecting a quoted-pair in the header.");
105 return -1;
106 }
107
108 ch = *p;
109 if ((ch == '\r') || (ch == '\n'))
110 {
111 *errmsg = _("Illegal quoted-pair character in header.");
112 return -1;
113 }
114
115 return 0;
116 }
117
skip_quoted_string(char * p,char * pmax,char ** errmsg)118 static char *skip_quoted_string (char *p, char *pmax, char **errmsg)
119 {
120 while (p < pmax)
121 {
122 char ch = *p++;
123
124 if (ch == '"')
125 return p;
126
127 if (ch == '\\')
128 {
129 if (-1 == check_quoted_pair (p, pmax, errmsg))
130 return NULL;
131
132 p++;
133 continue;
134 }
135
136 if (NULL != slrn_strbyte(RFC_2822_NOT_QUOTED_CHARS, ch))
137 {
138 *errmsg = _("Illegal char in displayname of address header.");
139 return NULL;
140 }
141 }
142
143 *errmsg = _("Quoted string opened but never closed in address header.");
144 return NULL;
145 }
146
147 /* This function gets called with *startp positioned to the character past the
148 * opening '('. Find the matching ')' and encode everything in between.
149 */
parse_rfc2822_comment(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg)150 static int parse_rfc2822_comment (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg) /*{{{*/
151 {
152 unsigned int start;
153
154 start = *startp;
155 while (start < stop)
156 {
157 unsigned char ch = (unsigned char) header[start];
158
159 if (ch == '(')
160 {
161 start++;
162 if (-1 == parse_rfc2822_comment (header, parsemap, &start, stop, errmsg))
163 {
164 *startp = start;
165 return -1;
166 }
167 continue;
168 }
169
170 if (ch == ')')
171 {
172 start++;
173 *startp = start;
174 return 0;
175 }
176
177 if (ch == '\\')
178 {
179 parsemap[start] = 'C';
180 start++;
181 if (-1 == check_quoted_pair (header+start, header+stop, errmsg))
182 {
183 *startp = start;
184 return -1;
185 }
186 parsemap[start] = 'C';
187 start++;
188 continue;
189 }
190
191 if (NULL != slrn_strbyte(RFC_2822_NOT_COMMENT_CHARS, ch))
192 {
193 *errmsg = _("Illegal char in displayname of address header.");
194 *startp = start;
195 return -1;
196 }
197 parsemap[start] = 'C';
198 start++;
199 }
200
201 *errmsg = _("Comment opened but never closed in address header.");
202 return -1;
203 }
204
205 /*}}}*/
206
parse_rfc2822_cfws(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg)207 static int parse_rfc2822_cfws (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg)
208 {
209 while (1)
210 {
211 char *p0, *p, *pmax;
212
213 p = p0 = header + *startp;
214 pmax = header + stop;
215
216 while ((p < pmax) && ((*p == ' ') || (*p == '\t')))
217 p++;
218
219 *startp = (unsigned int) (p - header);
220 if ((p == pmax) || (*p != '('))
221 return 0;
222
223 *startp += 1; /* skip ( */
224 if (-1 == parse_rfc2822_comment (header, parsemap, startp, stop, errmsg))
225 return -1;
226 }
227 }
228
parse_rfc2822_atext(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg)229 static int parse_rfc2822_atext (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg)
230 {
231 unsigned int start;
232 char ch;
233
234 (void) parsemap;
235 #define IS_NOT_ATEXT_CHAR(ch) \
236 (((ch) & 0x80) || ((unsigned char)(ch) <= ' ') || ((ch) == '.') \
237 || (NULL != slrn_strbyte(RFC_2822_NOT_DOTATOM_CHARS, (ch))))
238
239 start = *startp;
240 if (start >= stop)
241 {
242 *errmsg = _("premature end of parse seen in atext portion of email address");
243 return -1;
244 }
245
246 ch = header[start];
247 if (0 == IS_RFC2822_ATEXT(ch))
248 {
249 *errmsg = _("Expecting an atext character");
250 return -1;
251 }
252 start++;
253 while (start < stop)
254 {
255 ch = header[start];
256 if (0 == IS_RFC2822_ATEXT(ch))
257 break;
258 start++;
259 }
260 *startp = start;
261 return 0;
262 }
263
264 /* The assumption here is that the *startp is at the char following the '"' */
parse_rfc2822_quoted_string(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg)265 static int parse_rfc2822_quoted_string (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg)
266 {
267 char *p, *pmax;
268
269 p = header + *startp;
270 pmax = header + stop;
271
272 p = skip_quoted_string (p, pmax, errmsg);
273 if (p == NULL)
274 return -1;
275
276 *startp = (p - header);
277
278 return parse_rfc2822_cfws (header, parsemap, startp, stop, errmsg);
279 }
280
281 /* dotatom: [CFWS] atext [.atext ...] [CFWS]
282 * Note that the obsolete forms allow CFWS on both sides of the dot.
283 * Moreover, it allows quoted-strings between the dots.
284 * This is also permitted here:
285 * [CFWS] atext [[CFWS] "." [CFWS] atext...] [CFWS]
286 */
parse_rfc2822_dotatom(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg,int allow_quoted_string)287 static int parse_rfc2822_dotatom (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg, int allow_quoted_string)
288 {
289 if (-1 == parse_rfc2822_cfws (header, parsemap, startp, stop, errmsg))
290 return -1;
291
292 if (allow_quoted_string
293 && ((*startp < stop) && (header[*startp] == '"')))
294 {
295 *startp += 1;
296 if (-1 == parse_rfc2822_quoted_string (header, parsemap, startp, stop, errmsg))
297 return -1;
298 }
299 else if (-1 == parse_rfc2822_atext (header, parsemap, startp, stop, errmsg))
300 return -1;
301
302 while (*startp < stop)
303 {
304 if (-1 == parse_rfc2822_cfws (header, parsemap, startp, stop, errmsg))
305 return -1;
306
307 if (header[*startp] != '.')
308 break;
309
310 *startp += 1;
311
312 if (-1 == parse_rfc2822_cfws (header, parsemap, startp, stop, errmsg))
313 return -1;
314
315 if (allow_quoted_string
316 && ((*startp < stop) && (header[*startp] == '"')))
317 {
318 *startp += 1;
319 if (-1 == parse_rfc2822_quoted_string (header, parsemap, startp, stop, errmsg))
320 return -1;
321 }
322 else if (-1 == parse_rfc2822_atext (header, parsemap, startp, stop, errmsg))
323 return -1;
324 }
325
326 if (-1 == parse_rfc2822_cfws (header, parsemap, startp, stop, errmsg))
327 return -1;
328
329 return 0;
330 }
331
332 /* This parses a string that looks like "some phrase <address>". Stop
333 * parsing at the start of <address>.
334 *
335 * An RFC-2822 phrase consists of "words", which are composed of
336 * atoms or quoted strings, or comments, and optionally separated by dots.
337 */
parse_rfc2822_phrase(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg)338 static int parse_rfc2822_phrase (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg) /*{{{*/
339 {
340 unsigned int start;
341
342 start = *startp;
343 while (start < stop)
344 {
345 unsigned char ch = (unsigned char) header[start];
346
347 if (ch <= 32)
348 {
349 if ((ch == '\r') || (ch == '\n'))
350 {
351 *errmsg = _("Illegal char in displayname of address header.");
352 return -1;
353 }
354 parsemap[start] = 'C';
355 start++;
356 continue;
357 }
358
359 if (ch == '(')
360 {
361 start++;
362 if (-1 == parse_rfc2822_comment (header, parsemap, &start, stop, errmsg))
363 {
364 *startp = start;
365 return -1;
366 }
367 continue;
368 }
369
370 if (ch == '"')
371 {
372 unsigned int start0;
373 start++;
374 start0 = start;
375 if (-1 == parse_rfc2822_quoted_string (header, parsemap, &start, stop, errmsg))
376 {
377 *startp = start;
378 return -1;
379 }
380
381 while (start0 < start)
382 parsemap[start0++] = 'C';
383
384 continue;
385 }
386
387 if (!IS_RFC2822_PHRASE_CHAR(ch))
388 break;
389
390 parsemap[start] = 'C';
391 start++;
392 }
393
394 *startp = start;
395 return 0;
396 }
397
398 /*}}}*/
399
400 /* The grammar implies:
401 * local-part = dot-atom | quoted-string | obs-local-part
402 * Note that the obsolete local part is like the dot-atom, except it
403 * permits CFWS to surround the ".".
404 */
parse_rfc2822_localpart(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg)405 static int parse_rfc2822_localpart (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg) /*{{{*/
406 {
407 return parse_rfc2822_dotatom (header, parsemap, startp, stop, errmsg, 1);
408 }
409
410 /*}}}*/
411
parse_rfc2822_domain(char * header,char * parsemap,unsigned int * startp,unsigned int stop,char ** errmsg)412 static int parse_rfc2822_domain (char *header, char *parsemap, unsigned int *startp, unsigned int stop, char **errmsg) /*{{{*/
413 {
414 /* Here domain is a dot atom or an obsolete local part. */
415 if (-1 == parse_rfc2822_dotatom (header, parsemap, startp, stop, errmsg, 0))
416 return -1;
417
418 return 0;
419 }
420
421 /*}}}*/
422
parse_rfc2822_domainlit(char * header,char * parsemap,unsigned int * start,unsigned int end,char ** errmsg)423 static int parse_rfc2822_domainlit (char *header, char *parsemap, unsigned int *start, unsigned int end, char **errmsg) /*{{{*/
424 {
425 unsigned int pos = *start;
426
427 (void) parsemap;
428
429 while (pos < end)
430 {
431 #ifndef HAVE_LIBIDN
432 if (header[pos] & 0x80)
433 {
434 *errmsg = _("Non 7-bit char in domain of address header. libidn is not yet supported.");
435 return -1;
436 }
437 #endif
438 if (header[pos] == ']')
439 {
440 *start=pos;
441 return 0;
442 }
443 if (NULL != slrn_strbyte (RFC_2822_NOT_DOMLIT_CHARS, header[pos]))
444 {
445 *errmsg = _("Illegal char in domain-literal of address header.");
446 return -1;
447 }
448 pos++;
449 }
450 *errmsg = _("domain-literal opened but never closed.");
451 return -1;
452 }
453
454 /*}}}*/
455
456 /* The encodes a comma separated list of addresses. Each item in the list
457 * is assumed to be of the following forms:
458 *
459 * address (Comment-text)
460 * address (Comment-text)
461 * Comment-text <address>
462 *
463 * Here address is local@domain, local@[domain], or local.
464 *
465 * Here is an example of something that is permitted:
466 *
467 * From: Pete(A wonderful \) chap) <pete(his account)@silly.test(his host)>
468 * To:A Group(Some people)
469 * :Chris Jones <c@(Chris's host.)public.example>,
470 * joe@example.org,
471 * John <jdoe@one.test> (my dear friend); (the end of the group)
472 * Cc:(Empty list)(start)Undisclosed recipients :(nobody(that I know)) ;
473 *
474 * The example shows that the "local" part can contain comments, and that
475 * the backquote serves as a quote character in the comments.
476 */
rfc2822_parse(char * header,char * parsemap,int skip_colon,char ** errmsg)477 static int rfc2822_parse (char *header, char *parsemap, int skip_colon, char **errmsg) /*{{{*/
478 {
479 unsigned int head_start=0, head_end;
480 int type=0;
481 unsigned int pos=0;
482 char ch;
483
484 if (skip_colon)
485 {
486 while ((0 != (ch = header[head_start]))
487 && (ch != ':'))
488 head_start++;
489
490 if (ch != ':')
491 {
492 *errmsg = _("A colon is missing from the address header");
493 return -1;
494 }
495 head_start++; /* skip colon */
496 }
497
498 while (1)
499 {
500 int in_comment, in_quote;
501
502 /* skip past leading whitespace */
503 while (1)
504 {
505 ch = header[head_start];
506 if (ch == 0)
507 return 0;
508
509 if ((ch != ' ') && (ch != '\t'))
510 break;
511
512 head_start++;
513 }
514
515 /* If multiple addresses are given, split at ',' */
516 head_end=head_start;
517 in_quote=0;
518 in_comment = 0;
519 type=TYPE_ADD_ONLY;
520
521 /* Loop until end of string is reached, or a ',' found */
522 while (1)
523 {
524 ch = header[head_end];
525 if (ch == 0)
526 break;
527
528 head_end++;
529
530 if (in_quote)
531 {
532 if (ch == '"')
533 {
534 in_quote = !in_quote;
535 continue;
536 }
537
538 if (ch == '\\')
539 {
540 ch = header[head_end];
541 if ((ch == 0) || (ch == '\r'))
542 {
543 *errmsg = _("Illegal quoted character in address header.");
544 return -1;
545 }
546 head_end++;
547 continue;
548 }
549 continue;
550 }
551
552 if (in_comment)
553 {
554 if (ch == '(')
555 {
556 in_comment++;
557 continue;
558 }
559 if (ch == ')')
560 {
561 in_comment--;
562 continue;
563 }
564 if (ch == '\\')
565 {
566 ch = header[head_end];
567 if ((ch == 0) || (ch == '\r'))
568 {
569 *errmsg = _("Illegal quoted character in address header.");
570 return -1;
571 }
572 head_end++;
573 continue;
574 }
575 continue;
576 }
577
578 if (ch == '"')
579 {
580 in_quote = 1;
581 continue;
582 }
583
584 if (ch == '(')
585 {
586 in_comment++;
587 continue;
588 }
589
590 if (ch == '<')
591 {
592 type = TYPE_RFC_2822;
593 continue;
594 }
595
596 if (ch == ',')
597 {
598 head_end--;
599 break;
600 }
601 }
602
603 if (in_quote)
604 {
605 *errmsg = _("Quote opened but never closed in address header.");
606 return -1;
607 }
608
609 if (in_comment)
610 {
611 *errmsg = _("Comment opened but never closed in address header.");
612 return -1;
613 }
614
615 pos=head_start;
616 if (type == TYPE_RFC_2822) /* foo <bar> */
617 {
618 if (header[pos] != '<')
619 {
620 /* phrase <bar> */
621 if (-1 == parse_rfc2822_phrase (header, parsemap, &pos, head_end, errmsg))
622 return -1;
623 }
624 /* at this point, pos should be at '<' */
625 if (header[pos] != '<')
626 {
627 *errmsg = _("Address appears to have a misplaced '<'.");
628 return -1;
629 }
630 pos++;
631 }
632 if (-1 == parse_rfc2822_localpart (header, parsemap, &pos, head_end, errmsg))
633 return -1;
634
635 if (header[pos] == '@')
636 {
637 pos++;
638 if (header[pos] == '[')
639 {
640 pos++;
641 if (-1 == parse_rfc2822_domainlit (header, parsemap, &pos, head_end, errmsg))
642 return -1;
643 pos++; /* skip ']' */
644 }
645 else
646 {
647 if (-1 == parse_rfc2822_domain (header, parsemap, &pos, head_end, errmsg))
648 return -1;
649 }
650 }
651
652 if (type == TYPE_RFC_2822)
653 {
654 if (header[pos] != '>')
655 {
656 *errmsg = _("Expected closing '>' character in the address");
657 return -1;
658 }
659 pos++;
660 }
661
662 /* after domainpart only (folding) Whitespace and comments are allowed*/
663 if (-1 == parse_rfc2822_cfws (header, parsemap, &pos, head_end, errmsg))
664 return -1;
665
666 if (pos != head_end)
667 {
668 *errmsg = _("Junk found at the end of email-address");
669 /* fprintf (stderr, "BAD: %s\n", header); fprintf (stderr, "END: %s\n", header+pos); */
670 return -1;
671 }
672 if (header[head_end] == 0)
673 return 0;
674
675 /* head_end should be at ',', so skip over it. */
676 head_start=head_end+1;
677 }
678 }
679 /*}}}*/
680
681 /*}}}*/
682
683 /* This function takes a header of the form "KEY: VALUE" and parses it
684 * according to rfc2822. It returns a string that contains information where
685 * the header may be encoded. For example, if the header is:
686 *
687 * "From: Thomas Paine <thomas@unknown.isp>"
688 *
689 * Then the following string will be returned:
690 *
691 * " CCCCCCCCCCCCC "
692 *
693 * The Cs indicate that the corresponding areas of the header may be encoded.
694 * If an error occurs, NULL will be returned and *errmsg will be set to a string
695 * describing the error.
696 */
slrn_parse_rfc2822_addr(char * header,char ** errmsg)697 char *slrn_parse_rfc2822_addr (char *header, char **errmsg)
698 {
699 char *encodemap;
700 unsigned int len;
701
702 *errmsg = NULL;
703
704 len = strlen (header);
705 if (NULL == (encodemap = slrn_malloc (len+1, 0, 1)))
706 {
707 *errmsg = _("Out of memory");
708 return NULL;
709 }
710
711 memset (encodemap, ' ', len);
712 encodemap[len] = 0;
713
714 if (-1 == rfc2822_parse (header, encodemap, 0, errmsg))
715 {
716 if (*errmsg == NULL)
717 *errmsg = _("Error encountered while parsing an RFC2822 header");
718 slrn_free (encodemap);
719 return NULL;
720 }
721
722 return encodemap;
723 }
724