1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5/*
6Package mail implements parsing of mail messages.
7
8For the most part, this package follows the syntax as specified by RFC 5322 and
9extended by RFC 6532.
10Notable divergences:
11	* Obsolete address formats are not parsed, including addresses with
12	  embedded route information.
13	* The full range of spacing (the CFWS syntax element) is not supported,
14	  such as breaking addresses across lines.
15	* No unicode normalization is performed.
16	* The special characters ()[]:;@\, are allowed to appear unquoted in names.
17*/
18package mail
19
20import (
21	"bufio"
22	"errors"
23	"fmt"
24	"io"
25	"log"
26	"mime"
27	"net/textproto"
28	"strings"
29	"sync"
30	"time"
31	"unicode/utf8"
32)
33
34var debug = debugT(false)
35
36type debugT bool
37
38func (d debugT) Printf(format string, args ...interface{}) {
39	if d {
40		log.Printf(format, args...)
41	}
42}
43
44// A Message represents a parsed mail message.
45type Message struct {
46	Header Header
47	Body   io.Reader
48}
49
50// ReadMessage reads a message from r.
51// The headers are parsed, and the body of the message will be available
52// for reading from msg.Body.
53func ReadMessage(r io.Reader) (msg *Message, err error) {
54	tp := textproto.NewReader(bufio.NewReader(r))
55
56	hdr, err := tp.ReadMIMEHeader()
57	if err != nil {
58		return nil, err
59	}
60
61	return &Message{
62		Header: Header(hdr),
63		Body:   tp.R,
64	}, nil
65}
66
67// Layouts suitable for passing to time.Parse.
68// These are tried in order.
69var (
70	dateLayoutsBuildOnce sync.Once
71	dateLayouts          []string
72)
73
74func buildDateLayouts() {
75	// Generate layouts based on RFC 5322, section 3.3.
76
77	dows := [...]string{"", "Mon, "}   // day-of-week
78	days := [...]string{"2", "02"}     // day = 1*2DIGIT
79	years := [...]string{"2006", "06"} // year = 4*DIGIT / 2*DIGIT
80	seconds := [...]string{":05", ""}  // second
81	// "-0700 (MST)" is not in RFC 5322, but is common.
82	zones := [...]string{"-0700", "MST"} // zone = (("+" / "-") 4DIGIT) / "GMT" / ...
83
84	for _, dow := range dows {
85		for _, day := range days {
86			for _, year := range years {
87				for _, second := range seconds {
88					for _, zone := range zones {
89						s := dow + day + " Jan " + year + " 15:04" + second + " " + zone
90						dateLayouts = append(dateLayouts, s)
91					}
92				}
93			}
94		}
95	}
96}
97
98// ParseDate parses an RFC 5322 date string.
99func ParseDate(date string) (time.Time, error) {
100	dateLayoutsBuildOnce.Do(buildDateLayouts)
101	// CR and LF must match and are tolerated anywhere in the date field.
102	date = strings.ReplaceAll(date, "\r\n", "")
103	if strings.Index(date, "\r") != -1 {
104		return time.Time{}, errors.New("mail: header has a CR without LF")
105	}
106	// Re-using some addrParser methods which support obsolete text, i.e. non-printable ASCII
107	p := addrParser{date, nil}
108	p.skipSpace()
109
110	// RFC 5322: zone = (FWS ( "+" / "-" ) 4DIGIT) / obs-zone
111	// zone length is always 5 chars unless obsolete (obs-zone)
112	if ind := strings.IndexAny(p.s, "+-"); ind != -1 && len(p.s) >= ind+5 {
113		date = p.s[:ind+5]
114		p.s = p.s[ind+5:]
115	} else if ind := strings.Index(p.s, "T"); ind != -1 && len(p.s) >= ind+1 {
116		// The last letter T of the obsolete time zone is checked when no standard time zone is found.
117		// If T is misplaced, the date to parse is garbage.
118		date = p.s[:ind+1]
119		p.s = p.s[ind+1:]
120	}
121	if !p.skipCFWS() {
122		return time.Time{}, errors.New("mail: misformatted parenthetical comment")
123	}
124	for _, layout := range dateLayouts {
125		t, err := time.Parse(layout, date)
126		if err == nil {
127			return t, nil
128		}
129	}
130	return time.Time{}, errors.New("mail: header could not be parsed")
131}
132
133// A Header represents the key-value pairs in a mail message header.
134type Header map[string][]string
135
136// Get gets the first value associated with the given key.
137// It is case insensitive; CanonicalMIMEHeaderKey is used
138// to canonicalize the provided key.
139// If there are no values associated with the key, Get returns "".
140// To access multiple values of a key, or to use non-canonical keys,
141// access the map directly.
142func (h Header) Get(key string) string {
143	return textproto.MIMEHeader(h).Get(key)
144}
145
146var ErrHeaderNotPresent = errors.New("mail: header not in message")
147
148// Date parses the Date header field.
149func (h Header) Date() (time.Time, error) {
150	hdr := h.Get("Date")
151	if hdr == "" {
152		return time.Time{}, ErrHeaderNotPresent
153	}
154	return ParseDate(hdr)
155}
156
157// AddressList parses the named header field as a list of addresses.
158func (h Header) AddressList(key string) ([]*Address, error) {
159	hdr := h.Get(key)
160	if hdr == "" {
161		return nil, ErrHeaderNotPresent
162	}
163	return ParseAddressList(hdr)
164}
165
166// Address represents a single mail address.
167// An address such as "Barry Gibbs <bg@example.com>" is represented
168// as Address{Name: "Barry Gibbs", Address: "bg@example.com"}.
169type Address struct {
170	Name    string // Proper name; may be empty.
171	Address string // user@domain
172}
173
174// ParseAddress parses a single RFC 5322 address, e.g. "Barry Gibbs <bg@example.com>"
175func ParseAddress(address string) (*Address, error) {
176	return (&addrParser{s: address}).parseSingleAddress()
177}
178
179// ParseAddressList parses the given string as a list of addresses.
180func ParseAddressList(list string) ([]*Address, error) {
181	return (&addrParser{s: list}).parseAddressList()
182}
183
184// An AddressParser is an RFC 5322 address parser.
185type AddressParser struct {
186	// WordDecoder optionally specifies a decoder for RFC 2047 encoded-words.
187	WordDecoder *mime.WordDecoder
188}
189
190// Parse parses a single RFC 5322 address of the
191// form "Gogh Fir <gf@example.com>" or "foo@example.com".
192func (p *AddressParser) Parse(address string) (*Address, error) {
193	return (&addrParser{s: address, dec: p.WordDecoder}).parseSingleAddress()
194}
195
196// ParseList parses the given string as a list of comma-separated addresses
197// of the form "Gogh Fir <gf@example.com>" or "foo@example.com".
198func (p *AddressParser) ParseList(list string) ([]*Address, error) {
199	return (&addrParser{s: list, dec: p.WordDecoder}).parseAddressList()
200}
201
202// String formats the address as a valid RFC 5322 address.
203// If the address's name contains non-ASCII characters
204// the name will be rendered according to RFC 2047.
205func (a *Address) String() string {
206	// Format address local@domain
207	at := strings.LastIndex(a.Address, "@")
208	var local, domain string
209	if at < 0 {
210		// This is a malformed address ("@" is required in addr-spec);
211		// treat the whole address as local-part.
212		local = a.Address
213	} else {
214		local, domain = a.Address[:at], a.Address[at+1:]
215	}
216
217	// Add quotes if needed
218	quoteLocal := false
219	for i, r := range local {
220		if isAtext(r, false, false) {
221			continue
222		}
223		if r == '.' {
224			// Dots are okay if they are surrounded by atext.
225			// We only need to check that the previous byte is
226			// not a dot, and this isn't the end of the string.
227			if i > 0 && local[i-1] != '.' && i < len(local)-1 {
228				continue
229			}
230		}
231		quoteLocal = true
232		break
233	}
234	if quoteLocal {
235		local = quoteString(local)
236
237	}
238
239	s := "<" + local + "@" + domain + ">"
240
241	if a.Name == "" {
242		return s
243	}
244
245	// If every character is printable ASCII, quoting is simple.
246	allPrintable := true
247	for _, r := range a.Name {
248		// isWSP here should actually be isFWS,
249		// but we don't support folding yet.
250		if !isVchar(r) && !isWSP(r) || isMultibyte(r) {
251			allPrintable = false
252			break
253		}
254	}
255	if allPrintable {
256		return quoteString(a.Name) + " " + s
257	}
258
259	// Text in an encoded-word in a display-name must not contain certain
260	// characters like quotes or parentheses (see RFC 2047 section 5.3).
261	// When this is the case encode the name using base64 encoding.
262	if strings.ContainsAny(a.Name, "\"#$%&'(),.:;<>@[]^`{|}~") {
263		return mime.BEncoding.Encode("utf-8", a.Name) + " " + s
264	}
265	return mime.QEncoding.Encode("utf-8", a.Name) + " " + s
266}
267
268type addrParser struct {
269	s   string
270	dec *mime.WordDecoder // may be nil
271}
272
273func (p *addrParser) parseAddressList() ([]*Address, error) {
274	var list []*Address
275	for {
276		p.skipSpace()
277		addrs, err := p.parseAddress(true)
278		if err != nil {
279			return nil, err
280		}
281		list = append(list, addrs...)
282
283		if !p.skipCFWS() {
284			return nil, errors.New("mail: misformatted parenthetical comment")
285		}
286		if p.empty() {
287			break
288		}
289		if !p.consume(',') {
290			return nil, errors.New("mail: expected comma")
291		}
292	}
293	return list, nil
294}
295
296func (p *addrParser) parseSingleAddress() (*Address, error) {
297	addrs, err := p.parseAddress(true)
298	if err != nil {
299		return nil, err
300	}
301	if !p.skipCFWS() {
302		return nil, errors.New("mail: misformatted parenthetical comment")
303	}
304	if !p.empty() {
305		return nil, fmt.Errorf("mail: expected single address, got %q", p.s)
306	}
307	if len(addrs) == 0 {
308		return nil, errors.New("mail: empty group")
309	}
310	if len(addrs) > 1 {
311		return nil, errors.New("mail: group with multiple addresses")
312	}
313	return addrs[0], nil
314}
315
316// parseAddress parses a single RFC 5322 address at the start of p.
317func (p *addrParser) parseAddress(handleGroup bool) ([]*Address, error) {
318	debug.Printf("parseAddress: %q", p.s)
319	p.skipSpace()
320	if p.empty() {
321		return nil, errors.New("mail: no address")
322	}
323
324	// address = mailbox / group
325	// mailbox = name-addr / addr-spec
326	// group = display-name ":" [group-list] ";" [CFWS]
327
328	// addr-spec has a more restricted grammar than name-addr,
329	// so try parsing it first, and fallback to name-addr.
330	// TODO(dsymonds): Is this really correct?
331	spec, err := p.consumeAddrSpec()
332	if err == nil {
333		var displayName string
334		p.skipSpace()
335		if !p.empty() && p.peek() == '(' {
336			displayName, err = p.consumeDisplayNameComment()
337			if err != nil {
338				return nil, err
339			}
340		}
341
342		return []*Address{{
343			Name:    displayName,
344			Address: spec,
345		}}, err
346	}
347	debug.Printf("parseAddress: not an addr-spec: %v", err)
348	debug.Printf("parseAddress: state is now %q", p.s)
349
350	// display-name
351	var displayName string
352	if p.peek() != '<' {
353		displayName, err = p.consumePhrase()
354		if err != nil {
355			return nil, err
356		}
357	}
358	debug.Printf("parseAddress: displayName=%q", displayName)
359
360	p.skipSpace()
361	if handleGroup {
362		if p.consume(':') {
363			return p.consumeGroupList()
364		}
365	}
366	// angle-addr = "<" addr-spec ">"
367	if !p.consume('<') {
368		atext := true
369		for _, r := range displayName {
370			if !isAtext(r, true, false) {
371				atext = false
372				break
373			}
374		}
375		if atext {
376			// The input is like "foo.bar"; it's possible the input
377			// meant to be "foo.bar@domain", or "foo.bar <...>".
378			return nil, errors.New("mail: missing '@' or angle-addr")
379		}
380		// The input is like "Full Name", which couldn't possibly be a
381		// valid email address if followed by "@domain"; the input
382		// likely meant to be "Full Name <...>".
383		return nil, errors.New("mail: no angle-addr")
384	}
385	spec, err = p.consumeAddrSpec()
386	if err != nil {
387		return nil, err
388	}
389	if !p.consume('>') {
390		return nil, errors.New("mail: unclosed angle-addr")
391	}
392	debug.Printf("parseAddress: spec=%q", spec)
393
394	return []*Address{{
395		Name:    displayName,
396		Address: spec,
397	}}, nil
398}
399
400func (p *addrParser) consumeGroupList() ([]*Address, error) {
401	var group []*Address
402	// handle empty group.
403	p.skipSpace()
404	if p.consume(';') {
405		p.skipCFWS()
406		return group, nil
407	}
408
409	for {
410		p.skipSpace()
411		// embedded groups not allowed.
412		addrs, err := p.parseAddress(false)
413		if err != nil {
414			return nil, err
415		}
416		group = append(group, addrs...)
417
418		if !p.skipCFWS() {
419			return nil, errors.New("mail: misformatted parenthetical comment")
420		}
421		if p.consume(';') {
422			p.skipCFWS()
423			break
424		}
425		if !p.consume(',') {
426			return nil, errors.New("mail: expected comma")
427		}
428	}
429	return group, nil
430}
431
432// consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p.
433func (p *addrParser) consumeAddrSpec() (spec string, err error) {
434	debug.Printf("consumeAddrSpec: %q", p.s)
435
436	orig := *p
437	defer func() {
438		if err != nil {
439			*p = orig
440		}
441	}()
442
443	// local-part = dot-atom / quoted-string
444	var localPart string
445	p.skipSpace()
446	if p.empty() {
447		return "", errors.New("mail: no addr-spec")
448	}
449	if p.peek() == '"' {
450		// quoted-string
451		debug.Printf("consumeAddrSpec: parsing quoted-string")
452		localPart, err = p.consumeQuotedString()
453		if localPart == "" {
454			err = errors.New("mail: empty quoted string in addr-spec")
455		}
456	} else {
457		// dot-atom
458		debug.Printf("consumeAddrSpec: parsing dot-atom")
459		localPart, err = p.consumeAtom(true, false)
460	}
461	if err != nil {
462		debug.Printf("consumeAddrSpec: failed: %v", err)
463		return "", err
464	}
465
466	if !p.consume('@') {
467		return "", errors.New("mail: missing @ in addr-spec")
468	}
469
470	// domain = dot-atom / domain-literal
471	var domain string
472	p.skipSpace()
473	if p.empty() {
474		return "", errors.New("mail: no domain in addr-spec")
475	}
476	// TODO(dsymonds): Handle domain-literal
477	domain, err = p.consumeAtom(true, false)
478	if err != nil {
479		return "", err
480	}
481
482	return localPart + "@" + domain, nil
483}
484
485// consumePhrase parses the RFC 5322 phrase at the start of p.
486func (p *addrParser) consumePhrase() (phrase string, err error) {
487	debug.Printf("consumePhrase: [%s]", p.s)
488	// phrase = 1*word
489	var words []string
490	var isPrevEncoded bool
491	for {
492		// word = atom / quoted-string
493		var word string
494		p.skipSpace()
495		if p.empty() {
496			break
497		}
498		isEncoded := false
499		if p.peek() == '"' {
500			// quoted-string
501			word, err = p.consumeQuotedString()
502		} else {
503			// atom
504			// We actually parse dot-atom here to be more permissive
505			// than what RFC 5322 specifies.
506			word, err = p.consumeAtom(true, true)
507			if err == nil {
508				word, isEncoded, err = p.decodeRFC2047Word(word)
509			}
510		}
511
512		if err != nil {
513			break
514		}
515		debug.Printf("consumePhrase: consumed %q", word)
516		if isPrevEncoded && isEncoded {
517			words[len(words)-1] += word
518		} else {
519			words = append(words, word)
520		}
521		isPrevEncoded = isEncoded
522	}
523	// Ignore any error if we got at least one word.
524	if err != nil && len(words) == 0 {
525		debug.Printf("consumePhrase: hit err: %v", err)
526		return "", fmt.Errorf("mail: missing word in phrase: %v", err)
527	}
528	phrase = strings.Join(words, " ")
529	return phrase, nil
530}
531
532// consumeQuotedString parses the quoted string at the start of p.
533func (p *addrParser) consumeQuotedString() (qs string, err error) {
534	// Assume first byte is '"'.
535	i := 1
536	qsb := make([]rune, 0, 10)
537
538	escaped := false
539
540Loop:
541	for {
542		r, size := utf8.DecodeRuneInString(p.s[i:])
543
544		switch {
545		case size == 0:
546			return "", errors.New("mail: unclosed quoted-string")
547
548		case size == 1 && r == utf8.RuneError:
549			return "", fmt.Errorf("mail: invalid utf-8 in quoted-string: %q", p.s)
550
551		case escaped:
552			//  quoted-pair = ("\" (VCHAR / WSP))
553
554			if !isVchar(r) && !isWSP(r) {
555				return "", fmt.Errorf("mail: bad character in quoted-string: %q", r)
556			}
557
558			qsb = append(qsb, r)
559			escaped = false
560
561		case isQtext(r) || isWSP(r):
562			// qtext (printable US-ASCII excluding " and \), or
563			// FWS (almost; we're ignoring CRLF)
564			qsb = append(qsb, r)
565
566		case r == '"':
567			break Loop
568
569		case r == '\\':
570			escaped = true
571
572		default:
573			return "", fmt.Errorf("mail: bad character in quoted-string: %q", r)
574
575		}
576
577		i += size
578	}
579	p.s = p.s[i+1:]
580	return string(qsb), nil
581}
582
583// consumeAtom parses an RFC 5322 atom at the start of p.
584// If dot is true, consumeAtom parses an RFC 5322 dot-atom instead.
585// If permissive is true, consumeAtom will not fail on:
586// - leading/trailing/double dots in the atom (see golang.org/issue/4938)
587// - special characters (RFC 5322 3.2.3) except '<', '>', ':' and '"' (see golang.org/issue/21018)
588func (p *addrParser) consumeAtom(dot bool, permissive bool) (atom string, err error) {
589	i := 0
590
591Loop:
592	for {
593		r, size := utf8.DecodeRuneInString(p.s[i:])
594		switch {
595		case size == 1 && r == utf8.RuneError:
596			return "", fmt.Errorf("mail: invalid utf-8 in address: %q", p.s)
597
598		case size == 0 || !isAtext(r, dot, permissive):
599			break Loop
600
601		default:
602			i += size
603
604		}
605	}
606
607	if i == 0 {
608		return "", errors.New("mail: invalid string")
609	}
610	atom, p.s = p.s[:i], p.s[i:]
611	if !permissive {
612		if strings.HasPrefix(atom, ".") {
613			return "", errors.New("mail: leading dot in atom")
614		}
615		if strings.Contains(atom, "..") {
616			return "", errors.New("mail: double dot in atom")
617		}
618		if strings.HasSuffix(atom, ".") {
619			return "", errors.New("mail: trailing dot in atom")
620		}
621	}
622	return atom, nil
623}
624
625func (p *addrParser) consumeDisplayNameComment() (string, error) {
626	if !p.consume('(') {
627		return "", errors.New("mail: comment does not start with (")
628	}
629	comment, ok := p.consumeComment()
630	if !ok {
631		return "", errors.New("mail: misformatted parenthetical comment")
632	}
633
634	// TODO(stapelberg): parse quoted-string within comment
635	words := strings.FieldsFunc(comment, func(r rune) bool { return r == ' ' || r == '\t' })
636	for idx, word := range words {
637		decoded, isEncoded, err := p.decodeRFC2047Word(word)
638		if err != nil {
639			return "", err
640		}
641		if isEncoded {
642			words[idx] = decoded
643		}
644	}
645
646	return strings.Join(words, " "), nil
647}
648
649func (p *addrParser) consume(c byte) bool {
650	if p.empty() || p.peek() != c {
651		return false
652	}
653	p.s = p.s[1:]
654	return true
655}
656
657// skipSpace skips the leading space and tab characters.
658func (p *addrParser) skipSpace() {
659	p.s = strings.TrimLeft(p.s, " \t")
660}
661
662func (p *addrParser) peek() byte {
663	return p.s[0]
664}
665
666func (p *addrParser) empty() bool {
667	return p.len() == 0
668}
669
670func (p *addrParser) len() int {
671	return len(p.s)
672}
673
674// skipCFWS skips CFWS as defined in RFC5322.
675func (p *addrParser) skipCFWS() bool {
676	p.skipSpace()
677
678	for {
679		if !p.consume('(') {
680			break
681		}
682
683		if _, ok := p.consumeComment(); !ok {
684			return false
685		}
686
687		p.skipSpace()
688	}
689
690	return true
691}
692
693func (p *addrParser) consumeComment() (string, bool) {
694	// '(' already consumed.
695	depth := 1
696
697	var comment string
698	for {
699		if p.empty() || depth == 0 {
700			break
701		}
702
703		if p.peek() == '\\' && p.len() > 1 {
704			p.s = p.s[1:]
705		} else if p.peek() == '(' {
706			depth++
707		} else if p.peek() == ')' {
708			depth--
709		}
710		if depth > 0 {
711			comment += p.s[:1]
712		}
713		p.s = p.s[1:]
714	}
715
716	return comment, depth == 0
717}
718
719func (p *addrParser) decodeRFC2047Word(s string) (word string, isEncoded bool, err error) {
720	if p.dec != nil {
721		word, err = p.dec.Decode(s)
722	} else {
723		word, err = rfc2047Decoder.Decode(s)
724	}
725
726	if err == nil {
727		return word, true, nil
728	}
729
730	if _, ok := err.(charsetError); ok {
731		return s, true, err
732	}
733
734	// Ignore invalid RFC 2047 encoded-word errors.
735	return s, false, nil
736}
737
738var rfc2047Decoder = mime.WordDecoder{
739	CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
740		return nil, charsetError(charset)
741	},
742}
743
744type charsetError string
745
746func (e charsetError) Error() string {
747	return fmt.Sprintf("charset not supported: %q", string(e))
748}
749
750// isAtext reports whether r is an RFC 5322 atext character.
751// If dot is true, period is included.
752// If permissive is true, RFC 5322 3.2.3 specials is included,
753// except '<', '>', ':' and '"'.
754func isAtext(r rune, dot, permissive bool) bool {
755	switch r {
756	case '.':
757		return dot
758
759	// RFC 5322 3.2.3. specials
760	case '(', ')', '[', ']', ';', '@', '\\', ',':
761		return permissive
762
763	case '<', '>', '"', ':':
764		return false
765	}
766	return isVchar(r)
767}
768
769// isQtext reports whether r is an RFC 5322 qtext character.
770func isQtext(r rune) bool {
771	// Printable US-ASCII, excluding backslash or quote.
772	if r == '\\' || r == '"' {
773		return false
774	}
775	return isVchar(r)
776}
777
778// quoteString renders a string as an RFC 5322 quoted-string.
779func quoteString(s string) string {
780	var buf strings.Builder
781	buf.WriteByte('"')
782	for _, r := range s {
783		if isQtext(r) || isWSP(r) {
784			buf.WriteRune(r)
785		} else if isVchar(r) {
786			buf.WriteByte('\\')
787			buf.WriteRune(r)
788		}
789	}
790	buf.WriteByte('"')
791	return buf.String()
792}
793
794// isVchar reports whether r is an RFC 5322 VCHAR character.
795func isVchar(r rune) bool {
796	// Visible (printing) characters.
797	return '!' <= r && r <= '~' || isMultibyte(r)
798}
799
800// isMultibyte reports whether r is a multi-byte UTF-8 character
801// as supported by RFC 6532
802func isMultibyte(r rune) bool {
803	return r >= utf8.RuneSelf
804}
805
806// isWSP reports whether r is a WSP (white space).
807// WSP is a space or horizontal tab (RFC 5234 Appendix B).
808func isWSP(r rune) bool {
809	return r == ' ' || r == '\t'
810}
811