1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5/*
6Package mail implements parsing of mail messages.
7
8For the most part, this package follows the syntax as specified by RFC 5322 and
9extended by RFC 6532.
10Notable divergences:
11	* Obsolete address formats are not parsed, including addresses with
12	  embedded route information.
13	* The full range of spacing (the CFWS syntax element) is not supported,
14	  such as breaking addresses across lines.
15	* No unicode normalization is performed.
16	* The special characters ()[]:;@\, are allowed to appear unquoted in names.
17*/
18package mail
19
20import (
21	"bufio"
22	"bytes"
23	"errors"
24	"fmt"
25	"io"
26	"log"
27	"mime"
28	"net/textproto"
29	"strings"
30	"time"
31	"unicode/utf8"
32)
33
34var debug = debugT(false)
35
36type debugT bool
37
38func (d debugT) Printf(format string, args ...interface{}) {
39	if d {
40		log.Printf(format, args...)
41	}
42}
43
44// A Message represents a parsed mail message.
45type Message struct {
46	Header Header
47	Body   io.Reader
48}
49
50// ReadMessage reads a message from r.
51// The headers are parsed, and the body of the message will be available
52// for reading from msg.Body.
53func ReadMessage(r io.Reader) (msg *Message, err error) {
54	tp := textproto.NewReader(bufio.NewReader(r))
55
56	hdr, err := tp.ReadMIMEHeader()
57	if err != nil {
58		return nil, err
59	}
60
61	return &Message{
62		Header: Header(hdr),
63		Body:   tp.R,
64	}, nil
65}
66
67// Layouts suitable for passing to time.Parse.
68// These are tried in order.
69var dateLayouts []string
70
71func init() {
72	// Generate layouts based on RFC 5322, section 3.3.
73
74	dows := [...]string{"", "Mon, "}   // day-of-week
75	days := [...]string{"2", "02"}     // day = 1*2DIGIT
76	years := [...]string{"2006", "06"} // year = 4*DIGIT / 2*DIGIT
77	seconds := [...]string{":05", ""}  // second
78	// "-0700 (MST)" is not in RFC 5322, but is common.
79	zones := [...]string{"-0700", "MST", "-0700 (MST)"} // zone = (("+" / "-") 4DIGIT) / "GMT" / ...
80
81	for _, dow := range dows {
82		for _, day := range days {
83			for _, year := range years {
84				for _, second := range seconds {
85					for _, zone := range zones {
86						s := dow + day + " Jan " + year + " 15:04" + second + " " + zone
87						dateLayouts = append(dateLayouts, s)
88					}
89				}
90			}
91		}
92	}
93}
94
95// ParseDate parses an RFC 5322 date string.
96func ParseDate(date string) (time.Time, error) {
97	for _, layout := range dateLayouts {
98		t, err := time.Parse(layout, date)
99		if err == nil {
100			return t, nil
101		}
102	}
103	return time.Time{}, errors.New("mail: header could not be parsed")
104}
105
106// A Header represents the key-value pairs in a mail message header.
107type Header map[string][]string
108
109// Get gets the first value associated with the given key.
110// It is case insensitive; CanonicalMIMEHeaderKey is used
111// to canonicalize the provided key.
112// If there are no values associated with the key, Get returns "".
113// To access multiple values of a key, or to use non-canonical keys,
114// access the map directly.
115func (h Header) Get(key string) string {
116	return textproto.MIMEHeader(h).Get(key)
117}
118
119var ErrHeaderNotPresent = errors.New("mail: header not in message")
120
121// Date parses the Date header field.
122func (h Header) Date() (time.Time, error) {
123	hdr := h.Get("Date")
124	if hdr == "" {
125		return time.Time{}, ErrHeaderNotPresent
126	}
127	return ParseDate(hdr)
128}
129
130// AddressList parses the named header field as a list of addresses.
131func (h Header) AddressList(key string) ([]*Address, error) {
132	hdr := h.Get(key)
133	if hdr == "" {
134		return nil, ErrHeaderNotPresent
135	}
136	return ParseAddressList(hdr)
137}
138
139// Address represents a single mail address.
140// An address such as "Barry Gibbs <bg@example.com>" is represented
141// as Address{Name: "Barry Gibbs", Address: "bg@example.com"}.
142type Address struct {
143	Name    string // Proper name; may be empty.
144	Address string // user@domain
145}
146
147// Parses a single RFC 5322 address, e.g. "Barry Gibbs <bg@example.com>"
148func ParseAddress(address string) (*Address, error) {
149	return (&addrParser{s: address}).parseSingleAddress()
150}
151
152// ParseAddressList parses the given string as a list of addresses.
153func ParseAddressList(list string) ([]*Address, error) {
154	return (&addrParser{s: list}).parseAddressList()
155}
156
157// An AddressParser is an RFC 5322 address parser.
158type AddressParser struct {
159	// WordDecoder optionally specifies a decoder for RFC 2047 encoded-words.
160	WordDecoder *mime.WordDecoder
161}
162
163// Parse parses a single RFC 5322 address of the
164// form "Gogh Fir <gf@example.com>" or "foo@example.com".
165func (p *AddressParser) Parse(address string) (*Address, error) {
166	return (&addrParser{s: address, dec: p.WordDecoder}).parseSingleAddress()
167}
168
169// ParseList parses the given string as a list of comma-separated addresses
170// of the form "Gogh Fir <gf@example.com>" or "foo@example.com".
171func (p *AddressParser) ParseList(list string) ([]*Address, error) {
172	return (&addrParser{s: list, dec: p.WordDecoder}).parseAddressList()
173}
174
175// String formats the address as a valid RFC 5322 address.
176// If the address's name contains non-ASCII characters
177// the name will be rendered according to RFC 2047.
178func (a *Address) String() string {
179	// Format address local@domain
180	at := strings.LastIndex(a.Address, "@")
181	var local, domain string
182	if at < 0 {
183		// This is a malformed address ("@" is required in addr-spec);
184		// treat the whole address as local-part.
185		local = a.Address
186	} else {
187		local, domain = a.Address[:at], a.Address[at+1:]
188	}
189
190	// Add quotes if needed
191	quoteLocal := false
192	for i, r := range local {
193		if isAtext(r, false, false) {
194			continue
195		}
196		if r == '.' {
197			// Dots are okay if they are surrounded by atext.
198			// We only need to check that the previous byte is
199			// not a dot, and this isn't the end of the string.
200			if i > 0 && local[i-1] != '.' && i < len(local)-1 {
201				continue
202			}
203		}
204		quoteLocal = true
205		break
206	}
207	if quoteLocal {
208		local = quoteString(local)
209
210	}
211
212	s := "<" + local + "@" + domain + ">"
213
214	if a.Name == "" {
215		return s
216	}
217
218	// If every character is printable ASCII, quoting is simple.
219	allPrintable := true
220	for _, r := range a.Name {
221		// isWSP here should actually be isFWS,
222		// but we don't support folding yet.
223		if !isVchar(r) && !isWSP(r) || isMultibyte(r) {
224			allPrintable = false
225			break
226		}
227	}
228	if allPrintable {
229		return quoteString(a.Name) + " " + s
230	}
231
232	// Text in an encoded-word in a display-name must not contain certain
233	// characters like quotes or parentheses (see RFC 2047 section 5.3).
234	// When this is the case encode the name using base64 encoding.
235	if strings.ContainsAny(a.Name, "\"#$%&'(),.:;<>@[]^`{|}~") {
236		return mime.BEncoding.Encode("utf-8", a.Name) + " " + s
237	}
238	return mime.QEncoding.Encode("utf-8", a.Name) + " " + s
239}
240
241type addrParser struct {
242	s   string
243	dec *mime.WordDecoder // may be nil
244}
245
246func (p *addrParser) parseAddressList() ([]*Address, error) {
247	var list []*Address
248	for {
249		p.skipSpace()
250		addrs, err := p.parseAddress(true)
251		if err != nil {
252			return nil, err
253		}
254		list = append(list, addrs...)
255
256		if !p.skipCFWS() {
257			return nil, errors.New("mail: misformatted parenthetical comment")
258		}
259		if p.empty() {
260			break
261		}
262		if !p.consume(',') {
263			return nil, errors.New("mail: expected comma")
264		}
265	}
266	return list, nil
267}
268
269func (p *addrParser) parseSingleAddress() (*Address, error) {
270	addrs, err := p.parseAddress(true)
271	if err != nil {
272		return nil, err
273	}
274	if !p.skipCFWS() {
275		return nil, errors.New("mail: misformatted parenthetical comment")
276	}
277	if !p.empty() {
278		return nil, fmt.Errorf("mail: expected single address, got %q", p.s)
279	}
280	if len(addrs) == 0 {
281		return nil, errors.New("mail: empty group")
282	}
283	if len(addrs) > 1 {
284		return nil, errors.New("mail: group with multiple addresses")
285	}
286	return addrs[0], nil
287}
288
289// parseAddress parses a single RFC 5322 address at the start of p.
290func (p *addrParser) parseAddress(handleGroup bool) ([]*Address, error) {
291	debug.Printf("parseAddress: %q", p.s)
292	p.skipSpace()
293	if p.empty() {
294		return nil, errors.New("mail: no address")
295	}
296
297	// address = mailbox / group
298	// mailbox = name-addr / addr-spec
299	// group = display-name ":" [group-list] ";" [CFWS]
300
301	// addr-spec has a more restricted grammar than name-addr,
302	// so try parsing it first, and fallback to name-addr.
303	// TODO(dsymonds): Is this really correct?
304	spec, err := p.consumeAddrSpec()
305	if err == nil {
306		var displayName string
307		p.skipSpace()
308		if !p.empty() && p.peek() == '(' {
309			displayName, err = p.consumeDisplayNameComment()
310			if err != nil {
311				return nil, err
312			}
313		}
314
315		return []*Address{{
316			Name:    displayName,
317			Address: spec,
318		}}, err
319	}
320	debug.Printf("parseAddress: not an addr-spec: %v", err)
321	debug.Printf("parseAddress: state is now %q", p.s)
322
323	// display-name
324	var displayName string
325	if p.peek() != '<' {
326		displayName, err = p.consumePhrase()
327		if err != nil {
328			return nil, err
329		}
330	}
331	debug.Printf("parseAddress: displayName=%q", displayName)
332
333	p.skipSpace()
334	if handleGroup {
335		if p.consume(':') {
336			return p.consumeGroupList()
337		}
338	}
339	// angle-addr = "<" addr-spec ">"
340	if !p.consume('<') {
341		return nil, errors.New("mail: no angle-addr")
342	}
343	spec, err = p.consumeAddrSpec()
344	if err != nil {
345		return nil, err
346	}
347	if !p.consume('>') {
348		return nil, errors.New("mail: unclosed angle-addr")
349	}
350	debug.Printf("parseAddress: spec=%q", spec)
351
352	return []*Address{{
353		Name:    displayName,
354		Address: spec,
355	}}, nil
356}
357
358func (p *addrParser) consumeGroupList() ([]*Address, error) {
359	var group []*Address
360	// handle empty group.
361	p.skipSpace()
362	if p.consume(';') {
363		p.skipCFWS()
364		return group, nil
365	}
366
367	for {
368		p.skipSpace()
369		// embedded groups not allowed.
370		addrs, err := p.parseAddress(false)
371		if err != nil {
372			return nil, err
373		}
374		group = append(group, addrs...)
375
376		if !p.skipCFWS() {
377			return nil, errors.New("mail: misformatted parenthetical comment")
378		}
379		if p.consume(';') {
380			p.skipCFWS()
381			break
382		}
383		if !p.consume(',') {
384			return nil, errors.New("mail: expected comma")
385		}
386	}
387	return group, nil
388}
389
390// consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p.
391func (p *addrParser) consumeAddrSpec() (spec string, err error) {
392	debug.Printf("consumeAddrSpec: %q", p.s)
393
394	orig := *p
395	defer func() {
396		if err != nil {
397			*p = orig
398		}
399	}()
400
401	// local-part = dot-atom / quoted-string
402	var localPart string
403	p.skipSpace()
404	if p.empty() {
405		return "", errors.New("mail: no addr-spec")
406	}
407	if p.peek() == '"' {
408		// quoted-string
409		debug.Printf("consumeAddrSpec: parsing quoted-string")
410		localPart, err = p.consumeQuotedString()
411		if localPart == "" {
412			err = errors.New("mail: empty quoted string in addr-spec")
413		}
414	} else {
415		// dot-atom
416		debug.Printf("consumeAddrSpec: parsing dot-atom")
417		localPart, err = p.consumeAtom(true, false)
418	}
419	if err != nil {
420		debug.Printf("consumeAddrSpec: failed: %v", err)
421		return "", err
422	}
423
424	if !p.consume('@') {
425		return "", errors.New("mail: missing @ in addr-spec")
426	}
427
428	// domain = dot-atom / domain-literal
429	var domain string
430	p.skipSpace()
431	if p.empty() {
432		return "", errors.New("mail: no domain in addr-spec")
433	}
434	// TODO(dsymonds): Handle domain-literal
435	domain, err = p.consumeAtom(true, false)
436	if err != nil {
437		return "", err
438	}
439
440	return localPart + "@" + domain, nil
441}
442
443// consumePhrase parses the RFC 5322 phrase at the start of p.
444func (p *addrParser) consumePhrase() (phrase string, err error) {
445	debug.Printf("consumePhrase: [%s]", p.s)
446	// phrase = 1*word
447	var words []string
448	var isPrevEncoded bool
449	for {
450		// word = atom / quoted-string
451		var word string
452		p.skipSpace()
453		if p.empty() {
454			break
455		}
456		isEncoded := false
457		if p.peek() == '"' {
458			// quoted-string
459			word, err = p.consumeQuotedString()
460		} else {
461			// atom
462			// We actually parse dot-atom here to be more permissive
463			// than what RFC 5322 specifies.
464			word, err = p.consumeAtom(true, true)
465			if err == nil {
466				word, isEncoded, err = p.decodeRFC2047Word(word)
467			}
468		}
469
470		if err != nil {
471			break
472		}
473		debug.Printf("consumePhrase: consumed %q", word)
474		if isPrevEncoded && isEncoded {
475			words[len(words)-1] += word
476		} else {
477			words = append(words, word)
478		}
479		isPrevEncoded = isEncoded
480	}
481	// Ignore any error if we got at least one word.
482	if err != nil && len(words) == 0 {
483		debug.Printf("consumePhrase: hit err: %v", err)
484		return "", fmt.Errorf("mail: missing word in phrase: %v", err)
485	}
486	phrase = strings.Join(words, " ")
487	return phrase, nil
488}
489
490// consumeQuotedString parses the quoted string at the start of p.
491func (p *addrParser) consumeQuotedString() (qs string, err error) {
492	// Assume first byte is '"'.
493	i := 1
494	qsb := make([]rune, 0, 10)
495
496	escaped := false
497
498Loop:
499	for {
500		r, size := utf8.DecodeRuneInString(p.s[i:])
501
502		switch {
503		case size == 0:
504			return "", errors.New("mail: unclosed quoted-string")
505
506		case size == 1 && r == utf8.RuneError:
507			return "", fmt.Errorf("mail: invalid utf-8 in quoted-string: %q", p.s)
508
509		case escaped:
510			//  quoted-pair = ("\" (VCHAR / WSP))
511
512			if !isVchar(r) && !isWSP(r) {
513				return "", fmt.Errorf("mail: bad character in quoted-string: %q", r)
514			}
515
516			qsb = append(qsb, r)
517			escaped = false
518
519		case isQtext(r) || isWSP(r):
520			// qtext (printable US-ASCII excluding " and \), or
521			// FWS (almost; we're ignoring CRLF)
522			qsb = append(qsb, r)
523
524		case r == '"':
525			break Loop
526
527		case r == '\\':
528			escaped = true
529
530		default:
531			return "", fmt.Errorf("mail: bad character in quoted-string: %q", r)
532
533		}
534
535		i += size
536	}
537	p.s = p.s[i+1:]
538	return string(qsb), nil
539}
540
541// consumeAtom parses an RFC 5322 atom at the start of p.
542// If dot is true, consumeAtom parses an RFC 5322 dot-atom instead.
543// If permissive is true, consumeAtom will not fail on:
544// - leading/trailing/double dots in the atom (see golang.org/issue/4938)
545// - special characters (RFC 5322 3.2.3) except '<', '>', ':' and '"' (see golang.org/issue/21018)
546func (p *addrParser) consumeAtom(dot bool, permissive bool) (atom string, err error) {
547	i := 0
548
549Loop:
550	for {
551		r, size := utf8.DecodeRuneInString(p.s[i:])
552		switch {
553		case size == 1 && r == utf8.RuneError:
554			return "", fmt.Errorf("mail: invalid utf-8 in address: %q", p.s)
555
556		case size == 0 || !isAtext(r, dot, permissive):
557			break Loop
558
559		default:
560			i += size
561
562		}
563	}
564
565	if i == 0 {
566		return "", errors.New("mail: invalid string")
567	}
568	atom, p.s = p.s[:i], p.s[i:]
569	if !permissive {
570		if strings.HasPrefix(atom, ".") {
571			return "", errors.New("mail: leading dot in atom")
572		}
573		if strings.Contains(atom, "..") {
574			return "", errors.New("mail: double dot in atom")
575		}
576		if strings.HasSuffix(atom, ".") {
577			return "", errors.New("mail: trailing dot in atom")
578		}
579	}
580	return atom, nil
581}
582
583func (p *addrParser) consumeDisplayNameComment() (string, error) {
584	if !p.consume('(') {
585		return "", errors.New("mail: comment does not start with (")
586	}
587	comment, ok := p.consumeComment()
588	if !ok {
589		return "", errors.New("mail: misformatted parenthetical comment")
590	}
591
592	// TODO(stapelberg): parse quoted-string within comment
593	words := strings.FieldsFunc(comment, func(r rune) bool { return r == ' ' || r == '\t' })
594	for idx, word := range words {
595		decoded, isEncoded, err := p.decodeRFC2047Word(word)
596		if err != nil {
597			return "", err
598		}
599		if isEncoded {
600			words[idx] = decoded
601		}
602	}
603
604	return strings.Join(words, " "), nil
605}
606
607func (p *addrParser) consume(c byte) bool {
608	if p.empty() || p.peek() != c {
609		return false
610	}
611	p.s = p.s[1:]
612	return true
613}
614
615// skipSpace skips the leading space and tab characters.
616func (p *addrParser) skipSpace() {
617	p.s = strings.TrimLeft(p.s, " \t")
618}
619
620func (p *addrParser) peek() byte {
621	return p.s[0]
622}
623
624func (p *addrParser) empty() bool {
625	return p.len() == 0
626}
627
628func (p *addrParser) len() int {
629	return len(p.s)
630}
631
632// skipCFWS skips CFWS as defined in RFC5322.
633func (p *addrParser) skipCFWS() bool {
634	p.skipSpace()
635
636	for {
637		if !p.consume('(') {
638			break
639		}
640
641		if _, ok := p.consumeComment(); !ok {
642			return false
643		}
644
645		p.skipSpace()
646	}
647
648	return true
649}
650
651func (p *addrParser) consumeComment() (string, bool) {
652	// '(' already consumed.
653	depth := 1
654
655	var comment string
656	for {
657		if p.empty() || depth == 0 {
658			break
659		}
660
661		if p.peek() == '\\' && p.len() > 1 {
662			p.s = p.s[1:]
663		} else if p.peek() == '(' {
664			depth++
665		} else if p.peek() == ')' {
666			depth--
667		}
668		if depth > 0 {
669			comment += p.s[:1]
670		}
671		p.s = p.s[1:]
672	}
673
674	return comment, depth == 0
675}
676
677func (p *addrParser) decodeRFC2047Word(s string) (word string, isEncoded bool, err error) {
678	if p.dec != nil {
679		word, err = p.dec.Decode(s)
680	} else {
681		word, err = rfc2047Decoder.Decode(s)
682	}
683
684	if err == nil {
685		return word, true, nil
686	}
687
688	if _, ok := err.(charsetError); ok {
689		return s, true, err
690	}
691
692	// Ignore invalid RFC 2047 encoded-word errors.
693	return s, false, nil
694}
695
696var rfc2047Decoder = mime.WordDecoder{
697	CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
698		return nil, charsetError(charset)
699	},
700}
701
702type charsetError string
703
704func (e charsetError) Error() string {
705	return fmt.Sprintf("charset not supported: %q", string(e))
706}
707
708// isAtext reports whether r is an RFC 5322 atext character.
709// If dot is true, period is included.
710// If permissive is true, RFC 5322 3.2.3 specials is included,
711// except '<', '>', ':' and '"'.
712func isAtext(r rune, dot, permissive bool) bool {
713	switch r {
714	case '.':
715		return dot
716
717	// RFC 5322 3.2.3. specials
718	case '(', ')', '[', ']', ';', '@', '\\', ',':
719		return permissive
720
721	case '<', '>', '"', ':':
722		return false
723	}
724	return isVchar(r)
725}
726
727// isQtext reports whether r is an RFC 5322 qtext character.
728func isQtext(r rune) bool {
729	// Printable US-ASCII, excluding backslash or quote.
730	if r == '\\' || r == '"' {
731		return false
732	}
733	return isVchar(r)
734}
735
736// quoteString renders a string as an RFC 5322 quoted-string.
737func quoteString(s string) string {
738	var buf bytes.Buffer
739	buf.WriteByte('"')
740	for _, r := range s {
741		if isQtext(r) || isWSP(r) {
742			buf.WriteRune(r)
743		} else if isVchar(r) {
744			buf.WriteByte('\\')
745			buf.WriteRune(r)
746		}
747	}
748	buf.WriteByte('"')
749	return buf.String()
750}
751
752// isVchar reports whether r is an RFC 5322 VCHAR character.
753func isVchar(r rune) bool {
754	// Visible (printing) characters.
755	return '!' <= r && r <= '~' || isMultibyte(r)
756}
757
758// isMultibyte reports whether r is a multi-byte UTF-8 character
759// as supported by RFC 6532
760func isMultibyte(r rune) bool {
761	return r >= utf8.RuneSelf
762}
763
764// isWSP reports whether r is a WSP (white space).
765// WSP is a space or horizontal tab (RFC 5234 Appendix B).
766func isWSP(r rune) bool {
767	return r == ' ' || r == '\t'
768}
769