1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package mime
6
7import (
8	"bytes"
9	"encoding/base64"
10	"errors"
11	"fmt"
12	"io"
13	"strings"
14	"unicode"
15	"unicode/utf8"
16)
17
18// A WordEncoder is an RFC 2047 encoded-word encoder.
19type WordEncoder byte
20
21const (
22	// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
23	BEncoding = WordEncoder('b')
24	// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
25	QEncoding = WordEncoder('q')
26)
27
28var (
29	errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
30)
31
32// Encode returns the encoded-word form of s. If s is ASCII without special
33// characters, it is returned unchanged. The provided charset is the IANA
34// charset name of s. It is case insensitive.
35func (e WordEncoder) Encode(charset, s string) string {
36	if !needsEncoding(s) {
37		return s
38	}
39	return e.encodeWord(charset, s)
40}
41
42func needsEncoding(s string) bool {
43	for _, b := range s {
44		if (b < ' ' || b > '~') && b != '\t' {
45			return true
46		}
47	}
48	return false
49}
50
51// encodeWord encodes a string into an encoded-word.
52func (e WordEncoder) encodeWord(charset, s string) string {
53	var buf strings.Builder
54	// Could use a hint like len(s)*3, but that's not enough for cases
55	// with word splits and too much for simpler inputs.
56	// 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
57	buf.Grow(48)
58
59	e.openWord(&buf, charset)
60	if e == BEncoding {
61		e.bEncode(&buf, charset, s)
62	} else {
63		e.qEncode(&buf, charset, s)
64	}
65	closeWord(&buf)
66
67	return buf.String()
68}
69
70const (
71	// The maximum length of an encoded-word is 75 characters.
72	// See RFC 2047, section 2.
73	maxEncodedWordLen = 75
74	// maxContentLen is how much content can be encoded, ignoring the header and
75	// 2-byte footer.
76	maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
77)
78
79var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
80
81// bEncode encodes s using base64 encoding and writes it to buf.
82func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
83	w := base64.NewEncoder(base64.StdEncoding, buf)
84	// If the charset is not UTF-8 or if the content is short, do not bother
85	// splitting the encoded-word.
86	if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
87		io.WriteString(w, s)
88		w.Close()
89		return
90	}
91
92	var currentLen, last, runeLen int
93	for i := 0; i < len(s); i += runeLen {
94		// Multi-byte characters must not be split across encoded-words.
95		// See RFC 2047, section 5.3.
96		_, runeLen = utf8.DecodeRuneInString(s[i:])
97
98		if currentLen+runeLen <= maxBase64Len {
99			currentLen += runeLen
100		} else {
101			io.WriteString(w, s[last:i])
102			w.Close()
103			e.splitWord(buf, charset)
104			last = i
105			currentLen = runeLen
106		}
107	}
108	io.WriteString(w, s[last:])
109	w.Close()
110}
111
112// qEncode encodes s using Q encoding and writes it to buf. It splits the
113// encoded-words when necessary.
114func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
115	// We only split encoded-words when the charset is UTF-8.
116	if !isUTF8(charset) {
117		writeQString(buf, s)
118		return
119	}
120
121	var currentLen, runeLen int
122	for i := 0; i < len(s); i += runeLen {
123		b := s[i]
124		// Multi-byte characters must not be split across encoded-words.
125		// See RFC 2047, section 5.3.
126		var encLen int
127		if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
128			runeLen, encLen = 1, 1
129		} else {
130			_, runeLen = utf8.DecodeRuneInString(s[i:])
131			encLen = 3 * runeLen
132		}
133
134		if currentLen+encLen > maxContentLen {
135			e.splitWord(buf, charset)
136			currentLen = 0
137		}
138		writeQString(buf, s[i:i+runeLen])
139		currentLen += encLen
140	}
141}
142
143// writeQString encodes s using Q encoding and writes it to buf.
144func writeQString(buf *strings.Builder, s string) {
145	for i := 0; i < len(s); i++ {
146		switch b := s[i]; {
147		case b == ' ':
148			buf.WriteByte('_')
149		case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
150			buf.WriteByte(b)
151		default:
152			buf.WriteByte('=')
153			buf.WriteByte(upperhex[b>>4])
154			buf.WriteByte(upperhex[b&0x0f])
155		}
156	}
157}
158
159// openWord writes the beginning of an encoded-word into buf.
160func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
161	buf.WriteString("=?")
162	buf.WriteString(charset)
163	buf.WriteByte('?')
164	buf.WriteByte(byte(e))
165	buf.WriteByte('?')
166}
167
168// closeWord writes the end of an encoded-word into buf.
169func closeWord(buf *strings.Builder) {
170	buf.WriteString("?=")
171}
172
173// splitWord closes the current encoded-word and opens a new one.
174func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
175	closeWord(buf)
176	buf.WriteByte(' ')
177	e.openWord(buf, charset)
178}
179
180func isUTF8(charset string) bool {
181	return strings.EqualFold(charset, "UTF-8")
182}
183
184const upperhex = "0123456789ABCDEF"
185
186// A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
187type WordDecoder struct {
188	// CharsetReader, if non-nil, defines a function to generate
189	// charset-conversion readers, converting from the provided
190	// charset into UTF-8.
191	// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
192	// are handled by default.
193	// One of the CharsetReader's result values must be non-nil.
194	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
195}
196
197// Decode decodes an RFC 2047 encoded-word.
198func (d *WordDecoder) Decode(word string) (string, error) {
199	// See https://tools.ietf.org/html/rfc2047#section-2 for details.
200	// Our decoder is permissive, we accept empty encoded-text.
201	if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
202		return "", errInvalidWord
203	}
204	word = word[2 : len(word)-2]
205
206	// split delimits the first 2 fields
207	split := strings.IndexByte(word, '?')
208
209	// split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii"
210	charset := word[:split]
211	if len(charset) == 0 {
212		return "", errInvalidWord
213	}
214	if len(word) < split+3 {
215		return "", errInvalidWord
216	}
217	encoding := word[split+1]
218	// the field after split must only be one byte
219	if word[split+2] != '?' {
220		return "", errInvalidWord
221	}
222	text := word[split+3:]
223
224	content, err := decode(encoding, text)
225	if err != nil {
226		return "", err
227	}
228
229	var buf strings.Builder
230
231	if err := d.convert(&buf, charset, content); err != nil {
232		return "", err
233	}
234
235	return buf.String(), nil
236}
237
238// DecodeHeader decodes all encoded-words of the given string. It returns an
239// error if and only if CharsetReader of d returns an error.
240func (d *WordDecoder) DecodeHeader(header string) (string, error) {
241	// If there is no encoded-word, returns before creating a buffer.
242	i := strings.Index(header, "=?")
243	if i == -1 {
244		return header, nil
245	}
246
247	var buf strings.Builder
248
249	buf.WriteString(header[:i])
250	header = header[i:]
251
252	betweenWords := false
253	for {
254		start := strings.Index(header, "=?")
255		if start == -1 {
256			break
257		}
258		cur := start + len("=?")
259
260		i := strings.Index(header[cur:], "?")
261		if i == -1 {
262			break
263		}
264		charset := header[cur : cur+i]
265		cur += i + len("?")
266
267		if len(header) < cur+len("Q??=") {
268			break
269		}
270		encoding := header[cur]
271		cur++
272
273		if header[cur] != '?' {
274			break
275		}
276		cur++
277
278		j := strings.Index(header[cur:], "?=")
279		if j == -1 {
280			break
281		}
282		text := header[cur : cur+j]
283		end := cur + j + len("?=")
284
285		content, err := decode(encoding, text)
286		if err != nil {
287			betweenWords = false
288			buf.WriteString(header[:start+2])
289			header = header[start+2:]
290			continue
291		}
292
293		// Write characters before the encoded-word. White-space and newline
294		// characters separating two encoded-words must be deleted.
295		if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
296			buf.WriteString(header[:start])
297		}
298
299		if err := d.convert(&buf, charset, content); err != nil {
300			return "", err
301		}
302
303		header = header[end:]
304		betweenWords = true
305	}
306
307	if len(header) > 0 {
308		buf.WriteString(header)
309	}
310
311	return buf.String(), nil
312}
313
314func decode(encoding byte, text string) ([]byte, error) {
315	switch encoding {
316	case 'B', 'b':
317		return base64.StdEncoding.DecodeString(text)
318	case 'Q', 'q':
319		return qDecode(text)
320	default:
321		return nil, errInvalidWord
322	}
323}
324
325func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
326	switch {
327	case strings.EqualFold("utf-8", charset):
328		buf.Write(content)
329	case strings.EqualFold("iso-8859-1", charset):
330		for _, c := range content {
331			buf.WriteRune(rune(c))
332		}
333	case strings.EqualFold("us-ascii", charset):
334		for _, c := range content {
335			if c >= utf8.RuneSelf {
336				buf.WriteRune(unicode.ReplacementChar)
337			} else {
338				buf.WriteByte(c)
339			}
340		}
341	default:
342		if d.CharsetReader == nil {
343			return fmt.Errorf("mime: unhandled charset %q", charset)
344		}
345		r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
346		if err != nil {
347			return err
348		}
349		if _, err = io.Copy(buf, r); err != nil {
350			return err
351		}
352	}
353	return nil
354}
355
356// hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
357// one byte of non-whitespace.
358func hasNonWhitespace(s string) bool {
359	for _, b := range s {
360		switch b {
361		// Encoded-words can only be separated by linear white spaces which does
362		// not include vertical tabs (\v).
363		case ' ', '\t', '\n', '\r':
364		default:
365			return true
366		}
367	}
368	return false
369}
370
371// qDecode decodes a Q encoded string.
372func qDecode(s string) ([]byte, error) {
373	dec := make([]byte, len(s))
374	n := 0
375	for i := 0; i < len(s); i++ {
376		switch c := s[i]; {
377		case c == '_':
378			dec[n] = ' '
379		case c == '=':
380			if i+2 >= len(s) {
381				return nil, errInvalidWord
382			}
383			b, err := readHexByte(s[i+1], s[i+2])
384			if err != nil {
385				return nil, err
386			}
387			dec[n] = b
388			i += 2
389		case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
390			dec[n] = c
391		default:
392			return nil, errInvalidWord
393		}
394		n++
395	}
396
397	return dec[:n], nil
398}
399
400// readHexByte returns the byte from its quoted-printable representation.
401func readHexByte(a, b byte) (byte, error) {
402	var hb, lb byte
403	var err error
404	if hb, err = fromHex(a); err != nil {
405		return 0, err
406	}
407	if lb, err = fromHex(b); err != nil {
408		return 0, err
409	}
410	return hb<<4 | lb, nil
411}
412
413func fromHex(b byte) (byte, error) {
414	switch {
415	case b >= '0' && b <= '9':
416		return b - '0', nil
417	case b >= 'A' && b <= 'F':
418		return b - 'A' + 10, nil
419	// Accept badly encoded bytes.
420	case b >= 'a' && b <= 'f':
421		return b - 'a' + 10, nil
422	}
423	return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
424}
425