1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package encoding defines an interface for character encodings, such as Shift
6// JIS and Windows 1252, that can convert to and from UTF-8.
7//
8// Encoding implementations are provided in other packages, such as
9// golang.org/x/text/encoding/charmap and
10// golang.org/x/text/encoding/japanese.
11package encoding // import "golang.org/x/text/encoding"
12
13import (
14	"errors"
15	"io"
16	"strconv"
17	"unicode/utf8"
18
19	"golang.org/x/text/encoding/internal/identifier"
20	"golang.org/x/text/transform"
21)
22
23// TODO:
24// - There seems to be some inconsistency in when decoders return errors
25//   and when not. Also documentation seems to suggest they shouldn't return
26//   errors at all (except for UTF-16).
27// - Encoders seem to rely on or at least benefit from the input being in NFC
28//   normal form. Perhaps add an example how users could prepare their output.
29
30// Encoding is a character set encoding that can be transformed to and from
31// UTF-8.
32type Encoding interface {
33	// NewDecoder returns a Decoder.
34	NewDecoder() *Decoder
35
36	// NewEncoder returns an Encoder.
37	NewEncoder() *Encoder
38}
39
40// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
41//
42// Transforming source bytes that are not of that encoding will not result in an
43// error per se. Each byte that cannot be transcoded will be represented in the
44// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
45type Decoder struct {
46	transform.Transformer
47
48	// This forces external creators of Decoders to use names in struct
49	// initializers, allowing for future extendibility without having to break
50	// code.
51	_ struct{}
52}
53
54// Bytes converts the given encoded bytes to UTF-8. It returns the converted
55// bytes or nil, err if any error occurred.
56func (d *Decoder) Bytes(b []byte) ([]byte, error) {
57	b, _, err := transform.Bytes(d, b)
58	if err != nil {
59		return nil, err
60	}
61	return b, nil
62}
63
64// String converts the given encoded string to UTF-8. It returns the converted
65// string or "", err if any error occurred.
66func (d *Decoder) String(s string) (string, error) {
67	s, _, err := transform.String(d, s)
68	if err != nil {
69		return "", err
70	}
71	return s, nil
72}
73
74// Reader wraps another Reader to decode its bytes.
75//
76// The Decoder may not be used for any other operation as long as the returned
77// Reader is in use.
78func (d *Decoder) Reader(r io.Reader) io.Reader {
79	return transform.NewReader(r, d)
80}
81
82// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
83//
84// Each rune that cannot be transcoded will result in an error. In this case,
85// the transform will consume all source byte up to, not including the offending
86// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
87// `\uFFFD`. To return early with an error instead, use transform.Chain to
88// preprocess the data with a UTF8Validator.
89type Encoder struct {
90	transform.Transformer
91
92	// This forces external creators of Encoders to use names in struct
93	// initializers, allowing for future extendibility without having to break
94	// code.
95	_ struct{}
96}
97
98// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
99// any error occurred.
100func (e *Encoder) Bytes(b []byte) ([]byte, error) {
101	b, _, err := transform.Bytes(e, b)
102	if err != nil {
103		return nil, err
104	}
105	return b, nil
106}
107
108// String converts a string from UTF-8. It returns the converted string or
109// "", err if any error occurred.
110func (e *Encoder) String(s string) (string, error) {
111	s, _, err := transform.String(e, s)
112	if err != nil {
113		return "", err
114	}
115	return s, nil
116}
117
118// Writer wraps another Writer to encode its UTF-8 output.
119//
120// The Encoder may not be used for any other operation as long as the returned
121// Writer is in use.
122func (e *Encoder) Writer(w io.Writer) io.Writer {
123	return transform.NewWriter(w, e)
124}
125
126// ASCIISub is the ASCII substitute character, as recommended by
127// https://unicode.org/reports/tr36/#Text_Comparison
128const ASCIISub = '\x1a'
129
130// Nop is the nop encoding. Its transformed bytes are the same as the source
131// bytes; it does not replace invalid UTF-8 sequences.
132var Nop Encoding = nop{}
133
134type nop struct{}
135
136func (nop) NewDecoder() *Decoder {
137	return &Decoder{Transformer: transform.Nop}
138}
139func (nop) NewEncoder() *Encoder {
140	return &Encoder{Transformer: transform.Nop}
141}
142
143// Replacement is the replacement encoding. Decoding from the replacement
144// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
145// the replacement encoding yields the same as the source bytes except that
146// invalid UTF-8 is converted to '\uFFFD'.
147//
148// It is defined at http://encoding.spec.whatwg.org/#replacement
149var Replacement Encoding = replacement{}
150
151type replacement struct{}
152
153func (replacement) NewDecoder() *Decoder {
154	return &Decoder{Transformer: replacementDecoder{}}
155}
156
157func (replacement) NewEncoder() *Encoder {
158	return &Encoder{Transformer: replacementEncoder{}}
159}
160
161func (replacement) ID() (mib identifier.MIB, other string) {
162	return identifier.Replacement, ""
163}
164
165type replacementDecoder struct{ transform.NopResetter }
166
167func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
168	if len(dst) < 3 {
169		return 0, 0, transform.ErrShortDst
170	}
171	if atEOF {
172		const fffd = "\ufffd"
173		dst[0] = fffd[0]
174		dst[1] = fffd[1]
175		dst[2] = fffd[2]
176		nDst = 3
177	}
178	return nDst, len(src), nil
179}
180
181type replacementEncoder struct{ transform.NopResetter }
182
183func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
184	r, size := rune(0), 0
185
186	for ; nSrc < len(src); nSrc += size {
187		r = rune(src[nSrc])
188
189		// Decode a 1-byte rune.
190		if r < utf8.RuneSelf {
191			size = 1
192
193		} else {
194			// Decode a multi-byte rune.
195			r, size = utf8.DecodeRune(src[nSrc:])
196			if size == 1 {
197				// All valid runes of size 1 (those below utf8.RuneSelf) were
198				// handled above. We have invalid UTF-8 or we haven't seen the
199				// full character yet.
200				if !atEOF && !utf8.FullRune(src[nSrc:]) {
201					err = transform.ErrShortSrc
202					break
203				}
204				r = '\ufffd'
205			}
206		}
207
208		if nDst+utf8.RuneLen(r) > len(dst) {
209			err = transform.ErrShortDst
210			break
211		}
212		nDst += utf8.EncodeRune(dst[nDst:], r)
213	}
214	return nDst, nSrc, err
215}
216
217// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
218// repertoire of the destination encoding with HTML escape sequences.
219//
220// This wrapper exists to comply to URL and HTML forms requiring a
221// non-terminating legacy encoder. The produced sequences may lead to data
222// loss as they are indistinguishable from legitimate input. To avoid this
223// issue, use UTF-8 encodings whenever possible.
224func HTMLEscapeUnsupported(e *Encoder) *Encoder {
225	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
226}
227
228// ReplaceUnsupported wraps encoders to replace source runes outside the
229// repertoire of the destination encoding with an encoding-specific
230// replacement.
231//
232// This wrapper is only provided for backwards compatibility and legacy
233// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
234func ReplaceUnsupported(e *Encoder) *Encoder {
235	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
236}
237
238type errorHandler struct {
239	*Encoder
240	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
241}
242
243// TODO: consider making this error public in some form.
244type repertoireError interface {
245	Replacement() byte
246}
247
248func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
249	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
250	for err != nil {
251		rerr, ok := err.(repertoireError)
252		if !ok {
253			return nDst, nSrc, err
254		}
255		r, sz := utf8.DecodeRune(src[nSrc:])
256		n, ok := h.handler(dst[nDst:], r, rerr)
257		if !ok {
258			return nDst, nSrc, transform.ErrShortDst
259		}
260		err = nil
261		nDst += n
262		if nSrc += sz; nSrc < len(src) {
263			var dn, sn int
264			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
265			nDst += dn
266			nSrc += sn
267		}
268	}
269	return nDst, nSrc, err
270}
271
272func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
273	buf := [8]byte{}
274	b := strconv.AppendUint(buf[:0], uint64(r), 10)
275	if n = len(b) + len("&#;"); n >= len(dst) {
276		return 0, false
277	}
278	dst[0] = '&'
279	dst[1] = '#'
280	dst[copy(dst[2:], b)+2] = ';'
281	return n, true
282}
283
284func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
285	if len(dst) == 0 {
286		return 0, false
287	}
288	dst[0] = err.Replacement()
289	return 1, true
290}
291
292// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
293var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
294
295// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
296// input byte that is not valid UTF-8.
297var UTF8Validator transform.Transformer = utf8Validator{}
298
299type utf8Validator struct{ transform.NopResetter }
300
301func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
302	n := len(src)
303	if n > len(dst) {
304		n = len(dst)
305	}
306	for i := 0; i < n; {
307		if c := src[i]; c < utf8.RuneSelf {
308			dst[i] = c
309			i++
310			continue
311		}
312		_, size := utf8.DecodeRune(src[i:])
313		if size == 1 {
314			// All valid runes of size 1 (those below utf8.RuneSelf) were
315			// handled above. We have invalid UTF-8 or we haven't seen the
316			// full character yet.
317			err = ErrInvalidUTF8
318			if !atEOF && !utf8.FullRune(src[i:]) {
319				err = transform.ErrShortSrc
320			}
321			return i, i, err
322		}
323		if i+size > len(dst) {
324			return i, i, transform.ErrShortDst
325		}
326		for ; size > 0; size-- {
327			dst[i] = src[i]
328			i++
329		}
330	}
331	if len(src) > len(dst) {
332		err = transform.ErrShortDst
333	}
334	return n, n, err
335}
336