1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package utf32 provides the UTF-32 Unicode encoding.
6//
7// Please note that support for UTF-32 is discouraged as it is a rare and
8// inefficient encoding, unfit for use as an interchange format. For use
9// on the web, the W3C strongly discourages its use
10// (https://www.w3.org/TR/html5/document-metadata.html#charset)
11// while WHATWG directly prohibits supporting it
12// (https://html.spec.whatwg.org/multipage/syntax.html#character-encodings).
13package utf32 // import "golang.org/x/text/encoding/unicode/utf32"
14
15import (
16	"errors"
17	"unicode/utf8"
18
19	"golang.org/x/text/encoding"
20	"golang.org/x/text/encoding/internal/identifier"
21	"golang.org/x/text/transform"
22)
23
24// All lists a configuration for each IANA-defined UTF-32 variant.
25var All = []encoding.Encoding{
26	UTF32(BigEndian, UseBOM),
27	UTF32(BigEndian, IgnoreBOM),
28	UTF32(LittleEndian, IgnoreBOM),
29}
30
31// ErrMissingBOM means that decoding UTF-32 input with ExpectBOM did not
32// find a starting byte order mark.
33var ErrMissingBOM = errors.New("encoding: missing byte order mark")
34
35// UTF32 returns a UTF-32 Encoding for the given default endianness and
36// byte order mark (BOM) policy.
37//
38// When decoding from UTF-32 to UTF-8, if the BOMPolicy is IgnoreBOM then
39// neither BOMs U+FEFF nor ill-formed code units 0xFFFE0000 in the input
40// stream will affect the endianness used for decoding. Instead BOMs will
41// be output as their standard UTF-8 encoding "\xef\xbb\xbf" while
42// 0xFFFE0000 code units will be output as "\xef\xbf\xbd", the standard
43// UTF-8 encoding for the Unicode replacement character. If the BOMPolicy
44// is UseBOM or ExpectBOM a starting BOM is not written to the UTF-8
45// output. Instead, it overrides the default endianness e for the remainder
46// of the transformation. Any subsequent BOMs U+FEFF or ill-formed code
47// units 0xFFFE0000 will not affect the endianness used, and will instead
48// be output as their standard UTF-8 (replacement) encodings. For UseBOM,
49// if there is no starting BOM, it will proceed with the default
50// Endianness. For ExpectBOM, in that case, the transformation will return
51// early with an ErrMissingBOM error.
52//
53// When encoding from UTF-8 to UTF-32, a BOM will be inserted at the start
54// of the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM
55// will not be inserted. The UTF-8 input does not need to contain a BOM.
56//
57// There is no concept of a 'native' endianness. If the UTF-32 data is
58// produced and consumed in a greater context that implies a certain
59// endianness, use IgnoreBOM. Otherwise, use ExpectBOM and always produce
60// and consume a BOM.
61//
62// In the language of https://www.unicode.org/faq/utf_bom.html#bom10,
63// IgnoreBOM corresponds to "Where the precise type of the data stream is
64// known... the BOM should not be used" and ExpectBOM corresponds to "A
65// particular protocol... may require use of the BOM".
66func UTF32(e Endianness, b BOMPolicy) encoding.Encoding {
67	return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]}
68}
69
70// mibValue maps Endianness and BOMPolicy settings to MIB constants for UTF-32.
71// Note that some configurations map to the same MIB identifier.
72var mibValue = map[Endianness][numBOMValues]identifier.MIB{
73	BigEndian: [numBOMValues]identifier.MIB{
74		IgnoreBOM: identifier.UTF32BE,
75		UseBOM:    identifier.UTF32,
76	},
77	LittleEndian: [numBOMValues]identifier.MIB{
78		IgnoreBOM: identifier.UTF32LE,
79		UseBOM:    identifier.UTF32,
80	},
81	// ExpectBOM is not widely used and has no valid MIB identifier.
82}
83
84// BOMPolicy is a UTF-32 encodings's byte order mark policy.
85type BOMPolicy uint8
86
87const (
88	writeBOM   BOMPolicy = 0x01
89	acceptBOM  BOMPolicy = 0x02
90	requireBOM BOMPolicy = 0x04
91	bomMask    BOMPolicy = 0x07
92
93	// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
94	// map of an array of length 8 of a type that is also used as a key or value
95	// in another map). See golang.org/issue/11354.
96	// TODO: consider changing this value back to 8 if the use of 1.4.* has
97	// been minimized.
98	numBOMValues = 8 + 1
99
100	// IgnoreBOM means to ignore any byte order marks.
101	IgnoreBOM BOMPolicy = 0
102	// Unicode-compliant interpretation for UTF-32BE/LE.
103
104	// UseBOM means that the UTF-32 form may start with a byte order mark,
105	// which will be used to override the default encoding.
106	UseBOM BOMPolicy = writeBOM | acceptBOM
107	// Unicode-compliant interpretation for UTF-32.
108
109	// ExpectBOM means that the UTF-32 form must start with a byte order mark,
110	// which will be used to override the default encoding.
111	ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
112	// Consistent with BOMPolicy definition in golang.org/x/text/encoding/unicode
113)
114
115// Endianness is a UTF-32 encoding's default endianness.
116type Endianness bool
117
118const (
119	// BigEndian is UTF-32BE.
120	BigEndian Endianness = false
121	// LittleEndian is UTF-32LE.
122	LittleEndian Endianness = true
123)
124
125type config struct {
126	endianness Endianness
127	bomPolicy  BOMPolicy
128}
129
130type utf32Encoding struct {
131	config
132	mib identifier.MIB
133}
134
135func (u utf32Encoding) NewDecoder() *encoding.Decoder {
136	return &encoding.Decoder{Transformer: &utf32Decoder{
137		initial: u.config,
138		current: u.config,
139	}}
140}
141
142func (u utf32Encoding) NewEncoder() *encoding.Encoder {
143	return &encoding.Encoder{Transformer: &utf32Encoder{
144		endianness:       u.endianness,
145		initialBOMPolicy: u.bomPolicy,
146		currentBOMPolicy: u.bomPolicy,
147	}}
148}
149
150func (u utf32Encoding) ID() (mib identifier.MIB, other string) {
151	return u.mib, ""
152}
153
154func (u utf32Encoding) String() string {
155	e, b := "B", ""
156	if u.endianness == LittleEndian {
157		e = "L"
158	}
159	switch u.bomPolicy {
160	case ExpectBOM:
161		b = "Expect"
162	case UseBOM:
163		b = "Use"
164	case IgnoreBOM:
165		b = "Ignore"
166	}
167	return "UTF-32" + e + "E (" + b + " BOM)"
168}
169
170type utf32Decoder struct {
171	initial config
172	current config
173}
174
175func (u *utf32Decoder) Reset() {
176	u.current = u.initial
177}
178
179func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
180	if len(src) == 0 {
181		if atEOF && u.current.bomPolicy&requireBOM != 0 {
182			return 0, 0, ErrMissingBOM
183		}
184		return 0, 0, nil
185	}
186	if u.current.bomPolicy&acceptBOM != 0 {
187		if len(src) < 4 {
188			return 0, 0, transform.ErrShortSrc
189		}
190		switch {
191		case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff:
192			u.current.endianness = BigEndian
193			nSrc = 4
194		case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00:
195			u.current.endianness = LittleEndian
196			nSrc = 4
197		default:
198			if u.current.bomPolicy&requireBOM != 0 {
199				return 0, 0, ErrMissingBOM
200			}
201		}
202		u.current.bomPolicy = IgnoreBOM
203	}
204
205	var r rune
206	var dSize, sSize int
207	for nSrc < len(src) {
208		if nSrc+3 < len(src) {
209			x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 |
210				uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3])
211			if u.current.endianness == LittleEndian {
212				x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24
213			}
214			r, sSize = rune(x), 4
215			if dSize = utf8.RuneLen(r); dSize < 0 {
216				r, dSize = utf8.RuneError, 3
217			}
218		} else if atEOF {
219			// 1..3 trailing bytes.
220			r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc
221		} else {
222			err = transform.ErrShortSrc
223			break
224		}
225		if nDst+dSize > len(dst) {
226			err = transform.ErrShortDst
227			break
228		}
229		nDst += utf8.EncodeRune(dst[nDst:], r)
230		nSrc += sSize
231	}
232	return nDst, nSrc, err
233}
234
235type utf32Encoder struct {
236	endianness       Endianness
237	initialBOMPolicy BOMPolicy
238	currentBOMPolicy BOMPolicy
239}
240
241func (u *utf32Encoder) Reset() {
242	u.currentBOMPolicy = u.initialBOMPolicy
243}
244
245func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
246	if u.currentBOMPolicy&writeBOM != 0 {
247		if len(dst) < 4 {
248			return 0, 0, transform.ErrShortDst
249		}
250		dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff
251		u.currentBOMPolicy = IgnoreBOM
252		nDst = 4
253	}
254
255	r, size := rune(0), 0
256	for nSrc < len(src) {
257		r = rune(src[nSrc])
258
259		// Decode a 1-byte rune.
260		if r < utf8.RuneSelf {
261			size = 1
262
263		} else {
264			// Decode a multi-byte rune.
265			r, size = utf8.DecodeRune(src[nSrc:])
266			if size == 1 {
267				// All valid runes of size 1 (those below utf8.RuneSelf) were
268				// handled above. We have invalid UTF-8 or we haven't seen the
269				// full character yet.
270				if !atEOF && !utf8.FullRune(src[nSrc:]) {
271					err = transform.ErrShortSrc
272					break
273				}
274			}
275		}
276
277		if nDst+4 > len(dst) {
278			err = transform.ErrShortDst
279			break
280		}
281
282		dst[nDst+0] = uint8(r >> 24)
283		dst[nDst+1] = uint8(r >> 16)
284		dst[nDst+2] = uint8(r >> 8)
285		dst[nDst+3] = uint8(r)
286		nDst += 4
287		nSrc += size
288	}
289
290	if u.endianness == LittleEndian {
291		for i := 0; i < nDst; i += 4 {
292			dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i]
293		}
294	}
295	return nDst, nSrc, err
296}
297