1// Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
2// It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
3// interface but provides automatic BOM checking and removing as necessary.
4package utfbom
5
6import (
7	"errors"
8	"io"
9)
10
11// Encoding is type alias for detected UTF encoding.
12type Encoding int
13
14// Constants to identify detected UTF encodings.
15const (
16	// Unknown encoding, returned when no BOM was detected
17	Unknown Encoding = iota
18
19	// UTF8, BOM bytes: EF BB BF
20	UTF8
21
22	// UTF-16, big-endian, BOM bytes: FE FF
23	UTF16BigEndian
24
25	// UTF-16, little-endian, BOM bytes: FF FE
26	UTF16LittleEndian
27
28	// UTF-32, big-endian, BOM bytes: 00 00 FE FF
29	UTF32BigEndian
30
31	// UTF-32, little-endian, BOM bytes: FF FE 00 00
32	UTF32LittleEndian
33)
34
35const maxConsecutiveEmptyReads = 100
36
37// Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
38// It also returns the encoding detected by the BOM.
39// If the detected encoding is not needed, you can call the SkipOnly function.
40func Skip(rd io.Reader) (*Reader, Encoding) {
41	// Is it already a Reader?
42	b, ok := rd.(*Reader)
43	if ok {
44		return b, Unknown
45	}
46
47	enc, left, err := detectUtf(rd)
48	return &Reader{
49		rd:  rd,
50		buf: left,
51		err: err,
52	}, enc
53}
54
55// SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
56func SkipOnly(rd io.Reader) *Reader {
57	r, _ := Skip(rd)
58	return r
59}
60
61// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
62// removing as necessary for an io.Reader object.
63type Reader struct {
64	rd  io.Reader // reader provided by the client
65	buf []byte    // buffered data
66	err error     // last error
67}
68
69// Read is an implementation of io.Reader interface.
70// The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
71func (r *Reader) Read(p []byte) (n int, err error) {
72	if len(p) == 0 {
73		return 0, nil
74	}
75
76	if r.buf == nil {
77		if r.err != nil {
78			return 0, r.readErr()
79		}
80
81		return r.rd.Read(p)
82	}
83
84	// copy as much as we can
85	n = copy(p, r.buf)
86	r.buf = nilIfEmpty(r.buf[n:])
87	return n, nil
88}
89
90func (r *Reader) readErr() error {
91	err := r.err
92	r.err = nil
93	return err
94}
95
96var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
97
98func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
99	buf, err = readBOM(rd)
100
101	if len(buf) >= 4 {
102		if isUTF32BigEndianBOM4(buf) {
103			return UTF32BigEndian, nilIfEmpty(buf[4:]), err
104		}
105		if isUTF32LittleEndianBOM4(buf) {
106			return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
107		}
108	}
109
110	if len(buf) > 2 && isUTF8BOM3(buf) {
111		return UTF8, nilIfEmpty(buf[3:]), err
112	}
113
114	if (err != nil && err != io.EOF) || (len(buf) < 2) {
115		return Unknown, nilIfEmpty(buf), err
116	}
117
118	if isUTF16BigEndianBOM2(buf) {
119		return UTF16BigEndian, nilIfEmpty(buf[2:]), err
120	}
121	if isUTF16LittleEndianBOM2(buf) {
122		return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
123	}
124
125	return Unknown, nilIfEmpty(buf), err
126}
127
128func readBOM(rd io.Reader) (buf []byte, err error) {
129	const maxBOMSize = 4
130	var bom [maxBOMSize]byte // used to read BOM
131
132	// read as many bytes as possible
133	for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
134		if n, err = rd.Read(bom[len(buf):]); n < 0 {
135			panic(errNegativeRead)
136		}
137		if n > 0 {
138			nEmpty = 0
139		} else {
140			nEmpty++
141			if nEmpty >= maxConsecutiveEmptyReads {
142				err = io.ErrNoProgress
143			}
144		}
145	}
146	return
147}
148
149func isUTF32BigEndianBOM4(buf []byte) bool {
150	return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
151}
152
153func isUTF32LittleEndianBOM4(buf []byte) bool {
154	return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
155}
156
157func isUTF8BOM3(buf []byte) bool {
158	return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
159}
160
161func isUTF16BigEndianBOM2(buf []byte) bool {
162	return buf[0] == 0xFE && buf[1] == 0xFF
163}
164
165func isUTF16LittleEndianBOM2(buf []byte) bool {
166	return buf[0] == 0xFF && buf[1] == 0xFE
167}
168
169func nilIfEmpty(buf []byte) (res []byte) {
170	if len(buf) > 0 {
171		res = buf
172	}
173	return
174}
175