1// Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary. 2// It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader 3// interface but provides automatic BOM checking and removing as necessary. 4package utfbom 5 6import ( 7 "errors" 8 "io" 9) 10 11// Encoding is type alias for detected UTF encoding. 12type Encoding int 13 14// Constants to identify detected UTF encodings. 15const ( 16 // Unknown encoding, returned when no BOM was detected 17 Unknown Encoding = iota 18 19 // UTF8, BOM bytes: EF BB BF 20 UTF8 21 22 // UTF-16, big-endian, BOM bytes: FE FF 23 UTF16BigEndian 24 25 // UTF-16, little-endian, BOM bytes: FF FE 26 UTF16LittleEndian 27 28 // UTF-32, big-endian, BOM bytes: 00 00 FE FF 29 UTF32BigEndian 30 31 // UTF-32, little-endian, BOM bytes: FF FE 00 00 32 UTF32LittleEndian 33) 34 35const maxConsecutiveEmptyReads = 100 36 37// Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary. 38// It also returns the encoding detected by the BOM. 39// If the detected encoding is not needed, you can call the SkipOnly function. 40func Skip(rd io.Reader) (*Reader, Encoding) { 41 // Is it already a Reader? 42 b, ok := rd.(*Reader) 43 if ok { 44 return b, Unknown 45 } 46 47 enc, left, err := detectUtf(rd) 48 return &Reader{ 49 rd: rd, 50 buf: left, 51 err: err, 52 }, enc 53} 54 55// SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary. 56func SkipOnly(rd io.Reader) *Reader { 57 r, _ := Skip(rd) 58 return r 59} 60 61// Reader implements automatic BOM (Unicode Byte Order Mark) checking and 62// removing as necessary for an io.Reader object. 63type Reader struct { 64 rd io.Reader // reader provided by the client 65 buf []byte // buffered data 66 err error // last error 67} 68 69// Read is an implementation of io.Reader interface. 70// The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary. 71func (r *Reader) Read(p []byte) (n int, err error) { 72 if len(p) == 0 { 73 return 0, nil 74 } 75 76 if r.buf == nil { 77 if r.err != nil { 78 return 0, r.readErr() 79 } 80 81 return r.rd.Read(p) 82 } 83 84 // copy as much as we can 85 n = copy(p, r.buf) 86 r.buf = nilIfEmpty(r.buf[n:]) 87 return n, nil 88} 89 90func (r *Reader) readErr() error { 91 err := r.err 92 r.err = nil 93 return err 94} 95 96var errNegativeRead = errors.New("utfbom: reader returned negative count from Read") 97 98func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) { 99 buf, err = readBOM(rd) 100 101 if len(buf) >= 4 { 102 if isUTF32BigEndianBOM4(buf) { 103 return UTF32BigEndian, nilIfEmpty(buf[4:]), err 104 } 105 if isUTF32LittleEndianBOM4(buf) { 106 return UTF32LittleEndian, nilIfEmpty(buf[4:]), err 107 } 108 } 109 110 if len(buf) > 2 && isUTF8BOM3(buf) { 111 return UTF8, nilIfEmpty(buf[3:]), err 112 } 113 114 if (err != nil && err != io.EOF) || (len(buf) < 2) { 115 return Unknown, nilIfEmpty(buf), err 116 } 117 118 if isUTF16BigEndianBOM2(buf) { 119 return UTF16BigEndian, nilIfEmpty(buf[2:]), err 120 } 121 if isUTF16LittleEndianBOM2(buf) { 122 return UTF16LittleEndian, nilIfEmpty(buf[2:]), err 123 } 124 125 return Unknown, nilIfEmpty(buf), err 126} 127 128func readBOM(rd io.Reader) (buf []byte, err error) { 129 const maxBOMSize = 4 130 var bom [maxBOMSize]byte // used to read BOM 131 132 // read as many bytes as possible 133 for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] { 134 if n, err = rd.Read(bom[len(buf):]); n < 0 { 135 panic(errNegativeRead) 136 } 137 if n > 0 { 138 nEmpty = 0 139 } else { 140 nEmpty++ 141 if nEmpty >= maxConsecutiveEmptyReads { 142 err = io.ErrNoProgress 143 } 144 } 145 } 146 return 147} 148 149func isUTF32BigEndianBOM4(buf []byte) bool { 150 return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF 151} 152 153func isUTF32LittleEndianBOM4(buf []byte) bool { 154 return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00 155} 156 157func isUTF8BOM3(buf []byte) bool { 158 return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF 159} 160 161func isUTF16BigEndianBOM2(buf []byte) bool { 162 return buf[0] == 0xFE && buf[1] == 0xFF 163} 164 165func isUTF16LittleEndianBOM2(buf []byte) bool { 166 return buf[0] == 0xFF && buf[1] == 0xFE 167} 168 169func nilIfEmpty(buf []byte) (res []byte) { 170 if len(buf) > 0 { 171 res = buf 172 } 173 return 174} 175