1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package encoding defines an interface for character encodings, such as Shift 6// JIS and Windows 1252, that can convert to and from UTF-8. 7// 8// Encoding implementations are provided in other packages, such as 9// golang.org/x/text/encoding/charmap and 10// golang.org/x/text/encoding/japanese. 11package encoding // import "golang.org/x/text/encoding" 12 13import ( 14 "errors" 15 "io" 16 "strconv" 17 "unicode/utf8" 18 19 "golang.org/x/text/encoding/internal/identifier" 20 "golang.org/x/text/transform" 21) 22 23// TODO: 24// - There seems to be some inconsistency in when decoders return errors 25// and when not. Also documentation seems to suggest they shouldn't return 26// errors at all (except for UTF-16). 27// - Encoders seem to rely on or at least benefit from the input being in NFC 28// normal form. Perhaps add an example how users could prepare their output. 29 30// Encoding is a character set encoding that can be transformed to and from 31// UTF-8. 32type Encoding interface { 33 // NewDecoder returns a Decoder. 34 NewDecoder() *Decoder 35 36 // NewEncoder returns an Encoder. 37 NewEncoder() *Encoder 38} 39 40// A Decoder converts bytes to UTF-8. It implements transform.Transformer. 41// 42// Transforming source bytes that are not of that encoding will not result in an 43// error per se. Each byte that cannot be transcoded will be represented in the 44// output by the UTF-8 encoding of '\uFFFD', the replacement rune. 45type Decoder struct { 46 transform.Transformer 47 48 // This forces external creators of Decoders to use names in struct 49 // initializers, allowing for future extendibility without having to break 50 // code. 51 _ struct{} 52} 53 54// Bytes converts the given encoded bytes to UTF-8. It returns the converted 55// bytes or nil, err if any error occurred. 56func (d *Decoder) Bytes(b []byte) ([]byte, error) { 57 b, _, err := transform.Bytes(d, b) 58 if err != nil { 59 return nil, err 60 } 61 return b, nil 62} 63 64// String converts the given encoded string to UTF-8. It returns the converted 65// string or "", err if any error occurred. 66func (d *Decoder) String(s string) (string, error) { 67 s, _, err := transform.String(d, s) 68 if err != nil { 69 return "", err 70 } 71 return s, nil 72} 73 74// Reader wraps another Reader to decode its bytes. 75// 76// The Decoder may not be used for any other operation as long as the returned 77// Reader is in use. 78func (d *Decoder) Reader(r io.Reader) io.Reader { 79 return transform.NewReader(r, d) 80} 81 82// An Encoder converts bytes from UTF-8. It implements transform.Transformer. 83// 84// Each rune that cannot be transcoded will result in an error. In this case, 85// the transform will consume all source byte up to, not including the offending 86// rune. Transforming source bytes that are not valid UTF-8 will be replaced by 87// `\uFFFD`. To return early with an error instead, use transform.Chain to 88// preprocess the data with a UTF8Validator. 89type Encoder struct { 90 transform.Transformer 91 92 // This forces external creators of Encoders to use names in struct 93 // initializers, allowing for future extendibility without having to break 94 // code. 95 _ struct{} 96} 97 98// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if 99// any error occurred. 100func (e *Encoder) Bytes(b []byte) ([]byte, error) { 101 b, _, err := transform.Bytes(e, b) 102 if err != nil { 103 return nil, err 104 } 105 return b, nil 106} 107 108// String converts a string from UTF-8. It returns the converted string or 109// "", err if any error occurred. 110func (e *Encoder) String(s string) (string, error) { 111 s, _, err := transform.String(e, s) 112 if err != nil { 113 return "", err 114 } 115 return s, nil 116} 117 118// Writer wraps another Writer to encode its UTF-8 output. 119// 120// The Encoder may not be used for any other operation as long as the returned 121// Writer is in use. 122func (e *Encoder) Writer(w io.Writer) io.Writer { 123 return transform.NewWriter(w, e) 124} 125 126// ASCIISub is the ASCII substitute character, as recommended by 127// https://unicode.org/reports/tr36/#Text_Comparison 128const ASCIISub = '\x1a' 129 130// Nop is the nop encoding. Its transformed bytes are the same as the source 131// bytes; it does not replace invalid UTF-8 sequences. 132var Nop Encoding = nop{} 133 134type nop struct{} 135 136func (nop) NewDecoder() *Decoder { 137 return &Decoder{Transformer: transform.Nop} 138} 139func (nop) NewEncoder() *Encoder { 140 return &Encoder{Transformer: transform.Nop} 141} 142 143// Replacement is the replacement encoding. Decoding from the replacement 144// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to 145// the replacement encoding yields the same as the source bytes except that 146// invalid UTF-8 is converted to '\uFFFD'. 147// 148// It is defined at http://encoding.spec.whatwg.org/#replacement 149var Replacement Encoding = replacement{} 150 151type replacement struct{} 152 153func (replacement) NewDecoder() *Decoder { 154 return &Decoder{Transformer: replacementDecoder{}} 155} 156 157func (replacement) NewEncoder() *Encoder { 158 return &Encoder{Transformer: replacementEncoder{}} 159} 160 161func (replacement) ID() (mib identifier.MIB, other string) { 162 return identifier.Replacement, "" 163} 164 165type replacementDecoder struct{ transform.NopResetter } 166 167func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 168 if len(dst) < 3 { 169 return 0, 0, transform.ErrShortDst 170 } 171 if atEOF { 172 const fffd = "\ufffd" 173 dst[0] = fffd[0] 174 dst[1] = fffd[1] 175 dst[2] = fffd[2] 176 nDst = 3 177 } 178 return nDst, len(src), nil 179} 180 181type replacementEncoder struct{ transform.NopResetter } 182 183func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 184 r, size := rune(0), 0 185 186 for ; nSrc < len(src); nSrc += size { 187 r = rune(src[nSrc]) 188 189 // Decode a 1-byte rune. 190 if r < utf8.RuneSelf { 191 size = 1 192 193 } else { 194 // Decode a multi-byte rune. 195 r, size = utf8.DecodeRune(src[nSrc:]) 196 if size == 1 { 197 // All valid runes of size 1 (those below utf8.RuneSelf) were 198 // handled above. We have invalid UTF-8 or we haven't seen the 199 // full character yet. 200 if !atEOF && !utf8.FullRune(src[nSrc:]) { 201 err = transform.ErrShortSrc 202 break 203 } 204 r = '\ufffd' 205 } 206 } 207 208 if nDst+utf8.RuneLen(r) > len(dst) { 209 err = transform.ErrShortDst 210 break 211 } 212 nDst += utf8.EncodeRune(dst[nDst:], r) 213 } 214 return nDst, nSrc, err 215} 216 217// HTMLEscapeUnsupported wraps encoders to replace source runes outside the 218// repertoire of the destination encoding with HTML escape sequences. 219// 220// This wrapper exists to comply to URL and HTML forms requiring a 221// non-terminating legacy encoder. The produced sequences may lead to data 222// loss as they are indistinguishable from legitimate input. To avoid this 223// issue, use UTF-8 encodings whenever possible. 224func HTMLEscapeUnsupported(e *Encoder) *Encoder { 225 return &Encoder{Transformer: &errorHandler{e, errorToHTML}} 226} 227 228// ReplaceUnsupported wraps encoders to replace source runes outside the 229// repertoire of the destination encoding with an encoding-specific 230// replacement. 231// 232// This wrapper is only provided for backwards compatibility and legacy 233// handling. Its use is strongly discouraged. Use UTF-8 whenever possible. 234func ReplaceUnsupported(e *Encoder) *Encoder { 235 return &Encoder{Transformer: &errorHandler{e, errorToReplacement}} 236} 237 238type errorHandler struct { 239 *Encoder 240 handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) 241} 242 243// TODO: consider making this error public in some form. 244type repertoireError interface { 245 Replacement() byte 246} 247 248func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 249 nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF) 250 for err != nil { 251 rerr, ok := err.(repertoireError) 252 if !ok { 253 return nDst, nSrc, err 254 } 255 r, sz := utf8.DecodeRune(src[nSrc:]) 256 n, ok := h.handler(dst[nDst:], r, rerr) 257 if !ok { 258 return nDst, nSrc, transform.ErrShortDst 259 } 260 err = nil 261 nDst += n 262 if nSrc += sz; nSrc < len(src) { 263 var dn, sn int 264 dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF) 265 nDst += dn 266 nSrc += sn 267 } 268 } 269 return nDst, nSrc, err 270} 271 272func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) { 273 buf := [8]byte{} 274 b := strconv.AppendUint(buf[:0], uint64(r), 10) 275 if n = len(b) + len("&#;"); n >= len(dst) { 276 return 0, false 277 } 278 dst[0] = '&' 279 dst[1] = '#' 280 dst[copy(dst[2:], b)+2] = ';' 281 return n, true 282} 283 284func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) { 285 if len(dst) == 0 { 286 return 0, false 287 } 288 dst[0] = err.Replacement() 289 return 1, true 290} 291 292// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8. 293var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") 294 295// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first 296// input byte that is not valid UTF-8. 297var UTF8Validator transform.Transformer = utf8Validator{} 298 299type utf8Validator struct{ transform.NopResetter } 300 301func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 302 n := len(src) 303 if n > len(dst) { 304 n = len(dst) 305 } 306 for i := 0; i < n; { 307 if c := src[i]; c < utf8.RuneSelf { 308 dst[i] = c 309 i++ 310 continue 311 } 312 _, size := utf8.DecodeRune(src[i:]) 313 if size == 1 { 314 // All valid runes of size 1 (those below utf8.RuneSelf) were 315 // handled above. We have invalid UTF-8 or we haven't seen the 316 // full character yet. 317 err = ErrInvalidUTF8 318 if !atEOF && !utf8.FullRune(src[i:]) { 319 err = transform.ErrShortSrc 320 } 321 return i, i, err 322 } 323 if i+size > len(dst) { 324 return i, i, transform.ErrShortDst 325 } 326 for ; size > 0; size-- { 327 dst[i] = src[i] 328 i++ 329 } 330 } 331 if len(src) > len(dst) { 332 err = transform.ErrShortDst 333 } 334 return n, n, err 335} 336