1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package unicode provides Unicode encodings such as UTF-16. 6package unicode // import "golang.org/x/text/encoding/unicode" 7 8import ( 9 "errors" 10 "unicode/utf16" 11 "unicode/utf8" 12 13 "golang.org/x/text/encoding" 14 "golang.org/x/text/encoding/internal" 15 "golang.org/x/text/encoding/internal/identifier" 16 "golang.org/x/text/internal/utf8internal" 17 "golang.org/x/text/runes" 18 "golang.org/x/text/transform" 19) 20 21// TODO: I think the Transformers really should return errors on unmatched 22// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781, 23// which leaves it open, but is suggested by WhatWG. It will allow for all error 24// modes as defined by WhatWG: fatal, HTML and Replacement. This would require 25// the introduction of some kind of error type for conveying the erroneous code 26// point. 27 28// UTF8 is the UTF-8 encoding. 29var UTF8 encoding.Encoding = utf8enc 30 31var utf8enc = &internal.Encoding{ 32 &internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()}, 33 "UTF-8", 34 identifier.UTF8, 35} 36 37type utf8Decoder struct{ transform.NopResetter } 38 39func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 40 var pSrc int // point from which to start copy in src 41 var accept utf8internal.AcceptRange 42 43 // The decoder can only make the input larger, not smaller. 44 n := len(src) 45 if len(dst) < n { 46 err = transform.ErrShortDst 47 n = len(dst) 48 atEOF = false 49 } 50 for nSrc < n { 51 c := src[nSrc] 52 if c < utf8.RuneSelf { 53 nSrc++ 54 continue 55 } 56 first := utf8internal.First[c] 57 size := int(first & utf8internal.SizeMask) 58 if first == utf8internal.FirstInvalid { 59 goto handleInvalid // invalid starter byte 60 } 61 accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift] 62 if nSrc+size > n { 63 if !atEOF { 64 // We may stop earlier than necessary here if the short sequence 65 // has invalid bytes. Not checking for this simplifies the code 66 // and may avoid duplicate computations in certain conditions. 67 if err == nil { 68 err = transform.ErrShortSrc 69 } 70 break 71 } 72 // Determine the maximal subpart of an ill-formed subsequence. 73 switch { 74 case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]: 75 size = 1 76 case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]: 77 size = 2 78 default: 79 size = 3 // As we are short, the maximum is 3. 80 } 81 goto handleInvalid 82 } 83 if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c { 84 size = 1 85 goto handleInvalid // invalid continuation byte 86 } else if size == 2 { 87 } else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c { 88 size = 2 89 goto handleInvalid // invalid continuation byte 90 } else if size == 3 { 91 } else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c { 92 size = 3 93 goto handleInvalid // invalid continuation byte 94 } 95 nSrc += size 96 continue 97 98 handleInvalid: 99 // Copy the scanned input so far. 100 nDst += copy(dst[nDst:], src[pSrc:nSrc]) 101 102 // Append RuneError to the destination. 103 const runeError = "\ufffd" 104 if nDst+len(runeError) > len(dst) { 105 return nDst, nSrc, transform.ErrShortDst 106 } 107 nDst += copy(dst[nDst:], runeError) 108 109 // Skip the maximal subpart of an ill-formed subsequence according to 110 // the W3C standard way instead of the Go way. This Transform is 111 // probably the only place in the text repo where it is warranted. 112 nSrc += size 113 pSrc = nSrc 114 115 // Recompute the maximum source length. 116 if sz := len(dst) - nDst; sz < len(src)-nSrc { 117 err = transform.ErrShortDst 118 n = nSrc + sz 119 atEOF = false 120 } 121 } 122 return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err 123} 124 125// UTF16 returns a UTF-16 Encoding for the given default endianness and byte 126// order mark (BOM) policy. 127// 128// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then 129// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect 130// the endianness used for decoding, and will instead be output as their 131// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy 132// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output. 133// Instead, it overrides the default endianness e for the remainder of the 134// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not 135// affect the endianness used, and will instead be output as their standard 136// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed 137// with the default Endianness. For ExpectBOM, in that case, the transformation 138// will return early with an ErrMissingBOM error. 139// 140// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of 141// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not 142// be inserted. The UTF-8 input does not need to contain a BOM. 143// 144// There is no concept of a 'native' endianness. If the UTF-16 data is produced 145// and consumed in a greater context that implies a certain endianness, use 146// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM. 147// 148// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM 149// corresponds to "Where the precise type of the data stream is known... the 150// BOM should not be used" and ExpectBOM corresponds to "A particular 151// protocol... may require use of the BOM". 152func UTF16(e Endianness, b BOMPolicy) encoding.Encoding { 153 return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]} 154} 155 156// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that 157// some configurations map to the same MIB identifier. RFC 2781 has requirements 158// and recommendations. Some of the "configurations" are merely recommendations, 159// so multiple configurations could match. 160var mibValue = map[Endianness][numBOMValues]identifier.MIB{ 161 BigEndian: [numBOMValues]identifier.MIB{ 162 IgnoreBOM: identifier.UTF16BE, 163 UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781. 164 // TODO: acceptBOM | strictBOM would map to UTF16BE as well. 165 }, 166 LittleEndian: [numBOMValues]identifier.MIB{ 167 IgnoreBOM: identifier.UTF16LE, 168 UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows. 169 // TODO: acceptBOM | strictBOM would map to UTF16LE as well. 170 }, 171 // ExpectBOM is not widely used and has no valid MIB identifier. 172} 173 174// All lists a configuration for each IANA-defined UTF-16 variant. 175var All = []encoding.Encoding{ 176 UTF8, 177 UTF16(BigEndian, UseBOM), 178 UTF16(BigEndian, IgnoreBOM), 179 UTF16(LittleEndian, IgnoreBOM), 180} 181 182// BOMPolicy is a UTF-16 encoding's byte order mark policy. 183type BOMPolicy uint8 184 185const ( 186 writeBOM BOMPolicy = 0x01 187 acceptBOM BOMPolicy = 0x02 188 requireBOM BOMPolicy = 0x04 189 bomMask BOMPolicy = 0x07 190 191 // HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a 192 // map of an array of length 8 of a type that is also used as a key or value 193 // in another map). See golang.org/issue/11354. 194 // TODO: consider changing this value back to 8 if the use of 1.4.* has 195 // been minimized. 196 numBOMValues = 8 + 1 197 198 // IgnoreBOM means to ignore any byte order marks. 199 IgnoreBOM BOMPolicy = 0 200 // Common and RFC 2781-compliant interpretation for UTF-16BE/LE. 201 202 // UseBOM means that the UTF-16 form may start with a byte order mark, which 203 // will be used to override the default encoding. 204 UseBOM BOMPolicy = writeBOM | acceptBOM 205 // Common and RFC 2781-compliant interpretation for UTF-16. 206 207 // ExpectBOM means that the UTF-16 form must start with a byte order mark, 208 // which will be used to override the default encoding. 209 ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM 210 // Used in Java as Unicode (not to be confused with Java's UTF-16) and 211 // ICU's UTF-16,version=1. Not compliant with RFC 2781. 212 213 // TODO (maybe): strictBOM: BOM must match Endianness. This would allow: 214 // - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM 215 // (UnicodeBig and UnicodeLittle in Java) 216 // - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E: 217 // acceptBOM | strictBOM (e.g. assigned to CheckBOM). 218 // This addition would be consistent with supporting ExpectBOM. 219) 220 221// Endianness is a UTF-16 encoding's default endianness. 222type Endianness bool 223 224const ( 225 // BigEndian is UTF-16BE. 226 BigEndian Endianness = false 227 // LittleEndian is UTF-16LE. 228 LittleEndian Endianness = true 229) 230 231// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a 232// starting byte order mark. 233var ErrMissingBOM = errors.New("encoding: missing byte order mark") 234 235type utf16Encoding struct { 236 config 237 mib identifier.MIB 238} 239 240type config struct { 241 endianness Endianness 242 bomPolicy BOMPolicy 243} 244 245func (u utf16Encoding) NewDecoder() *encoding.Decoder { 246 return &encoding.Decoder{Transformer: &utf16Decoder{ 247 initial: u.config, 248 current: u.config, 249 }} 250} 251 252func (u utf16Encoding) NewEncoder() *encoding.Encoder { 253 return &encoding.Encoder{Transformer: &utf16Encoder{ 254 endianness: u.endianness, 255 initialBOMPolicy: u.bomPolicy, 256 currentBOMPolicy: u.bomPolicy, 257 }} 258} 259 260func (u utf16Encoding) ID() (mib identifier.MIB, other string) { 261 return u.mib, "" 262} 263 264func (u utf16Encoding) String() string { 265 e, b := "B", "" 266 if u.endianness == LittleEndian { 267 e = "L" 268 } 269 switch u.bomPolicy { 270 case ExpectBOM: 271 b = "Expect" 272 case UseBOM: 273 b = "Use" 274 case IgnoreBOM: 275 b = "Ignore" 276 } 277 return "UTF-16" + e + "E (" + b + " BOM)" 278} 279 280type utf16Decoder struct { 281 initial config 282 current config 283} 284 285func (u *utf16Decoder) Reset() { 286 u.current = u.initial 287} 288 289func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 290 if len(src) == 0 { 291 if atEOF && u.current.bomPolicy&requireBOM != 0 { 292 return 0, 0, ErrMissingBOM 293 } 294 return 0, 0, nil 295 } 296 if u.current.bomPolicy&acceptBOM != 0 { 297 if len(src) < 2 { 298 return 0, 0, transform.ErrShortSrc 299 } 300 switch { 301 case src[0] == 0xfe && src[1] == 0xff: 302 u.current.endianness = BigEndian 303 nSrc = 2 304 case src[0] == 0xff && src[1] == 0xfe: 305 u.current.endianness = LittleEndian 306 nSrc = 2 307 default: 308 if u.current.bomPolicy&requireBOM != 0 { 309 return 0, 0, ErrMissingBOM 310 } 311 } 312 u.current.bomPolicy = IgnoreBOM 313 } 314 315 var r rune 316 var dSize, sSize int 317 for nSrc < len(src) { 318 if nSrc+1 < len(src) { 319 x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1]) 320 if u.current.endianness == LittleEndian { 321 x = x>>8 | x<<8 322 } 323 r, sSize = rune(x), 2 324 if utf16.IsSurrogate(r) { 325 if nSrc+3 < len(src) { 326 x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3]) 327 if u.current.endianness == LittleEndian { 328 x = x>>8 | x<<8 329 } 330 // Save for next iteration if it is not a high surrogate. 331 if isHighSurrogate(rune(x)) { 332 r, sSize = utf16.DecodeRune(r, rune(x)), 4 333 } 334 } else if !atEOF { 335 err = transform.ErrShortSrc 336 break 337 } 338 } 339 if dSize = utf8.RuneLen(r); dSize < 0 { 340 r, dSize = utf8.RuneError, 3 341 } 342 } else if atEOF { 343 // Single trailing byte. 344 r, dSize, sSize = utf8.RuneError, 3, 1 345 } else { 346 err = transform.ErrShortSrc 347 break 348 } 349 if nDst+dSize > len(dst) { 350 err = transform.ErrShortDst 351 break 352 } 353 nDst += utf8.EncodeRune(dst[nDst:], r) 354 nSrc += sSize 355 } 356 return nDst, nSrc, err 357} 358 359func isHighSurrogate(r rune) bool { 360 return 0xDC00 <= r && r <= 0xDFFF 361} 362 363type utf16Encoder struct { 364 endianness Endianness 365 initialBOMPolicy BOMPolicy 366 currentBOMPolicy BOMPolicy 367} 368 369func (u *utf16Encoder) Reset() { 370 u.currentBOMPolicy = u.initialBOMPolicy 371} 372 373func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 374 if u.currentBOMPolicy&writeBOM != 0 { 375 if len(dst) < 2 { 376 return 0, 0, transform.ErrShortDst 377 } 378 dst[0], dst[1] = 0xfe, 0xff 379 u.currentBOMPolicy = IgnoreBOM 380 nDst = 2 381 } 382 383 r, size := rune(0), 0 384 for nSrc < len(src) { 385 r = rune(src[nSrc]) 386 387 // Decode a 1-byte rune. 388 if r < utf8.RuneSelf { 389 size = 1 390 391 } else { 392 // Decode a multi-byte rune. 393 r, size = utf8.DecodeRune(src[nSrc:]) 394 if size == 1 { 395 // All valid runes of size 1 (those below utf8.RuneSelf) were 396 // handled above. We have invalid UTF-8 or we haven't seen the 397 // full character yet. 398 if !atEOF && !utf8.FullRune(src[nSrc:]) { 399 err = transform.ErrShortSrc 400 break 401 } 402 } 403 } 404 405 if r <= 0xffff { 406 if nDst+2 > len(dst) { 407 err = transform.ErrShortDst 408 break 409 } 410 dst[nDst+0] = uint8(r >> 8) 411 dst[nDst+1] = uint8(r) 412 nDst += 2 413 } else { 414 if nDst+4 > len(dst) { 415 err = transform.ErrShortDst 416 break 417 } 418 r1, r2 := utf16.EncodeRune(r) 419 dst[nDst+0] = uint8(r1 >> 8) 420 dst[nDst+1] = uint8(r1) 421 dst[nDst+2] = uint8(r2 >> 8) 422 dst[nDst+3] = uint8(r2) 423 nDst += 4 424 } 425 nSrc += size 426 } 427 428 if u.endianness == LittleEndian { 429 for i := 0; i < nDst; i += 2 { 430 dst[i], dst[i+1] = dst[i+1], dst[i] 431 } 432 } 433 return nDst, nSrc, err 434} 435