1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 2 // file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the 11 // "top-level directory" in the above notice refers to 12 // third_party/rust/encoding_c/. 13 14 #ifndef mozilla_Encoding_h 15 #define mozilla_Encoding_h 16 17 #include "mozilla/CheckedInt.h" 18 #include "mozilla/Maybe.h" 19 #include "mozilla/NotNull.h" 20 #include "mozilla/Span.h" 21 #include "mozilla/Tuple.h" 22 #include "nsString.h" 23 24 namespace mozilla { 25 class Encoding; 26 class Decoder; 27 class Encoder; 28 }; // namespace mozilla 29 30 #define ENCODING_RS_ENCODING mozilla::Encoding 31 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \ 32 mozilla::NotNull<const mozilla::Encoding*> 33 #define ENCODING_RS_ENCODER mozilla::Encoder 34 #define ENCODING_RS_DECODER mozilla::Decoder 35 36 #include "encoding_rs.h" 37 38 extern "C" { 39 40 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding, 41 uint8_t const* src, size_t src_len, 42 nsAString* dst); 43 44 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal( 45 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 46 nsAString* dst); 47 48 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling( 49 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 50 nsAString* dst); 51 52 nsresult 53 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement( 54 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 55 nsAString* dst); 56 57 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding, 58 char16_t const* src, size_t src_len, 59 nsACString* dst); 60 61 nsresult mozilla_encoding_decode_to_nscstring( 62 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst); 63 64 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal( 65 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst); 66 67 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling( 68 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst); 69 70 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling( 71 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 72 nsACString* dst, size_t already_validated); 73 74 nsresult 75 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( 76 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst); 77 78 nsresult mozilla_encoding_encode_from_nscstring( 79 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst); 80 81 } // extern "C" 82 83 namespace mozilla { 84 85 /** 86 * Return value from `Decoder`/`Encoder` to indicate that input 87 * was exhausted. 88 */ 89 const uint32_t kInputEmpty = INPUT_EMPTY; 90 91 /** 92 * Return value from `Decoder`/`Encoder` to indicate that output 93 * space was insufficient. 94 */ 95 const uint32_t kOutputFull = OUTPUT_FULL; 96 97 /** 98 * An encoding as defined in the Encoding Standard 99 * (https://encoding.spec.whatwg.org/). 100 * 101 * See https://docs.rs/encoding_rs/ for the Rust API docs. 102 * 103 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point 104 * sequence and, in most cases, vice versa. Each encoding has a name, an output 105 * encoding, and one or more labels. 106 * 107 * _Labels_ are ASCII-case-insensitive strings that are used to identify an 108 * encoding in formats and protocols. The _name_ of the encoding is the 109 * preferred label in the case appropriate for returning from the 110 * `characterSet` property of the `Document` DOM interface, except for 111 * the replacement encoding whose name is not one of its labels. 112 * 113 * The _output encoding_ is the encoding used for form submission and URL 114 * parsing on Web pages in the encoding. This is UTF-8 for the replacement, 115 * UTF-16LE and UTF-16BE encodings and the encoding itself for other 116 * encodings. 117 * 118 * # Streaming vs. Non-Streaming 119 * 120 * When you have the entire input in a single buffer, you can use the 121 * methods `Decode()`, `DecodeWithBOMRemoval()`, 122 * `DecodeWithoutBOMHandling()`, 123 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and 124 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and 125 * NewEncoder()` methods), these methods perform heap allocations. You should 126 * the `Decoder` and `Encoder` objects when your input is split into multiple 127 * buffers or when you want to control the allocation of the output buffers. 128 * 129 * # Instances 130 * 131 * All instances of `Encoding` are statically allocated and have the process's 132 * lifetime. There is precisely one unique `Encoding` instance for each 133 * encoding defined in the Encoding Standard. 134 * 135 * To obtain a reference to a particular encoding whose identity you know at 136 * compile time, use a `static` that refers to encoding. There is a `static` 137 * for each encoding. The `static`s are named in all caps with hyphens 138 * replaced with underscores and with `_ENCODING` appended to the 139 * name. For example, if you know at compile time that you will want to 140 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`. 141 * 142 * If you don't know what encoding you need at compile time and need to 143 * dynamically get an encoding by label, use `Encoding::for_label()`. 144 * 145 * Pointers to `Encoding` can be compared with `==` to check for the sameness 146 * of two encodings. 147 * 148 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer 149 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use 150 * `const mozilla::Encoding*` in the C signature and 151 * `*const encoding_rs::Encoding` is the corresponding Rust signature. 152 */ 153 class Encoding final { 154 public: 155 /** 156 * Implements the _get an encoding_ algorithm 157 * (https://encoding.spec.whatwg.org/#concept-encoding-get). 158 * 159 * If, after ASCII-lowercasing and removing leading and trailing 160 * whitespace, the argument matches a label defined in the Encoding 161 * Standard, `const Encoding*` representing the corresponding 162 * encoding is returned. If there is no match, `nullptr` is returned. 163 * 164 * This is the right method to use if the action upon the method returning 165 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) 166 * instead. When the action upon the method returning `nullptr` is not to 167 * proceed with a fallback but to refuse processing, 168 * `ForLabelNoReplacement()` is more appropriate. 169 */ ForLabel(Span<const char> aLabel)170 static inline const Encoding* ForLabel(Span<const char> aLabel) { 171 return encoding_for_label( 172 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length()); 173 } 174 175 /** 176 * `nsAString` argument version. See above for docs. 177 */ ForLabel(const nsAString & aLabel)178 static inline const Encoding* ForLabel(const nsAString& aLabel) { 179 return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel)); 180 } 181 182 /** 183 * This method behaves the same as `ForLabel()`, except when `ForLabel()` 184 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead. 185 * 186 * This method is useful in scenarios where a fatal error is required 187 * upon invalid label, because in those cases the caller typically wishes 188 * to treat the labels that map to the replacement encoding as fatal 189 * errors, too. 190 * 191 * It is not OK to use this method when the action upon the method returning 192 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In 193 * such a case, the `ForLabel()` method should be used instead in order to 194 * avoid unsafe fallback for labels that `ForLabel()` maps to 195 * `REPLACEMENT_ENCODING`. 196 */ ForLabelNoReplacement(Span<const char> aLabel)197 static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) { 198 return encoding_for_label_no_replacement( 199 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length()); 200 } 201 202 /** 203 * `nsAString` argument version. See above for docs. 204 */ ForLabelNoReplacement(const nsAString & aLabel)205 static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) { 206 return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel)); 207 } 208 209 /** 210 * Performs non-incremental BOM sniffing. 211 * 212 * The argument must either be a buffer representing the entire input 213 * stream (non-streaming case) or a buffer representing at least the first 214 * three bytes of the input stream (streaming case). 215 * 216 * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)` 217 * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the 218 * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise. 219 */ ForBOM(Span<const uint8_t> aBuffer)220 static inline Tuple<const Encoding*, size_t> ForBOM( 221 Span<const uint8_t> aBuffer) { 222 size_t len = aBuffer.Length(); 223 const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len); 224 return MakeTuple(encoding, len); 225 } 226 227 /** 228 * Writes the name of this encoding into `aName`. 229 * 230 * This name is appropriate to return as-is from the DOM 231 * `document.characterSet` property. 232 */ Name(nsACString & aName)233 inline void Name(nsACString& aName) const { 234 aName.SetLength(ENCODING_NAME_MAX_LENGTH); 235 size_t length = 236 encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting())); 237 aName.SetLength(length); // truncation is the 64-bit case is OK 238 } 239 240 /** 241 * Checks whether the _output encoding_ of this encoding can encode every 242 * Unicode code point. (Only true if the output encoding is UTF-8.) 243 */ CanEncodeEverything()244 inline bool CanEncodeEverything() const { 245 return encoding_can_encode_everything(this); 246 } 247 248 /** 249 * Checks whether this encoding maps one byte to one Basic Multilingual 250 * Plane code point (i.e. byte length equals decoded UTF-16 length) and 251 * vice versa (for mappable characters). 252 * 253 * `true` iff this encoding is on the list of Legacy single-byte 254 * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) 255 * in the spec or x-user-defined. 256 */ IsSingleByte()257 inline bool IsSingleByte() const { return encoding_is_single_byte(this); } 258 259 /** 260 * Checks whether the bytes 0x00...0x7F map exclusively to the characters 261 * U+0000...U+007F and vice versa. 262 */ IsAsciiCompatible()263 inline bool IsAsciiCompatible() const { 264 return encoding_is_ascii_compatible(this); 265 } 266 267 /** 268 * Checks whether this is a Japanese legacy encoding. 269 */ IsJapaneseLegacy()270 inline bool IsJapaneseLegacy() const { 271 return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING || 272 this == ISO_2022_JP_ENCODING; 273 } 274 275 /** 276 * Returns the _output encoding_ of this encoding. This is UTF-8 for 277 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. 278 */ OutputEncoding()279 inline NotNull<const mozilla::Encoding*> OutputEncoding() const { 280 return WrapNotNull(encoding_output_encoding(this)); 281 } 282 283 /** 284 * Decode complete input to `nsACString` _with BOM sniffing_ and with 285 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 286 * entire input is available as a single buffer (i.e. the end of the 287 * buffer marks the end of the stream). 288 * 289 * This method implements the (non-streaming version of) the 290 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. 291 * 292 * The second item in the returned tuple is the encoding that was actually 293 * used (which may differ from this encoding thanks to BOM sniffing). 294 * 295 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 296 * if there were malformed sequences (that were replaced with the 297 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the 298 * tuple. 299 * 300 * The backing buffer of the string isn't copied if the input buffer 301 * is heap-allocated and decoding from UTF-8 and the input is valid 302 * BOMless UTF-8, decoding from an ASCII-compatible encoding and 303 * the input is valid ASCII or decoding from ISO-2022-JP and the 304 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass 305 * the same string as both arguments. 306 * 307 * _Note:_ It is wrong to use this when the input buffer represents only 308 * a segment of the input instead of the whole input. Use `NewDecoder()` 309 * when decoding segmented input. 310 */ Decode(const nsACString & aBytes,nsACString & aOut)311 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode( 312 const nsACString& aBytes, nsACString& aOut) const { 313 const Encoding* encoding = this; 314 const nsACString* bytes = &aBytes; 315 nsACString* out = &aOut; 316 nsresult rv; 317 if (bytes == out) { 318 nsAutoCString temp(aBytes); 319 rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out); 320 } else { 321 rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out); 322 } 323 return MakeTuple(rv, WrapNotNull(encoding)); 324 } 325 326 /** 327 * Decode complete input to `nsAString` _with BOM sniffing_ and with 328 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 329 * entire input is available as a single buffer (i.e. the end of the 330 * buffer marks the end of the stream). 331 * 332 * This method implements the (non-streaming version of) the 333 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. 334 * 335 * The second item in the returned tuple is the encoding that was actually 336 * used (which may differ from this encoding thanks to BOM sniffing). 337 * 338 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 339 * if there were malformed sequences (that were replaced with the 340 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the 341 * tuple. 342 * 343 * _Note:_ It is wrong to use this when the input buffer represents only 344 * a segment of the input instead of the whole input. Use `NewDecoder()` 345 * when decoding segmented input. 346 */ Decode(Span<const uint8_t> aBytes,nsAString & aOut)347 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode( 348 Span<const uint8_t> aBytes, nsAString& aOut) const { 349 const Encoding* encoding = this; 350 nsresult rv = mozilla_encoding_decode_to_nsstring( 351 &encoding, aBytes.Elements(), aBytes.Length(), &aOut); 352 return MakeTuple(rv, WrapNotNull(encoding)); 353 } 354 355 /** 356 * Decode complete input to `nsACString` _with BOM removal_ and with 357 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 358 * entire input is available as a single buffer (i.e. the end of the 359 * buffer marks the end of the stream). 360 * 361 * When invoked on `UTF_8`, this method implements the (non-streaming 362 * version of) the _UTF-8 decode_ 363 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. 364 * 365 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 366 * if there were malformed sequences (that were replaced with the 367 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 368 * 369 * The backing buffer of the string isn't copied if the input buffer 370 * is heap-allocated and decoding from UTF-8 and the input is valid 371 * BOMless UTF-8, decoding from an ASCII-compatible encoding and 372 * the input is valid ASCII or decoding from ISO-2022-JP and the 373 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass 374 * the same string as both arguments. 375 * 376 * _Note:_ It is wrong to use this when the input buffer represents only 377 * a segment of the input instead of the whole input. Use 378 * `NewDecoderWithBOMRemoval()` when decoding segmented input. 379 */ DecodeWithBOMRemoval(const nsACString & aBytes,nsACString & aOut)380 inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes, 381 nsACString& aOut) const { 382 const nsACString* bytes = &aBytes; 383 nsACString* out = &aOut; 384 if (bytes == out) { 385 nsAutoCString temp(aBytes); 386 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp, 387 out); 388 } 389 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes, 390 out); 391 } 392 393 /** 394 * Decode complete input to `nsAString` _with BOM removal_ and with 395 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 396 * entire input is available as a single buffer (i.e. the end of the 397 * buffer marks the end of the stream). 398 * 399 * When invoked on `UTF_8`, this method implements the (non-streaming 400 * version of) the _UTF-8 decode_ 401 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. 402 * 403 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 404 * if there were malformed sequences (that were replaced with the 405 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 406 * 407 * _Note:_ It is wrong to use this when the input buffer represents only 408 * a segment of the input instead of the whole input. Use 409 * `NewDecoderWithBOMRemoval()` when decoding segmented input. 410 */ DecodeWithBOMRemoval(Span<const uint8_t> aBytes,nsAString & aOut)411 inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes, 412 nsAString& aOut) const { 413 return mozilla_encoding_decode_to_nsstring_with_bom_removal( 414 this, aBytes.Elements(), aBytes.Length(), &aOut); 415 } 416 417 /** 418 * Decode complete input to `nsACString` _without BOM handling_ and 419 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 420 * the entire input is available as a single buffer (i.e. the end of the 421 * buffer marks the end of the stream). 422 * 423 * When invoked on `UTF_8`, this method implements the (non-streaming 424 * version of) the _UTF-8 decode without BOM_ 425 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 426 * 427 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 428 * if there were malformed sequences (that were replaced with the 429 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 430 * 431 * The backing buffer of the string isn't copied if the input buffer 432 * is heap-allocated and decoding from UTF-8 and the input is valid 433 * UTF-8, decoding from an ASCII-compatible encoding and the input 434 * is valid ASCII or decoding from ISO-2022-JP and the input stays 435 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string 436 * as both arguments. 437 * 438 * _Note:_ It is wrong to use this when the input buffer represents only 439 * a segment of the input instead of the whole input. Use 440 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 441 */ DecodeWithoutBOMHandling(const nsACString & aBytes,nsACString & aOut)442 inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes, 443 nsACString& aOut) const { 444 const nsACString* bytes = &aBytes; 445 nsACString* out = &aOut; 446 if (bytes == out) { 447 nsAutoCString temp(aBytes); 448 return mozilla_encoding_decode_to_nscstring_without_bom_handling( 449 this, &temp, out); 450 } 451 return mozilla_encoding_decode_to_nscstring_without_bom_handling( 452 this, bytes, out); 453 } 454 455 /** 456 * Decode complete input to `nsAString` _without BOM handling_ and 457 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 458 * the entire input is available as a single buffer (i.e. the end of the 459 * buffer marks the end of the stream). 460 * 461 * When invoked on `UTF_8`, this method implements the (non-streaming 462 * version of) the _UTF-8 decode without BOM_ 463 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 464 * 465 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 466 * if there were malformed sequences (that were replaced with the 467 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 468 * 469 * _Note:_ It is wrong to use this when the input buffer represents only 470 * a segment of the input instead of the whole input. Use 471 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 472 */ DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,nsAString & aOut)473 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes, 474 nsAString& aOut) const { 475 return mozilla_encoding_decode_to_nsstring_without_bom_handling( 476 this, aBytes.Elements(), aBytes.Length(), &aOut); 477 } 478 479 /** 480 * Decode complete input to `nsACString` _without BOM handling_ and 481 * _with malformed sequences treated as fatal_ when the entire input is 482 * available as a single buffer (i.e. the end of the buffer marks the end 483 * of the stream). 484 * 485 * When invoked on `UTF_8`, this method implements the (non-streaming 486 * version of) the _UTF-8 decode without BOM or fail_ 487 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) 488 * spec concept. 489 * 490 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT` 491 * if a malformed sequence was encountered and `NS_OK` otherwise. 492 * 493 * The backing buffer of the string isn't copied if the input buffer 494 * is heap-allocated and decoding from UTF-8 and the input is valid 495 * UTF-8, decoding from an ASCII-compatible encoding and the input 496 * is valid ASCII or decoding from ISO-2022-JP and the input stays 497 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string 498 * as both arguments. 499 * 500 * _Note:_ It is wrong to use this when the input buffer represents only 501 * a segment of the input instead of the whole input. Use 502 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 503 */ DecodeWithoutBOMHandlingAndWithoutReplacement(const nsACString & aBytes,nsACString & aOut)504 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement( 505 const nsACString& aBytes, nsACString& aOut) const { 506 const nsACString* bytes = &aBytes; 507 nsACString* out = &aOut; 508 if (bytes == out) { 509 nsAutoCString temp(aBytes); 510 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( 511 this, &temp, out); 512 } 513 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( 514 this, bytes, out); 515 } 516 517 /** 518 * Decode complete input to `nsACString` _without BOM handling_ and 519 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 520 * the entire input is available as a single buffer (i.e. the end of the 521 * buffer marks the end of the stream) _asserting that a number of bytes 522 * from the start are already known to be valid UTF-8_. 523 * 524 * The use case for this method is avoiding copying when dealing with 525 * input that has a UTF-8 BOM. _When in doubt, do not use this method._ 526 * 527 * When invoked on `UTF_8`, this method implements the (non-streaming 528 * version of) the _UTF-8 decode without BOM_ 529 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 530 * 531 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 532 * if there were malformed sequences (that were replaced with the 533 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 534 * 535 * _Note:_ It is wrong to use this when the input buffer represents only 536 * a segment of the input instead of the whole input. Use 537 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 538 * 539 * # Safety 540 * 541 * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8. 542 * `aBytes` _must not_ alias the buffer (if any) of `aOut`. 543 */ DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,nsACString & aOut,size_t aAlreadyValidated)544 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes, 545 nsACString& aOut, 546 size_t aAlreadyValidated) const { 547 return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling( 548 this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated); 549 } 550 551 /** 552 * Decode complete input to `nsAString` _without BOM handling_ and 553 * _with malformed sequences treated as fatal_ when the entire input is 554 * available as a single buffer (i.e. the end of the buffer marks the end 555 * of the stream). 556 * 557 * When invoked on `UTF_8`, this method implements the (non-streaming 558 * version of) the _UTF-8 decode without BOM or fail_ 559 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) 560 * spec concept. 561 * 562 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT` 563 * if a malformed sequence was encountered and `NS_OK` otherwise. 564 * 565 * _Note:_ It is wrong to use this when the input buffer represents only 566 * a segment of the input instead of the whole input. Use 567 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 568 */ DecodeWithoutBOMHandlingAndWithoutReplacement(Span<const uint8_t> aBytes,nsAString & aOut)569 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement( 570 Span<const uint8_t> aBytes, nsAString& aOut) const { 571 return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement( 572 this, aBytes.Elements(), aBytes.Length(), &aOut); 573 } 574 575 /** 576 * Encode complete input to `nsACString` with unmappable characters 577 * replaced with decimal numeric character references when the entire input 578 * is available as a single buffer (i.e. the end of the buffer marks the 579 * end of the stream). 580 * 581 * This method implements the (non-streaming version of) the 582 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. 583 * 584 * The second item in the returned tuple is the encoding that was actually 585 * used (which may differ from this encoding thanks to some encodings 586 * having UTF-8 as their output encoding). 587 * 588 * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if 589 * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM, 590 * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were 591 * replaced with numeric character references) and `NS_OK` otherwise. 592 * 593 * The backing buffer of the string isn't copied if the input buffer 594 * is heap-allocated and encoding to UTF-8 and the input is valid 595 * UTF-8, encoding to an ASCII-compatible encoding and the input 596 * is valid ASCII or encoding from ISO-2022-JP and the input stays 597 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string 598 * as both arguments. 599 * 600 * _Note:_ It is wrong to use this when the input buffer represents only 601 * a segment of the input instead of the whole input. Use `NewEncoder()` 602 * when encoding segmented output. 603 */ Encode(const nsACString & aString,nsACString & aOut)604 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode( 605 const nsACString& aString, nsACString& aOut) const { 606 const Encoding* encoding = this; 607 const nsACString* string = &aString; 608 nsACString* out = &aOut; 609 nsresult rv; 610 if (string == out) { 611 nsAutoCString temp(aString); 612 rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out); 613 } else { 614 rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out); 615 } 616 return MakeTuple(rv, WrapNotNull(encoding)); 617 } 618 619 /** 620 * Encode complete input to `nsACString` with unmappable characters 621 * replaced with decimal numeric character references when the entire input 622 * is available as a single buffer (i.e. the end of the buffer marks the 623 * end of the stream). 624 * 625 * This method implements the (non-streaming version of) the 626 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. 627 * 628 * The second item in the returned tuple is the encoding that was actually 629 * used (which may differ from this encoding thanks to some encodings 630 * having UTF-8 as their output encoding). 631 * 632 * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon 633 * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that 634 * were replaced with numeric character references) and `NS_OK` otherwise. 635 636 * _Note:_ It is wrong to use this when the input buffer represents only 637 * a segment of the input instead of the whole input. Use `NewEncoder()` 638 * when encoding segmented output. 639 */ Encode(Span<const char16_t> aString,nsACString & aOut)640 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode( 641 Span<const char16_t> aString, nsACString& aOut) const { 642 const Encoding* encoding = this; 643 nsresult rv = mozilla_encoding_encode_from_utf16( 644 &encoding, aString.Elements(), aString.Length(), &aOut); 645 return MakeTuple(rv, WrapNotNull(encoding)); 646 } 647 648 /** 649 * Instantiates a new decoder for this encoding with BOM sniffing enabled. 650 * 651 * BOM sniffing may cause the returned decoder to morph into a decoder 652 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. 653 */ NewDecoder()654 inline UniquePtr<Decoder> NewDecoder() const { 655 UniquePtr<Decoder> decoder(encoding_new_decoder(this)); 656 return decoder; 657 } 658 659 /** 660 * Instantiates a new decoder for this encoding with BOM sniffing enabled 661 * into memory occupied by a previously-instantiated decoder. 662 * 663 * BOM sniffing may cause the returned decoder to morph into a decoder 664 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. 665 */ NewDecoderInto(Decoder & aDecoder)666 inline void NewDecoderInto(Decoder& aDecoder) const { 667 encoding_new_decoder_into(this, &aDecoder); 668 } 669 670 /** 671 * Instantiates a new decoder for this encoding with BOM removal. 672 * 673 * If the input starts with bytes that are the BOM for this encoding, 674 * those bytes are removed. However, the decoder never morphs into a 675 * decoder for another encoding: A BOM for another encoding is treated as 676 * (potentially malformed) input to the decoding algorithm for this 677 * encoding. 678 */ NewDecoderWithBOMRemoval()679 inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const { 680 UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this)); 681 return decoder; 682 } 683 684 /** 685 * Instantiates a new decoder for this encoding with BOM removal 686 * into memory occupied by a previously-instantiated decoder. 687 * 688 * If the input starts with bytes that are the BOM for this encoding, 689 * those bytes are removed. However, the decoder never morphs into a 690 * decoder for another encoding: A BOM for another encoding is treated as 691 * (potentially malformed) input to the decoding algorithm for this 692 * encoding. 693 */ NewDecoderWithBOMRemovalInto(Decoder & aDecoder)694 inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const { 695 encoding_new_decoder_with_bom_removal_into(this, &aDecoder); 696 } 697 698 /** 699 * Instantiates a new decoder for this encoding with BOM handling disabled. 700 * 701 * If the input starts with bytes that look like a BOM, those bytes are 702 * not treated as a BOM. (Hence, the decoder never morphs into a decoder 703 * for another encoding.) 704 * 705 * _Note:_ If the caller has performed BOM sniffing on its own but has not 706 * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()` 707 * instead of this method to cause the BOM to be removed. 708 */ NewDecoderWithoutBOMHandling()709 inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const { 710 UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this)); 711 return decoder; 712 } 713 714 /** 715 * Instantiates a new decoder for this encoding with BOM handling disabled 716 * into memory occupied by a previously-instantiated decoder. 717 * 718 * If the input starts with bytes that look like a BOM, those bytes are 719 * not treated as a BOM. (Hence, the decoder never morphs into a decoder 720 * for another encoding.) 721 * 722 * _Note:_ If the caller has performed BOM sniffing on its own but has not 723 * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()` 724 * instead of this method to cause the BOM to be removed. 725 */ NewDecoderWithoutBOMHandlingInto(Decoder & aDecoder)726 inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const { 727 encoding_new_decoder_without_bom_handling_into(this, &aDecoder); 728 } 729 730 /** 731 * Instantiates a new encoder for the output encoding of this encoding. 732 */ NewEncoder()733 inline UniquePtr<Encoder> NewEncoder() const { 734 UniquePtr<Encoder> encoder(encoding_new_encoder(this)); 735 return encoder; 736 } 737 738 /** 739 * Instantiates a new encoder for the output encoding of this encoding 740 * into memory occupied by a previously-instantiated encoder. 741 */ NewEncoderInto(Encoder & aEncoder)742 inline void NewEncoderInto(Encoder& aEncoder) const { 743 encoding_new_encoder_into(this, &aEncoder); 744 } 745 746 /** 747 * Validates UTF-8. 748 * 749 * Returns the index of the first byte that makes the input malformed as 750 * UTF-8 or the length of the input if the input is entirely valid. 751 */ UTF8ValidUpTo(Span<const uint8_t> aBuffer)752 static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) { 753 return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length()); 754 } 755 756 /** 757 * Validates ASCII. 758 * 759 * Returns the index of the first byte that makes the input malformed as 760 * ASCII or the length of the input if the input is entirely valid. 761 */ ASCIIValidUpTo(Span<const uint8_t> aBuffer)762 static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) { 763 return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length()); 764 } 765 766 /** 767 * Validates ISO-2022-JP ASCII-state data. 768 * 769 * Returns the index of the first byte that makes the input not 770 * representable in the ASCII state of ISO-2022-JP or the length of the 771 * input if the input is entirely representable in the ASCII state of 772 * ISO-2022-JP. 773 */ ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer)774 static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) { 775 return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(), 776 aBuffer.Length()); 777 } 778 779 private: 780 Encoding() = delete; 781 Encoding(const Encoding&) = delete; 782 Encoding& operator=(const Encoding&) = delete; 783 ~Encoding() = delete; 784 }; 785 786 /** 787 * A converter that decodes a byte stream into Unicode according to a 788 * character encoding in a streaming (incremental) manner. 789 * 790 * The various `Decode*` methods take an input buffer (`aSrc`) and an output 791 * buffer `aDst` both of which are caller-allocated. There are variants for 792 * both UTF-8 and UTF-16 output buffers. 793 * 794 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored 795 * into `aDst` until one of the following three things happens: 796 * 797 * 1. A malformed byte sequence is encountered (`*WithoutReplacement` 798 * variants only). 799 * 800 * 2. The output buffer has been filled so near capacity that the decoder 801 * cannot be sure that processing an additional byte of input wouldn't 802 * cause so much output that the output buffer would overflow. 803 * 804 * 3. All the input bytes have been processed. 805 * 806 * The `Decode*` method then returns tuple of a status indicating which one 807 * of the three reasons to return happened, how many input bytes were read, 808 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t` 809 * when decoding to UTF-16) were written, and in the case of the 810 * variants performing replacement, a boolean indicating whether an error was 811 * replaced with the REPLACEMENT CHARACTER during the call. 812 * 813 * The number of bytes "written" is what's logically written. Garbage may be 814 * written in the output buffer beyond the point logically written to. 815 * 816 * In the case of the `*WithoutReplacement` variants, the status is a 817 * `uint32_t` whose possible values are packed info about a malformed byte 818 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases 819 * listed above). 820 * 821 * Packed info about malformed sequences has the following format: 822 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3, 823 * indicate the number of bytes that were consumed after the malformed 824 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate 825 * the length of the malformed byte sequence (possible decimal values 1, 2, 826 * 3 or 4). The maximum possible sum of the two is 6. 827 * 828 * In the case of methods whose name does not end with 829 * `*WithoutReplacement`, malformed sequences are automatically replaced 830 * with the REPLACEMENT CHARACTER and errors do not cause the methods to 831 * return early. 832 * 833 * When decoding to UTF-8, the output buffer must have at least 4 bytes of 834 * space. When decoding to UTF-16, the output buffer must have at least two 835 * UTF-16 code units (`char16_t`) of space. 836 * 837 * When decoding to UTF-8 without replacement, the methods are guaranteed 838 * not to return indicating that more output space is needed if the length 839 * of the output buffer is at least the length returned by 840 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8 841 * with replacement, the length of the output buffer that guarantees the 842 * methods not to return indicating that more output space is needed is given 843 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with 844 * or without replacement, the length of the output buffer that guarantees 845 * the methods not to return indicating that more output space is needed is 846 * given by `MaxUTF16BufferLength()`. 847 * 848 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16, 849 * and the output after each `Decode*` call is guaranteed to consist of 850 * complete characters. (I.e. the code unit sequence for the last character is 851 * guaranteed not to be split across output buffers.) 852 * 853 * The boolean argument `aLast` indicates that the end of the stream is reached 854 * when all the bytes in `aSrc` have been consumed. 855 * 856 * A `Decoder` object can be used to incrementally decode a byte stream. 857 * 858 * During the processing of a single stream, the caller must call `Decode*` 859 * zero or more times with `aLast` set to `false` and then call `Decode*` at 860 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`, 861 * the processing of the stream has ended. Otherwise, the caller must call 862 * `Decode*` again with `aLast` set to `true` (or treat a malformed result, 863 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error). 864 * 865 * Once the stream has ended, the `Decoder` object must not be used anymore. 866 * That is, you need to create another one to process another stream. 867 * 868 * When the decoder returns `kOutputFull` or the decoder returns a malformed 869 * result and the caller does not wish to treat it as a fatal error, the input 870 * buffer `aSrc` may not have been completely consumed. In that case, the caller 871 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next 872 * call. 873 * 874 * # Infinite loops 875 * 876 * When converting with a fixed-size output buffer whose size is too small to 877 * accommodate one character of output, an infinite loop ensues. When 878 * converting with a fixed-size output buffer, it generally makes sense to 879 * make the buffer fairly large (e.g. couple of kilobytes). 880 */ 881 class Decoder final { 882 public: 883 ~Decoder() = default; delete(void * aDecoder)884 static void operator delete(void* aDecoder) { 885 decoder_free(reinterpret_cast<Decoder*>(aDecoder)); 886 } 887 888 /** 889 * The `Encoding` this `Decoder` is for. 890 * 891 * BOM sniffing can change the return value of this method during the life 892 * of the decoder. 893 */ Encoding()894 inline NotNull<const mozilla::Encoding*> Encoding() const { 895 return WrapNotNull(decoder_encoding(this)); 896 } 897 898 /** 899 * Query the worst-case UTF-8 output size _with replacement_. 900 * 901 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) 902 * that will not overflow given the current state of the decoder and 903 * `aByteLength` number of additional input bytes when decoding with 904 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed 905 * sequence. 906 */ MaxUTF8BufferLength(size_t aByteLength)907 inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const { 908 CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength)); 909 if (max.value() == std::numeric_limits<size_t>::max()) { 910 // Mark invalid by overflowing 911 max++; 912 MOZ_ASSERT(!max.isValid()); 913 } 914 return max; 915 } 916 917 /** 918 * Query the worst-case UTF-8 output size _without replacement_. 919 * 920 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) 921 * that will not overflow given the current state of the decoder and 922 * `aByteLength` number of additional input bytes when decoding without 923 * replacement error handling. 924 * 925 * Note that this value may be too small for the `WithReplacement` case. 926 * Use `MaxUTF8BufferLength()` for that case. 927 */ MaxUTF8BufferLengthWithoutReplacement(size_t aByteLength)928 inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement( 929 size_t aByteLength) const { 930 CheckedInt<size_t> max( 931 decoder_max_utf8_buffer_length_without_replacement(this, aByteLength)); 932 if (max.value() == std::numeric_limits<size_t>::max()) { 933 // Mark invalid by overflowing 934 max++; 935 MOZ_ASSERT(!max.isValid()); 936 } 937 return max; 938 } 939 940 /** 941 * Incrementally decode a byte stream into UTF-8 with malformed sequences 942 * replaced with the REPLACEMENT CHARACTER. 943 * 944 * See the documentation of the class for documentation for `Decode*` 945 * methods collectively. 946 */ DecodeToUTF8(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)947 inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8( 948 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 949 size_t srcRead = aSrc.Length(); 950 size_t dstWritten = aDst.Length(); 951 bool hadReplacements; 952 uint32_t result = 953 decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(), 954 &dstWritten, aLast, &hadReplacements); 955 return MakeTuple(result, srcRead, dstWritten, hadReplacements); 956 } 957 958 /** 959 * Incrementally decode a byte stream into UTF-8 _without replacement_. 960 * 961 * See the documentation of the class for documentation for `Decode*` 962 * methods collectively. 963 */ DecodeToUTF8WithoutReplacement(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)964 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement( 965 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 966 size_t srcRead = aSrc.Length(); 967 size_t dstWritten = aDst.Length(); 968 uint32_t result = decoder_decode_to_utf8_without_replacement( 969 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 970 return MakeTuple(result, srcRead, dstWritten); 971 } 972 973 /** 974 * Query the worst-case UTF-16 output size (with or without replacement). 975 * 976 * Returns the size of the output buffer in UTF-16 code units (`char16_t`) 977 * that will not overflow given the current state of the decoder and 978 * `aByteLength` number of additional input bytes. 979 * 980 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the 981 * return value of this method applies also in the 982 * `_without_replacement` case. 983 */ MaxUTF16BufferLength(size_t aU16Length)984 inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const { 985 CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length)); 986 if (max.value() == std::numeric_limits<size_t>::max()) { 987 // Mark invalid by overflowing 988 max++; 989 MOZ_ASSERT(!max.isValid()); 990 } 991 return max; 992 } 993 994 /** 995 * Incrementally decode a byte stream into UTF-16 with malformed sequences 996 * replaced with the REPLACEMENT CHARACTER. 997 * 998 * See the documentation of the class for documentation for `Decode*` 999 * methods collectively. 1000 */ DecodeToUTF16(Span<const uint8_t> aSrc,Span<char16_t> aDst,bool aLast)1001 inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16( 1002 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) { 1003 size_t srcRead = aSrc.Length(); 1004 size_t dstWritten = aDst.Length(); 1005 bool hadReplacements; 1006 uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead, 1007 aDst.Elements(), &dstWritten, 1008 aLast, &hadReplacements); 1009 return MakeTuple(result, srcRead, dstWritten, hadReplacements); 1010 } 1011 1012 /** 1013 * Incrementally decode a byte stream into UTF-16 _without replacement_. 1014 * 1015 * See the documentation of the class for documentation for `Decode*` 1016 * methods collectively. 1017 */ DecodeToUTF16WithoutReplacement(Span<const uint8_t> aSrc,Span<char16_t> aDst,bool aLast)1018 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement( 1019 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) { 1020 size_t srcRead = aSrc.Length(); 1021 size_t dstWritten = aDst.Length(); 1022 uint32_t result = decoder_decode_to_utf16_without_replacement( 1023 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 1024 return MakeTuple(result, srcRead, dstWritten); 1025 } 1026 1027 /** 1028 * Checks for compatibility with storing Unicode scalar values as unsigned 1029 * bytes taking into account the state of the decoder. 1030 * 1031 * Returns `mozilla::Nothing()` if the decoder is not in a neutral state, 1032 * including waiting for the BOM, or if the encoding is never 1033 * Latin1-byte-compatible. 1034 * 1035 * Otherwise returns the index of the first byte whose unsigned value doesn't 1036 * directly correspond to the decoded Unicode scalar value, or the length 1037 * of the input if all bytes in the input decode directly to scalar values 1038 * corresponding to the unsigned byte values. 1039 * 1040 * Does not change the state of the decoder. 1041 * 1042 * Do not use this unless you are supporting SpiderMonkey-style string 1043 * storage optimizations. 1044 */ Latin1ByteCompatibleUpTo(Span<const uint8_t> aBuffer)1045 inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo( 1046 Span<const uint8_t> aBuffer) const { 1047 size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(), 1048 aBuffer.Length()); 1049 if (upTo == std::numeric_limits<size_t>::max()) { 1050 return mozilla::Nothing(); 1051 } 1052 return mozilla::Some(upTo); 1053 } 1054 1055 private: 1056 Decoder() = delete; 1057 Decoder(const Decoder&) = delete; 1058 Decoder& operator=(const Decoder&) = delete; 1059 }; 1060 1061 /** 1062 * A converter that encodes a Unicode stream into bytes according to a 1063 * character encoding in a streaming (incremental) manner. 1064 * 1065 * The various `Encode*` methods take an input buffer (`aSrc`) and an output 1066 * buffer `aDst` both of which are caller-allocated. There are variants for 1067 * both UTF-8 and UTF-16 input buffers. 1068 * 1069 * An `Encode*` method encode characters from `aSrc` into bytes characters 1070 * stored into `aDst` until one of the following three things happens: 1071 * 1072 * 1. An unmappable character is encountered (`*WithoutReplacement` variants 1073 * only). 1074 * 1075 * 2. The output buffer has been filled so near capacity that the decoder 1076 * cannot be sure that processing an additional character of input wouldn't 1077 * cause so much output that the output buffer would overflow. 1078 * 1079 * 3. All the input characters have been processed. 1080 * 1081 * The `Encode*` method then returns tuple of a status indicating which one 1082 * of the three reasons to return happened, how many input code units (`uint8_t` 1083 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read, 1084 * how many output bytes were written, and in the case of the variants that 1085 * perform replacement, a boolean indicating whether an unmappable 1086 * character was replaced with a numeric character reference during the call. 1087 * 1088 * The number of bytes "written" is what's logically written. Garbage may be 1089 * written in the output buffer beyond the point logically written to. 1090 * 1091 * In the case of the methods whose name ends with 1092 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values 1093 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding 1094 * to the three cases listed above). 1095 * 1096 * In the case of methods whose name does not end with 1097 * `*WithoutReplacement`, unmappable characters are automatically replaced 1098 * with the corresponding numeric character references and unmappable 1099 * characters do not cause the methods to return early. 1100 * 1101 * When encoding from UTF-8 without replacement, the methods are guaranteed 1102 * not to return indicating that more output space is needed if the length 1103 * of the output buffer is at least the length returned by 1104 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from 1105 * UTF-8 with replacement, the length of the output buffer that guarantees the 1106 * methods not to return indicating that more output space is needed in the 1107 * absence of unmappable characters is given by 1108 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from 1109 * UTF-16 without replacement, the methods are guaranteed not to return 1110 * indicating that more output space is needed if the length of the output 1111 * buffer is at least the length returned by 1112 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding 1113 * from UTF-16 with replacement, the the length of the output buffer that 1114 * guarantees the methods not to return indicating that more output space is 1115 * needed in the absence of unmappable characters is given by 1116 * `MaxBufferLengthFromUTF16IfNoUnmappables()`. 1117 * When encoding with replacement, applications are not expected to size the 1118 * buffer for the worst case ahead of time but to resize the buffer if there 1119 * are unmappable characters. This is why max length queries are only available 1120 * for the case where there are no unmappable characters. 1121 * 1122 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When 1123 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD 1124 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to 1125 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that 1126 * surrogate pairs are not split across input buffer boundaries. 1127 * 1128 * After an `Encode*` call returns, the output produced so far, taken as a 1129 * whole from the start of the stream, is guaranteed to consist of a valid 1130 * byte sequence in the target encoding. (I.e. the code unit sequence for a 1131 * character is guaranteed not to be split across output buffers. However, due 1132 * to the stateful nature of ISO-2022-JP, the stream needs to be considered 1133 * from the start for it to be valid. For other encodings, the validity holds 1134 * on a per-output buffer basis.) 1135 * 1136 * The boolean argument `aLast` indicates that the end of the stream is reached 1137 * when all the characters in `aSrc` have been consumed. This argument is needed 1138 * for ISO-2022-JP and is ignored for other encodings. 1139 * 1140 * An `Encoder` object can be used to incrementally encode a byte stream. 1141 * 1142 * During the processing of a single stream, the caller must call `Encode*` 1143 * zero or more times with `aLast` set to `false` and then call `Encode*` at 1144 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`, 1145 * the processing of the stream has ended. Otherwise, the caller must call 1146 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result, 1147 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error). 1148 * 1149 * Once the stream has ended, the `Encoder` object must not be used anymore. 1150 * That is, you need to create another one to process another stream. 1151 * 1152 * When the encoder returns `kOutputFull` or the encoder returns an unmappable 1153 * result and the caller does not wish to treat it as a fatal error, the input 1154 * buffer `aSrc` may not have been completely consumed. In that case, the caller 1155 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next 1156 * call. 1157 * 1158 * # Infinite loops 1159 * 1160 * When converting with a fixed-size output buffer whose size is too small to 1161 * accommodate one character of output, an infinite loop ensues. When 1162 * converting with a fixed-size output buffer, it generally makes sense to 1163 * make the buffer fairly large (e.g. couple of kilobytes). 1164 */ 1165 class Encoder final { 1166 public: 1167 ~Encoder() = default; 1168 delete(void * aEncoder)1169 static void operator delete(void* aEncoder) { 1170 encoder_free(reinterpret_cast<Encoder*>(aEncoder)); 1171 } 1172 1173 /** 1174 * The `Encoding` this `Encoder` is for. 1175 */ Encoding()1176 inline NotNull<const mozilla::Encoding*> Encoding() const { 1177 return WrapNotNull(encoder_encoding(this)); 1178 } 1179 1180 /** 1181 * Returns `true` if this is an ISO-2022-JP encoder that's not in the 1182 * ASCII state and `false` otherwise. 1183 */ HasPendingState()1184 inline bool HasPendingState() const { 1185 return encoder_has_pending_state(this); 1186 } 1187 1188 /** 1189 * Query the worst-case output size when encoding from UTF-8 with 1190 * replacement. 1191 * 1192 * Returns the size of the output buffer in bytes that will not overflow 1193 * given the current state of the encoder and `aByteLength` number of 1194 * additional input code units if there are no unmappable characters in 1195 * the input. 1196 */ MaxBufferLengthFromUTF8IfNoUnmappables(size_t aByteLength)1197 inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables( 1198 size_t aByteLength) const { 1199 CheckedInt<size_t> max( 1200 encoder_max_buffer_length_from_utf8_if_no_unmappables(this, 1201 aByteLength)); 1202 if (max.value() == std::numeric_limits<size_t>::max()) { 1203 // Mark invalid by overflowing 1204 max++; 1205 MOZ_ASSERT(!max.isValid()); 1206 } 1207 return max; 1208 } 1209 1210 /** 1211 * Query the worst-case output size when encoding from UTF-8 without 1212 * replacement. 1213 * 1214 * Returns the size of the output buffer in bytes that will not overflow 1215 * given the current state of the encoder and `aByteLength` number of 1216 * additional input code units. 1217 */ MaxBufferLengthFromUTF8WithoutReplacement(size_t aByteLength)1218 inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement( 1219 size_t aByteLength) const { 1220 CheckedInt<size_t> max( 1221 encoder_max_buffer_length_from_utf8_without_replacement(this, 1222 aByteLength)); 1223 if (max.value() == std::numeric_limits<size_t>::max()) { 1224 // Mark invalid by overflowing 1225 max++; 1226 MOZ_ASSERT(!max.isValid()); 1227 } 1228 return max; 1229 } 1230 1231 /** 1232 * Incrementally encode into byte stream from UTF-8 with unmappable 1233 * characters replaced with HTML (decimal) numeric character references. 1234 * 1235 * See the documentation of the class for documentation for `Encode*` 1236 * methods collectively. 1237 * 1238 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING: 1239 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless 1240 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check. 1241 */ EncodeFromUTF8(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)1242 inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8( 1243 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1244 size_t srcRead = aSrc.Length(); 1245 size_t dstWritten = aDst.Length(); 1246 bool hadReplacements; 1247 uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead, 1248 aDst.Elements(), &dstWritten, 1249 aLast, &hadReplacements); 1250 return MakeTuple(result, srcRead, dstWritten, hadReplacements); 1251 } 1252 1253 /** 1254 * Incrementally encode into byte stream from UTF-8 _without replacement_. 1255 * 1256 * See the documentation of the class for documentation for `Encode*` 1257 * methods collectively. 1258 * 1259 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING: 1260 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless 1261 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check. 1262 */ EncodeFromUTF8WithoutReplacement(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)1263 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement( 1264 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1265 size_t srcRead = aSrc.Length(); 1266 size_t dstWritten = aDst.Length(); 1267 uint32_t result = encoder_encode_from_utf8_without_replacement( 1268 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 1269 return MakeTuple(result, srcRead, dstWritten); 1270 } 1271 1272 /** 1273 * Query the worst-case output size when encoding from UTF-16 with 1274 * replacement. 1275 * 1276 * Returns the size of the output buffer in bytes that will not overflow 1277 * given the current state of the encoder and `aU16Length` number of 1278 * additional input code units if there are no unmappable characters in 1279 * the input. 1280 */ MaxBufferLengthFromUTF16IfNoUnmappables(size_t aU16Length)1281 inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables( 1282 size_t aU16Length) const { 1283 CheckedInt<size_t> max( 1284 encoder_max_buffer_length_from_utf16_if_no_unmappables(this, 1285 aU16Length)); 1286 if (max.value() == std::numeric_limits<size_t>::max()) { 1287 // Mark invalid by overflowing 1288 max++; 1289 MOZ_ASSERT(!max.isValid()); 1290 } 1291 return max; 1292 } 1293 1294 /** 1295 * Query the worst-case output size when encoding from UTF-16 without 1296 * replacement. 1297 * 1298 * Returns the size of the output buffer in bytes that will not overflow 1299 * given the current state of the encoder and `aU16Length` number of 1300 * additional input code units. 1301 */ MaxBufferLengthFromUTF16WithoutReplacement(size_t aU16Length)1302 inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement( 1303 size_t aU16Length) const { 1304 CheckedInt<size_t> max( 1305 encoder_max_buffer_length_from_utf16_without_replacement(this, 1306 aU16Length)); 1307 if (max.value() == std::numeric_limits<size_t>::max()) { 1308 // Mark invalid by overflowing 1309 max++; 1310 MOZ_ASSERT(!max.isValid()); 1311 } 1312 return max; 1313 } 1314 1315 /** 1316 * Incrementally encode into byte stream from UTF-16 with unmappable 1317 * characters replaced with HTML (decimal) numeric character references. 1318 * 1319 * See the documentation of the class for documentation for `Encode*` 1320 * methods collectively. 1321 */ EncodeFromUTF16(Span<const char16_t> aSrc,Span<uint8_t> aDst,bool aLast)1322 inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16( 1323 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1324 size_t srcRead = aSrc.Length(); 1325 size_t dstWritten = aDst.Length(); 1326 bool hadReplacements; 1327 uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead, 1328 aDst.Elements(), &dstWritten, 1329 aLast, &hadReplacements); 1330 return MakeTuple(result, srcRead, dstWritten, hadReplacements); 1331 } 1332 1333 /** 1334 * Incrementally encode into byte stream from UTF-16 _without replacement_. 1335 * 1336 * See the documentation of the class for documentation for `Encode*` 1337 * methods collectively. 1338 */ EncodeFromUTF16WithoutReplacement(Span<const char16_t> aSrc,Span<uint8_t> aDst,bool aLast)1339 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement( 1340 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1341 size_t srcRead = aSrc.Length(); 1342 size_t dstWritten = aDst.Length(); 1343 uint32_t result = encoder_encode_from_utf16_without_replacement( 1344 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 1345 return MakeTuple(result, srcRead, dstWritten); 1346 } 1347 1348 private: 1349 Encoder() = delete; 1350 Encoder(const Encoder&) = delete; 1351 Encoder& operator=(const Encoder&) = delete; 1352 }; 1353 1354 }; // namespace mozilla 1355 1356 #endif // mozilla_Encoding_h 1357