1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 2 // file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 #pragma once 11 12 #ifndef encoding_rs_cpp_h_ 13 #define encoding_rs_cpp_h_ 14 15 #include <memory> 16 #include <optional> 17 #include <string> 18 #include <string_view> 19 #include <tuple> 20 #include <vector> 21 #include "gsl/gsl" 22 23 namespace encoding_rs { 24 class Encoding; 25 class Decoder; 26 class Encoder; 27 }; // namespace encoding_rs 28 29 #define ENCODING_RS_ENCODING encoding_rs::Encoding 30 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \ 31 gsl::not_null<const encoding_rs::Encoding*> 32 #define ENCODING_RS_ENCODER encoding_rs::Encoder 33 #define ENCODING_RS_DECODER encoding_rs::Decoder 34 35 #include "encoding_rs.h" 36 37 namespace encoding_rs { 38 39 /** 40 * A converter that decodes a byte stream into Unicode according to a 41 * character encoding in a streaming (incremental) manner. 42 * 43 * The various `decode_*` methods take an input buffer (`src`) and an output 44 * buffer `dst` both of which are caller-allocated. There are variants for 45 * both UTF-8 and UTF-16 output buffers. 46 * 47 * A `decode_*` method decodes bytes from `src` into Unicode characters stored 48 * into `dst` until one of the following three things happens: 49 * 50 * 1. A malformed byte sequence is encountered (`*_without_replacement` 51 * variants only). 52 * 53 * 2. The output buffer has been filled so near capacity that the decoder 54 * cannot be sure that processing an additional byte of input wouldn't 55 * cause so much output that the output buffer would overflow. 56 * 57 * 3. All the input bytes have been processed. 58 * 59 * The `decode_*` method then returns tuple of a status indicating which one 60 * of the three reasons to return happened, how many input bytes were read, 61 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t` 62 * when decoding to UTF-16) were written, and in the case of the 63 * variants performing replacement, a boolean indicating whether an error was 64 * replaced with the REPLACEMENT CHARACTER during the call. 65 * 66 * The number of bytes "written" is what's logically written. Garbage may be 67 * written in the output buffer beyond the point logically written to. 68 * 69 * In the case of the `*_without_replacement` variants, the status is a 70 * `uint32_t` whose possible values are packed info about a malformed byte 71 * sequence, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding to the three cases 72 * listed above). 73 * 74 * Packed info about malformed sequences has the following format: 75 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3, 76 * indicate the number of bytes that were consumed after the malformed 77 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate 78 * the length of the malformed byte sequence (possible decimal values 1, 2, 79 * 3 or 4). The maximum possible sum of the two is 6. 80 * 81 * In the case of methods whose name does not end with 82 * `*_without_replacement`, malformed sequences are automatically replaced 83 * with the REPLACEMENT CHARACTER and errors do not cause the methods to 84 * return early. 85 * 86 * When decoding to UTF-8, the output buffer must have at least 4 bytes of 87 * space. When decoding to UTF-16, the output buffer must have at least two 88 * UTF-16 code units (`char16_t`) of space. 89 * 90 * When decoding to UTF-8 without replacement, the methods are guaranteed 91 * not to return indicating that more output space is needed if the length 92 * of the output buffer is at least the length returned by 93 * `max_utf8_buffer_length_without_replacement()`. When decoding to UTF-8 94 * with replacement, the length of the output buffer that guarantees the 95 * methods not to return indicating that more output space is needed is given 96 * by `max_utf8_buffer_length()`. When decoding to UTF-16 with 97 * or without replacement, the length of the output buffer that guarantees 98 * the methods not to return indicating that more output space is needed is 99 * given by `max_utf16_buffer_length()`. 100 * 101 * The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16, 102 * and the output after each `decode_*` call is guaranteed to consist of 103 * complete characters. (I.e. the code unit sequence for the last character is 104 * guaranteed not to be split across output buffers.) 105 * 106 * The boolean argument `last` indicates that the end of the stream is reached 107 * when all the bytes in `src` have been consumed. 108 * 109 * A `Decoder` object can be used to incrementally decode a byte stream. 110 * 111 * During the processing of a single stream, the caller must call `decode_*` 112 * zero or more times with `last` set to `false` and then call `decode_*` at 113 * least once with `last` set to `true`. If `decode_*` returns `INPUT_EMPTY`, 114 * the processing of the stream has ended. Otherwise, the caller must call 115 * `decode_*` again with `last` set to `true` (or treat a malformed result, 116 * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error). 117 * 118 * Once the stream has ended, the `Decoder` object must not be used anymore. 119 * That is, you need to create another one to process another stream. 120 * 121 * When the decoder returns `OUTPUT_FULL` or the decoder returns a malformed 122 * result and the caller does not wish to treat it as a fatal error, the input 123 * buffer `src` may not have been completely consumed. In that case, the caller 124 * must pass the unconsumed contents of `src` to `decode_*` again upon the next 125 * call. 126 * 127 * # Infinite loops 128 * 129 * When converting with a fixed-size output buffer whose size is too small to 130 * accommodate one character of output, an infinite loop ensues. When 131 * converting with a fixed-size output buffer, it generally makes sense to 132 * make the buffer fairly large (e.g. couple of kilobytes). 133 */ 134 class Decoder final { 135 public: ~Decoder()136 ~Decoder() {} delete(void * decoder)137 static inline void operator delete(void* decoder) { 138 decoder_free(reinterpret_cast<Decoder*>(decoder)); 139 } 140 141 /** 142 * The `Encoding` this `Decoder` is for. 143 * 144 * BOM sniffing can change the return value of this method during the life 145 * of the decoder. 146 */ encoding()147 inline gsl::not_null<const Encoding*> encoding() const { 148 return gsl::not_null<const Encoding*>(decoder_encoding(this)); 149 } 150 151 /** 152 * Query the worst-case UTF-8 output size _with replacement_. 153 * 154 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) 155 * that will not overflow given the current state of the decoder and 156 * `byte_length` number of additional input bytes when decoding with 157 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed 158 * sequence or `std::optional` without value if `size_t` would overflow. 159 */ max_utf8_buffer_length(size_t byte_length)160 inline std::optional<size_t> max_utf8_buffer_length( 161 size_t byte_length) const { 162 size_t val = decoder_max_utf8_buffer_length(this, byte_length); 163 if (val == SIZE_MAX) { 164 return std::nullopt; 165 } 166 return val; 167 } 168 169 /** 170 * Query the worst-case UTF-8 output size _without replacement_. 171 * 172 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) 173 * that will not overflow given the current state of the decoder and 174 * `byte_length` number of additional input bytes when decoding without 175 * replacement error handling or `std::optional` without value if `size_t` 176 * would overflow. 177 * 178 * Note that this value may be too small for the `_with_replacement` case. 179 * Use `max_utf8_buffer_length()` for that case. 180 */ max_utf8_buffer_length_without_replacement(size_t byte_length)181 inline std::optional<size_t> max_utf8_buffer_length_without_replacement( 182 size_t byte_length) const { 183 size_t val = 184 decoder_max_utf8_buffer_length_without_replacement(this, byte_length); 185 if (val == SIZE_MAX) { 186 return std::nullopt; 187 } 188 return val; 189 } 190 191 /** 192 * Incrementally decode a byte stream into UTF-8 with malformed sequences 193 * replaced with the REPLACEMENT CHARACTER. 194 * 195 * See the documentation of the class for documentation for `decode_*` 196 * methods collectively. 197 */ decode_to_utf8(gsl::span<const uint8_t> src,gsl::span<uint8_t> dst,bool last)198 inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf8( 199 gsl::span<const uint8_t> src, gsl::span<uint8_t> dst, bool last) { 200 size_t src_read = src.size(); 201 size_t dst_written = dst.size(); 202 bool had_replacements; 203 uint32_t result = 204 decoder_decode_to_utf8(this, null_to_bogus<const uint8_t>(src.data()), 205 &src_read, null_to_bogus<uint8_t>(dst.data()), 206 &dst_written, last, &had_replacements); 207 return {result, src_read, dst_written, had_replacements}; 208 } 209 210 /** 211 * Incrementally decode a byte stream into UTF-8 _without replacement_. 212 * 213 * See the documentation of the class for documentation for `decode_*` 214 * methods collectively. 215 */ 216 inline std::tuple<uint32_t, size_t, size_t> decode_to_utf8_without_replacement(gsl::span<const uint8_t> src,gsl::span<uint8_t> dst,bool last)217 decode_to_utf8_without_replacement(gsl::span<const uint8_t> src, 218 gsl::span<uint8_t> dst, bool last) { 219 size_t src_read = src.size(); 220 size_t dst_written = dst.size(); 221 uint32_t result = decoder_decode_to_utf8_without_replacement( 222 this, null_to_bogus<const uint8_t>(src.data()), &src_read, 223 null_to_bogus<uint8_t>(dst.data()), &dst_written, last); 224 return {result, src_read, dst_written}; 225 } 226 227 /** 228 * Query the worst-case UTF-16 output size (with or without replacement). 229 * 230 * Returns the size of the output buffer in UTF-16 code units (`char16_t`) 231 * that will not overflow given the current state of the decoder and 232 * `byte_length` number of additional input bytes or `std::optional` 233 * without value if `size_t` would overflow. 234 * 235 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the 236 * return value of this method applies also in the 237 * `_without_replacement` case. 238 */ max_utf16_buffer_length(size_t byte_length)239 inline std::optional<size_t> max_utf16_buffer_length( 240 size_t byte_length) const { 241 size_t val = decoder_max_utf16_buffer_length(this, byte_length); 242 if (val == SIZE_MAX) { 243 return std::nullopt; 244 } 245 return val; 246 } 247 248 /** 249 * Incrementally decode a byte stream into UTF-16 with malformed sequences 250 * replaced with the REPLACEMENT CHARACTER. 251 * 252 * See the documentation of the class for documentation for `decode_*` 253 * methods collectively. 254 */ decode_to_utf16(gsl::span<const uint8_t> src,gsl::span<char16_t> dst,bool last)255 inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf16( 256 gsl::span<const uint8_t> src, gsl::span<char16_t> dst, bool last) { 257 size_t src_read = src.size(); 258 size_t dst_written = dst.size(); 259 bool had_replacements; 260 uint32_t result = 261 decoder_decode_to_utf16(this, null_to_bogus<const uint8_t>(src.data()), 262 &src_read, null_to_bogus<char16_t>(dst.data()), 263 &dst_written, last, &had_replacements); 264 return {result, src_read, dst_written, had_replacements}; 265 } 266 267 /** 268 * Incrementally decode a byte stream into UTF-16 _without replacement_. 269 * 270 * See the documentation of the class for documentation for `decode_*` 271 * methods collectively. 272 */ 273 inline std::tuple<uint32_t, size_t, size_t> decode_to_utf16_without_replacement(gsl::span<const uint8_t> src,gsl::span<char16_t> dst,bool last)274 decode_to_utf16_without_replacement(gsl::span<const uint8_t> src, 275 gsl::span<char16_t> dst, bool last) { 276 size_t src_read = src.size(); 277 size_t dst_written = dst.size(); 278 uint32_t result = decoder_decode_to_utf16_without_replacement( 279 this, null_to_bogus<const uint8_t>(src.data()), &src_read, 280 null_to_bogus<char16_t>(dst.data()), &dst_written, last); 281 return {result, src_read, dst_written}; 282 } 283 284 /** 285 * Checks for compatibility with storing Unicode scalar values as unsigned 286 * bytes taking into account the state of the decoder. 287 * 288 * Returns `std::nullopt` if the decoder is not in a neutral state, including 289 * waiting for the BOM, or if the encoding is never Latin1-byte-compatible. 290 * 291 * Otherwise returns the index of the first byte whose unsigned value doesn't 292 * directly correspond to the decoded Unicode scalar value, or the length 293 * of the input if all bytes in the input decode directly to scalar values 294 * corresponding to the unsigned byte values. 295 * 296 * Does not change the state of the decoder. 297 * 298 * Do not use this unless you are supporting SpiderMonkey/V8-style string 299 * storage optimizations. 300 */ latin1_byte_compatible_up_to(gsl::span<const uint8_t> buffer)301 inline std::optional<size_t> latin1_byte_compatible_up_to( 302 gsl::span<const uint8_t> buffer) const { 303 size_t val = decoder_latin1_byte_compatible_up_to( 304 this, null_to_bogus<const uint8_t>(buffer.data()), 305 static_cast<size_t>(buffer.size())); 306 if (val == SIZE_MAX) { 307 return std::nullopt; 308 } 309 return val; 310 } 311 312 private: 313 /** 314 * Replaces `nullptr` with a bogus pointer suitable for use as part of a 315 * zero-length Rust slice. 316 */ 317 template <class T> null_to_bogus(T * ptr)318 static inline T* null_to_bogus(T* ptr) { 319 return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); 320 } 321 322 Decoder() = delete; 323 Decoder(const Decoder&) = delete; 324 Decoder& operator=(const Decoder&) = delete; 325 }; 326 327 /** 328 * A converter that encodes a Unicode stream into bytes according to a 329 * character encoding in a streaming (incremental) manner. 330 * 331 * The various `encode_*` methods take an input buffer (`src`) and an output 332 * buffer `dst` both of which are caller-allocated. There are variants for 333 * both UTF-8 and UTF-16 input buffers. 334 * 335 * An `encode_*` method encode characters from `src` into bytes characters 336 * stored into `dst` until one of the following three things happens: 337 * 338 * 1. An unmappable character is encountered (`*_without_replacement` variants 339 * only). 340 * 341 * 2. The output buffer has been filled so near capacity that the decoder 342 * cannot be sure that processing an additional character of input wouldn't 343 * cause so much output that the output buffer would overflow. 344 * 345 * 3. All the input characters have been processed. 346 * 347 * The `encode_*` method then returns tuple of a status indicating which one 348 * of the three reasons to return happened, how many input code units (`uint8_t` 349 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read, 350 * how many output bytes were written, and in the case of the variants that 351 * perform replacement, a boolean indicating whether an unmappable 352 * character was replaced with a numeric character reference during the call. 353 * 354 * The number of bytes "written" is what's logically written. Garbage may be 355 * written in the output buffer beyond the point logically written to. 356 * 357 * In the case of the methods whose name ends with 358 * `*_without_replacement`, the status is a `uint32_t` whose possible values 359 * are an unmappable code point, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding 360 * to the three cases listed above). 361 * 362 * In the case of methods whose name does not end with 363 * `*_without_replacement`, unmappable characters are automatically replaced 364 * with the corresponding numeric character references and unmappable 365 * characters do not cause the methods to return early. 366 * 367 * When encoding from UTF-8 without replacement, the methods are guaranteed 368 * not to return indicating that more output space is needed if the length 369 * of the output buffer is at least the length returned by 370 * `max_buffer_length_from_utf8_without_replacement()`. When encoding from 371 * UTF-8 with replacement, the length of the output buffer that guarantees the 372 * methods not to return indicating that more output space is needed in the 373 * absence of unmappable characters is given by 374 * `max_buffer_length_from_utf8_if_no_unmappables()`. When encoding from 375 * UTF-16 without replacement, the methods are guaranteed not to return 376 * indicating that more output space is needed if the length of the output 377 * buffer is at least the length returned by 378 * `max_buffer_length_from_utf16_without_replacement()`. When encoding 379 * from UTF-16 with replacement, the the length of the output buffer that 380 * guarantees the methods not to return indicating that more output space is 381 * needed in the absence of unmappable characters is given by 382 * `max_buffer_length_from_utf16_if_no_unmappables()`. 383 * When encoding with replacement, applications are not expected to size the 384 * buffer for the worst case ahead of time but to resize the buffer if there 385 * are unmappable characters. This is why max length queries are only available 386 * for the case where there are no unmappable characters. 387 * 388 * When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. When 389 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD 390 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to 391 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that 392 * surrogate pairs are not split across input buffer boundaries. 393 * 394 * After an `encode_*` call returns, the output produced so far, taken as a 395 * whole from the start of the stream, is guaranteed to consist of a valid 396 * byte sequence in the target encoding. (I.e. the code unit sequence for a 397 * character is guaranteed not to be split across output buffers. However, due 398 * to the stateful nature of ISO-2022-JP, the stream needs to be considered 399 * from the start for it to be valid. For other encodings, the validity holds 400 * on a per-output buffer basis.) 401 * 402 * The boolean argument `last` indicates that the end of the stream is reached 403 * when all the characters in `src` have been consumed. This argument is needed 404 * for ISO-2022-JP and is ignored for other encodings. 405 * 406 * An `Encoder` object can be used to incrementally encode a byte stream. 407 * 408 * During the processing of a single stream, the caller must call `encode_*` 409 * zero or more times with `last` set to `false` and then call `encode_*` at 410 * least once with `last` set to `true`. If `encode_*` returns `INPUT_EMPTY`, 411 * the processing of the stream has ended. Otherwise, the caller must call 412 * `encode_*` again with `last` set to `true` (or treat an unmappable result, 413 * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error). 414 * 415 * Once the stream has ended, the `Encoder` object must not be used anymore. 416 * That is, you need to create another one to process another stream. 417 * 418 * When the encoder returns `OUTPUT_FULL` or the encoder returns an unmappable 419 * result and the caller does not wish to treat it as a fatal error, the input 420 * buffer `src` may not have been completely consumed. In that case, the caller 421 * must pass the unconsumed contents of `src` to `encode_*` again upon the next 422 * call. 423 * 424 * # Infinite loops 425 * 426 * When converting with a fixed-size output buffer whose size is too small to 427 * accommodate one character of output, an infinite loop ensues. When 428 * converting with a fixed-size output buffer, it generally makes sense to 429 * make the buffer fairly large (e.g. couple of kilobytes). 430 */ 431 class Encoder final { 432 public: ~Encoder()433 ~Encoder() {} 434 delete(void * encoder)435 static inline void operator delete(void* encoder) { 436 encoder_free(reinterpret_cast<Encoder*>(encoder)); 437 } 438 439 /** 440 * The `Encoding` this `Encoder` is for. 441 */ encoding()442 inline gsl::not_null<const Encoding*> encoding() const { 443 return gsl::not_null<const Encoding*>(encoder_encoding(this)); 444 } 445 446 /** 447 * Returns `true` if this is an ISO-2022-JP encoder that's not in the 448 * ASCII state and `false` otherwise. 449 */ has_pending_state()450 inline bool has_pending_state() const { 451 return encoder_has_pending_state(this); 452 } 453 454 /** 455 * Query the worst-case output size when encoding from UTF-8 with 456 * replacement. 457 * 458 * Returns the size of the output buffer in bytes that will not overflow 459 * given the current state of the encoder and `byte_length` number of 460 * additional input code units if there are no unmappable characters in 461 * the input or `SIZE_MAX` if `size_t` would overflow. 462 */ max_buffer_length_from_utf8_if_no_unmappables(size_t byte_length)463 inline std::optional<size_t> max_buffer_length_from_utf8_if_no_unmappables( 464 size_t byte_length) const { 465 size_t val = encoder_max_buffer_length_from_utf8_if_no_unmappables( 466 this, byte_length); 467 if (val == SIZE_MAX) { 468 return std::nullopt; 469 } 470 return val; 471 } 472 473 /** 474 * Query the worst-case output size when encoding from UTF-8 without 475 * replacement. 476 * 477 * Returns the size of the output buffer in bytes that will not overflow 478 * given the current state of the encoder and `byte_length` number of 479 * additional input code units or `SIZE_MAX` if `size_t` would overflow. 480 */ max_buffer_length_from_utf8_without_replacement(size_t byte_length)481 inline std::optional<size_t> max_buffer_length_from_utf8_without_replacement( 482 size_t byte_length) const { 483 size_t val = encoder_max_buffer_length_from_utf8_without_replacement( 484 this, byte_length); 485 if (val == SIZE_MAX) { 486 return std::nullopt; 487 } 488 return val; 489 } 490 491 /** 492 * Incrementally encode into byte stream from UTF-8 with unmappable 493 * characters replaced with HTML (decimal) numeric character references. 494 * 495 * See the documentation of the class for documentation for `encode_*` 496 * methods collectively. 497 */ encode_from_utf8(std::string_view src,gsl::span<uint8_t> dst,bool last)498 inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf8( 499 std::string_view src, gsl::span<uint8_t> dst, bool last) { 500 size_t src_read = src.size(); 501 size_t dst_written = dst.size(); 502 bool had_replacements; 503 uint32_t result = encoder_encode_from_utf8( 504 this, 505 null_to_bogus<const uint8_t>( 506 reinterpret_cast<const uint8_t*>(src.data())), 507 &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last, 508 &had_replacements); 509 return {result, src_read, dst_written, had_replacements}; 510 } 511 512 /** 513 * Incrementally encode into byte stream from UTF-8 _without replacement_. 514 * 515 * See the documentation of the class for documentation for `encode_*` 516 * methods collectively. 517 */ 518 inline std::tuple<uint32_t, size_t, size_t> encode_from_utf8_without_replacement(std::string_view src,gsl::span<uint8_t> dst,bool last)519 encode_from_utf8_without_replacement(std::string_view src, 520 gsl::span<uint8_t> dst, bool last) { 521 size_t src_read = src.size(); 522 size_t dst_written = dst.size(); 523 uint32_t result = encoder_encode_from_utf8_without_replacement( 524 this, 525 null_to_bogus<const uint8_t>( 526 reinterpret_cast<const uint8_t*>(src.data())), 527 &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last); 528 return {result, src_read, dst_written}; 529 } 530 531 /** 532 * Query the worst-case output size when encoding from UTF-16 with 533 * replacement. 534 * 535 * Returns the size of the output buffer in bytes that will not overflow 536 * given the current state of the encoder and `u16_length` number of 537 * additional input code units if there are no unmappable characters in 538 * the input or `SIZE_MAX` if `size_t` would overflow. 539 */ max_buffer_length_from_utf16_if_no_unmappables(size_t u16_length)540 inline std::optional<size_t> max_buffer_length_from_utf16_if_no_unmappables( 541 size_t u16_length) const { 542 size_t val = encoder_max_buffer_length_from_utf16_if_no_unmappables( 543 this, u16_length); 544 if (val == SIZE_MAX) { 545 return std::nullopt; 546 } 547 return val; 548 } 549 550 /** 551 * Query the worst-case output size when encoding from UTF-16 without 552 * replacement. 553 * 554 * Returns the size of the output buffer in bytes that will not overflow 555 * given the current state of the encoder and `u16_length` number of 556 * additional input code units or `SIZE_MAX` if `size_t` would overflow. 557 */ max_buffer_length_from_utf16_without_replacement(size_t u16_length)558 inline std::optional<size_t> max_buffer_length_from_utf16_without_replacement( 559 size_t u16_length) const { 560 size_t val = encoder_max_buffer_length_from_utf16_without_replacement( 561 this, u16_length); 562 if (val == SIZE_MAX) { 563 return std::nullopt; 564 } 565 return val; 566 } 567 568 /** 569 * Incrementally encode into byte stream from UTF-16 with unmappable 570 * characters replaced with HTML (decimal) numeric character references. 571 * 572 * See the documentation of the class for documentation for `encode_*` 573 * methods collectively. 574 */ encode_from_utf16(std::u16string_view src,gsl::span<uint8_t> dst,bool last)575 inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf16( 576 std::u16string_view src, gsl::span<uint8_t> dst, bool last) { 577 size_t src_read = src.size(); 578 size_t dst_written = dst.size(); 579 bool had_replacements; 580 uint32_t result = encoder_encode_from_utf16( 581 this, null_to_bogus<const char16_t>(src.data()), &src_read, 582 null_to_bogus<uint8_t>(dst.data()), &dst_written, last, 583 &had_replacements); 584 return {result, src_read, dst_written, had_replacements}; 585 } 586 587 /** 588 * Incrementally encode into byte stream from UTF-16 _without replacement_. 589 * 590 * See the documentation of the class for documentation for `encode_*` 591 * methods collectively. 592 */ 593 inline std::tuple<uint32_t, size_t, size_t> encode_from_utf16_without_replacement(std::u16string_view src,gsl::span<uint8_t> dst,bool last)594 encode_from_utf16_without_replacement(std::u16string_view src, 595 gsl::span<uint8_t> dst, bool last) { 596 size_t src_read = src.size(); 597 size_t dst_written = dst.size(); 598 uint32_t result = encoder_encode_from_utf16_without_replacement( 599 this, null_to_bogus<const char16_t>(src.data()), &src_read, 600 null_to_bogus<uint8_t>(dst.data()), &dst_written, last); 601 return {result, src_read, dst_written}; 602 } 603 604 private: 605 /** 606 * Replaces `nullptr` with a bogus pointer suitable for use as part of a 607 * zero-length Rust slice. 608 */ 609 template <class T> null_to_bogus(T * ptr)610 static inline T* null_to_bogus(T* ptr) { 611 return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); 612 } 613 614 Encoder() = delete; 615 Encoder(const Encoder&) = delete; 616 Encoder& operator=(const Encoder&) = delete; 617 }; 618 619 /** 620 * An encoding as defined in the Encoding Standard 621 * (https://encoding.spec.whatwg.org/). 622 * 623 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point 624 * sequence and, in most cases, vice versa. Each encoding has a name, an output 625 * encoding, and one or more labels. 626 * 627 * _Labels_ are ASCII-case-insensitive strings that are used to identify an 628 * encoding in formats and protocols. The _name_ of the encoding is the 629 * preferred label in the case appropriate for returning from the 630 * `characterSet` property of the `Document` DOM interface, except for 631 * the replacement encoding whose name is not one of its labels. 632 * 633 * The _output encoding_ is the encoding used for form submission and URL 634 * parsing on Web pages in the encoding. This is UTF-8 for the replacement, 635 * UTF-16LE and UTF-16BE encodings and the encoding itself for other 636 * encodings. 637 * 638 * # Streaming vs. Non-Streaming 639 * 640 * When you have the entire input in a single buffer, you can use the 641 * methods `decode()`, `decode_with_bom_removal()`, 642 * `decode_without_bom_handling()`, 643 * `decode_without_bom_handling_and_without_replacement()` and 644 * `encode()`. Unlike the rest of the API, these methods perform heap 645 * allocations. You should the `Decoder` and `Encoder` objects when your input 646 * is split into multiple buffers or when you want to control the allocation of 647 * the output buffers. 648 * 649 * # Instances 650 * 651 * All instances of `Encoding` are statically allocated and have the process's 652 * lifetime. There is precisely one unique `Encoding` instance for each 653 * encoding defined in the Encoding Standard. 654 * 655 * To obtain a reference to a particular encoding whose identity you know at 656 * compile time, use a `static` that refers to encoding. There is a `static` 657 * for each encoding. The `static`s are named in all caps with hyphens 658 * replaced with underscores and with `_ENCODING` appended to the 659 * name. For example, if you know at compile time that you will want to 660 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`. 661 * 662 * If you don't know what encoding you need at compile time and need to 663 * dynamically get an encoding by label, use `Encoding::for_label()`. 664 * 665 * Instances of `Encoding` can be compared with `==`. 666 */ 667 class Encoding final { 668 public: 669 /** 670 * Implements the _get an encoding_ algorithm 671 * (https://encoding.spec.whatwg.org/#concept-encoding-get). 672 * 673 * If, after ASCII-lowercasing and removing leading and trailing 674 * whitespace, the argument matches a label defined in the Encoding 675 * Standard, `const Encoding*` representing the corresponding 676 * encoding is returned. If there is no match, `nullptr` is returned. 677 * 678 * This is the right method to use if the action upon the method returning 679 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) 680 * instead. When the action upon the method returning `nullptr` is not to 681 * proceed with a fallback but to refuse processing, 682 * `for_label_no_replacement()` is more appropriate. 683 */ for_label(gsl::cstring_span<> label)684 static inline const Encoding* for_label(gsl::cstring_span<> label) { 685 return encoding_for_label( 686 null_to_bogus<const uint8_t>( 687 reinterpret_cast<const uint8_t*>(label.data())), 688 label.length()); 689 } 690 691 /** 692 * This method behaves the same as `for_label()`, except when `for_label()` 693 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead. 694 * 695 * This method is useful in scenarios where a fatal error is required 696 * upon invalid label, because in those cases the caller typically wishes 697 * to treat the labels that map to the replacement encoding as fatal 698 * errors, too. 699 * 700 * It is not OK to use this method when the action upon the method returning 701 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In 702 * such a case, the `for_label()` method should be used instead in order to 703 * avoid 704 * unsafe fallback for labels that `for_label()` maps to 705 * `REPLACEMENT_ENCODING`. 706 */ for_label_no_replacement(gsl::cstring_span<> label)707 static inline const Encoding* for_label_no_replacement( 708 gsl::cstring_span<> label) { 709 return encoding_for_label_no_replacement( 710 null_to_bogus<const uint8_t>( 711 reinterpret_cast<const uint8_t*>(label.data())), 712 label.length()); 713 } 714 715 /** 716 * Performs non-incremental BOM sniffing. 717 * 718 * The argument must either be a buffer representing the entire input 719 * stream (non-streaming case) or a buffer representing at least the first 720 * three bytes of the input stream (streaming case). 721 * 722 * Returns a std::optinal wrapping `make_tuple(UTF_8_ENCODING, 3)`, 723 * `make_tuple(UTF_16LE_ENCODING, 2)` or `make_tuple(UTF_16BE_ENCODING, 3)` 724 * if the argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or 725 * `std::nullopt` otherwise. 726 */ 727 static inline std::optional< 728 std::tuple<gsl::not_null<const Encoding*>, size_t>> for_bom(gsl::span<const uint8_t> buffer)729 for_bom(gsl::span<const uint8_t> buffer) { 730 size_t len = buffer.size(); 731 const Encoding* encoding = 732 encoding_for_bom(null_to_bogus(buffer.data()), &len); 733 if (encoding) { 734 return std::make_tuple(gsl::not_null<const Encoding*>(encoding), len); 735 } 736 return std::nullopt; 737 } 738 739 /** 740 * Returns the name of this encoding. 741 * 742 * This name is appropriate to return as-is from the DOM 743 * `document.characterSet` property. 744 */ name()745 inline std::string name() const { 746 std::string name(ENCODING_NAME_MAX_LENGTH, '\0'); 747 // http://herbsutter.com/2008/04/07/cringe-not-vectors-are-guaranteed-to-be-contiguous/#comment-483 748 size_t length = encoding_name(this, reinterpret_cast<uint8_t*>(&name[0])); 749 name.resize(length); 750 return name; 751 } 752 753 /** 754 * Checks whether the _output encoding_ of this encoding can encode every 755 * Unicode code point. (Only true if the output encoding is UTF-8.) 756 */ can_encode_everything()757 inline bool can_encode_everything() const { 758 return encoding_can_encode_everything(this); 759 } 760 761 /** 762 * Checks whether the bytes 0x00...0x7F map exclusively to the characters 763 * U+0000...U+007F and vice versa. 764 */ is_ascii_compatible()765 inline bool is_ascii_compatible() const { 766 return encoding_is_ascii_compatible(this); 767 } 768 769 /** 770 * Checks whether this encoding maps one byte to one Basic Multilingual 771 * Plane code point (i.e. byte length equals decoded UTF-16 length) and 772 * vice versa (for mappable characters). 773 * 774 * `true` iff this encoding is on the list of Legacy single-byte 775 * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) 776 * in the spec or x-user-defined. 777 */ is_single_byte()778 inline bool is_single_byte() const { return encoding_is_single_byte(this); } 779 780 /** 781 * Returns the _output encoding_ of this encoding. This is UTF-8 for 782 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. 783 */ output_encoding()784 inline gsl::not_null<const Encoding*> output_encoding() const { 785 return gsl::not_null<const Encoding*>(encoding_output_encoding(this)); 786 } 787 788 /** 789 * Decode complete input to `std::string` _with BOM sniffing_ and with 790 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 791 * entire input is available as a single buffer (i.e. the end of the 792 * buffer marks the end of the stream). 793 * 794 * This method implements the (non-streaming version of) the 795 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. 796 * 797 * The second item in the returned tuple is the encoding that was actually 798 * used (which may differ from this encoding thanks to BOM sniffing). 799 * 800 * The third item in the returned tuple indicates whether there were 801 * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 802 * 803 * _Note:_ It is wrong to use this when the input buffer represents only 804 * a segment of the input instead of the whole input. Use `new_decoder()` 805 * when decoding segmented input. 806 */ decode(gsl::span<const uint8_t> bytes)807 inline std::tuple<std::string, gsl::not_null<const Encoding*>, bool> decode( 808 gsl::span<const uint8_t> bytes) const { 809 auto opt = Encoding::for_bom(bytes); 810 const Encoding* encoding; 811 if (opt) { 812 size_t bom_length; 813 std::tie(encoding, bom_length) = *opt; 814 bytes = bytes.subspan(bom_length); 815 } else { 816 encoding = this; 817 } 818 auto [str, had_errors] = encoding->decode_without_bom_handling(bytes); 819 return {str, gsl::not_null<const Encoding*>(encoding), had_errors}; 820 } 821 822 /** 823 * Decode complete input to `std::string` _with BOM removal_ and with 824 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 825 * entire input is available as a single buffer (i.e. the end of the 826 * buffer marks the end of the stream). 827 * 828 * When invoked on `UTF_8`, this method implements the (non-streaming 829 * version of) the _UTF-8 decode_ 830 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. 831 * 832 * The second item in the returned pair indicates whether there were 833 * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 834 * 835 * _Note:_ It is wrong to use this when the input buffer represents only 836 * a segment of the input instead of the whole input. Use 837 * `new_decoder_with_bom_removal()` when decoding segmented input. 838 */ decode_with_bom_removal(gsl::span<const uint8_t> bytes)839 inline std::tuple<std::string, bool> decode_with_bom_removal( 840 gsl::span<const uint8_t> bytes) const { 841 if (this == UTF_8_ENCODING && bytes.size() >= 3 && 842 (gsl::as_bytes(bytes.first<3>()) == 843 gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) { 844 bytes = bytes.subspan(3, bytes.size() - 3); 845 } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 && 846 (gsl::as_bytes(bytes.first<2>()) == 847 gsl::as_bytes(gsl::make_span("\xFF\xFE")))) { 848 bytes = bytes.subspan(2, bytes.size() - 2); 849 } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 && 850 (gsl::as_bytes(bytes.first<2>()) == 851 gsl::as_bytes(gsl::make_span("\xFE\xFF")))) { 852 bytes = bytes.subspan(2, bytes.size() - 2); 853 } 854 return decode_without_bom_handling(bytes); 855 } 856 857 /** 858 * Decode complete input to `std::string` _without BOM handling_ and 859 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 860 * the entire input is available as a single buffer (i.e. the end of the 861 * buffer marks the end of the stream). 862 * 863 * When invoked on `UTF_8`, this method implements the (non-streaming 864 * version of) the _UTF-8 decode without BOM_ 865 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 866 * 867 * The second item in the returned pair indicates whether there were 868 * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 869 * 870 * _Note:_ It is wrong to use this when the input buffer represents only 871 * a segment of the input instead of the whole input. Use 872 * `new_decoder_without_bom_handling()` when decoding segmented input. 873 */ decode_without_bom_handling(gsl::span<const uint8_t> bytes)874 inline std::tuple<std::string, bool> decode_without_bom_handling( 875 gsl::span<const uint8_t> bytes) const { 876 auto decoder = new_decoder_without_bom_handling(); 877 auto needed = decoder->max_utf8_buffer_length(bytes.size()); 878 if (!needed) { 879 throw std::overflow_error("Overflow in buffer size computation."); 880 } 881 std::string string(needed.value(), '\0'); 882 const auto [result, read, written, had_errors] = decoder->decode_to_utf8( 883 bytes, 884 gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]), string.size()), 885 true); 886 assert(read == static_cast<size_t>(bytes.size())); 887 assert(written <= static_cast<size_t>(string.size())); 888 assert(result == INPUT_EMPTY); 889 string.resize(written); 890 return {string, had_errors}; 891 } 892 893 /** 894 * Decode complete input to `std::string` _without BOM handling_ and 895 * _with malformed sequences treated as fatal_ when the entire input is 896 * available as a single buffer (i.e. the end of the buffer marks the end 897 * of the stream). 898 * 899 * When invoked on `UTF_8`, this method implements the (non-streaming 900 * version of) the _UTF-8 decode without BOM or fail_ 901 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) 902 * spec concept. 903 * 904 * Returns `std::nullopt` if a malformed sequence was encountered and the result 905 * of the decode as `std::optional<std::string>` otherwise. 906 * 907 * _Note:_ It is wrong to use this when the input buffer represents only 908 * a segment of the input instead of the whole input. Use 909 * `new_decoder_without_bom_handling()` when decoding segmented input. 910 */ 911 inline std::optional<std::string> decode_without_bom_handling_and_without_replacement(gsl::span<const uint8_t> bytes)912 decode_without_bom_handling_and_without_replacement( 913 gsl::span<const uint8_t> bytes) const { 914 auto decoder = new_decoder_without_bom_handling(); 915 auto needed = 916 decoder->max_utf8_buffer_length_without_replacement(bytes.size()); 917 if (!needed) { 918 throw std::overflow_error("Overflow in buffer size computation."); 919 } 920 std::string string(needed.value(), '\0'); 921 const auto [result, read, written] = 922 decoder->decode_to_utf8_without_replacement( 923 bytes, 924 gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]), 925 string.size()), 926 true); 927 assert(result != OUTPUT_FULL); 928 if (result == INPUT_EMPTY) { 929 assert(read == static_cast<size_t>(bytes.size())); 930 assert(written <= static_cast<size_t>(string.size())); 931 string.resize(written); 932 return string; 933 } 934 return std::nullopt; 935 } 936 937 /** 938 * Decode complete input to `std::u16string` _with BOM sniffing_ and with 939 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 940 * entire input is available as a single buffer (i.e. the end of the 941 * buffer marks the end of the stream). 942 * 943 * This method implements the (non-streaming version of) the 944 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. 945 * 946 * The second item in the returned tuple is the encoding that was actually 947 * used (which may differ from this encoding thanks to BOM sniffing). 948 * 949 * The third item in the returned tuple indicates whether there were 950 * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 951 * 952 * _Note:_ It is wrong to use this when the input buffer represents only 953 * a segment of the input instead of the whole input. Use `new_decoder()` 954 * when decoding segmented input. 955 */ 956 inline std::tuple<std::u16string, gsl::not_null<const Encoding*>, bool> decode16(gsl::span<const uint8_t> bytes)957 decode16(gsl::span<const uint8_t> bytes) const { 958 auto opt = Encoding::for_bom(bytes); 959 const Encoding* encoding; 960 if (opt) { 961 size_t bom_length; 962 std::tie(encoding, bom_length) = *opt; 963 bytes = bytes.subspan(bom_length); 964 } else { 965 encoding = this; 966 } 967 auto [str, had_errors] = encoding->decode16_without_bom_handling(bytes); 968 return {str, gsl::not_null<const Encoding*>(encoding), had_errors}; 969 } 970 971 /** 972 * Decode complete input to `std::u16string` _with BOM removal_ and with 973 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 974 * entire input is available as a single buffer (i.e. the end of the 975 * buffer marks the end of the stream). 976 * 977 * When invoked on `UTF_8`, this method implements the (non-streaming 978 * version of) the _UTF-8 decode_ 979 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. 980 * 981 * The second item in the returned pair indicates whether there were 982 * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 983 * 984 * _Note:_ It is wrong to use this when the input buffer represents only 985 * a segment of the input instead of the whole input. Use 986 * `new_decoder_with_bom_removal()` when decoding segmented input. 987 */ decode16_with_bom_removal(gsl::span<const uint8_t> bytes)988 inline std::tuple<std::u16string, bool> decode16_with_bom_removal( 989 gsl::span<const uint8_t> bytes) const { 990 if (this == UTF_8_ENCODING && bytes.size() >= 3 && 991 (gsl::as_bytes(bytes.first<3>()) == 992 gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) { 993 bytes = bytes.subspan(3, bytes.size() - 3); 994 } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 && 995 (gsl::as_bytes(bytes.first<2>()) == 996 gsl::as_bytes(gsl::make_span("\xFF\xFE")))) { 997 bytes = bytes.subspan(2, bytes.size() - 2); 998 } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 && 999 (gsl::as_bytes(bytes.first<2>()) == 1000 gsl::as_bytes(gsl::make_span("\xFE\xFF")))) { 1001 bytes = bytes.subspan(2, bytes.size() - 2); 1002 } 1003 return decode16_without_bom_handling(bytes); 1004 } 1005 1006 /** 1007 * Decode complete input to `std::u16string` _without BOM handling_ and 1008 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 1009 * the entire input is available as a single buffer (i.e. the end of the 1010 * buffer marks the end of the stream). 1011 * 1012 * When invoked on `UTF_8`, this method implements the (non-streaming 1013 * version of) the _UTF-8 decode without BOM_ 1014 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 1015 * 1016 * The second item in the returned pair indicates whether there were 1017 * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 1018 * 1019 * _Note:_ It is wrong to use this when the input buffer represents only 1020 * a segment of the input instead of the whole input. Use 1021 * `new_decoder_without_bom_handling()` when decoding segmented input. 1022 */ decode16_without_bom_handling(gsl::span<const uint8_t> bytes)1023 inline std::tuple<std::u16string, bool> decode16_without_bom_handling( 1024 gsl::span<const uint8_t> bytes) const { 1025 auto decoder = new_decoder_without_bom_handling(); 1026 auto needed = decoder->max_utf16_buffer_length(bytes.size()); 1027 if (!needed) { 1028 throw std::overflow_error("Overflow in buffer size computation."); 1029 } 1030 std::u16string string(needed.value(), '\0'); 1031 const auto [result, read, written, had_errors] = decoder->decode_to_utf16( 1032 bytes, gsl::make_span(&string[0], string.size()), true); 1033 assert(read == static_cast<size_t>(bytes.size())); 1034 assert(written <= static_cast<size_t>(string.size())); 1035 assert(result == INPUT_EMPTY); 1036 string.resize(written); 1037 return {string, had_errors}; 1038 } 1039 1040 /** 1041 * Decode complete input to `std::u16string` _without BOM handling_ and 1042 * _with malformed sequences treated as fatal_ when the entire input is 1043 * available as a single buffer (i.e. the end of the buffer marks the end 1044 * of the stream). 1045 * 1046 * When invoked on `UTF_8`, this method implements the (non-streaming 1047 * version of) the _UTF-8 decode without BOM or fail_ 1048 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) 1049 * spec concept. 1050 * 1051 * Returns `std::nullopt` if a malformed sequence was encountered and the result 1052 * of the decode as `std::optional<std::u16string>` otherwise. 1053 * 1054 * _Note:_ It is wrong to use this when the input buffer represents only 1055 * a segment of the input instead of the whole input. Use 1056 * `new_decoder_without_bom_handling()` when decoding segmented input. 1057 */ 1058 inline std::optional<std::u16string> decode16_without_bom_handling_and_without_replacement(gsl::span<const uint8_t> bytes)1059 decode16_without_bom_handling_and_without_replacement( 1060 gsl::span<const uint8_t> bytes) const { 1061 auto decoder = new_decoder_without_bom_handling(); 1062 auto needed = decoder->max_utf16_buffer_length(bytes.size()); 1063 if (!needed) { 1064 throw std::overflow_error("Overflow in buffer size computation."); 1065 } 1066 std::u16string string(needed.value(), '\0'); 1067 const auto [result, read, written] = 1068 decoder->decode_to_utf16_without_replacement( 1069 bytes, gsl::make_span(&string[0], string.size()), true); 1070 assert(result != OUTPUT_FULL); 1071 if (result == INPUT_EMPTY) { 1072 assert(read == static_cast<size_t>(bytes.size())); 1073 assert(written <= static_cast<size_t>(string.size())); 1074 string.resize(written); 1075 return string; 1076 } 1077 return std::nullopt; 1078 } 1079 1080 /** 1081 * Encode complete input to `std::vector<uint8_t>` with unmappable characters 1082 * replaced with decimal numeric character references when the entire input 1083 * is available as a single buffer (i.e. the end of the buffer marks the 1084 * end of the stream). 1085 * 1086 * This method implements the (non-streaming version of) the 1087 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. 1088 * 1089 * The second item in the returned tuple is the encoding that was actually 1090 * used (which may differ from this encoding thanks to some encodings 1091 * having UTF-8 as their output encoding). 1092 * 1093 * The third item in the returned tuple indicates whether there were 1094 * unmappable characters (that were replaced with HTML numeric character 1095 * references). 1096 * 1097 * _Note:_ It is wrong to use this when the input buffer represents only 1098 * a segment of the input instead of the whole input. Use `new_encoder()` 1099 * when encoding segmented output. 1100 */ 1101 inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool> encode(std::string_view string)1102 encode(std::string_view string) const { 1103 auto output_enc = output_encoding(); 1104 if (output_enc == UTF_8_ENCODING) { 1105 std::vector<uint8_t> vec(string.size()); 1106 std::memcpy(&vec[0], string.data(), string.size()); 1107 } 1108 auto encoder = output_enc->new_encoder(); 1109 auto needed = 1110 encoder->max_buffer_length_from_utf8_if_no_unmappables(string.size()); 1111 if (!needed) { 1112 throw std::overflow_error("Overflow in buffer size computation."); 1113 } 1114 std::vector<uint8_t> vec(needed.value()); 1115 bool total_had_errors = false; 1116 size_t total_read = 0; 1117 size_t total_written = 0; 1118 for (;;) { 1119 const auto [result, read, written, had_errors] = 1120 encoder->encode_from_utf8(string.substr(total_read), 1121 gsl::make_span(vec).subspan(total_written), 1122 true); 1123 total_read += read; 1124 total_written += written; 1125 total_had_errors |= had_errors; 1126 if (result == INPUT_EMPTY) { 1127 assert(total_read == static_cast<size_t>(string.size())); 1128 assert(total_written <= static_cast<size_t>(vec.size())); 1129 vec.resize(total_written); 1130 return {vec, gsl::not_null<const Encoding*>(output_enc), 1131 total_had_errors}; 1132 } 1133 auto needed = encoder->max_buffer_length_from_utf8_if_no_unmappables( 1134 string.size() - total_read); 1135 if (!needed) { 1136 throw std::overflow_error("Overflow in buffer size computation."); 1137 } 1138 vec.resize(total_written + needed.value()); 1139 } 1140 } 1141 1142 /** 1143 * Encode complete input to `std::vector<uint8_t>` with unmappable characters 1144 * replaced with decimal numeric character references when the entire input 1145 * is available as a single buffer (i.e. the end of the buffer marks the 1146 * end of the stream). 1147 * 1148 * This method implements the (non-streaming version of) the 1149 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. 1150 * 1151 * The second item in the returned tuple is the encoding that was actually 1152 * used (which may differ from this encoding thanks to some encodings 1153 * having UTF-8 as their output encoding). 1154 * 1155 * The third item in the returned tuple indicates whether there were 1156 * unmappable characters (that were replaced with HTML numeric character 1157 * references). 1158 * 1159 * _Note:_ It is wrong to use this when the input buffer represents only 1160 * a segment of the input instead of the whole input. Use `new_encoder()` 1161 * when encoding segmented output. 1162 */ 1163 inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool> encode(std::u16string_view string)1164 encode(std::u16string_view string) const { 1165 auto output_enc = output_encoding(); 1166 auto encoder = output_enc->new_encoder(); 1167 auto needed = 1168 encoder->max_buffer_length_from_utf16_if_no_unmappables(string.size()); 1169 if (!needed) { 1170 throw std::overflow_error("Overflow in buffer size computation."); 1171 } 1172 std::vector<uint8_t> vec(needed.value()); 1173 bool total_had_errors = false; 1174 size_t total_read = 0; 1175 size_t total_written = 0; 1176 for (;;) { 1177 const auto [result, read, written, had_errors] = 1178 encoder->encode_from_utf16(string.substr(total_read), 1179 gsl::make_span(vec).subspan(total_written), 1180 true); 1181 total_read += read; 1182 total_written += written; 1183 total_had_errors |= had_errors; 1184 if (result == INPUT_EMPTY) { 1185 assert(total_read == static_cast<size_t>(string.size())); 1186 assert(total_written <= static_cast<size_t>(vec.size())); 1187 vec.resize(total_written); 1188 return {vec, gsl::not_null<const Encoding*>(output_enc), 1189 total_had_errors}; 1190 } 1191 auto needed = encoder->max_buffer_length_from_utf16_if_no_unmappables( 1192 string.size() - total_read); 1193 if (!needed) { 1194 throw std::overflow_error("Overflow in buffer size computation."); 1195 } 1196 vec.resize(total_written + needed.value()); 1197 } 1198 } 1199 1200 /** 1201 * Instantiates a new decoder for this encoding with BOM sniffing enabled. 1202 * 1203 * BOM sniffing may cause the returned decoder to morph into a decoder 1204 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. 1205 */ new_decoder()1206 inline std::unique_ptr<Decoder> new_decoder() const { 1207 return std::unique_ptr<Decoder>(encoding_new_decoder(this)); 1208 } 1209 1210 /** 1211 * Instantiates a new decoder for this encoding with BOM sniffing enabled 1212 * into memory occupied by a previously-instantiated decoder. 1213 * 1214 * BOM sniffing may cause the returned decoder to morph into a decoder 1215 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. 1216 */ new_decoder_into(Decoder & decoder)1217 inline void new_decoder_into(Decoder& decoder) const { 1218 encoding_new_decoder_into(this, &decoder); 1219 } 1220 1221 /** 1222 * Instantiates a new decoder for this encoding with BOM removal. 1223 * 1224 * If the input starts with bytes that are the BOM for this encoding, 1225 * those bytes are removed. However, the decoder never morphs into a 1226 * decoder for another encoding: A BOM for another encoding is treated as 1227 * (potentially malformed) input to the decoding algorithm for this 1228 * encoding. 1229 */ new_decoder_with_bom_removal()1230 inline std::unique_ptr<Decoder> new_decoder_with_bom_removal() const { 1231 return std::unique_ptr<Decoder>( 1232 encoding_new_decoder_with_bom_removal(this)); 1233 } 1234 1235 /** 1236 * Instantiates a new decoder for this encoding with BOM removal 1237 * into memory occupied by a previously-instantiated decoder. 1238 * 1239 * If the input starts with bytes that are the BOM for this encoding, 1240 * those bytes are removed. However, the decoder never morphs into a 1241 * decoder for another encoding: A BOM for another encoding is treated as 1242 * (potentially malformed) input to the decoding algorithm for this 1243 * encoding. 1244 */ new_decoder_with_bom_removal_into(Decoder & decoder)1245 inline void new_decoder_with_bom_removal_into(Decoder& decoder) const { 1246 encoding_new_decoder_with_bom_removal_into(this, &decoder); 1247 } 1248 1249 /** 1250 * Instantiates a new decoder for this encoding with BOM handling disabled. 1251 * 1252 * If the input starts with bytes that look like a BOM, those bytes are 1253 * not treated as a BOM. (Hence, the decoder never morphs into a decoder 1254 * for another encoding.) 1255 * 1256 * _Note:_ If the caller has performed BOM sniffing on its own but has not 1257 * removed the BOM, the caller should use `new_decoder_with_bom_removal()` 1258 * instead of this method to cause the BOM to be removed. 1259 */ new_decoder_without_bom_handling()1260 inline std::unique_ptr<Decoder> new_decoder_without_bom_handling() const { 1261 return std::unique_ptr<Decoder>( 1262 encoding_new_decoder_without_bom_handling(this)); 1263 } 1264 1265 /** 1266 * Instantiates a new decoder for this encoding with BOM handling disabled 1267 * into memory occupied by a previously-instantiated decoder. 1268 * 1269 * If the input starts with bytes that look like a BOM, those bytes are 1270 * not treated as a BOM. (Hence, the decoder never morphs into a decoder 1271 * for another encoding.) 1272 * 1273 * _Note:_ If the caller has performed BOM sniffing on its own but has not 1274 * removed the BOM, the caller should use 1275 * `new_decoder_with_bom_removal_into()` 1276 * instead of this method to cause the BOM to be removed. 1277 */ new_decoder_without_bom_handling_into(Decoder & decoder)1278 inline void new_decoder_without_bom_handling_into(Decoder& decoder) const { 1279 encoding_new_decoder_without_bom_handling_into(this, &decoder); 1280 } 1281 1282 /** 1283 * Instantiates a new encoder for the output encoding of this encoding. 1284 */ new_encoder()1285 inline std::unique_ptr<Encoder> new_encoder() const { 1286 return std::unique_ptr<Encoder>(encoding_new_encoder(this)); 1287 } 1288 1289 /** 1290 * Instantiates a new encoder for the output encoding of this encoding 1291 * into memory occupied by a previously-instantiated encoder. 1292 */ new_encoder_into(Encoder & encoder)1293 inline void new_encoder_into(Encoder& encoder) const { 1294 encoding_new_encoder_into(this, &encoder); 1295 } 1296 1297 /** 1298 * Validates UTF-8. 1299 * 1300 * Returns the index of the first byte that makes the input malformed as 1301 * UTF-8 or the length of the input if the input is entirely valid. 1302 */ utf8_valid_up_to(gsl::span<const uint8_t> buffer)1303 static inline size_t utf8_valid_up_to(gsl::span<const uint8_t> buffer) { 1304 return encoding_utf8_valid_up_to( 1305 null_to_bogus<const uint8_t>(buffer.data()), buffer.size()); 1306 } 1307 1308 /** 1309 * Validates ASCII. 1310 * 1311 * Returns the index of the first byte that makes the input malformed as 1312 * ASCII or the length of the input if the input is entirely valid. 1313 */ ascii_valid_up_to(gsl::span<const uint8_t> buffer)1314 static inline size_t ascii_valid_up_to(gsl::span<const uint8_t> buffer) { 1315 return encoding_ascii_valid_up_to( 1316 null_to_bogus<const uint8_t>(buffer.data()), buffer.size()); 1317 } 1318 1319 /** 1320 * Validates ISO-2022-JP ASCII-state data. 1321 * 1322 * Returns the index of the first byte that makes the input not 1323 * representable in the ASCII state of ISO-2022-JP or the length of the 1324 * input if the input is entirely representable in the ASCII state of 1325 * ISO-2022-JP. 1326 */ iso_2022_jp_ascii_valid_up_to(gsl::span<const uint8_t> buffer)1327 static inline size_t iso_2022_jp_ascii_valid_up_to( 1328 gsl::span<const uint8_t> buffer) { 1329 return encoding_iso_2022_jp_ascii_valid_up_to( 1330 null_to_bogus<const uint8_t>(buffer.data()), buffer.size()); 1331 } 1332 1333 private: 1334 /** 1335 * Replaces `nullptr` with a bogus pointer suitable for use as part of a 1336 * zero-length Rust slice. 1337 */ 1338 template <class T> null_to_bogus(T * ptr)1339 static inline T* null_to_bogus(T* ptr) { 1340 return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); 1341 } 1342 1343 Encoding() = delete; 1344 Encoding(const Encoding&) = delete; 1345 Encoding& operator=(const Encoding&) = delete; 1346 ~Encoding() = delete; 1347 }; 1348 1349 }; // namespace encoding_rs 1350 1351 #endif // encoding_rs_cpp_h_ 1352