1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 #pragma once
11 
12 #ifndef encoding_rs_cpp_h_
13 #define encoding_rs_cpp_h_
14 
15 #include <memory>
16 #include <optional>
17 #include <string>
18 #include <string_view>
19 #include <tuple>
20 #include <vector>
21 #include "gsl/gsl"
22 
23 namespace encoding_rs {
24 class Encoding;
25 class Decoder;
26 class Encoder;
27 };  // namespace encoding_rs
28 
29 #define ENCODING_RS_ENCODING encoding_rs::Encoding
30 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
31   gsl::not_null<const encoding_rs::Encoding*>
32 #define ENCODING_RS_ENCODER encoding_rs::Encoder
33 #define ENCODING_RS_DECODER encoding_rs::Decoder
34 
35 #include "encoding_rs.h"
36 
37 namespace encoding_rs {
38 
39 /**
40  * A converter that decodes a byte stream into Unicode according to a
41  * character encoding in a streaming (incremental) manner.
42  *
43  * The various `decode_*` methods take an input buffer (`src`) and an output
44  * buffer `dst` both of which are caller-allocated. There are variants for
45  * both UTF-8 and UTF-16 output buffers.
46  *
47  * A `decode_*` method decodes bytes from `src` into Unicode characters stored
48  * into `dst` until one of the following three things happens:
49  *
50  * 1. A malformed byte sequence is encountered (`*_without_replacement`
51  *    variants only).
52  *
53  * 2. The output buffer has been filled so near capacity that the decoder
54  *    cannot be sure that processing an additional byte of input wouldn't
55  *    cause so much output that the output buffer would overflow.
56  *
57  * 3. All the input bytes have been processed.
58  *
59  * The `decode_*` method then returns tuple of a status indicating which one
60  * of the three reasons to return happened, how many input bytes were read,
61  * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
62  * when decoding to UTF-16) were written, and in the case of the
63  * variants performing replacement, a boolean indicating whether an error was
64  * replaced with the REPLACEMENT CHARACTER during the call.
65  *
66  * The number of bytes "written" is what's logically written. Garbage may be
67  * written in the output buffer beyond the point logically written to.
68  *
69  * In the case of the `*_without_replacement` variants, the status is a
70  * `uint32_t` whose possible values are packed info about a malformed byte
71  * sequence, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding to the three cases
72  * listed above).
73  *
74  * Packed info about malformed sequences has the following format:
75  * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
76  * indicate the number of bytes that were consumed after the malformed
77  * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
78  * the length of the malformed byte sequence (possible decimal values 1, 2,
79  * 3 or 4). The maximum possible sum of the two is 6.
80  *
81  * In the case of methods whose name does not end with
82  * `*_without_replacement`, malformed sequences are automatically replaced
83  * with the REPLACEMENT CHARACTER and errors do not cause the methods to
84  * return early.
85  *
86  * When decoding to UTF-8, the output buffer must have at least 4 bytes of
87  * space. When decoding to UTF-16, the output buffer must have at least two
88  * UTF-16 code units (`char16_t`) of space.
89  *
90  * When decoding to UTF-8 without replacement, the methods are guaranteed
91  * not to return indicating that more output space is needed if the length
92  * of the output buffer is at least the length returned by
93  * `max_utf8_buffer_length_without_replacement()`. When decoding to UTF-8
94  * with replacement, the length of the output buffer that guarantees the
95  * methods not to return indicating that more output space is needed is given
96  * by `max_utf8_buffer_length()`. When decoding to UTF-16 with
97  * or without replacement, the length of the output buffer that guarantees
98  * the methods not to return indicating that more output space is needed is
99  * given by `max_utf16_buffer_length()`.
100  *
101  * The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
102  * and the output after each `decode_*` call is guaranteed to consist of
103  * complete characters. (I.e. the code unit sequence for the last character is
104  * guaranteed not to be split across output buffers.)
105  *
106  * The boolean argument `last` indicates that the end of the stream is reached
107  * when all the bytes in `src` have been consumed.
108  *
109  * A `Decoder` object can be used to incrementally decode a byte stream.
110  *
111  * During the processing of a single stream, the caller must call `decode_*`
112  * zero or more times with `last` set to `false` and then call `decode_*` at
113  * least once with `last` set to `true`. If `decode_*` returns `INPUT_EMPTY`,
114  * the processing of the stream has ended. Otherwise, the caller must call
115  * `decode_*` again with `last` set to `true` (or treat a malformed result,
116  * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error).
117  *
118  * Once the stream has ended, the `Decoder` object must not be used anymore.
119  * That is, you need to create another one to process another stream.
120  *
121  * When the decoder returns `OUTPUT_FULL` or the decoder returns a malformed
122  * result and the caller does not wish to treat it as a fatal error, the input
123  * buffer `src` may not have been completely consumed. In that case, the caller
124  * must pass the unconsumed contents of `src` to `decode_*` again upon the next
125  * call.
126  *
127  * # Infinite loops
128  *
129  * When converting with a fixed-size output buffer whose size is too small to
130  * accommodate one character of output, an infinite loop ensues. When
131  * converting with a fixed-size output buffer, it generally makes sense to
132  * make the buffer fairly large (e.g. couple of kilobytes).
133  */
134 class Decoder final {
135  public:
~Decoder()136   ~Decoder() {}
delete(void * decoder)137   static inline void operator delete(void* decoder) {
138     decoder_free(reinterpret_cast<Decoder*>(decoder));
139   }
140 
141   /**
142    * The `Encoding` this `Decoder` is for.
143    *
144    * BOM sniffing can change the return value of this method during the life
145    * of the decoder.
146    */
encoding()147   inline gsl::not_null<const Encoding*> encoding() const {
148     return gsl::not_null<const Encoding*>(decoder_encoding(this));
149   }
150 
151   /**
152    * Query the worst-case UTF-8 output size _with replacement_.
153    *
154    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
155    * that will not overflow given the current state of the decoder and
156    * `byte_length` number of additional input bytes when decoding with
157    * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
158    * sequence or `std::optional` without value if `size_t` would overflow.
159    */
max_utf8_buffer_length(size_t byte_length)160   inline std::optional<size_t> max_utf8_buffer_length(
161       size_t byte_length) const {
162     size_t val = decoder_max_utf8_buffer_length(this, byte_length);
163     if (val == SIZE_MAX) {
164       return std::nullopt;
165     }
166     return val;
167   }
168 
169   /**
170    * Query the worst-case UTF-8 output size _without replacement_.
171    *
172    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
173    * that will not overflow given the current state of the decoder and
174    * `byte_length` number of additional input bytes when decoding without
175    * replacement error handling or `std::optional` without value if `size_t`
176    * would overflow.
177    *
178    * Note that this value may be too small for the `_with_replacement` case.
179    * Use `max_utf8_buffer_length()` for that case.
180    */
max_utf8_buffer_length_without_replacement(size_t byte_length)181   inline std::optional<size_t> max_utf8_buffer_length_without_replacement(
182       size_t byte_length) const {
183     size_t val =
184         decoder_max_utf8_buffer_length_without_replacement(this, byte_length);
185     if (val == SIZE_MAX) {
186       return std::nullopt;
187     }
188     return val;
189   }
190 
191   /**
192    * Incrementally decode a byte stream into UTF-8 with malformed sequences
193    * replaced with the REPLACEMENT CHARACTER.
194    *
195    * See the documentation of the class for documentation for `decode_*`
196    * methods collectively.
197    */
decode_to_utf8(gsl::span<const uint8_t> src,gsl::span<uint8_t> dst,bool last)198   inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf8(
199       gsl::span<const uint8_t> src, gsl::span<uint8_t> dst, bool last) {
200     size_t src_read = src.size();
201     size_t dst_written = dst.size();
202     bool had_replacements;
203     uint32_t result =
204         decoder_decode_to_utf8(this, null_to_bogus<const uint8_t>(src.data()),
205                                &src_read, null_to_bogus<uint8_t>(dst.data()),
206                                &dst_written, last, &had_replacements);
207     return {result, src_read, dst_written, had_replacements};
208   }
209 
210   /**
211    * Incrementally decode a byte stream into UTF-8 _without replacement_.
212    *
213    * See the documentation of the class for documentation for `decode_*`
214    * methods collectively.
215    */
216   inline std::tuple<uint32_t, size_t, size_t>
decode_to_utf8_without_replacement(gsl::span<const uint8_t> src,gsl::span<uint8_t> dst,bool last)217   decode_to_utf8_without_replacement(gsl::span<const uint8_t> src,
218                                      gsl::span<uint8_t> dst, bool last) {
219     size_t src_read = src.size();
220     size_t dst_written = dst.size();
221     uint32_t result = decoder_decode_to_utf8_without_replacement(
222         this, null_to_bogus<const uint8_t>(src.data()), &src_read,
223         null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
224     return {result, src_read, dst_written};
225   }
226 
227   /**
228    * Query the worst-case UTF-16 output size (with or without replacement).
229    *
230    * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
231    * that will not overflow given the current state of the decoder and
232    * `byte_length` number of additional input bytes  or `std::optional`
233    * without value if `size_t` would overflow.
234    *
235    * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
236    * return value of this method applies also in the
237    * `_without_replacement` case.
238    */
max_utf16_buffer_length(size_t byte_length)239   inline std::optional<size_t> max_utf16_buffer_length(
240       size_t byte_length) const {
241     size_t val = decoder_max_utf16_buffer_length(this, byte_length);
242     if (val == SIZE_MAX) {
243       return std::nullopt;
244     }
245     return val;
246   }
247 
248   /**
249    * Incrementally decode a byte stream into UTF-16 with malformed sequences
250    * replaced with the REPLACEMENT CHARACTER.
251    *
252    * See the documentation of the class for documentation for `decode_*`
253    * methods collectively.
254    */
decode_to_utf16(gsl::span<const uint8_t> src,gsl::span<char16_t> dst,bool last)255   inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf16(
256       gsl::span<const uint8_t> src, gsl::span<char16_t> dst, bool last) {
257     size_t src_read = src.size();
258     size_t dst_written = dst.size();
259     bool had_replacements;
260     uint32_t result =
261         decoder_decode_to_utf16(this, null_to_bogus<const uint8_t>(src.data()),
262                                 &src_read, null_to_bogus<char16_t>(dst.data()),
263                                 &dst_written, last, &had_replacements);
264     return {result, src_read, dst_written, had_replacements};
265   }
266 
267   /**
268    * Incrementally decode a byte stream into UTF-16 _without replacement_.
269    *
270    * See the documentation of the class for documentation for `decode_*`
271    * methods collectively.
272    */
273   inline std::tuple<uint32_t, size_t, size_t>
decode_to_utf16_without_replacement(gsl::span<const uint8_t> src,gsl::span<char16_t> dst,bool last)274   decode_to_utf16_without_replacement(gsl::span<const uint8_t> src,
275                                       gsl::span<char16_t> dst, bool last) {
276     size_t src_read = src.size();
277     size_t dst_written = dst.size();
278     uint32_t result = decoder_decode_to_utf16_without_replacement(
279         this, null_to_bogus<const uint8_t>(src.data()), &src_read,
280         null_to_bogus<char16_t>(dst.data()), &dst_written, last);
281     return {result, src_read, dst_written};
282   }
283 
284   /**
285    * Checks for compatibility with storing Unicode scalar values as unsigned
286    * bytes taking into account the state of the decoder.
287    *
288    * Returns `std::nullopt` if the decoder is not in a neutral state, including
289    * waiting for the BOM, or if the encoding is never Latin1-byte-compatible.
290    *
291    * Otherwise returns the index of the first byte whose unsigned value doesn't
292    * directly correspond to the decoded Unicode scalar value, or the length
293    * of the input if all bytes in the input decode directly to scalar values
294    * corresponding to the unsigned byte values.
295    *
296    * Does not change the state of the decoder.
297    *
298    * Do not use this unless you are supporting SpiderMonkey/V8-style string
299    * storage optimizations.
300    */
latin1_byte_compatible_up_to(gsl::span<const uint8_t> buffer)301   inline std::optional<size_t> latin1_byte_compatible_up_to(
302       gsl::span<const uint8_t> buffer) const {
303     size_t val = decoder_latin1_byte_compatible_up_to(
304         this, null_to_bogus<const uint8_t>(buffer.data()),
305         static_cast<size_t>(buffer.size()));
306     if (val == SIZE_MAX) {
307       return std::nullopt;
308     }
309     return val;
310   }
311 
312  private:
313   /**
314    * Replaces `nullptr` with a bogus pointer suitable for use as part of a
315    * zero-length Rust slice.
316    */
317   template <class T>
null_to_bogus(T * ptr)318   static inline T* null_to_bogus(T* ptr) {
319     return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
320   }
321 
322   Decoder() = delete;
323   Decoder(const Decoder&) = delete;
324   Decoder& operator=(const Decoder&) = delete;
325 };
326 
327 /**
328  * A converter that encodes a Unicode stream into bytes according to a
329  * character encoding in a streaming (incremental) manner.
330  *
331  * The various `encode_*` methods take an input buffer (`src`) and an output
332  * buffer `dst` both of which are caller-allocated. There are variants for
333  * both UTF-8 and UTF-16 input buffers.
334  *
335  * An `encode_*` method encode characters from `src` into bytes characters
336  * stored into `dst` until one of the following three things happens:
337  *
338  * 1. An unmappable character is encountered (`*_without_replacement` variants
339  *    only).
340  *
341  * 2. The output buffer has been filled so near capacity that the decoder
342  *    cannot be sure that processing an additional character of input wouldn't
343  *    cause so much output that the output buffer would overflow.
344  *
345  * 3. All the input characters have been processed.
346  *
347  * The `encode_*` method then returns tuple of a status indicating which one
348  * of the three reasons to return happened, how many input code units (`uint8_t`
349  * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
350  * how many output bytes were written, and in the case of the variants that
351  * perform replacement, a boolean indicating whether an unmappable
352  * character was replaced with a numeric character reference during the call.
353  *
354  * The number of bytes "written" is what's logically written. Garbage may be
355  * written in the output buffer beyond the point logically written to.
356  *
357  * In the case of the methods whose name ends with
358  * `*_without_replacement`, the status is a `uint32_t` whose possible values
359  * are an unmappable code point, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding
360  * to the three cases listed above).
361  *
362  * In the case of methods whose name does not end with
363  * `*_without_replacement`, unmappable characters are automatically replaced
364  * with the corresponding numeric character references and unmappable
365  * characters do not cause the methods to return early.
366  *
367  * When encoding from UTF-8 without replacement, the methods are guaranteed
368  * not to return indicating that more output space is needed if the length
369  * of the output buffer is at least the length returned by
370  * `max_buffer_length_from_utf8_without_replacement()`. When encoding from
371  * UTF-8 with replacement, the length of the output buffer that guarantees the
372  * methods not to return indicating that more output space is needed in the
373  * absence of unmappable characters is given by
374  * `max_buffer_length_from_utf8_if_no_unmappables()`. When encoding from
375  * UTF-16 without replacement, the methods are guaranteed not to return
376  * indicating that more output space is needed if the length of the output
377  * buffer is at least the length returned by
378  * `max_buffer_length_from_utf16_without_replacement()`. When encoding
379  * from UTF-16 with replacement, the the length of the output buffer that
380  * guarantees the methods not to return indicating that more output space is
381  * needed in the absence of unmappable characters is given by
382  * `max_buffer_length_from_utf16_if_no_unmappables()`.
383  * When encoding with replacement, applications are not expected to size the
384  * buffer for the worst case ahead of time but to resize the buffer if there
385  * are unmappable characters. This is why max length queries are only available
386  * for the case where there are no unmappable characters.
387  *
388  * When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. When
389  * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
390  * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
391  * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
392  * surrogate pairs are not split across input buffer boundaries.
393  *
394  * After an `encode_*` call returns, the output produced so far, taken as a
395  * whole from the start of the stream, is guaranteed to consist of a valid
396  * byte sequence in the target encoding. (I.e. the code unit sequence for a
397  * character is guaranteed not to be split across output buffers. However, due
398  * to the stateful nature of ISO-2022-JP, the stream needs to be considered
399  * from the start for it to be valid. For other encodings, the validity holds
400  * on a per-output buffer basis.)
401  *
402  * The boolean argument `last` indicates that the end of the stream is reached
403  * when all the characters in `src` have been consumed. This argument is needed
404  * for ISO-2022-JP and is ignored for other encodings.
405  *
406  * An `Encoder` object can be used to incrementally encode a byte stream.
407  *
408  * During the processing of a single stream, the caller must call `encode_*`
409  * zero or more times with `last` set to `false` and then call `encode_*` at
410  * least once with `last` set to `true`. If `encode_*` returns `INPUT_EMPTY`,
411  * the processing of the stream has ended. Otherwise, the caller must call
412  * `encode_*` again with `last` set to `true` (or treat an unmappable result,
413  * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error).
414  *
415  * Once the stream has ended, the `Encoder` object must not be used anymore.
416  * That is, you need to create another one to process another stream.
417  *
418  * When the encoder returns `OUTPUT_FULL` or the encoder returns an unmappable
419  * result and the caller does not wish to treat it as a fatal error, the input
420  * buffer `src` may not have been completely consumed. In that case, the caller
421  * must pass the unconsumed contents of `src` to `encode_*` again upon the next
422  * call.
423  *
424  * # Infinite loops
425  *
426  * When converting with a fixed-size output buffer whose size is too small to
427  * accommodate one character of output, an infinite loop ensues. When
428  * converting with a fixed-size output buffer, it generally makes sense to
429  * make the buffer fairly large (e.g. couple of kilobytes).
430  */
431 class Encoder final {
432  public:
~Encoder()433   ~Encoder() {}
434 
delete(void * encoder)435   static inline void operator delete(void* encoder) {
436     encoder_free(reinterpret_cast<Encoder*>(encoder));
437   }
438 
439   /**
440    * The `Encoding` this `Encoder` is for.
441    */
encoding()442   inline gsl::not_null<const Encoding*> encoding() const {
443     return gsl::not_null<const Encoding*>(encoder_encoding(this));
444   }
445 
446   /**
447    * Returns `true` if this is an ISO-2022-JP encoder that's not in the
448    * ASCII state and `false` otherwise.
449    */
has_pending_state()450   inline bool has_pending_state() const {
451     return encoder_has_pending_state(this);
452   }
453 
454   /**
455    * Query the worst-case output size when encoding from UTF-8 with
456    * replacement.
457    *
458    * Returns the size of the output buffer in bytes that will not overflow
459    * given the current state of the encoder and `byte_length` number of
460    * additional input code units if there are no unmappable characters in
461    * the input or `SIZE_MAX` if `size_t` would overflow.
462    */
max_buffer_length_from_utf8_if_no_unmappables(size_t byte_length)463   inline std::optional<size_t> max_buffer_length_from_utf8_if_no_unmappables(
464       size_t byte_length) const {
465     size_t val = encoder_max_buffer_length_from_utf8_if_no_unmappables(
466         this, byte_length);
467     if (val == SIZE_MAX) {
468       return std::nullopt;
469     }
470     return val;
471   }
472 
473   /**
474    * Query the worst-case output size when encoding from UTF-8 without
475    * replacement.
476    *
477    * Returns the size of the output buffer in bytes that will not overflow
478    * given the current state of the encoder and `byte_length` number of
479    * additional input code units or `SIZE_MAX` if `size_t` would overflow.
480    */
max_buffer_length_from_utf8_without_replacement(size_t byte_length)481   inline std::optional<size_t> max_buffer_length_from_utf8_without_replacement(
482       size_t byte_length) const {
483     size_t val = encoder_max_buffer_length_from_utf8_without_replacement(
484         this, byte_length);
485     if (val == SIZE_MAX) {
486       return std::nullopt;
487     }
488     return val;
489   }
490 
491   /**
492    * Incrementally encode into byte stream from UTF-8 with unmappable
493    * characters replaced with HTML (decimal) numeric character references.
494    *
495    * See the documentation of the class for documentation for `encode_*`
496    * methods collectively.
497    */
encode_from_utf8(std::string_view src,gsl::span<uint8_t> dst,bool last)498   inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf8(
499       std::string_view src, gsl::span<uint8_t> dst, bool last) {
500     size_t src_read = src.size();
501     size_t dst_written = dst.size();
502     bool had_replacements;
503     uint32_t result = encoder_encode_from_utf8(
504         this,
505         null_to_bogus<const uint8_t>(
506             reinterpret_cast<const uint8_t*>(src.data())),
507         &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last,
508         &had_replacements);
509     return {result, src_read, dst_written, had_replacements};
510   }
511 
512   /**
513    * Incrementally encode into byte stream from UTF-8 _without replacement_.
514    *
515    * See the documentation of the class for documentation for `encode_*`
516    * methods collectively.
517    */
518   inline std::tuple<uint32_t, size_t, size_t>
encode_from_utf8_without_replacement(std::string_view src,gsl::span<uint8_t> dst,bool last)519   encode_from_utf8_without_replacement(std::string_view src,
520                                        gsl::span<uint8_t> dst, bool last) {
521     size_t src_read = src.size();
522     size_t dst_written = dst.size();
523     uint32_t result = encoder_encode_from_utf8_without_replacement(
524         this,
525         null_to_bogus<const uint8_t>(
526             reinterpret_cast<const uint8_t*>(src.data())),
527         &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
528     return {result, src_read, dst_written};
529   }
530 
531   /**
532    * Query the worst-case output size when encoding from UTF-16 with
533    * replacement.
534    *
535    * Returns the size of the output buffer in bytes that will not overflow
536    * given the current state of the encoder and `u16_length` number of
537    * additional input code units if there are no unmappable characters in
538    * the input or `SIZE_MAX` if `size_t` would overflow.
539    */
max_buffer_length_from_utf16_if_no_unmappables(size_t u16_length)540   inline std::optional<size_t> max_buffer_length_from_utf16_if_no_unmappables(
541       size_t u16_length) const {
542     size_t val = encoder_max_buffer_length_from_utf16_if_no_unmappables(
543         this, u16_length);
544     if (val == SIZE_MAX) {
545       return std::nullopt;
546     }
547     return val;
548   }
549 
550   /**
551    * Query the worst-case output size when encoding from UTF-16 without
552    * replacement.
553    *
554    * Returns the size of the output buffer in bytes that will not overflow
555    * given the current state of the encoder and `u16_length` number of
556    * additional input code units or `SIZE_MAX` if `size_t` would overflow.
557    */
max_buffer_length_from_utf16_without_replacement(size_t u16_length)558   inline std::optional<size_t> max_buffer_length_from_utf16_without_replacement(
559       size_t u16_length) const {
560     size_t val = encoder_max_buffer_length_from_utf16_without_replacement(
561         this, u16_length);
562     if (val == SIZE_MAX) {
563       return std::nullopt;
564     }
565     return val;
566   }
567 
568   /**
569    * Incrementally encode into byte stream from UTF-16 with unmappable
570    * characters replaced with HTML (decimal) numeric character references.
571    *
572    * See the documentation of the class for documentation for `encode_*`
573    * methods collectively.
574    */
encode_from_utf16(std::u16string_view src,gsl::span<uint8_t> dst,bool last)575   inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf16(
576       std::u16string_view src, gsl::span<uint8_t> dst, bool last) {
577     size_t src_read = src.size();
578     size_t dst_written = dst.size();
579     bool had_replacements;
580     uint32_t result = encoder_encode_from_utf16(
581         this, null_to_bogus<const char16_t>(src.data()), &src_read,
582         null_to_bogus<uint8_t>(dst.data()), &dst_written, last,
583         &had_replacements);
584     return {result, src_read, dst_written, had_replacements};
585   }
586 
587   /**
588    * Incrementally encode into byte stream from UTF-16 _without replacement_.
589    *
590    * See the documentation of the class for documentation for `encode_*`
591    * methods collectively.
592    */
593   inline std::tuple<uint32_t, size_t, size_t>
encode_from_utf16_without_replacement(std::u16string_view src,gsl::span<uint8_t> dst,bool last)594   encode_from_utf16_without_replacement(std::u16string_view src,
595                                         gsl::span<uint8_t> dst, bool last) {
596     size_t src_read = src.size();
597     size_t dst_written = dst.size();
598     uint32_t result = encoder_encode_from_utf16_without_replacement(
599         this, null_to_bogus<const char16_t>(src.data()), &src_read,
600         null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
601     return {result, src_read, dst_written};
602   }
603 
604  private:
605   /**
606    * Replaces `nullptr` with a bogus pointer suitable for use as part of a
607    * zero-length Rust slice.
608    */
609   template <class T>
null_to_bogus(T * ptr)610   static inline T* null_to_bogus(T* ptr) {
611     return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
612   }
613 
614   Encoder() = delete;
615   Encoder(const Encoder&) = delete;
616   Encoder& operator=(const Encoder&) = delete;
617 };
618 
619 /**
620  * An encoding as defined in the Encoding Standard
621  * (https://encoding.spec.whatwg.org/).
622  *
623  * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
624  * sequence and, in most cases, vice versa. Each encoding has a name, an output
625  * encoding, and one or more labels.
626  *
627  * _Labels_ are ASCII-case-insensitive strings that are used to identify an
628  * encoding in formats and protocols. The _name_ of the encoding is the
629  * preferred label in the case appropriate for returning from the
630  * `characterSet` property of the `Document` DOM interface, except for
631  * the replacement encoding whose name is not one of its labels.
632  *
633  * The _output encoding_ is the encoding used for form submission and URL
634  * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
635  * UTF-16LE and UTF-16BE encodings and the encoding itself for other
636  * encodings.
637  *
638  * # Streaming vs. Non-Streaming
639  *
640  * When you have the entire input in a single buffer, you can use the
641  * methods `decode()`, `decode_with_bom_removal()`,
642  * `decode_without_bom_handling()`,
643  * `decode_without_bom_handling_and_without_replacement()` and
644  * `encode()`. Unlike the rest of the API, these methods perform heap
645  * allocations. You should the `Decoder` and `Encoder` objects when your input
646  * is split into multiple buffers or when you want to control the allocation of
647  * the output buffers.
648  *
649  * # Instances
650  *
651  * All instances of `Encoding` are statically allocated and have the process's
652  * lifetime. There is precisely one unique `Encoding` instance for each
653  * encoding defined in the Encoding Standard.
654  *
655  * To obtain a reference to a particular encoding whose identity you know at
656  * compile time, use a `static` that refers to encoding. There is a `static`
657  * for each encoding. The `static`s are named in all caps with hyphens
658  * replaced with underscores and with `_ENCODING` appended to the
659  * name. For example, if you know at compile time that you will want to
660  * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
661  *
662  * If you don't know what encoding you need at compile time and need to
663  * dynamically get an encoding by label, use `Encoding::for_label()`.
664  *
665  * Instances of `Encoding` can be compared with `==`.
666  */
667 class Encoding final {
668  public:
669   /**
670    * Implements the _get an encoding_ algorithm
671    * (https://encoding.spec.whatwg.org/#concept-encoding-get).
672    *
673    * If, after ASCII-lowercasing and removing leading and trailing
674    * whitespace, the argument matches a label defined in the Encoding
675    * Standard, `const Encoding*` representing the corresponding
676    * encoding is returned. If there is no match, `nullptr` is returned.
677    *
678    * This is the right method to use if the action upon the method returning
679    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
680    * instead. When the action upon the method returning `nullptr` is not to
681    * proceed with a fallback but to refuse processing,
682    * `for_label_no_replacement()` is more appropriate.
683    */
for_label(gsl::cstring_span<> label)684   static inline const Encoding* for_label(gsl::cstring_span<> label) {
685     return encoding_for_label(
686         null_to_bogus<const uint8_t>(
687             reinterpret_cast<const uint8_t*>(label.data())),
688         label.length());
689   }
690 
691   /**
692    * This method behaves the same as `for_label()`, except when `for_label()`
693    * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
694    *
695    * This method is useful in scenarios where a fatal error is required
696    * upon invalid label, because in those cases the caller typically wishes
697    * to treat the labels that map to the replacement encoding as fatal
698    * errors, too.
699    *
700    * It is not OK to use this method when the action upon the method returning
701    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
702    * such a case, the `for_label()` method should be used instead in order to
703    * avoid
704    * unsafe fallback for labels that `for_label()` maps to
705    * `REPLACEMENT_ENCODING`.
706    */
for_label_no_replacement(gsl::cstring_span<> label)707   static inline const Encoding* for_label_no_replacement(
708       gsl::cstring_span<> label) {
709     return encoding_for_label_no_replacement(
710         null_to_bogus<const uint8_t>(
711             reinterpret_cast<const uint8_t*>(label.data())),
712         label.length());
713   }
714 
715   /**
716    * Performs non-incremental BOM sniffing.
717    *
718    * The argument must either be a buffer representing the entire input
719    * stream (non-streaming case) or a buffer representing at least the first
720    * three bytes of the input stream (streaming case).
721    *
722    * Returns a std::optinal wrapping `make_tuple(UTF_8_ENCODING, 3)`,
723    * `make_tuple(UTF_16LE_ENCODING, 2)` or `make_tuple(UTF_16BE_ENCODING, 3)`
724    * if the argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or
725    * `std::nullopt` otherwise.
726    */
727   static inline std::optional<
728       std::tuple<gsl::not_null<const Encoding*>, size_t>>
for_bom(gsl::span<const uint8_t> buffer)729   for_bom(gsl::span<const uint8_t> buffer) {
730     size_t len = buffer.size();
731     const Encoding* encoding =
732         encoding_for_bom(null_to_bogus(buffer.data()), &len);
733     if (encoding) {
734       return std::make_tuple(gsl::not_null<const Encoding*>(encoding), len);
735     }
736     return std::nullopt;
737   }
738 
739   /**
740    * Returns the name of this encoding.
741    *
742    * This name is appropriate to return as-is from the DOM
743    * `document.characterSet` property.
744    */
name()745   inline std::string name() const {
746     std::string name(ENCODING_NAME_MAX_LENGTH, '\0');
747     // http://herbsutter.com/2008/04/07/cringe-not-vectors-are-guaranteed-to-be-contiguous/#comment-483
748     size_t length = encoding_name(this, reinterpret_cast<uint8_t*>(&name[0]));
749     name.resize(length);
750     return name;
751   }
752 
753   /**
754    * Checks whether the _output encoding_ of this encoding can encode every
755    * Unicode code point. (Only true if the output encoding is UTF-8.)
756    */
can_encode_everything()757   inline bool can_encode_everything() const {
758     return encoding_can_encode_everything(this);
759   }
760 
761   /**
762    * Checks whether the bytes 0x00...0x7F map exclusively to the characters
763    * U+0000...U+007F and vice versa.
764    */
is_ascii_compatible()765   inline bool is_ascii_compatible() const {
766     return encoding_is_ascii_compatible(this);
767   }
768 
769   /**
770    * Checks whether this encoding maps one byte to one Basic Multilingual
771    * Plane code point (i.e. byte length equals decoded UTF-16 length) and
772    * vice versa (for mappable characters).
773    *
774    * `true` iff this encoding is on the list of Legacy single-byte
775    * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
776    * in the spec or x-user-defined.
777    */
is_single_byte()778   inline bool is_single_byte() const { return encoding_is_single_byte(this); }
779 
780   /**
781    * Returns the _output encoding_ of this encoding. This is UTF-8 for
782    * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
783    */
output_encoding()784   inline gsl::not_null<const Encoding*> output_encoding() const {
785     return gsl::not_null<const Encoding*>(encoding_output_encoding(this));
786   }
787 
788   /**
789    * Decode complete input to `std::string` _with BOM sniffing_ and with
790    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
791    * entire input is available as a single buffer (i.e. the end of the
792    * buffer marks the end of the stream).
793    *
794    * This method implements the (non-streaming version of) the
795    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
796    *
797    * The second item in the returned tuple is the encoding that was actually
798    * used (which may differ from this encoding thanks to BOM sniffing).
799    *
800    * The third item in the returned tuple indicates whether there were
801    * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
802    *
803    * _Note:_ It is wrong to use this when the input buffer represents only
804    * a segment of the input instead of the whole input. Use `new_decoder()`
805    * when decoding segmented input.
806    */
decode(gsl::span<const uint8_t> bytes)807   inline std::tuple<std::string, gsl::not_null<const Encoding*>, bool> decode(
808       gsl::span<const uint8_t> bytes) const {
809     auto opt = Encoding::for_bom(bytes);
810     const Encoding* encoding;
811     if (opt) {
812       size_t bom_length;
813       std::tie(encoding, bom_length) = *opt;
814       bytes = bytes.subspan(bom_length);
815     } else {
816       encoding = this;
817     }
818     auto [str, had_errors] = encoding->decode_without_bom_handling(bytes);
819     return {str, gsl::not_null<const Encoding*>(encoding), had_errors};
820   }
821 
822   /**
823    * Decode complete input to `std::string` _with BOM removal_ and with
824    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
825    * entire input is available as a single buffer (i.e. the end of the
826    * buffer marks the end of the stream).
827    *
828    * When invoked on `UTF_8`, this method implements the (non-streaming
829    * version of) the _UTF-8 decode_
830    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
831    *
832    * The second item in the returned pair indicates whether there were
833    * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
834    *
835    * _Note:_ It is wrong to use this when the input buffer represents only
836    * a segment of the input instead of the whole input. Use
837    * `new_decoder_with_bom_removal()` when decoding segmented input.
838    */
decode_with_bom_removal(gsl::span<const uint8_t> bytes)839   inline std::tuple<std::string, bool> decode_with_bom_removal(
840       gsl::span<const uint8_t> bytes) const {
841     if (this == UTF_8_ENCODING && bytes.size() >= 3 &&
842         (gsl::as_bytes(bytes.first<3>()) ==
843          gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) {
844       bytes = bytes.subspan(3, bytes.size() - 3);
845     } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 &&
846                (gsl::as_bytes(bytes.first<2>()) ==
847                 gsl::as_bytes(gsl::make_span("\xFF\xFE")))) {
848       bytes = bytes.subspan(2, bytes.size() - 2);
849     } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 &&
850                (gsl::as_bytes(bytes.first<2>()) ==
851                 gsl::as_bytes(gsl::make_span("\xFE\xFF")))) {
852       bytes = bytes.subspan(2, bytes.size() - 2);
853     }
854     return decode_without_bom_handling(bytes);
855   }
856 
857   /**
858    * Decode complete input to `std::string` _without BOM handling_ and
859    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
860    * the entire input is available as a single buffer (i.e. the end of the
861    * buffer marks the end of the stream).
862    *
863    * When invoked on `UTF_8`, this method implements the (non-streaming
864    * version of) the _UTF-8 decode without BOM_
865    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
866    *
867    * The second item in the returned pair indicates whether there were
868    * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
869    *
870    * _Note:_ It is wrong to use this when the input buffer represents only
871    * a segment of the input instead of the whole input. Use
872    * `new_decoder_without_bom_handling()` when decoding segmented input.
873    */
decode_without_bom_handling(gsl::span<const uint8_t> bytes)874   inline std::tuple<std::string, bool> decode_without_bom_handling(
875       gsl::span<const uint8_t> bytes) const {
876     auto decoder = new_decoder_without_bom_handling();
877     auto needed = decoder->max_utf8_buffer_length(bytes.size());
878     if (!needed) {
879       throw std::overflow_error("Overflow in buffer size computation.");
880     }
881     std::string string(needed.value(), '\0');
882     const auto [result, read, written, had_errors] = decoder->decode_to_utf8(
883         bytes,
884         gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]), string.size()),
885         true);
886     assert(read == static_cast<size_t>(bytes.size()));
887     assert(written <= static_cast<size_t>(string.size()));
888     assert(result == INPUT_EMPTY);
889     string.resize(written);
890     return {string, had_errors};
891   }
892 
893   /**
894    * Decode complete input to `std::string` _without BOM handling_ and
895    * _with malformed sequences treated as fatal_ when the entire input is
896    * available as a single buffer (i.e. the end of the buffer marks the end
897    * of the stream).
898    *
899    * When invoked on `UTF_8`, this method implements the (non-streaming
900    * version of) the _UTF-8 decode without BOM or fail_
901    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
902    * spec concept.
903    *
904    * Returns `std::nullopt` if a malformed sequence was encountered and the result
905    * of the decode as `std::optional<std::string>` otherwise.
906    *
907    * _Note:_ It is wrong to use this when the input buffer represents only
908    * a segment of the input instead of the whole input. Use
909    * `new_decoder_without_bom_handling()` when decoding segmented input.
910    */
911   inline std::optional<std::string>
decode_without_bom_handling_and_without_replacement(gsl::span<const uint8_t> bytes)912   decode_without_bom_handling_and_without_replacement(
913       gsl::span<const uint8_t> bytes) const {
914     auto decoder = new_decoder_without_bom_handling();
915     auto needed =
916         decoder->max_utf8_buffer_length_without_replacement(bytes.size());
917     if (!needed) {
918       throw std::overflow_error("Overflow in buffer size computation.");
919     }
920     std::string string(needed.value(), '\0');
921     const auto [result, read, written] =
922         decoder->decode_to_utf8_without_replacement(
923             bytes,
924             gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]),
925                            string.size()),
926             true);
927     assert(result != OUTPUT_FULL);
928     if (result == INPUT_EMPTY) {
929       assert(read == static_cast<size_t>(bytes.size()));
930       assert(written <= static_cast<size_t>(string.size()));
931       string.resize(written);
932       return string;
933     }
934     return std::nullopt;
935   }
936 
937   /**
938    * Decode complete input to `std::u16string` _with BOM sniffing_ and with
939    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
940    * entire input is available as a single buffer (i.e. the end of the
941    * buffer marks the end of the stream).
942    *
943    * This method implements the (non-streaming version of) the
944    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
945    *
946    * The second item in the returned tuple is the encoding that was actually
947    * used (which may differ from this encoding thanks to BOM sniffing).
948    *
949    * The third item in the returned tuple indicates whether there were
950    * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
951    *
952    * _Note:_ It is wrong to use this when the input buffer represents only
953    * a segment of the input instead of the whole input. Use `new_decoder()`
954    * when decoding segmented input.
955    */
956   inline std::tuple<std::u16string, gsl::not_null<const Encoding*>, bool>
decode16(gsl::span<const uint8_t> bytes)957   decode16(gsl::span<const uint8_t> bytes) const {
958     auto opt = Encoding::for_bom(bytes);
959     const Encoding* encoding;
960     if (opt) {
961       size_t bom_length;
962       std::tie(encoding, bom_length) = *opt;
963       bytes = bytes.subspan(bom_length);
964     } else {
965       encoding = this;
966     }
967     auto [str, had_errors] = encoding->decode16_without_bom_handling(bytes);
968     return {str, gsl::not_null<const Encoding*>(encoding), had_errors};
969   }
970 
971   /**
972    * Decode complete input to `std::u16string` _with BOM removal_ and with
973    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
974    * entire input is available as a single buffer (i.e. the end of the
975    * buffer marks the end of the stream).
976    *
977    * When invoked on `UTF_8`, this method implements the (non-streaming
978    * version of) the _UTF-8 decode_
979    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
980    *
981    * The second item in the returned pair indicates whether there were
982    * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
983    *
984    * _Note:_ It is wrong to use this when the input buffer represents only
985    * a segment of the input instead of the whole input. Use
986    * `new_decoder_with_bom_removal()` when decoding segmented input.
987    */
decode16_with_bom_removal(gsl::span<const uint8_t> bytes)988   inline std::tuple<std::u16string, bool> decode16_with_bom_removal(
989       gsl::span<const uint8_t> bytes) const {
990     if (this == UTF_8_ENCODING && bytes.size() >= 3 &&
991         (gsl::as_bytes(bytes.first<3>()) ==
992          gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) {
993       bytes = bytes.subspan(3, bytes.size() - 3);
994     } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 &&
995                (gsl::as_bytes(bytes.first<2>()) ==
996                 gsl::as_bytes(gsl::make_span("\xFF\xFE")))) {
997       bytes = bytes.subspan(2, bytes.size() - 2);
998     } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 &&
999                (gsl::as_bytes(bytes.first<2>()) ==
1000                 gsl::as_bytes(gsl::make_span("\xFE\xFF")))) {
1001       bytes = bytes.subspan(2, bytes.size() - 2);
1002     }
1003     return decode16_without_bom_handling(bytes);
1004   }
1005 
1006   /**
1007    * Decode complete input to `std::u16string` _without BOM handling_ and
1008    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
1009    * the entire input is available as a single buffer (i.e. the end of the
1010    * buffer marks the end of the stream).
1011    *
1012    * When invoked on `UTF_8`, this method implements the (non-streaming
1013    * version of) the _UTF-8 decode without BOM_
1014    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
1015    *
1016    * The second item in the returned pair indicates whether there were
1017    * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
1018    *
1019    * _Note:_ It is wrong to use this when the input buffer represents only
1020    * a segment of the input instead of the whole input. Use
1021    * `new_decoder_without_bom_handling()` when decoding segmented input.
1022    */
decode16_without_bom_handling(gsl::span<const uint8_t> bytes)1023   inline std::tuple<std::u16string, bool> decode16_without_bom_handling(
1024       gsl::span<const uint8_t> bytes) const {
1025     auto decoder = new_decoder_without_bom_handling();
1026     auto needed = decoder->max_utf16_buffer_length(bytes.size());
1027     if (!needed) {
1028       throw std::overflow_error("Overflow in buffer size computation.");
1029     }
1030     std::u16string string(needed.value(), '\0');
1031     const auto [result, read, written, had_errors] = decoder->decode_to_utf16(
1032         bytes, gsl::make_span(&string[0], string.size()), true);
1033     assert(read == static_cast<size_t>(bytes.size()));
1034     assert(written <= static_cast<size_t>(string.size()));
1035     assert(result == INPUT_EMPTY);
1036     string.resize(written);
1037     return {string, had_errors};
1038   }
1039 
1040   /**
1041    * Decode complete input to `std::u16string` _without BOM handling_ and
1042    * _with malformed sequences treated as fatal_ when the entire input is
1043    * available as a single buffer (i.e. the end of the buffer marks the end
1044    * of the stream).
1045    *
1046    * When invoked on `UTF_8`, this method implements the (non-streaming
1047    * version of) the _UTF-8 decode without BOM or fail_
1048    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
1049    * spec concept.
1050    *
1051    * Returns `std::nullopt` if a malformed sequence was encountered and the result
1052    * of the decode as `std::optional<std::u16string>` otherwise.
1053    *
1054    * _Note:_ It is wrong to use this when the input buffer represents only
1055    * a segment of the input instead of the whole input. Use
1056    * `new_decoder_without_bom_handling()` when decoding segmented input.
1057    */
1058   inline std::optional<std::u16string>
decode16_without_bom_handling_and_without_replacement(gsl::span<const uint8_t> bytes)1059   decode16_without_bom_handling_and_without_replacement(
1060       gsl::span<const uint8_t> bytes) const {
1061     auto decoder = new_decoder_without_bom_handling();
1062     auto needed = decoder->max_utf16_buffer_length(bytes.size());
1063     if (!needed) {
1064       throw std::overflow_error("Overflow in buffer size computation.");
1065     }
1066     std::u16string string(needed.value(), '\0');
1067     const auto [result, read, written] =
1068         decoder->decode_to_utf16_without_replacement(
1069             bytes, gsl::make_span(&string[0], string.size()), true);
1070     assert(result != OUTPUT_FULL);
1071     if (result == INPUT_EMPTY) {
1072       assert(read == static_cast<size_t>(bytes.size()));
1073       assert(written <= static_cast<size_t>(string.size()));
1074       string.resize(written);
1075       return string;
1076     }
1077     return std::nullopt;
1078   }
1079 
1080   /**
1081    * Encode complete input to `std::vector<uint8_t>` with unmappable characters
1082    * replaced with decimal numeric character references when the entire input
1083    * is available as a single buffer (i.e. the end of the buffer marks the
1084    * end of the stream).
1085    *
1086    * This method implements the (non-streaming version of) the
1087    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
1088    *
1089    * The second item in the returned tuple is the encoding that was actually
1090    * used (which may differ from this encoding thanks to some encodings
1091    * having UTF-8 as their output encoding).
1092    *
1093    * The third item in the returned tuple indicates whether there were
1094    * unmappable characters (that were replaced with HTML numeric character
1095    * references).
1096    *
1097    * _Note:_ It is wrong to use this when the input buffer represents only
1098    * a segment of the input instead of the whole input. Use `new_encoder()`
1099    * when encoding segmented output.
1100    */
1101   inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool>
encode(std::string_view string)1102   encode(std::string_view string) const {
1103     auto output_enc = output_encoding();
1104     if (output_enc == UTF_8_ENCODING) {
1105       std::vector<uint8_t> vec(string.size());
1106       std::memcpy(&vec[0], string.data(), string.size());
1107     }
1108     auto encoder = output_enc->new_encoder();
1109     auto needed =
1110         encoder->max_buffer_length_from_utf8_if_no_unmappables(string.size());
1111     if (!needed) {
1112       throw std::overflow_error("Overflow in buffer size computation.");
1113     }
1114     std::vector<uint8_t> vec(needed.value());
1115     bool total_had_errors = false;
1116     size_t total_read = 0;
1117     size_t total_written = 0;
1118     for (;;) {
1119       const auto [result, read, written, had_errors] =
1120           encoder->encode_from_utf8(string.substr(total_read),
1121                                     gsl::make_span(vec).subspan(total_written),
1122                                     true);
1123       total_read += read;
1124       total_written += written;
1125       total_had_errors |= had_errors;
1126       if (result == INPUT_EMPTY) {
1127         assert(total_read == static_cast<size_t>(string.size()));
1128         assert(total_written <= static_cast<size_t>(vec.size()));
1129         vec.resize(total_written);
1130         return {vec, gsl::not_null<const Encoding*>(output_enc),
1131                 total_had_errors};
1132       }
1133       auto needed = encoder->max_buffer_length_from_utf8_if_no_unmappables(
1134           string.size() - total_read);
1135       if (!needed) {
1136         throw std::overflow_error("Overflow in buffer size computation.");
1137       }
1138       vec.resize(total_written + needed.value());
1139     }
1140   }
1141 
1142   /**
1143    * Encode complete input to `std::vector<uint8_t>` with unmappable characters
1144    * replaced with decimal numeric character references when the entire input
1145    * is available as a single buffer (i.e. the end of the buffer marks the
1146    * end of the stream).
1147    *
1148    * This method implements the (non-streaming version of) the
1149    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
1150    *
1151    * The second item in the returned tuple is the encoding that was actually
1152    * used (which may differ from this encoding thanks to some encodings
1153    * having UTF-8 as their output encoding).
1154    *
1155    * The third item in the returned tuple indicates whether there were
1156    * unmappable characters (that were replaced with HTML numeric character
1157    * references).
1158    *
1159    * _Note:_ It is wrong to use this when the input buffer represents only
1160    * a segment of the input instead of the whole input. Use `new_encoder()`
1161    * when encoding segmented output.
1162    */
1163   inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool>
encode(std::u16string_view string)1164   encode(std::u16string_view string) const {
1165     auto output_enc = output_encoding();
1166     auto encoder = output_enc->new_encoder();
1167     auto needed =
1168         encoder->max_buffer_length_from_utf16_if_no_unmappables(string.size());
1169     if (!needed) {
1170       throw std::overflow_error("Overflow in buffer size computation.");
1171     }
1172     std::vector<uint8_t> vec(needed.value());
1173     bool total_had_errors = false;
1174     size_t total_read = 0;
1175     size_t total_written = 0;
1176     for (;;) {
1177       const auto [result, read, written, had_errors] =
1178           encoder->encode_from_utf16(string.substr(total_read),
1179                                      gsl::make_span(vec).subspan(total_written),
1180                                      true);
1181       total_read += read;
1182       total_written += written;
1183       total_had_errors |= had_errors;
1184       if (result == INPUT_EMPTY) {
1185         assert(total_read == static_cast<size_t>(string.size()));
1186         assert(total_written <= static_cast<size_t>(vec.size()));
1187         vec.resize(total_written);
1188         return {vec, gsl::not_null<const Encoding*>(output_enc),
1189                 total_had_errors};
1190       }
1191       auto needed = encoder->max_buffer_length_from_utf16_if_no_unmappables(
1192           string.size() - total_read);
1193       if (!needed) {
1194         throw std::overflow_error("Overflow in buffer size computation.");
1195       }
1196       vec.resize(total_written + needed.value());
1197     }
1198   }
1199 
1200   /**
1201    * Instantiates a new decoder for this encoding with BOM sniffing enabled.
1202    *
1203    * BOM sniffing may cause the returned decoder to morph into a decoder
1204    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
1205    */
new_decoder()1206   inline std::unique_ptr<Decoder> new_decoder() const {
1207     return std::unique_ptr<Decoder>(encoding_new_decoder(this));
1208   }
1209 
1210   /**
1211    * Instantiates a new decoder for this encoding with BOM sniffing enabled
1212    * into memory occupied by a previously-instantiated decoder.
1213    *
1214    * BOM sniffing may cause the returned decoder to morph into a decoder
1215    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
1216    */
new_decoder_into(Decoder & decoder)1217   inline void new_decoder_into(Decoder& decoder) const {
1218     encoding_new_decoder_into(this, &decoder);
1219   }
1220 
1221   /**
1222    * Instantiates a new decoder for this encoding with BOM removal.
1223    *
1224    * If the input starts with bytes that are the BOM for this encoding,
1225    * those bytes are removed. However, the decoder never morphs into a
1226    * decoder for another encoding: A BOM for another encoding is treated as
1227    * (potentially malformed) input to the decoding algorithm for this
1228    * encoding.
1229    */
new_decoder_with_bom_removal()1230   inline std::unique_ptr<Decoder> new_decoder_with_bom_removal() const {
1231     return std::unique_ptr<Decoder>(
1232         encoding_new_decoder_with_bom_removal(this));
1233   }
1234 
1235   /**
1236    * Instantiates a new decoder for this encoding with BOM removal
1237    * into memory occupied by a previously-instantiated decoder.
1238    *
1239    * If the input starts with bytes that are the BOM for this encoding,
1240    * those bytes are removed. However, the decoder never morphs into a
1241    * decoder for another encoding: A BOM for another encoding is treated as
1242    * (potentially malformed) input to the decoding algorithm for this
1243    * encoding.
1244    */
new_decoder_with_bom_removal_into(Decoder & decoder)1245   inline void new_decoder_with_bom_removal_into(Decoder& decoder) const {
1246     encoding_new_decoder_with_bom_removal_into(this, &decoder);
1247   }
1248 
1249   /**
1250    * Instantiates a new decoder for this encoding with BOM handling disabled.
1251    *
1252    * If the input starts with bytes that look like a BOM, those bytes are
1253    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
1254    * for another encoding.)
1255    *
1256    * _Note:_ If the caller has performed BOM sniffing on its own but has not
1257    * removed the BOM, the caller should use `new_decoder_with_bom_removal()`
1258    * instead of this method to cause the BOM to be removed.
1259    */
new_decoder_without_bom_handling()1260   inline std::unique_ptr<Decoder> new_decoder_without_bom_handling() const {
1261     return std::unique_ptr<Decoder>(
1262         encoding_new_decoder_without_bom_handling(this));
1263   }
1264 
1265   /**
1266    * Instantiates a new decoder for this encoding with BOM handling disabled
1267    * into memory occupied by a previously-instantiated decoder.
1268    *
1269    * If the input starts with bytes that look like a BOM, those bytes are
1270    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
1271    * for another encoding.)
1272    *
1273    * _Note:_ If the caller has performed BOM sniffing on its own but has not
1274    * removed the BOM, the caller should use
1275    * `new_decoder_with_bom_removal_into()`
1276    * instead of this method to cause the BOM to be removed.
1277    */
new_decoder_without_bom_handling_into(Decoder & decoder)1278   inline void new_decoder_without_bom_handling_into(Decoder& decoder) const {
1279     encoding_new_decoder_without_bom_handling_into(this, &decoder);
1280   }
1281 
1282   /**
1283    * Instantiates a new encoder for the output encoding of this encoding.
1284    */
new_encoder()1285   inline std::unique_ptr<Encoder> new_encoder() const {
1286     return std::unique_ptr<Encoder>(encoding_new_encoder(this));
1287   }
1288 
1289   /**
1290    * Instantiates a new encoder for the output encoding of this encoding
1291    * into memory occupied by a previously-instantiated encoder.
1292    */
new_encoder_into(Encoder & encoder)1293   inline void new_encoder_into(Encoder& encoder) const {
1294     encoding_new_encoder_into(this, &encoder);
1295   }
1296 
1297   /**
1298    * Validates UTF-8.
1299    *
1300    * Returns the index of the first byte that makes the input malformed as
1301    * UTF-8 or the length of the input if the input is entirely valid.
1302    */
utf8_valid_up_to(gsl::span<const uint8_t> buffer)1303   static inline size_t utf8_valid_up_to(gsl::span<const uint8_t> buffer) {
1304     return encoding_utf8_valid_up_to(
1305         null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
1306   }
1307 
1308   /**
1309    * Validates ASCII.
1310    *
1311    * Returns the index of the first byte that makes the input malformed as
1312    * ASCII or the length of the input if the input is entirely valid.
1313    */
ascii_valid_up_to(gsl::span<const uint8_t> buffer)1314   static inline size_t ascii_valid_up_to(gsl::span<const uint8_t> buffer) {
1315     return encoding_ascii_valid_up_to(
1316         null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
1317   }
1318 
1319   /**
1320    * Validates ISO-2022-JP ASCII-state data.
1321    *
1322    * Returns the index of the first byte that makes the input not
1323    * representable in the ASCII state of ISO-2022-JP or the length of the
1324    * input if the input is entirely representable in the ASCII state of
1325    * ISO-2022-JP.
1326    */
iso_2022_jp_ascii_valid_up_to(gsl::span<const uint8_t> buffer)1327   static inline size_t iso_2022_jp_ascii_valid_up_to(
1328       gsl::span<const uint8_t> buffer) {
1329     return encoding_iso_2022_jp_ascii_valid_up_to(
1330         null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
1331   }
1332 
1333  private:
1334   /**
1335    * Replaces `nullptr` with a bogus pointer suitable for use as part of a
1336    * zero-length Rust slice.
1337    */
1338   template <class T>
null_to_bogus(T * ptr)1339   static inline T* null_to_bogus(T* ptr) {
1340     return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
1341   }
1342 
1343   Encoding() = delete;
1344   Encoding(const Encoding&) = delete;
1345   Encoding& operator=(const Encoding&) = delete;
1346   ~Encoding() = delete;
1347 };
1348 
1349 };  // namespace encoding_rs
1350 
1351 #endif  // encoding_rs_cpp_h_
1352