1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11 // "top-level directory" in the above notice refers to
12 // third_party/rust/encoding_c/.
13 
14 #ifndef mozilla_Encoding_h
15 #define mozilla_Encoding_h
16 
17 #include "mozilla/CheckedInt.h"
18 #include "mozilla/Maybe.h"
19 #include "mozilla/NotNull.h"
20 #include "mozilla/Span.h"
21 #include "mozilla/Tuple.h"
22 #include "nsString.h"
23 
24 namespace mozilla {
25 class Encoding;
26 class Decoder;
27 class Encoder;
28 };  // namespace mozilla
29 
30 #define ENCODING_RS_ENCODING mozilla::Encoding
31 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
32   mozilla::NotNull<const mozilla::Encoding*>
33 #define ENCODING_RS_ENCODER mozilla::Encoder
34 #define ENCODING_RS_DECODER mozilla::Decoder
35 
36 #include "encoding_rs.h"
37 
38 extern "C" {
39 
40 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
41                                              uint8_t const* src, size_t src_len,
42                                              nsAString* dst);
43 
44 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
45     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
46     nsAString* dst);
47 
48 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
49     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
50     nsAString* dst);
51 
52 nsresult
53 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
54     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
55     nsAString* dst);
56 
57 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
58                                             char16_t const* src, size_t src_len,
59                                             nsACString* dst);
60 
61 nsresult mozilla_encoding_decode_to_nscstring(
62     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
63 
64 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
65     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
66 
67 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
68     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
69 
70 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
71     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
72     nsACString* dst, size_t already_validated);
73 
74 nsresult
75 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
76     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
77 
78 nsresult mozilla_encoding_encode_from_nscstring(
79     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
80 
81 }  // extern "C"
82 
83 namespace mozilla {
84 
85 /**
86  * Return value from `Decoder`/`Encoder` to indicate that input
87  * was exhausted.
88  */
89 const uint32_t kInputEmpty = INPUT_EMPTY;
90 
91 /**
92  * Return value from `Decoder`/`Encoder` to indicate that output
93  * space was insufficient.
94  */
95 const uint32_t kOutputFull = OUTPUT_FULL;
96 
97 /**
98  * An encoding as defined in the Encoding Standard
99  * (https://encoding.spec.whatwg.org/).
100  *
101  * See https://docs.rs/encoding_rs/ for the Rust API docs.
102  *
103  * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
104  * sequence and, in most cases, vice versa. Each encoding has a name, an output
105  * encoding, and one or more labels.
106  *
107  * _Labels_ are ASCII-case-insensitive strings that are used to identify an
108  * encoding in formats and protocols. The _name_ of the encoding is the
109  * preferred label in the case appropriate for returning from the
110  * `characterSet` property of the `Document` DOM interface, except for
111  * the replacement encoding whose name is not one of its labels.
112  *
113  * The _output encoding_ is the encoding used for form submission and URL
114  * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
115  * UTF-16LE and UTF-16BE encodings and the encoding itself for other
116  * encodings.
117  *
118  * # Streaming vs. Non-Streaming
119  *
120  * When you have the entire input in a single buffer, you can use the
121  * methods `Decode()`, `DecodeWithBOMRemoval()`,
122  * `DecodeWithoutBOMHandling()`,
123  * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
124  * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
125  * NewEncoder()` methods), these methods perform heap allocations. You should
126  * the `Decoder` and `Encoder` objects when your input is split into multiple
127  * buffers or when you want to control the allocation of the output buffers.
128  *
129  * # Instances
130  *
131  * All instances of `Encoding` are statically allocated and have the process's
132  * lifetime. There is precisely one unique `Encoding` instance for each
133  * encoding defined in the Encoding Standard.
134  *
135  * To obtain a reference to a particular encoding whose identity you know at
136  * compile time, use a `static` that refers to encoding. There is a `static`
137  * for each encoding. The `static`s are named in all caps with hyphens
138  * replaced with underscores and with `_ENCODING` appended to the
139  * name. For example, if you know at compile time that you will want to
140  * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
141  *
142  * If you don't know what encoding you need at compile time and need to
143  * dynamically get an encoding by label, use `Encoding::for_label()`.
144  *
145  * Pointers to `Encoding` can be compared with `==` to check for the sameness
146  * of two encodings.
147  *
148  * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
149  * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
150  * `const mozilla::Encoding*` in the C signature and
151  * `*const encoding_rs::Encoding` is the corresponding Rust signature.
152  */
153 class Encoding final {
154  public:
155   /**
156    * Implements the _get an encoding_ algorithm
157    * (https://encoding.spec.whatwg.org/#concept-encoding-get).
158    *
159    * If, after ASCII-lowercasing and removing leading and trailing
160    * whitespace, the argument matches a label defined in the Encoding
161    * Standard, `const Encoding*` representing the corresponding
162    * encoding is returned. If there is no match, `nullptr` is returned.
163    *
164    * This is the right method to use if the action upon the method returning
165    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
166    * instead. When the action upon the method returning `nullptr` is not to
167    * proceed with a fallback but to refuse processing,
168    * `ForLabelNoReplacement()` is more appropriate.
169    */
ForLabel(Span<const char> aLabel)170   static inline const Encoding* ForLabel(Span<const char> aLabel) {
171     return encoding_for_label(
172         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
173   }
174 
175   /**
176    * `nsAString` argument version. See above for docs.
177    */
ForLabel(const nsAString & aLabel)178   static inline const Encoding* ForLabel(const nsAString& aLabel) {
179     return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
180   }
181 
182   /**
183    * This method behaves the same as `ForLabel()`, except when `ForLabel()`
184    * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
185    *
186    * This method is useful in scenarios where a fatal error is required
187    * upon invalid label, because in those cases the caller typically wishes
188    * to treat the labels that map to the replacement encoding as fatal
189    * errors, too.
190    *
191    * It is not OK to use this method when the action upon the method returning
192    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
193    * such a case, the `ForLabel()` method should be used instead in order to
194    * avoid unsafe fallback for labels that `ForLabel()` maps to
195    * `REPLACEMENT_ENCODING`.
196    */
ForLabelNoReplacement(Span<const char> aLabel)197   static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
198     return encoding_for_label_no_replacement(
199         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
200   }
201 
202   /**
203    * `nsAString` argument version. See above for docs.
204    */
ForLabelNoReplacement(const nsAString & aLabel)205   static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
206     return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
207   }
208 
209   /**
210    * Performs non-incremental BOM sniffing.
211    *
212    * The argument must either be a buffer representing the entire input
213    * stream (non-streaming case) or a buffer representing at least the first
214    * three bytes of the input stream (streaming case).
215    *
216    * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
217    * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
218    * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
219    */
ForBOM(Span<const uint8_t> aBuffer)220   static inline Tuple<const Encoding*, size_t> ForBOM(
221       Span<const uint8_t> aBuffer) {
222     size_t len = aBuffer.Length();
223     const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
224     return MakeTuple(encoding, len);
225   }
226 
227   /**
228    * Writes the name of this encoding into `aName`.
229    *
230    * This name is appropriate to return as-is from the DOM
231    * `document.characterSet` property.
232    */
Name(nsACString & aName)233   inline void Name(nsACString& aName) const {
234     aName.SetLength(ENCODING_NAME_MAX_LENGTH);
235     size_t length =
236         encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
237     aName.SetLength(length);  // truncation is the 64-bit case is OK
238   }
239 
240   /**
241    * Checks whether the _output encoding_ of this encoding can encode every
242    * Unicode code point. (Only true if the output encoding is UTF-8.)
243    */
CanEncodeEverything()244   inline bool CanEncodeEverything() const {
245     return encoding_can_encode_everything(this);
246   }
247 
248   /**
249    * Checks whether this encoding maps one byte to one Basic Multilingual
250    * Plane code point (i.e. byte length equals decoded UTF-16 length) and
251    * vice versa (for mappable characters).
252    *
253    * `true` iff this encoding is on the list of Legacy single-byte
254    * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
255    * in the spec or x-user-defined.
256    */
IsSingleByte()257   inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
258 
259   /**
260    * Checks whether the bytes 0x00...0x7F map exclusively to the characters
261    * U+0000...U+007F and vice versa.
262    */
IsAsciiCompatible()263   inline bool IsAsciiCompatible() const {
264     return encoding_is_ascii_compatible(this);
265   }
266 
267   /**
268    * Checks whether this is a Japanese legacy encoding.
269    */
IsJapaneseLegacy()270   inline bool IsJapaneseLegacy() const {
271     return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
272            this == ISO_2022_JP_ENCODING;
273   }
274 
275   /**
276    * Returns the _output encoding_ of this encoding. This is UTF-8 for
277    * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
278    */
OutputEncoding()279   inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
280     return WrapNotNull(encoding_output_encoding(this));
281   }
282 
283   /**
284    * Decode complete input to `nsACString` _with BOM sniffing_ and with
285    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
286    * entire input is available as a single buffer (i.e. the end of the
287    * buffer marks the end of the stream).
288    *
289    * This method implements the (non-streaming version of) the
290    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
291    *
292    * The second item in the returned tuple is the encoding that was actually
293    * used (which may differ from this encoding thanks to BOM sniffing).
294    *
295    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
296    * if there were malformed sequences (that were replaced with the
297    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
298    * tuple.
299    *
300    * The backing buffer of the string isn't copied if the input buffer
301    * is heap-allocated and decoding from UTF-8 and the input is valid
302    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
303    * the input is valid ASCII or decoding from ISO-2022-JP and the
304    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
305    * the same string as both arguments.
306    *
307    * _Note:_ It is wrong to use this when the input buffer represents only
308    * a segment of the input instead of the whole input. Use `NewDecoder()`
309    * when decoding segmented input.
310    */
Decode(const nsACString & aBytes,nsACString & aOut)311   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
312       const nsACString& aBytes, nsACString& aOut) const {
313     const Encoding* encoding = this;
314     const nsACString* bytes = &aBytes;
315     nsACString* out = &aOut;
316     nsresult rv;
317     if (bytes == out) {
318       nsAutoCString temp(aBytes);
319       rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
320     } else {
321       rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
322     }
323     return MakeTuple(rv, WrapNotNull(encoding));
324   }
325 
326   /**
327    * Decode complete input to `nsAString` _with BOM sniffing_ and with
328    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
329    * entire input is available as a single buffer (i.e. the end of the
330    * buffer marks the end of the stream).
331    *
332    * This method implements the (non-streaming version of) the
333    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
334    *
335    * The second item in the returned tuple is the encoding that was actually
336    * used (which may differ from this encoding thanks to BOM sniffing).
337    *
338    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
339    * if there were malformed sequences (that were replaced with the
340    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
341    * tuple.
342    *
343    * _Note:_ It is wrong to use this when the input buffer represents only
344    * a segment of the input instead of the whole input. Use `NewDecoder()`
345    * when decoding segmented input.
346    */
Decode(Span<const uint8_t> aBytes,nsAString & aOut)347   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
348       Span<const uint8_t> aBytes, nsAString& aOut) const {
349     const Encoding* encoding = this;
350     nsresult rv = mozilla_encoding_decode_to_nsstring(
351         &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
352     return MakeTuple(rv, WrapNotNull(encoding));
353   }
354 
355   /**
356    * Decode complete input to `nsACString` _with BOM removal_ and with
357    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
358    * entire input is available as a single buffer (i.e. the end of the
359    * buffer marks the end of the stream).
360    *
361    * When invoked on `UTF_8`, this method implements the (non-streaming
362    * version of) the _UTF-8 decode_
363    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
364    *
365    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
366    * if there were malformed sequences (that were replaced with the
367    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
368    *
369    * The backing buffer of the string isn't copied if the input buffer
370    * is heap-allocated and decoding from UTF-8 and the input is valid
371    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
372    * the input is valid ASCII or decoding from ISO-2022-JP and the
373    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
374    * the same string as both arguments.
375    *
376    * _Note:_ It is wrong to use this when the input buffer represents only
377    * a segment of the input instead of the whole input. Use
378    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
379    */
DecodeWithBOMRemoval(const nsACString & aBytes,nsACString & aOut)380   inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
381                                        nsACString& aOut) const {
382     const nsACString* bytes = &aBytes;
383     nsACString* out = &aOut;
384     if (bytes == out) {
385       nsAutoCString temp(aBytes);
386       return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
387                                                                    out);
388     }
389     return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
390                                                                  out);
391   }
392 
393   /**
394    * Decode complete input to `nsAString` _with BOM removal_ and with
395    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
396    * entire input is available as a single buffer (i.e. the end of the
397    * buffer marks the end of the stream).
398    *
399    * When invoked on `UTF_8`, this method implements the (non-streaming
400    * version of) the _UTF-8 decode_
401    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
402    *
403    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
404    * if there were malformed sequences (that were replaced with the
405    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
406    *
407    * _Note:_ It is wrong to use this when the input buffer represents only
408    * a segment of the input instead of the whole input. Use
409    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
410    */
DecodeWithBOMRemoval(Span<const uint8_t> aBytes,nsAString & aOut)411   inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
412                                        nsAString& aOut) const {
413     return mozilla_encoding_decode_to_nsstring_with_bom_removal(
414         this, aBytes.Elements(), aBytes.Length(), &aOut);
415   }
416 
417   /**
418    * Decode complete input to `nsACString` _without BOM handling_ and
419    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
420    * the entire input is available as a single buffer (i.e. the end of the
421    * buffer marks the end of the stream).
422    *
423    * When invoked on `UTF_8`, this method implements the (non-streaming
424    * version of) the _UTF-8 decode without BOM_
425    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
426    *
427    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
428    * if there were malformed sequences (that were replaced with the
429    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
430    *
431    * The backing buffer of the string isn't copied if the input buffer
432    * is heap-allocated and decoding from UTF-8 and the input is valid
433    * UTF-8, decoding from an ASCII-compatible encoding and the input
434    * is valid ASCII or decoding from ISO-2022-JP and the input stays
435    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
436    * as both arguments.
437    *
438    * _Note:_ It is wrong to use this when the input buffer represents only
439    * a segment of the input instead of the whole input. Use
440    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
441    */
DecodeWithoutBOMHandling(const nsACString & aBytes,nsACString & aOut)442   inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
443                                            nsACString& aOut) const {
444     const nsACString* bytes = &aBytes;
445     nsACString* out = &aOut;
446     if (bytes == out) {
447       nsAutoCString temp(aBytes);
448       return mozilla_encoding_decode_to_nscstring_without_bom_handling(
449           this, &temp, out);
450     }
451     return mozilla_encoding_decode_to_nscstring_without_bom_handling(
452         this, bytes, out);
453   }
454 
455   /**
456    * Decode complete input to `nsAString` _without BOM handling_ and
457    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
458    * the entire input is available as a single buffer (i.e. the end of the
459    * buffer marks the end of the stream).
460    *
461    * When invoked on `UTF_8`, this method implements the (non-streaming
462    * version of) the _UTF-8 decode without BOM_
463    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
464    *
465    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
466    * if there were malformed sequences (that were replaced with the
467    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
468    *
469    * _Note:_ It is wrong to use this when the input buffer represents only
470    * a segment of the input instead of the whole input. Use
471    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
472    */
DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,nsAString & aOut)473   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
474                                            nsAString& aOut) const {
475     return mozilla_encoding_decode_to_nsstring_without_bom_handling(
476         this, aBytes.Elements(), aBytes.Length(), &aOut);
477   }
478 
479   /**
480    * Decode complete input to `nsACString` _without BOM handling_ and
481    * _with malformed sequences treated as fatal_ when the entire input is
482    * available as a single buffer (i.e. the end of the buffer marks the end
483    * of the stream).
484    *
485    * When invoked on `UTF_8`, this method implements the (non-streaming
486    * version of) the _UTF-8 decode without BOM or fail_
487    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
488    * spec concept.
489    *
490    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
491    * if a malformed sequence was encountered and `NS_OK` otherwise.
492    *
493    * The backing buffer of the string isn't copied if the input buffer
494    * is heap-allocated and decoding from UTF-8 and the input is valid
495    * UTF-8, decoding from an ASCII-compatible encoding and the input
496    * is valid ASCII or decoding from ISO-2022-JP and the input stays
497    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
498    * as both arguments.
499    *
500    * _Note:_ It is wrong to use this when the input buffer represents only
501    * a segment of the input instead of the whole input. Use
502    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
503    */
DecodeWithoutBOMHandlingAndWithoutReplacement(const nsACString & aBytes,nsACString & aOut)504   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
505       const nsACString& aBytes, nsACString& aOut) const {
506     const nsACString* bytes = &aBytes;
507     nsACString* out = &aOut;
508     if (bytes == out) {
509       nsAutoCString temp(aBytes);
510       return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
511           this, &temp, out);
512     }
513     return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
514         this, bytes, out);
515   }
516 
517   /**
518    * Decode complete input to `nsACString` _without BOM handling_ and
519    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
520    * the entire input is available as a single buffer (i.e. the end of the
521    * buffer marks the end of the stream) _asserting that a number of bytes
522    * from the start are already known to be valid UTF-8_.
523    *
524    * The use case for this method is avoiding copying when dealing with
525    * input that has a UTF-8 BOM. _When in doubt, do not use this method._
526    *
527    * When invoked on `UTF_8`, this method implements the (non-streaming
528    * version of) the _UTF-8 decode without BOM_
529    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
530    *
531    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
532    * if there were malformed sequences (that were replaced with the
533    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
534    *
535    * _Note:_ It is wrong to use this when the input buffer represents only
536    * a segment of the input instead of the whole input. Use
537    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
538    *
539    * # Safety
540    *
541    * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
542    * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
543    */
DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,nsACString & aOut,size_t aAlreadyValidated)544   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
545                                            nsACString& aOut,
546                                            size_t aAlreadyValidated) const {
547     return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
548         this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
549   }
550 
551   /**
552    * Decode complete input to `nsAString` _without BOM handling_ and
553    * _with malformed sequences treated as fatal_ when the entire input is
554    * available as a single buffer (i.e. the end of the buffer marks the end
555    * of the stream).
556    *
557    * When invoked on `UTF_8`, this method implements the (non-streaming
558    * version of) the _UTF-8 decode without BOM or fail_
559    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
560    * spec concept.
561    *
562    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
563    * if a malformed sequence was encountered and `NS_OK` otherwise.
564    *
565    * _Note:_ It is wrong to use this when the input buffer represents only
566    * a segment of the input instead of the whole input. Use
567    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
568    */
DecodeWithoutBOMHandlingAndWithoutReplacement(Span<const uint8_t> aBytes,nsAString & aOut)569   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
570       Span<const uint8_t> aBytes, nsAString& aOut) const {
571     return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
572         this, aBytes.Elements(), aBytes.Length(), &aOut);
573   }
574 
575   /**
576    * Encode complete input to `nsACString` with unmappable characters
577    * replaced with decimal numeric character references when the entire input
578    * is available as a single buffer (i.e. the end of the buffer marks the
579    * end of the stream).
580    *
581    * This method implements the (non-streaming version of) the
582    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
583    *
584    * The second item in the returned tuple is the encoding that was actually
585    * used (which may differ from this encoding thanks to some encodings
586    * having UTF-8 as their output encoding).
587    *
588    * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
589    * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
590    * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
591    * replaced with numeric character references) and `NS_OK` otherwise.
592    *
593    * The backing buffer of the string isn't copied if the input buffer
594    * is heap-allocated and encoding to UTF-8 and the input is valid
595    * UTF-8, encoding to an ASCII-compatible encoding and the input
596    * is valid ASCII or encoding from ISO-2022-JP and the input stays
597    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
598    * as both arguments.
599    *
600    * _Note:_ It is wrong to use this when the input buffer represents only
601    * a segment of the input instead of the whole input. Use `NewEncoder()`
602    * when encoding segmented output.
603    */
Encode(const nsACString & aString,nsACString & aOut)604   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
605       const nsACString& aString, nsACString& aOut) const {
606     const Encoding* encoding = this;
607     const nsACString* string = &aString;
608     nsACString* out = &aOut;
609     nsresult rv;
610     if (string == out) {
611       nsAutoCString temp(aString);
612       rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
613     } else {
614       rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
615     }
616     return MakeTuple(rv, WrapNotNull(encoding));
617   }
618 
619   /**
620    * Encode complete input to `nsACString` with unmappable characters
621    * replaced with decimal numeric character references when the entire input
622    * is available as a single buffer (i.e. the end of the buffer marks the
623    * end of the stream).
624    *
625    * This method implements the (non-streaming version of) the
626    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
627    *
628    * The second item in the returned tuple is the encoding that was actually
629    * used (which may differ from this encoding thanks to some encodings
630    * having UTF-8 as their output encoding).
631    *
632    * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
633    * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
634    * were replaced with numeric character references) and `NS_OK` otherwise.
635 
636    * _Note:_ It is wrong to use this when the input buffer represents only
637    * a segment of the input instead of the whole input. Use `NewEncoder()`
638    * when encoding segmented output.
639    */
Encode(Span<const char16_t> aString,nsACString & aOut)640   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
641       Span<const char16_t> aString, nsACString& aOut) const {
642     const Encoding* encoding = this;
643     nsresult rv = mozilla_encoding_encode_from_utf16(
644         &encoding, aString.Elements(), aString.Length(), &aOut);
645     return MakeTuple(rv, WrapNotNull(encoding));
646   }
647 
648   /**
649    * Instantiates a new decoder for this encoding with BOM sniffing enabled.
650    *
651    * BOM sniffing may cause the returned decoder to morph into a decoder
652    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
653    */
NewDecoder()654   inline UniquePtr<Decoder> NewDecoder() const {
655     UniquePtr<Decoder> decoder(encoding_new_decoder(this));
656     return decoder;
657   }
658 
659   /**
660    * Instantiates a new decoder for this encoding with BOM sniffing enabled
661    * into memory occupied by a previously-instantiated decoder.
662    *
663    * BOM sniffing may cause the returned decoder to morph into a decoder
664    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
665    */
NewDecoderInto(Decoder & aDecoder)666   inline void NewDecoderInto(Decoder& aDecoder) const {
667     encoding_new_decoder_into(this, &aDecoder);
668   }
669 
670   /**
671    * Instantiates a new decoder for this encoding with BOM removal.
672    *
673    * If the input starts with bytes that are the BOM for this encoding,
674    * those bytes are removed. However, the decoder never morphs into a
675    * decoder for another encoding: A BOM for another encoding is treated as
676    * (potentially malformed) input to the decoding algorithm for this
677    * encoding.
678    */
NewDecoderWithBOMRemoval()679   inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
680     UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
681     return decoder;
682   }
683 
684   /**
685    * Instantiates a new decoder for this encoding with BOM removal
686    * into memory occupied by a previously-instantiated decoder.
687    *
688    * If the input starts with bytes that are the BOM for this encoding,
689    * those bytes are removed. However, the decoder never morphs into a
690    * decoder for another encoding: A BOM for another encoding is treated as
691    * (potentially malformed) input to the decoding algorithm for this
692    * encoding.
693    */
NewDecoderWithBOMRemovalInto(Decoder & aDecoder)694   inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
695     encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
696   }
697 
698   /**
699    * Instantiates a new decoder for this encoding with BOM handling disabled.
700    *
701    * If the input starts with bytes that look like a BOM, those bytes are
702    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
703    * for another encoding.)
704    *
705    * _Note:_ If the caller has performed BOM sniffing on its own but has not
706    * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
707    * instead of this method to cause the BOM to be removed.
708    */
NewDecoderWithoutBOMHandling()709   inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
710     UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
711     return decoder;
712   }
713 
714   /**
715    * Instantiates a new decoder for this encoding with BOM handling disabled
716    * into memory occupied by a previously-instantiated decoder.
717    *
718    * If the input starts with bytes that look like a BOM, those bytes are
719    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
720    * for another encoding.)
721    *
722    * _Note:_ If the caller has performed BOM sniffing on its own but has not
723    * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
724    * instead of this method to cause the BOM to be removed.
725    */
NewDecoderWithoutBOMHandlingInto(Decoder & aDecoder)726   inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
727     encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
728   }
729 
730   /**
731    * Instantiates a new encoder for the output encoding of this encoding.
732    */
NewEncoder()733   inline UniquePtr<Encoder> NewEncoder() const {
734     UniquePtr<Encoder> encoder(encoding_new_encoder(this));
735     return encoder;
736   }
737 
738   /**
739    * Instantiates a new encoder for the output encoding of this encoding
740    * into memory occupied by a previously-instantiated encoder.
741    */
NewEncoderInto(Encoder & aEncoder)742   inline void NewEncoderInto(Encoder& aEncoder) const {
743     encoding_new_encoder_into(this, &aEncoder);
744   }
745 
746   /**
747    * Validates UTF-8.
748    *
749    * Returns the index of the first byte that makes the input malformed as
750    * UTF-8 or the length of the input if the input is entirely valid.
751    */
UTF8ValidUpTo(Span<const uint8_t> aBuffer)752   static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
753     return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
754   }
755 
756   /**
757    * Validates ASCII.
758    *
759    * Returns the index of the first byte that makes the input malformed as
760    * ASCII or the length of the input if the input is entirely valid.
761    */
ASCIIValidUpTo(Span<const uint8_t> aBuffer)762   static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
763     return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
764   }
765 
766   /**
767    * Validates ISO-2022-JP ASCII-state data.
768    *
769    * Returns the index of the first byte that makes the input not
770    * representable in the ASCII state of ISO-2022-JP or the length of the
771    * input if the input is entirely representable in the ASCII state of
772    * ISO-2022-JP.
773    */
ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer)774   static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
775     return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
776                                                   aBuffer.Length());
777   }
778 
779  private:
780   Encoding() = delete;
781   Encoding(const Encoding&) = delete;
782   Encoding& operator=(const Encoding&) = delete;
783   ~Encoding() = delete;
784 };
785 
786 /**
787  * A converter that decodes a byte stream into Unicode according to a
788  * character encoding in a streaming (incremental) manner.
789  *
790  * The various `Decode*` methods take an input buffer (`aSrc`) and an output
791  * buffer `aDst` both of which are caller-allocated. There are variants for
792  * both UTF-8 and UTF-16 output buffers.
793  *
794  * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
795  * into `aDst` until one of the following three things happens:
796  *
797  * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
798  *    variants only).
799  *
800  * 2. The output buffer has been filled so near capacity that the decoder
801  *    cannot be sure that processing an additional byte of input wouldn't
802  *    cause so much output that the output buffer would overflow.
803  *
804  * 3. All the input bytes have been processed.
805  *
806  * The `Decode*` method then returns tuple of a status indicating which one
807  * of the three reasons to return happened, how many input bytes were read,
808  * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
809  * when decoding to UTF-16) were written, and in the case of the
810  * variants performing replacement, a boolean indicating whether an error was
811  * replaced with the REPLACEMENT CHARACTER during the call.
812  *
813  * The number of bytes "written" is what's logically written. Garbage may be
814  * written in the output buffer beyond the point logically written to.
815  *
816  * In the case of the `*WithoutReplacement` variants, the status is a
817  * `uint32_t` whose possible values are packed info about a malformed byte
818  * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
819  * listed above).
820  *
821  * Packed info about malformed sequences has the following format:
822  * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
823  * indicate the number of bytes that were consumed after the malformed
824  * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
825  * the length of the malformed byte sequence (possible decimal values 1, 2,
826  * 3 or 4). The maximum possible sum of the two is 6.
827  *
828  * In the case of methods whose name does not end with
829  * `*WithoutReplacement`, malformed sequences are automatically replaced
830  * with the REPLACEMENT CHARACTER and errors do not cause the methods to
831  * return early.
832  *
833  * When decoding to UTF-8, the output buffer must have at least 4 bytes of
834  * space. When decoding to UTF-16, the output buffer must have at least two
835  * UTF-16 code units (`char16_t`) of space.
836  *
837  * When decoding to UTF-8 without replacement, the methods are guaranteed
838  * not to return indicating that more output space is needed if the length
839  * of the output buffer is at least the length returned by
840  * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
841  * with replacement, the length of the output buffer that guarantees the
842  * methods not to return indicating that more output space is needed is given
843  * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
844  * or without replacement, the length of the output buffer that guarantees
845  * the methods not to return indicating that more output space is needed is
846  * given by `MaxUTF16BufferLength()`.
847  *
848  * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
849  * and the output after each `Decode*` call is guaranteed to consist of
850  * complete characters. (I.e. the code unit sequence for the last character is
851  * guaranteed not to be split across output buffers.)
852  *
853  * The boolean argument `aLast` indicates that the end of the stream is reached
854  * when all the bytes in `aSrc` have been consumed.
855  *
856  * A `Decoder` object can be used to incrementally decode a byte stream.
857  *
858  * During the processing of a single stream, the caller must call `Decode*`
859  * zero or more times with `aLast` set to `false` and then call `Decode*` at
860  * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
861  * the processing of the stream has ended. Otherwise, the caller must call
862  * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
863  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
864  *
865  * Once the stream has ended, the `Decoder` object must not be used anymore.
866  * That is, you need to create another one to process another stream.
867  *
868  * When the decoder returns `kOutputFull` or the decoder returns a malformed
869  * result and the caller does not wish to treat it as a fatal error, the input
870  * buffer `aSrc` may not have been completely consumed. In that case, the caller
871  * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
872  * call.
873  *
874  * # Infinite loops
875  *
876  * When converting with a fixed-size output buffer whose size is too small to
877  * accommodate one character of output, an infinite loop ensues. When
878  * converting with a fixed-size output buffer, it generally makes sense to
879  * make the buffer fairly large (e.g. couple of kilobytes).
880  */
881 class Decoder final {
882  public:
883   ~Decoder() = default;
delete(void * aDecoder)884   static void operator delete(void* aDecoder) {
885     decoder_free(reinterpret_cast<Decoder*>(aDecoder));
886   }
887 
888   /**
889    * The `Encoding` this `Decoder` is for.
890    *
891    * BOM sniffing can change the return value of this method during the life
892    * of the decoder.
893    */
Encoding()894   inline NotNull<const mozilla::Encoding*> Encoding() const {
895     return WrapNotNull(decoder_encoding(this));
896   }
897 
898   /**
899    * Query the worst-case UTF-8 output size _with replacement_.
900    *
901    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
902    * that will not overflow given the current state of the decoder and
903    * `aByteLength` number of additional input bytes when decoding with
904    * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
905    * sequence.
906    */
MaxUTF8BufferLength(size_t aByteLength)907   inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
908     CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
909     if (max.value() == std::numeric_limits<size_t>::max()) {
910       // Mark invalid by overflowing
911       max++;
912       MOZ_ASSERT(!max.isValid());
913     }
914     return max;
915   }
916 
917   /**
918    * Query the worst-case UTF-8 output size _without replacement_.
919    *
920    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
921    * that will not overflow given the current state of the decoder and
922    * `aByteLength` number of additional input bytes when decoding without
923    * replacement error handling.
924    *
925    * Note that this value may be too small for the `WithReplacement` case.
926    * Use `MaxUTF8BufferLength()` for that case.
927    */
MaxUTF8BufferLengthWithoutReplacement(size_t aByteLength)928   inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
929       size_t aByteLength) const {
930     CheckedInt<size_t> max(
931         decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
932     if (max.value() == std::numeric_limits<size_t>::max()) {
933       // Mark invalid by overflowing
934       max++;
935       MOZ_ASSERT(!max.isValid());
936     }
937     return max;
938   }
939 
940   /**
941    * Incrementally decode a byte stream into UTF-8 with malformed sequences
942    * replaced with the REPLACEMENT CHARACTER.
943    *
944    * See the documentation of the class for documentation for `Decode*`
945    * methods collectively.
946    */
DecodeToUTF8(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)947   inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
948       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
949     size_t srcRead = aSrc.Length();
950     size_t dstWritten = aDst.Length();
951     bool hadReplacements;
952     uint32_t result =
953         decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
954                                &dstWritten, aLast, &hadReplacements);
955     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
956   }
957 
958   /**
959    * Incrementally decode a byte stream into UTF-8 _without replacement_.
960    *
961    * See the documentation of the class for documentation for `Decode*`
962    * methods collectively.
963    */
DecodeToUTF8WithoutReplacement(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)964   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
965       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
966     size_t srcRead = aSrc.Length();
967     size_t dstWritten = aDst.Length();
968     uint32_t result = decoder_decode_to_utf8_without_replacement(
969         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
970     return MakeTuple(result, srcRead, dstWritten);
971   }
972 
973   /**
974    * Query the worst-case UTF-16 output size (with or without replacement).
975    *
976    * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
977    * that will not overflow given the current state of the decoder and
978    * `aByteLength` number of additional input bytes.
979    *
980    * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
981    * return value of this method applies also in the
982    * `_without_replacement` case.
983    */
MaxUTF16BufferLength(size_t aU16Length)984   inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
985     CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
986     if (max.value() == std::numeric_limits<size_t>::max()) {
987       // Mark invalid by overflowing
988       max++;
989       MOZ_ASSERT(!max.isValid());
990     }
991     return max;
992   }
993 
994   /**
995    * Incrementally decode a byte stream into UTF-16 with malformed sequences
996    * replaced with the REPLACEMENT CHARACTER.
997    *
998    * See the documentation of the class for documentation for `Decode*`
999    * methods collectively.
1000    */
DecodeToUTF16(Span<const uint8_t> aSrc,Span<char16_t> aDst,bool aLast)1001   inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
1002       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1003     size_t srcRead = aSrc.Length();
1004     size_t dstWritten = aDst.Length();
1005     bool hadReplacements;
1006     uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
1007                                               aDst.Elements(), &dstWritten,
1008                                               aLast, &hadReplacements);
1009     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1010   }
1011 
1012   /**
1013    * Incrementally decode a byte stream into UTF-16 _without replacement_.
1014    *
1015    * See the documentation of the class for documentation for `Decode*`
1016    * methods collectively.
1017    */
DecodeToUTF16WithoutReplacement(Span<const uint8_t> aSrc,Span<char16_t> aDst,bool aLast)1018   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1019       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1020     size_t srcRead = aSrc.Length();
1021     size_t dstWritten = aDst.Length();
1022     uint32_t result = decoder_decode_to_utf16_without_replacement(
1023         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1024     return MakeTuple(result, srcRead, dstWritten);
1025   }
1026 
1027   /**
1028    * Checks for compatibility with storing Unicode scalar values as unsigned
1029    * bytes taking into account the state of the decoder.
1030    *
1031    * Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
1032    * including waiting for the BOM, or if the encoding is never
1033    * Latin1-byte-compatible.
1034    *
1035    * Otherwise returns the index of the first byte whose unsigned value doesn't
1036    * directly correspond to the decoded Unicode scalar value, or the length
1037    * of the input if all bytes in the input decode directly to scalar values
1038    * corresponding to the unsigned byte values.
1039    *
1040    * Does not change the state of the decoder.
1041    *
1042    * Do not use this unless you are supporting SpiderMonkey-style string
1043    * storage optimizations.
1044    */
Latin1ByteCompatibleUpTo(Span<const uint8_t> aBuffer)1045   inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo(
1046       Span<const uint8_t> aBuffer) const {
1047     size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(),
1048                                                        aBuffer.Length());
1049     if (upTo == std::numeric_limits<size_t>::max()) {
1050       return mozilla::Nothing();
1051     }
1052     return mozilla::Some(upTo);
1053   }
1054 
1055  private:
1056   Decoder() = delete;
1057   Decoder(const Decoder&) = delete;
1058   Decoder& operator=(const Decoder&) = delete;
1059 };
1060 
1061 /**
1062  * A converter that encodes a Unicode stream into bytes according to a
1063  * character encoding in a streaming (incremental) manner.
1064  *
1065  * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1066  * buffer `aDst` both of which are caller-allocated. There are variants for
1067  * both UTF-8 and UTF-16 input buffers.
1068  *
1069  * An `Encode*` method encode characters from `aSrc` into bytes characters
1070  * stored into `aDst` until one of the following three things happens:
1071  *
1072  * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1073  *    only).
1074  *
1075  * 2. The output buffer has been filled so near capacity that the decoder
1076  *    cannot be sure that processing an additional character of input wouldn't
1077  *    cause so much output that the output buffer would overflow.
1078  *
1079  * 3. All the input characters have been processed.
1080  *
1081  * The `Encode*` method then returns tuple of a status indicating which one
1082  * of the three reasons to return happened, how many input code units (`uint8_t`
1083  * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1084  * how many output bytes were written, and in the case of the variants that
1085  * perform replacement, a boolean indicating whether an unmappable
1086  * character was replaced with a numeric character reference during the call.
1087  *
1088  * The number of bytes "written" is what's logically written. Garbage may be
1089  * written in the output buffer beyond the point logically written to.
1090  *
1091  * In the case of the methods whose name ends with
1092  * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1093  * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1094  * to the three cases listed above).
1095  *
1096  * In the case of methods whose name does not end with
1097  * `*WithoutReplacement`, unmappable characters are automatically replaced
1098  * with the corresponding numeric character references and unmappable
1099  * characters do not cause the methods to return early.
1100  *
1101  * When encoding from UTF-8 without replacement, the methods are guaranteed
1102  * not to return indicating that more output space is needed if the length
1103  * of the output buffer is at least the length returned by
1104  * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1105  * UTF-8 with replacement, the length of the output buffer that guarantees the
1106  * methods not to return indicating that more output space is needed in the
1107  * absence of unmappable characters is given by
1108  * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1109  * UTF-16 without replacement, the methods are guaranteed not to return
1110  * indicating that more output space is needed if the length of the output
1111  * buffer is at least the length returned by
1112  * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1113  * from UTF-16 with replacement, the the length of the output buffer that
1114  * guarantees the methods not to return indicating that more output space is
1115  * needed in the absence of unmappable characters is given by
1116  * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1117  * When encoding with replacement, applications are not expected to size the
1118  * buffer for the worst case ahead of time but to resize the buffer if there
1119  * are unmappable characters. This is why max length queries are only available
1120  * for the case where there are no unmappable characters.
1121  *
1122  * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1123  * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1124  * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1125  * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1126  * surrogate pairs are not split across input buffer boundaries.
1127  *
1128  * After an `Encode*` call returns, the output produced so far, taken as a
1129  * whole from the start of the stream, is guaranteed to consist of a valid
1130  * byte sequence in the target encoding. (I.e. the code unit sequence for a
1131  * character is guaranteed not to be split across output buffers. However, due
1132  * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1133  * from the start for it to be valid. For other encodings, the validity holds
1134  * on a per-output buffer basis.)
1135  *
1136  * The boolean argument `aLast` indicates that the end of the stream is reached
1137  * when all the characters in `aSrc` have been consumed. This argument is needed
1138  * for ISO-2022-JP and is ignored for other encodings.
1139  *
1140  * An `Encoder` object can be used to incrementally encode a byte stream.
1141  *
1142  * During the processing of a single stream, the caller must call `Encode*`
1143  * zero or more times with `aLast` set to `false` and then call `Encode*` at
1144  * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1145  * the processing of the stream has ended. Otherwise, the caller must call
1146  * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1147  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1148  *
1149  * Once the stream has ended, the `Encoder` object must not be used anymore.
1150  * That is, you need to create another one to process another stream.
1151  *
1152  * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1153  * result and the caller does not wish to treat it as a fatal error, the input
1154  * buffer `aSrc` may not have been completely consumed. In that case, the caller
1155  * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1156  * call.
1157  *
1158  * # Infinite loops
1159  *
1160  * When converting with a fixed-size output buffer whose size is too small to
1161  * accommodate one character of output, an infinite loop ensues. When
1162  * converting with a fixed-size output buffer, it generally makes sense to
1163  * make the buffer fairly large (e.g. couple of kilobytes).
1164  */
1165 class Encoder final {
1166  public:
1167   ~Encoder() = default;
1168 
delete(void * aEncoder)1169   static void operator delete(void* aEncoder) {
1170     encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1171   }
1172 
1173   /**
1174    * The `Encoding` this `Encoder` is for.
1175    */
Encoding()1176   inline NotNull<const mozilla::Encoding*> Encoding() const {
1177     return WrapNotNull(encoder_encoding(this));
1178   }
1179 
1180   /**
1181    * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1182    * ASCII state and `false` otherwise.
1183    */
HasPendingState()1184   inline bool HasPendingState() const {
1185     return encoder_has_pending_state(this);
1186   }
1187 
1188   /**
1189    * Query the worst-case output size when encoding from UTF-8 with
1190    * replacement.
1191    *
1192    * Returns the size of the output buffer in bytes that will not overflow
1193    * given the current state of the encoder and `aByteLength` number of
1194    * additional input code units if there are no unmappable characters in
1195    * the input.
1196    */
MaxBufferLengthFromUTF8IfNoUnmappables(size_t aByteLength)1197   inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1198       size_t aByteLength) const {
1199     CheckedInt<size_t> max(
1200         encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1201                                                               aByteLength));
1202     if (max.value() == std::numeric_limits<size_t>::max()) {
1203       // Mark invalid by overflowing
1204       max++;
1205       MOZ_ASSERT(!max.isValid());
1206     }
1207     return max;
1208   }
1209 
1210   /**
1211    * Query the worst-case output size when encoding from UTF-8 without
1212    * replacement.
1213    *
1214    * Returns the size of the output buffer in bytes that will not overflow
1215    * given the current state of the encoder and `aByteLength` number of
1216    * additional input code units.
1217    */
MaxBufferLengthFromUTF8WithoutReplacement(size_t aByteLength)1218   inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1219       size_t aByteLength) const {
1220     CheckedInt<size_t> max(
1221         encoder_max_buffer_length_from_utf8_without_replacement(this,
1222                                                                 aByteLength));
1223     if (max.value() == std::numeric_limits<size_t>::max()) {
1224       // Mark invalid by overflowing
1225       max++;
1226       MOZ_ASSERT(!max.isValid());
1227     }
1228     return max;
1229   }
1230 
1231   /**
1232    * Incrementally encode into byte stream from UTF-8 with unmappable
1233    * characters replaced with HTML (decimal) numeric character references.
1234    *
1235    * See the documentation of the class for documentation for `Encode*`
1236    * methods collectively.
1237    *
1238    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1239    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1240    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1241    */
EncodeFromUTF8(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)1242   inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1243       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1244     size_t srcRead = aSrc.Length();
1245     size_t dstWritten = aDst.Length();
1246     bool hadReplacements;
1247     uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1248                                                aDst.Elements(), &dstWritten,
1249                                                aLast, &hadReplacements);
1250     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1251   }
1252 
1253   /**
1254    * Incrementally encode into byte stream from UTF-8 _without replacement_.
1255    *
1256    * See the documentation of the class for documentation for `Encode*`
1257    * methods collectively.
1258    *
1259    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1260    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1261    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1262    */
EncodeFromUTF8WithoutReplacement(Span<const uint8_t> aSrc,Span<uint8_t> aDst,bool aLast)1263   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1264       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1265     size_t srcRead = aSrc.Length();
1266     size_t dstWritten = aDst.Length();
1267     uint32_t result = encoder_encode_from_utf8_without_replacement(
1268         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1269     return MakeTuple(result, srcRead, dstWritten);
1270   }
1271 
1272   /**
1273    * Query the worst-case output size when encoding from UTF-16 with
1274    * replacement.
1275    *
1276    * Returns the size of the output buffer in bytes that will not overflow
1277    * given the current state of the encoder and `aU16Length` number of
1278    * additional input code units if there are no unmappable characters in
1279    * the input.
1280    */
MaxBufferLengthFromUTF16IfNoUnmappables(size_t aU16Length)1281   inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1282       size_t aU16Length) const {
1283     CheckedInt<size_t> max(
1284         encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1285                                                                aU16Length));
1286     if (max.value() == std::numeric_limits<size_t>::max()) {
1287       // Mark invalid by overflowing
1288       max++;
1289       MOZ_ASSERT(!max.isValid());
1290     }
1291     return max;
1292   }
1293 
1294   /**
1295    * Query the worst-case output size when encoding from UTF-16 without
1296    * replacement.
1297    *
1298    * Returns the size of the output buffer in bytes that will not overflow
1299    * given the current state of the encoder and `aU16Length` number of
1300    * additional input code units.
1301    */
MaxBufferLengthFromUTF16WithoutReplacement(size_t aU16Length)1302   inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1303       size_t aU16Length) const {
1304     CheckedInt<size_t> max(
1305         encoder_max_buffer_length_from_utf16_without_replacement(this,
1306                                                                  aU16Length));
1307     if (max.value() == std::numeric_limits<size_t>::max()) {
1308       // Mark invalid by overflowing
1309       max++;
1310       MOZ_ASSERT(!max.isValid());
1311     }
1312     return max;
1313   }
1314 
1315   /**
1316    * Incrementally encode into byte stream from UTF-16 with unmappable
1317    * characters replaced with HTML (decimal) numeric character references.
1318    *
1319    * See the documentation of the class for documentation for `Encode*`
1320    * methods collectively.
1321    */
EncodeFromUTF16(Span<const char16_t> aSrc,Span<uint8_t> aDst,bool aLast)1322   inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1323       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1324     size_t srcRead = aSrc.Length();
1325     size_t dstWritten = aDst.Length();
1326     bool hadReplacements;
1327     uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1328                                                 aDst.Elements(), &dstWritten,
1329                                                 aLast, &hadReplacements);
1330     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1331   }
1332 
1333   /**
1334    * Incrementally encode into byte stream from UTF-16 _without replacement_.
1335    *
1336    * See the documentation of the class for documentation for `Encode*`
1337    * methods collectively.
1338    */
EncodeFromUTF16WithoutReplacement(Span<const char16_t> aSrc,Span<uint8_t> aDst,bool aLast)1339   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1340       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1341     size_t srcRead = aSrc.Length();
1342     size_t dstWritten = aDst.Length();
1343     uint32_t result = encoder_encode_from_utf16_without_replacement(
1344         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1345     return MakeTuple(result, srcRead, dstWritten);
1346   }
1347 
1348  private:
1349   Encoder() = delete;
1350   Encoder(const Encoder&) = delete;
1351   Encoder& operator=(const Encoder&) = delete;
1352 };
1353 
1354 };  // namespace mozilla
1355 
1356 #endif  // mozilla_Encoding_h
1357