1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 #![cfg_attr(
11     feature = "cargo-clippy",
12     allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14 
15 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17 //! Gecko-oriented means that converting to and from UTF-16 is supported in
18 //! addition to converting to and from UTF-8, that the performance and
19 //! streamability goals are browser-oriented, and that FFI-friendliness is a
20 //! goal.
21 //!
22 //! Additionally, the `mem` module provides functions that are useful for
23 //! applications that need to be able to deal with legacy in-memory
24 //! representations of Unicode.
25 //!
26 //! For expectation setting, please be sure to read the sections
27 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29 //!
30 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31 //! design and internals of the crate.
32 //!
33 //! # Availability
34 //!
35 //! The code is available under the
36 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38 //! See the
39 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40 //! file for details.
41 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43 //!
44 //! # Integration with `std::io`
45 //!
46 //! This crate doesn't implement traits from `std::io`. However, for the case of
47 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50 //!
51 //! # Examples
52 //!
53 //! Example programs:
54 //!
55 //! * [Rust](https://github.com/hsivonen/recode_rs)
56 //! * [C](https://github.com/hsivonen/recode_c)
57 //! * [C++](https://github.com/hsivonen/recode_cpp)
58 //!
59 //! Decode using the non-streaming API:
60 //!
61 //! ```
62 //! use encoding_rs::*;
63 //!
64 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
65 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
66 //!
67 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
68 //! assert_eq!(&cow[..], expectation);
69 //! assert_eq!(encoding_used, SHIFT_JIS);
70 //! assert!(!had_errors);
71 //! ```
72 //!
73 //! Decode using the streaming API with minimal `unsafe`:
74 //!
75 //! ```
76 //! use encoding_rs::*;
77 //!
78 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
79 //!
80 //! // Use an array of byte slices to demonstrate content arriving piece by
81 //! // piece from the network.
82 //! let bytes: [&'static [u8]; 4] = [b"\x83",
83 //!                                  b"n\x83\x8D\x81",
84 //!                                  b"[\x81E\x83\x8F\x81[\x83",
85 //!                                  b"\x8B\x83h"];
86 //!
87 //! // Very short output buffer to demonstrate the output buffer getting full.
88 //! // Normally, you'd use something like `[0u8; 2048]`.
89 //! let mut buffer_bytes = [0u8; 8];
90 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
91 //!
92 //! // How many bytes in the buffer currently hold significant data.
93 //! let mut bytes_in_buffer = 0usize;
94 //!
95 //! // Collect the output to a string for demonstration purposes.
96 //! let mut output = String::new();
97 //!
98 //! // The `Decoder`
99 //! let mut decoder = SHIFT_JIS.new_decoder();
100 //!
101 //! // Track whether we see errors.
102 //! let mut total_had_errors = false;
103 //!
104 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
105 //! // use of a fixed-size buffer; normally when the output of an incremental
106 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
107 //! // avoid the intermediate buffer).
108 //! for input in &bytes[..] {
109 //!     // The number of bytes already read from current `input` in total.
110 //!     let mut total_read_from_current_input = 0usize;
111 //!
112 //!     loop {
113 //!         let (result, read, written, had_errors) =
114 //!             decoder.decode_to_str(&input[total_read_from_current_input..],
115 //!                                   &mut buffer[bytes_in_buffer..],
116 //!                                   false);
117 //!         total_read_from_current_input += read;
118 //!         bytes_in_buffer += written;
119 //!         total_had_errors |= had_errors;
120 //!         match result {
121 //!             CoderResult::InputEmpty => {
122 //!                 // We have consumed the current input buffer. Break out of
123 //!                 // the inner loop to get the next input buffer from the
124 //!                 // outer loop.
125 //!                 break;
126 //!             },
127 //!             CoderResult::OutputFull => {
128 //!                 // Write the current buffer out and consider the buffer
129 //!                 // empty.
130 //!                 output.push_str(&buffer[..bytes_in_buffer]);
131 //!                 bytes_in_buffer = 0usize;
132 //!                 continue;
133 //!             }
134 //!         }
135 //!     }
136 //! }
137 //!
138 //! // Process EOF
139 //! loop {
140 //!     let (result, _, written, had_errors) =
141 //!         decoder.decode_to_str(b"",
142 //!                               &mut buffer[bytes_in_buffer..],
143 //!                               true);
144 //!     bytes_in_buffer += written;
145 //!     total_had_errors |= had_errors;
146 //!     // Write the current buffer out and consider the buffer empty.
147 //!     // Need to do this here for both `match` arms, because we exit the
148 //!     // loop on `CoderResult::InputEmpty`.
149 //!     output.push_str(&buffer[..bytes_in_buffer]);
150 //!     bytes_in_buffer = 0usize;
151 //!     match result {
152 //!         CoderResult::InputEmpty => {
153 //!             // Done!
154 //!             break;
155 //!         },
156 //!         CoderResult::OutputFull => {
157 //!             continue;
158 //!         }
159 //!     }
160 //! }
161 //!
162 //! assert_eq!(&output[..], expectation);
163 //! assert!(!total_had_errors);
164 //! ```
165 //!
166 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
167 //!
168 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
169 //! __so this crate does not provide encoders for those encodings__!
170 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
171 //! so you get an UTF-8 encoder if you request an encoder for them.
172 //!
173 //! Additionally, the Encoding Standard factors BOM handling into wrapper
174 //! algorithms so that BOM handling isn't part of the definition of the
175 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
176 //! Standard define BOM handling or lack thereof as part of the encoding
177 //! scheme.
178 //!
179 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
180 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
181 //! the Unicode Standard.
182 //!
183 //! When used with the `_with_bom_removal` entry points, the UTF-8
184 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
185 //! Standard.
186 //!
187 //! This crate does not provide a mode that matches the UTF-16 _encoding
188 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
189 //! the entry points without `_bom_` qualifiers is the closest match,
190 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
191 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
192 //! Unicode Standard.
193 //!
194 //! The UTF-32 family of Unicode encoding schemes is not supported
195 //! by this crate. The Encoding Standard doesn't define any UTF-32
196 //! family encodings, since they aren't necessary for consuming Web
197 //! content.
198 //!
199 //! ## ISO-8859-1
200 //!
201 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
202 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
203 //! byte value to the same Unicode scalar value is not available via
204 //! `Encoding` in this crate.
205 //!
206 //! However, the functions whose name starts with `convert` and contains
207 //! `latin1` in the `mem` module support such conversions, which are known as
208 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
209 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
210 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
211 //!
212 //! ## Web / Browser Focus
213 //!
214 //! Both in terms of scope and performance, the focus is on the Web. For scope,
215 //! this means that encoding_rs implements the Encoding Standard fully and
216 //! doesn't implement encodings that are not specified in the Encoding
217 //! Standard. For performance, this means that decoding performance is
218 //! important as well as performance for encoding into UTF-8 or encoding the
219 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
220 //! be encoded into legacy encodings in only two places in the Web platform: in
221 //! the query part of URLs, in which case it's a matter of relatively rare
222 //! error handling, and in form submission, in which case the user action and
223 //! networking tend to hide the performance of the encoder.
224 //!
225 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
226 //! encodings enables smaller code size thanks to the encoder side using the
227 //! decode-optimized data tables without having encode-optimized data tables at
228 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
229 //! multiplication operations.
230 //!
231 //! Additionally, performance is a non-goal for the ASCII-incompatible
232 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
233 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
234 //! of implementation.
235 //!
236 //! Despite the browser focus, the hope is that non-browser applications
237 //! that wish to consume Web content or submit Web forms in a Web-compatible
238 //! way will find encoding_rs useful. While encoding_rs does not try to match
239 //! Windows behavior, many of the encodings are close enough to legacy
240 //! encodings implemented by Windows that applications that need to consume
241 //! data in legacy Windows encodins may find encoding_rs useful. The
242 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
243 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
244 //!
245 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
246 //! to the encodings defined in the Encoding Standard. The
247 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
248 //! UTF-7 decoding for email purposes.
249 //!
250 //! For single-byte DOS encodings beyond the ones supported by the Encoding
251 //! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
252 //!
253 //! # Preparing Text for the Encoders
254 //!
255 //! Normalizing text into Unicode Normalization Form C prior to encoding text
256 //! into a legacy encoding minimizes unmappable characters. Text can be
257 //! normalized to Unicode Normalization Form C using the
258 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
259 //!
260 //! The exception is windows-1258, which after normalizing to Unicode
261 //! Normalization Form C requires tone marks to be decomposed in order to
262 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
263 //! using the [`detone`](https://crates.io/crates/detone) crate.
264 //!
265 //! # Streaming & Non-Streaming; Rust & C/C++
266 //!
267 //! The API in Rust has two modes of operation: streaming and non-streaming.
268 //! The streaming API is the foundation of the implementation and should be
269 //! used when processing data that arrives piecemeal from an i/o stream. The
270 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
271 //! to C callers. The non-streaming part of the API is for Rust callers only and
272 //! is smart about borrowing instead of copying when possible. When
273 //! streamability is not needed, the non-streaming API should be preferrer in
274 //! order to avoid copying data when a borrow suffices.
275 //!
276 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
277 //! standard types for growable byte buffers and Unicode strings that know
278 //! their length.
279 //!
280 //! The C API (header file generated at `target/include/encoding_rs.h` when
281 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
282 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
283 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
284 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
285 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
286 //! as part of Mozilla [bug 1261841][4].
287 //!
288 //! The `Encoding` type is common to both the streaming and non-streaming
289 //! modes. In the streaming mode, decoding operations are performed with a
290 //! `Decoder` and encoding operations with an `Encoder` object obtained via
291 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
292 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
293 //! and `Encoder` objects are not used at all.
294 //!
295 //! [1]: https://github.com/hsivonen/encoding_c
296 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
297 //! [3]: https://github.com/Microsoft/GSL/
298 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
299 //!
300 //! # Memory management
301 //!
302 //! The non-streaming mode never performs heap allocations (even the methods
303 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
304 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
305 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
306 //!
307 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
308 //! perform heap allocations but only to allocate the backing buffer of the
309 //! `Vec<u8>` or the `String`.
310 //!
311 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
312 //! `Drop` cleanup.
313 //!
314 //! # Buffer reading and writing behavior
315 //!
316 //! Based on experience gained with the `java.nio.charset` encoding converter
317 //! API and with the Gecko uconv encoding converter API, the buffer reading
318 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
319 //! fully drained but output buffers are not always fully filled.
320 //!
321 //! When reading from an input buffer, encoding_rs always consumes all input
322 //! up to the next error or to the end of the buffer. In particular, when
323 //! decoding, even if the input buffer ends in the middle of a byte sequence
324 //! for a character, the decoder consumes all input. This has the benefit that
325 //! the caller of the API can always fill the next buffer from the start from
326 //! whatever source the bytes come from and never has to first copy the last
327 //! bytes of the previous buffer to the start of the next buffer. However, when
328 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
329 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
330 //! boundaries falling in the middle of a surrogate pair result in both
331 //! suggorates being treated individually as unpaired surrogates.
332 //!
333 //! Additionally, decoders guarantee that they can be fed even one byte at a
334 //! time and encoders guarantee that they can be fed even one code point at a
335 //! time. This has the benefit of not placing restrictions on the size of
336 //! chunks the content arrives e.g. from network.
337 //!
338 //! When writing into an output buffer, encoding_rs makes sure that the code
339 //! unit sequence for a character is never split across output buffer
340 //! boundaries. This may result in wasted space at the end of an output buffer,
341 //! but the advantages are that the output side of both decoders and encoders
342 //! is greatly simplified compared to designs that attempt to fill output
343 //! buffers exactly even when that entails splitting a code unit sequence and
344 //! when encoding_rs methods return to the caller, the output produces thus
345 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
346 //! the output needs to be considered as a whole, because the latest output
347 //! buffer taken alone might not be valid taken alone if the transition away
348 //! from the ASCII state occurred in an earlier output buffer. However, since
349 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
350 //! state as being in error despite the encoder generating a transition to the
351 //! ASCII state at the end, the claim about the partial output taken as a whole
352 //! being valid is true even for ISO-2022-JP.)
353 //!
354 //! # Error Reporting
355 //!
356 //! Based on experience gained with the `java.nio.charset` encoding converter
357 //! API and with the Gecko uconv encoding converter API, the error reporting
358 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
359 //! that leave it up to the caller to extract the erroneous bytes from the
360 //! input stream if the caller wishes to do so but encoder errors provide the
361 //! code point associated with the error without requiring the caller to
362 //! extract it from the input on its own.
363 //!
364 //! On the encoder side, an error is always triggered by the most recently
365 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
366 //! caller. Also, it's very typical for the caller to wish to do something with
367 //! this data: generate a numeric escape for the character. Additionally, the
368 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
369 //! certain cases, so requiring the caller to extract the character from the
370 //! input buffer would require the caller to handle ISO-2022-JP details.
371 //! Furthermore, requiring the caller to extract the character from the input
372 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
373 //! the job of an encoding conversion library.
374 //!
375 //! On the decoder side, errors are triggered in more complex ways. For
376 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
377 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
378 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
379 //! be the ones most recently pushed to the decoder and the error might not even
380 //! be in the current buffer.
381 //!
382 //! Some encoding conversion APIs address the problem by not acknowledging
383 //! trailing bytes of an input buffer as consumed if it's still possible for
384 //! future bytes to cause the trailing bytes to be in error. This way, error
385 //! reporting can always refer to the most recently pushed buffer. This has the
386 //! problem that the caller of the API has to copy the unconsumed trailing
387 //! bytes to the start of the next buffer before being able to fill the rest
388 //! of the next buffer. This is annoying, error-prone and inefficient.
389 //!
390 //! A possible solution would be making the decoder remember recently consumed
391 //! bytes in order to be able to include a copy of the erroneous bytes when
392 //! reporting an error. This has two problem: First, callers a rarely
393 //! interested in the erroneous bytes, so attempts to identify them are most
394 //! often just overhead anyway. Second, the rare applications that are
395 //! interested typically care about the location of the error in the input
396 //! stream.
397 //!
398 //! To keep the API convenient for common uses and the overhead low while making
399 //! it possible to develop applications, such as HTML validators, that care
400 //! about which bytes were in error, encoding_rs reports the length of the
401 //! erroneous sequence and the number of bytes consumed after the erroneous
402 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
403 //! this makes it possible for callers that care about the erroneous bytes to
404 //! locate them.
405 //!
406 //! # No Convenience API for Custom Replacements
407 //!
408 //! The Web Platform and, therefore, the Encoding Standard supports only one
409 //! error recovery mode for decoders and only one error recovery mode for
410 //! encoders. The supported error recovery mode for decoders is emitting the
411 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
412 //! encoders is emitting an HTML decimal numeric character reference for
413 //! unmappable characters.
414 //!
415 //! Since encoding_rs is Web-focused, these are the only error recovery modes
416 //! for which convenient support is provided. Moreover, on the decoder side,
417 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
418 //! on error (other than treating errors as fatal). In particular, simply
419 //! ignoring errors is a
420 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
421 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
422 //! callers to ignore errors.
423 //!
424 //! On the encoder side, there are plausible alternatives for HTML decimal
425 //! numeric character references. For example, when outputting CSS, CSS-style
426 //! escapes would seem to make sense. However, instead of facilitating the
427 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
428 //! position that you shouldn't generate output in encodings other than UTF-8,
429 //! except where backward compatibility with interacting with the legacy Web
430 //! requires it. The legacy Web requires it only when parsing the query strings
431 //! of URLs and when submitting forms, and those two both use HTML decimal
432 //! numeric character references.
433 //!
434 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
435 //! numeric character references easy, it does make them _possible_.
436 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
437 //! for unmappable characters, is implemented on top of
438 //! `encode_from_utf8_without_replacement()`. Applications that really, really
439 //! want other replacement schemes for unmappable characters can likewise
440 //! implement them on top of `encode_from_utf8_without_replacement()`.
441 //!
442 //! # No Extensibility by Design
443 //!
444 //! The set of encodings supported by encoding_rs is not extensible by design.
445 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
446 //! rather than `trait`s. encoding_rs takes the design position that all future
447 //! text interchange should be done using UTF-8, which can represent all of
448 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
449 //! Standard and encoding_rs that can represent all of Unicode and that has
450 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
451 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
452 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
453 //! other than being able to consume legacy content.
454 //!
455 //! Considering that UTF-8 can represent all of Unicode and is already supported
456 //! by all Web browsers, introducing a new encoding wouldn't add to the
457 //! expressiveness but would add to compatibility problems. In that sense,
458 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
459 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
460 //! the Web Platform. On the other hand, the set of legacy encodings that must
461 //! be supported for a Web browser to be able to be successful is not going to
462 //! expand. Empirically, the set of encodings specified in the Encoding Standard
463 //! is already sufficient and the set of legacy encodings won't grow
464 //! retroactively.
465 //!
466 //! Since extensibility doesn't make sense considering the Web focus of
467 //! encoding_rs and adding encodings to Web clients would be actively harmful,
468 //! it makes sense to make the set of encodings that encoding_rs supports
469 //! non-extensible and to take the (admittedly small) benefits arising from
470 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
471 //!  of time, which enables stack allocation thereof.
472 //!
473 //! This does have downsides for applications that might want to put encoding_rs
474 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
475 //! needed for Web uses. The needs of such applications should not complicate
476 //! encoding_rs itself, though. It is up to those applications to provide a
477 //! framework that delegates the operations with encodings that encoding_rs
478 //! supports to encoding_rs and operations with other encodings to something
479 //! else (as opposed to encoding_rs itself providing an extensibility
480 //! framework).
481 //!
482 //! # Panics
483 //!
484 //! Methods in encoding_rs can panic if the API is used against the requirements
485 //! stated in the documentation, if a state that's supposed to be impossible
486 //! is reached due to an internal bug or on integer overflow. When used
487 //! according to documentation with buffer sizes that stay below integer
488 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
489 //!
490 //! Panics arising from API misuse aren't documented beyond this on individual
491 //! methods.
492 //!
493 //! # At-Risk Parts of the API
494 //!
495 //! The foreseeable source of partially backward-incompatible API change is the
496 //! way the instances of `Encoding` are made available.
497 //!
498 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
499 //! initialized with `static`s of type `&'static Encoding`, the non-reference
500 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
501 //!
502 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
503 //! unique when the constant is used in different crates, the reference-typed
504 //! `static`s for the encoding instances will be changed from `static` to
505 //! `const` and the non-reference-typed `_INIT` instances will be removed.
506 //!
507 //! # Mapping Spec Concepts onto the API
508 //!
509 //! <table>
510 //! <thead>
511 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
512 //! </thead>
513 //! <tbody>
514 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
515 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
522 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
523 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
524 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
525 //! </tbody>
526 //! </table>
527 //!
528 //! # Compatibility with the rust-encoding API
529 //!
530 //! The crate
531 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
532 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
533 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
534 //!
535 //! # Mapping rust-encoding concepts to encoding_rs concepts
536 //!
537 //! The following table provides a mapping from rust-encoding constructs to
538 //! encoding_rs ones.
539 //!
540 //! <table>
541 //! <thead>
542 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
543 //! </thead>
544 //! <tbody>
545 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
546 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
547 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
548 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
549 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
550 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
551 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
552 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
553 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
554 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
555 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
556 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
557 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
558 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
559 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
560 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
561 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
562 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
563 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
564 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
565 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
566 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
567 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
568 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
570 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
572 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
573 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
574 //! </tbody>
575 //! </table>
576 //!
577 //! # Relationship with Windows Code Pages
578 //!
579 //! Despite the Web and browser focus, the encodings defined by the Encoding
580 //! Standard and implemented by this crate may be useful for decoding legacy
581 //! data that uses Windows code pages. The following table names the single-byte
582 //! encodings
583 //! that have a closely related Windows code page, the number of the closest
584 //! code page, a column indicating whether Windows maps unassigned code points
585 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
586 //! indicating remarks in the list after the table.
587 //!
588 //! <table>
589 //! <thead>
590 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
591 //! </thead>
592 //! <tbody>
593 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
594 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
595 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
596 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
597 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
598 //! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
599 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
600 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
601 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
602 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
603 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
604 //! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
605 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
606 //! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
607 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
608 //! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
609 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
610 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
611 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
612 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
613 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
614 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
615 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
616 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
617 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
618 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
619 //! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
620 //! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
621 //! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
622 //! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
623 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
624 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
625 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
626 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
627 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
628 //! </tbody>
629 //! </table>
630 //!
631 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
632 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
633 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
634 //!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
635 //!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
636 //!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
637 //!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
638 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
639 //!    of LRM and RLM.
640 //! 5. Remarks from the previous item apply.
641 //!
642 //! The differences between this crate and Windows in the case of multibyte encodings
643 //! are not yet fully documented here. The lack of remarks above should not be taken
644 //! as indication of lack of differences.
645 //!
646 //! # Notable Differences from IANA Naming
647 //!
648 //! In some cases, the Encoding Standard specifies the popular unextended encoding
649 //! name where in IANA terms one of the other labels would be more precise considering
650 //! the extensions that the Encoding Standard has unified into the encoding.
651 //!
652 //! <table>
653 //! <thead>
654 //! <tr><th>Encoding</th><th>IANA</th></tr>
655 //! </thead>
656 //! <tbody>
657 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
658 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
659 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
660 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
661 //! </tbody>
662 //! </table>
663 //!
664 //! In other cases where the Encoding Standard unifies unextended and extended
665 //! variants of an encoding, the encoding gets the name of the extended
666 //! variant.
667 //!
668 //! <table>
669 //! <thead>
670 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
671 //! </thead>
672 //! <tbody>
673 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
674 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
675 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
676 //! </tbody>
677 //! </table>
678 //!
679 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
680 //! for discussion about the UTF-16 family.
681 
682 #![no_std]
683 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
684 
685 #[cfg_attr(test, macro_use)]
686 extern crate alloc;
687 extern crate core;
688 #[macro_use]
689 extern crate cfg_if;
690 
691 #[cfg(all(
692     feature = "simd-accel",
693     any(
694         target_feature = "sse2",
695         all(target_endian = "little", target_arch = "aarch64"),
696         all(target_endian = "little", target_feature = "neon")
697     )
698 ))]
699 #[macro_use(shuffle)]
700 extern crate packed_simd;
701 
702 #[cfg(feature = "serde")]
703 extern crate serde;
704 
705 #[cfg(all(test, feature = "serde"))]
706 extern crate bincode;
707 #[cfg(all(test, feature = "serde"))]
708 #[macro_use]
709 extern crate serde_derive;
710 #[cfg(all(test, feature = "serde"))]
711 extern crate serde_json;
712 
713 #[macro_use]
714 mod macros;
715 
716 #[cfg(all(
717     feature = "simd-accel",
718     any(
719         target_feature = "sse2",
720         all(target_endian = "little", target_arch = "aarch64"),
721         all(target_endian = "little", target_feature = "neon")
722     )
723 ))]
724 mod simd_funcs;
725 
726 #[cfg(test)]
727 mod testing;
728 
729 mod big5;
730 mod euc_jp;
731 mod euc_kr;
732 mod gb18030;
733 mod iso_2022_jp;
734 mod replacement;
735 mod shift_jis;
736 mod single_byte;
737 mod utf_16;
738 mod utf_8;
739 mod x_user_defined;
740 
741 mod ascii;
742 mod data;
743 mod handles;
744 mod variant;
745 
746 pub mod mem;
747 
748 use crate::ascii::ascii_valid_up_to;
749 use crate::ascii::iso_2022_jp_ascii_valid_up_to;
750 use crate::utf_8::utf8_valid_up_to;
751 use crate::variant::*;
752 
753 use alloc::borrow::Cow;
754 use alloc::string::String;
755 use alloc::vec::Vec;
756 use core::cmp::Ordering;
757 use core::hash::Hash;
758 use core::hash::Hasher;
759 
760 #[cfg(feature = "serde")]
761 use serde::de::Visitor;
762 #[cfg(feature = "serde")]
763 use serde::{Deserialize, Deserializer, Serialize, Serializer};
764 
765 /// This has to be the max length of an NCR instead of max
766 /// minus one, because we can't rely on getting the minus
767 /// one from the space reserved for the current unmappable,
768 /// because the ISO-2022-JP encoder can fill up that space
769 /// with a state transition escape.
770 const NCR_EXTRA: usize = 10; // &#1114111;
771 
772 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
773 // Instead, please regenerate using generate-encoding-data.py
774 
775 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
776 
777 /// The initializer for the [Big5](static.BIG5.html) encoding.
778 ///
779 /// For use only for taking the address of this form when
780 /// Rust prohibits the use of the non-`_INIT` form directly,
781 /// such as in initializers of other `static`s. If in doubt,
782 /// use the corresponding non-`_INIT` reference-typed `static`.
783 ///
784 /// This part of the public API will go away if Rust changes
785 /// to make the referent of `pub const FOO: &'static Encoding`
786 /// unique cross-crate or if Rust starts allowing static arrays
787 /// to be initialized with `pub static FOO: &'static Encoding`
788 /// items.
789 pub static BIG5_INIT: Encoding = Encoding {
790     name: "Big5",
791     variant: VariantEncoding::Big5,
792 };
793 
794 /// The Big5 encoding.
795 ///
796 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
797 /// instead of the Private Use Area code points that have been used historically.
798 /// It is believed to be able to decode existing Web content in a way that makes
799 /// sense.
800 ///
801 /// To avoid form submissions generating data that Web servers don't understand,
802 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
803 /// Big5 in the lexical order.
804 ///
805 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
806 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
807 ///
808 /// This encoding is designed to be suited for decoding the Windows code page 950
809 /// and its HKSCS patched "951" variant such that the text makes sense, given
810 /// assignments that Unicode has made after those encodings used Private Use
811 /// Area characters.
812 ///
813 /// This will change from `static` to `const` if Rust changes
814 /// to make the referent of `pub const FOO: &'static Encoding`
815 /// unique cross-crate, so don't take the address of this
816 /// `static`.
817 pub static BIG5: &'static Encoding = &BIG5_INIT;
818 
819 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
820 ///
821 /// For use only for taking the address of this form when
822 /// Rust prohibits the use of the non-`_INIT` form directly,
823 /// such as in initializers of other `static`s. If in doubt,
824 /// use the corresponding non-`_INIT` reference-typed `static`.
825 ///
826 /// This part of the public API will go away if Rust changes
827 /// to make the referent of `pub const FOO: &'static Encoding`
828 /// unique cross-crate or if Rust starts allowing static arrays
829 /// to be initialized with `pub static FOO: &'static Encoding`
830 /// items.
831 pub static EUC_JP_INIT: Encoding = Encoding {
832     name: "EUC-JP",
833     variant: VariantEncoding::EucJp,
834 };
835 
836 /// The EUC-JP encoding.
837 ///
838 /// This is the legacy Unix encoding for Japanese.
839 ///
840 /// For compatibility with Web servers that don't expect three-byte sequences
841 /// in form submissions, the encoder doesn't generate three-byte sequences.
842 /// That is, the JIS X 0212 support is decode-only.
843 ///
844 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
845 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
846 ///
847 /// This encoding roughly matches the Windows code page 20932. There are error
848 /// handling differences and a handful of 2-byte sequences that decode differently.
849 /// Additionall, Windows doesn't support 3-byte sequences.
850 ///
851 /// This will change from `static` to `const` if Rust changes
852 /// to make the referent of `pub const FOO: &'static Encoding`
853 /// unique cross-crate, so don't take the address of this
854 /// `static`.
855 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
856 
857 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
858 ///
859 /// For use only for taking the address of this form when
860 /// Rust prohibits the use of the non-`_INIT` form directly,
861 /// such as in initializers of other `static`s. If in doubt,
862 /// use the corresponding non-`_INIT` reference-typed `static`.
863 ///
864 /// This part of the public API will go away if Rust changes
865 /// to make the referent of `pub const FOO: &'static Encoding`
866 /// unique cross-crate or if Rust starts allowing static arrays
867 /// to be initialized with `pub static FOO: &'static Encoding`
868 /// items.
869 pub static EUC_KR_INIT: Encoding = Encoding {
870     name: "EUC-KR",
871     variant: VariantEncoding::EucKr,
872 };
873 
874 /// The EUC-KR encoding.
875 ///
876 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
877 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
878 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
879 ///
880 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
881 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
882 ///
883 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
884 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
885 /// the question mark or the Private Use Area.
886 ///
887 /// This will change from `static` to `const` if Rust changes
888 /// to make the referent of `pub const FOO: &'static Encoding`
889 /// unique cross-crate, so don't take the address of this
890 /// `static`.
891 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
892 
893 /// The initializer for the [GBK](static.GBK.html) encoding.
894 ///
895 /// For use only for taking the address of this form when
896 /// Rust prohibits the use of the non-`_INIT` form directly,
897 /// such as in initializers of other `static`s. If in doubt,
898 /// use the corresponding non-`_INIT` reference-typed `static`.
899 ///
900 /// This part of the public API will go away if Rust changes
901 /// to make the referent of `pub const FOO: &'static Encoding`
902 /// unique cross-crate or if Rust starts allowing static arrays
903 /// to be initialized with `pub static FOO: &'static Encoding`
904 /// items.
905 pub static GBK_INIT: Encoding = Encoding {
906     name: "GBK",
907     variant: VariantEncoding::Gbk,
908 };
909 
910 /// The GBK encoding.
911 ///
912 /// The decoder for this encoding is the same as the decoder for gb18030.
913 /// The encoder side of this encoding is GBK with Windows code page 936 euro
914 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
915 /// Unicode block as well as a handful of ideographs from the CJK Unified
916 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
917 ///
918 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
919 /// unified with the gb18030 encoder in the Encoding Standard out of concern
920 /// that servers that expect GBK form submissions might not be able to handle
921 /// the four-byte sequences.
922 ///
923 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
924 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
925 ///
926 /// The encoder of this encoding roughly matches the Windows code page 936.
927 /// The decoder side is a superset.
928 ///
929 /// This will change from `static` to `const` if Rust changes
930 /// to make the referent of `pub const FOO: &'static Encoding`
931 /// unique cross-crate, so don't take the address of this
932 /// `static`.
933 pub static GBK: &'static Encoding = &GBK_INIT;
934 
935 /// The initializer for the [IBM866](static.IBM866.html) encoding.
936 ///
937 /// For use only for taking the address of this form when
938 /// Rust prohibits the use of the non-`_INIT` form directly,
939 /// such as in initializers of other `static`s. If in doubt,
940 /// use the corresponding non-`_INIT` reference-typed `static`.
941 ///
942 /// This part of the public API will go away if Rust changes
943 /// to make the referent of `pub const FOO: &'static Encoding`
944 /// unique cross-crate or if Rust starts allowing static arrays
945 /// to be initialized with `pub static FOO: &'static Encoding`
946 /// items.
947 pub static IBM866_INIT: Encoding = Encoding {
948     name: "IBM866",
949     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
950 };
951 
952 /// The IBM866 encoding.
953 ///
954 /// This the most notable one of the DOS Cyrillic code pages. It has the same
955 /// box drawing characters as code page 437, so it can be used for decoding
956 /// DOS-era ASCII + box drawing data.
957 ///
958 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
959 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
960 ///
961 /// This encoding matches the Windows code page 866.
962 ///
963 /// This will change from `static` to `const` if Rust changes
964 /// to make the referent of `pub const FOO: &'static Encoding`
965 /// unique cross-crate, so don't take the address of this
966 /// `static`.
967 pub static IBM866: &'static Encoding = &IBM866_INIT;
968 
969 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
970 ///
971 /// For use only for taking the address of this form when
972 /// Rust prohibits the use of the non-`_INIT` form directly,
973 /// such as in initializers of other `static`s. If in doubt,
974 /// use the corresponding non-`_INIT` reference-typed `static`.
975 ///
976 /// This part of the public API will go away if Rust changes
977 /// to make the referent of `pub const FOO: &'static Encoding`
978 /// unique cross-crate or if Rust starts allowing static arrays
979 /// to be initialized with `pub static FOO: &'static Encoding`
980 /// items.
981 pub static ISO_2022_JP_INIT: Encoding = Encoding {
982     name: "ISO-2022-JP",
983     variant: VariantEncoding::Iso2022Jp,
984 };
985 
986 /// The ISO-2022-JP encoding.
987 ///
988 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
989 /// byte range to encode non-Basic Latin characters. It's the only encoding
990 /// supported by this crate whose encoder is stateful.
991 ///
992 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
993 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
994 ///
995 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
996 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
997 /// error handling.
998 ///
999 /// This will change from `static` to `const` if Rust changes
1000 /// to make the referent of `pub const FOO: &'static Encoding`
1001 /// unique cross-crate, so don't take the address of this
1002 /// `static`.
1003 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1004 
1005 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1006 ///
1007 /// For use only for taking the address of this form when
1008 /// Rust prohibits the use of the non-`_INIT` form directly,
1009 /// such as in initializers of other `static`s. If in doubt,
1010 /// use the corresponding non-`_INIT` reference-typed `static`.
1011 ///
1012 /// This part of the public API will go away if Rust changes
1013 /// to make the referent of `pub const FOO: &'static Encoding`
1014 /// unique cross-crate or if Rust starts allowing static arrays
1015 /// to be initialized with `pub static FOO: &'static Encoding`
1016 /// items.
1017 pub static ISO_8859_10_INIT: Encoding = Encoding {
1018     name: "ISO-8859-10",
1019     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1020 };
1021 
1022 /// The ISO-8859-10 encoding.
1023 ///
1024 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1025 /// is also known as Latin 6.
1026 ///
1027 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1028 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1029 ///
1030 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1031 /// does not support this encoding.
1032 ///
1033 /// This will change from `static` to `const` if Rust changes
1034 /// to make the referent of `pub const FOO: &'static Encoding`
1035 /// unique cross-crate, so don't take the address of this
1036 /// `static`.
1037 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1038 
1039 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1040 ///
1041 /// For use only for taking the address of this form when
1042 /// Rust prohibits the use of the non-`_INIT` form directly,
1043 /// such as in initializers of other `static`s. If in doubt,
1044 /// use the corresponding non-`_INIT` reference-typed `static`.
1045 ///
1046 /// This part of the public API will go away if Rust changes
1047 /// to make the referent of `pub const FOO: &'static Encoding`
1048 /// unique cross-crate or if Rust starts allowing static arrays
1049 /// to be initialized with `pub static FOO: &'static Encoding`
1050 /// items.
1051 pub static ISO_8859_13_INIT: Encoding = Encoding {
1052     name: "ISO-8859-13",
1053     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1054 };
1055 
1056 /// The ISO-8859-13 encoding.
1057 ///
1058 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1059 /// is also known as Latin 7.
1060 ///
1061 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1062 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1063 ///
1064 /// This encoding matches the Windows code page 28603, except Windows decodes
1065 /// unassigned code points to the Private Use Area of Unicode.
1066 ///
1067 /// This will change from `static` to `const` if Rust changes
1068 /// to make the referent of `pub const FOO: &'static Encoding`
1069 /// unique cross-crate, so don't take the address of this
1070 /// `static`.
1071 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1072 
1073 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1074 ///
1075 /// For use only for taking the address of this form when
1076 /// Rust prohibits the use of the non-`_INIT` form directly,
1077 /// such as in initializers of other `static`s. If in doubt,
1078 /// use the corresponding non-`_INIT` reference-typed `static`.
1079 ///
1080 /// This part of the public API will go away if Rust changes
1081 /// to make the referent of `pub const FOO: &'static Encoding`
1082 /// unique cross-crate or if Rust starts allowing static arrays
1083 /// to be initialized with `pub static FOO: &'static Encoding`
1084 /// items.
1085 pub static ISO_8859_14_INIT: Encoding = Encoding {
1086     name: "ISO-8859-14",
1087     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1088 };
1089 
1090 /// The ISO-8859-14 encoding.
1091 ///
1092 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1093 /// is also known as Latin 8.
1094 ///
1095 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1096 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1097 ///
1098 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1099 /// does not support this encoding.
1100 ///
1101 /// This will change from `static` to `const` if Rust changes
1102 /// to make the referent of `pub const FOO: &'static Encoding`
1103 /// unique cross-crate, so don't take the address of this
1104 /// `static`.
1105 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1106 
1107 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1108 ///
1109 /// For use only for taking the address of this form when
1110 /// Rust prohibits the use of the non-`_INIT` form directly,
1111 /// such as in initializers of other `static`s. If in doubt,
1112 /// use the corresponding non-`_INIT` reference-typed `static`.
1113 ///
1114 /// This part of the public API will go away if Rust changes
1115 /// to make the referent of `pub const FOO: &'static Encoding`
1116 /// unique cross-crate or if Rust starts allowing static arrays
1117 /// to be initialized with `pub static FOO: &'static Encoding`
1118 /// items.
1119 pub static ISO_8859_15_INIT: Encoding = Encoding {
1120     name: "ISO-8859-15",
1121     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1122 };
1123 
1124 /// The ISO-8859-15 encoding.
1125 ///
1126 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1127 /// family. This encoding is also known as Latin 9.
1128 ///
1129 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1130 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1131 ///
1132 /// This encoding matches the Windows code page 28605.
1133 ///
1134 /// This will change from `static` to `const` if Rust changes
1135 /// to make the referent of `pub const FOO: &'static Encoding`
1136 /// unique cross-crate, so don't take the address of this
1137 /// `static`.
1138 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1139 
1140 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1141 ///
1142 /// For use only for taking the address of this form when
1143 /// Rust prohibits the use of the non-`_INIT` form directly,
1144 /// such as in initializers of other `static`s. If in doubt,
1145 /// use the corresponding non-`_INIT` reference-typed `static`.
1146 ///
1147 /// This part of the public API will go away if Rust changes
1148 /// to make the referent of `pub const FOO: &'static Encoding`
1149 /// unique cross-crate or if Rust starts allowing static arrays
1150 /// to be initialized with `pub static FOO: &'static Encoding`
1151 /// items.
1152 pub static ISO_8859_16_INIT: Encoding = Encoding {
1153     name: "ISO-8859-16",
1154     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1155 };
1156 
1157 /// The ISO-8859-16 encoding.
1158 ///
1159 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1160 /// family. This encoding is also known as Latin 10.
1161 ///
1162 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1163 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1164 ///
1165 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1166 /// does not support this encoding.
1167 ///
1168 /// This will change from `static` to `const` if Rust changes
1169 /// to make the referent of `pub const FOO: &'static Encoding`
1170 /// unique cross-crate, so don't take the address of this
1171 /// `static`.
1172 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1173 
1174 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1175 ///
1176 /// For use only for taking the address of this form when
1177 /// Rust prohibits the use of the non-`_INIT` form directly,
1178 /// such as in initializers of other `static`s. If in doubt,
1179 /// use the corresponding non-`_INIT` reference-typed `static`.
1180 ///
1181 /// This part of the public API will go away if Rust changes
1182 /// to make the referent of `pub const FOO: &'static Encoding`
1183 /// unique cross-crate or if Rust starts allowing static arrays
1184 /// to be initialized with `pub static FOO: &'static Encoding`
1185 /// items.
1186 pub static ISO_8859_2_INIT: Encoding = Encoding {
1187     name: "ISO-8859-2",
1188     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1189 };
1190 
1191 /// The ISO-8859-2 encoding.
1192 ///
1193 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1194 ///
1195 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1196 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1197 ///
1198 /// This encoding matches the Windows code page 28592.
1199 ///
1200 /// This will change from `static` to `const` if Rust changes
1201 /// to make the referent of `pub const FOO: &'static Encoding`
1202 /// unique cross-crate, so don't take the address of this
1203 /// `static`.
1204 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1205 
1206 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1207 ///
1208 /// For use only for taking the address of this form when
1209 /// Rust prohibits the use of the non-`_INIT` form directly,
1210 /// such as in initializers of other `static`s. If in doubt,
1211 /// use the corresponding non-`_INIT` reference-typed `static`.
1212 ///
1213 /// This part of the public API will go away if Rust changes
1214 /// to make the referent of `pub const FOO: &'static Encoding`
1215 /// unique cross-crate or if Rust starts allowing static arrays
1216 /// to be initialized with `pub static FOO: &'static Encoding`
1217 /// items.
1218 pub static ISO_8859_3_INIT: Encoding = Encoding {
1219     name: "ISO-8859-3",
1220     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1221 };
1222 
1223 /// The ISO-8859-3 encoding.
1224 ///
1225 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1226 ///
1227 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1228 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1229 ///
1230 /// This encoding matches the Windows code page 28593.
1231 ///
1232 /// This will change from `static` to `const` if Rust changes
1233 /// to make the referent of `pub const FOO: &'static Encoding`
1234 /// unique cross-crate, so don't take the address of this
1235 /// `static`.
1236 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1237 
1238 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1239 ///
1240 /// For use only for taking the address of this form when
1241 /// Rust prohibits the use of the non-`_INIT` form directly,
1242 /// such as in initializers of other `static`s. If in doubt,
1243 /// use the corresponding non-`_INIT` reference-typed `static`.
1244 ///
1245 /// This part of the public API will go away if Rust changes
1246 /// to make the referent of `pub const FOO: &'static Encoding`
1247 /// unique cross-crate or if Rust starts allowing static arrays
1248 /// to be initialized with `pub static FOO: &'static Encoding`
1249 /// items.
1250 pub static ISO_8859_4_INIT: Encoding = Encoding {
1251     name: "ISO-8859-4",
1252     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1253 };
1254 
1255 /// The ISO-8859-4 encoding.
1256 ///
1257 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1258 ///
1259 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1260 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1261 ///
1262 /// This encoding matches the Windows code page 28594.
1263 ///
1264 /// This will change from `static` to `const` if Rust changes
1265 /// to make the referent of `pub const FOO: &'static Encoding`
1266 /// unique cross-crate, so don't take the address of this
1267 /// `static`.
1268 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1269 
1270 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1271 ///
1272 /// For use only for taking the address of this form when
1273 /// Rust prohibits the use of the non-`_INIT` form directly,
1274 /// such as in initializers of other `static`s. If in doubt,
1275 /// use the corresponding non-`_INIT` reference-typed `static`.
1276 ///
1277 /// This part of the public API will go away if Rust changes
1278 /// to make the referent of `pub const FOO: &'static Encoding`
1279 /// unique cross-crate or if Rust starts allowing static arrays
1280 /// to be initialized with `pub static FOO: &'static Encoding`
1281 /// items.
1282 pub static ISO_8859_5_INIT: Encoding = Encoding {
1283     name: "ISO-8859-5",
1284     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1285 };
1286 
1287 /// The ISO-8859-5 encoding.
1288 ///
1289 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1290 ///
1291 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1292 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1293 ///
1294 /// This encoding matches the Windows code page 28595.
1295 ///
1296 /// This will change from `static` to `const` if Rust changes
1297 /// to make the referent of `pub const FOO: &'static Encoding`
1298 /// unique cross-crate, so don't take the address of this
1299 /// `static`.
1300 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1301 
1302 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1303 ///
1304 /// For use only for taking the address of this form when
1305 /// Rust prohibits the use of the non-`_INIT` form directly,
1306 /// such as in initializers of other `static`s. If in doubt,
1307 /// use the corresponding non-`_INIT` reference-typed `static`.
1308 ///
1309 /// This part of the public API will go away if Rust changes
1310 /// to make the referent of `pub const FOO: &'static Encoding`
1311 /// unique cross-crate or if Rust starts allowing static arrays
1312 /// to be initialized with `pub static FOO: &'static Encoding`
1313 /// items.
1314 pub static ISO_8859_6_INIT: Encoding = Encoding {
1315     name: "ISO-8859-6",
1316     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1317 };
1318 
1319 /// The ISO-8859-6 encoding.
1320 ///
1321 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1322 ///
1323 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1324 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1325 ///
1326 /// This encoding matches the Windows code page 28596, except Windows decodes
1327 /// unassigned code points to the Private Use Area of Unicode.
1328 ///
1329 /// This will change from `static` to `const` if Rust changes
1330 /// to make the referent of `pub const FOO: &'static Encoding`
1331 /// unique cross-crate, so don't take the address of this
1332 /// `static`.
1333 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1334 
1335 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1336 ///
1337 /// For use only for taking the address of this form when
1338 /// Rust prohibits the use of the non-`_INIT` form directly,
1339 /// such as in initializers of other `static`s. If in doubt,
1340 /// use the corresponding non-`_INIT` reference-typed `static`.
1341 ///
1342 /// This part of the public API will go away if Rust changes
1343 /// to make the referent of `pub const FOO: &'static Encoding`
1344 /// unique cross-crate or if Rust starts allowing static arrays
1345 /// to be initialized with `pub static FOO: &'static Encoding`
1346 /// items.
1347 pub static ISO_8859_7_INIT: Encoding = Encoding {
1348     name: "ISO-8859-7",
1349     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1350 };
1351 
1352 /// The ISO-8859-7 encoding.
1353 ///
1354 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1355 ///
1356 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1357 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1358 ///
1359 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1360 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1361 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1362 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1363 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1364 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1365 ///
1366 /// This will change from `static` to `const` if Rust changes
1367 /// to make the referent of `pub const FOO: &'static Encoding`
1368 /// unique cross-crate, so don't take the address of this
1369 /// `static`.
1370 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1371 
1372 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1373 ///
1374 /// For use only for taking the address of this form when
1375 /// Rust prohibits the use of the non-`_INIT` form directly,
1376 /// such as in initializers of other `static`s. If in doubt,
1377 /// use the corresponding non-`_INIT` reference-typed `static`.
1378 ///
1379 /// This part of the public API will go away if Rust changes
1380 /// to make the referent of `pub const FOO: &'static Encoding`
1381 /// unique cross-crate or if Rust starts allowing static arrays
1382 /// to be initialized with `pub static FOO: &'static Encoding`
1383 /// items.
1384 pub static ISO_8859_8_INIT: Encoding = Encoding {
1385     name: "ISO-8859-8",
1386     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1387 };
1388 
1389 /// The ISO-8859-8 encoding.
1390 ///
1391 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1392 ///
1393 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1394 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1395 ///
1396 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1397 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1398 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1399 /// the private use area.
1400 ///
1401 /// This will change from `static` to `const` if Rust changes
1402 /// to make the referent of `pub const FOO: &'static Encoding`
1403 /// unique cross-crate, so don't take the address of this
1404 /// `static`.
1405 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1406 
1407 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1408 ///
1409 /// For use only for taking the address of this form when
1410 /// Rust prohibits the use of the non-`_INIT` form directly,
1411 /// such as in initializers of other `static`s. If in doubt,
1412 /// use the corresponding non-`_INIT` reference-typed `static`.
1413 ///
1414 /// This part of the public API will go away if Rust changes
1415 /// to make the referent of `pub const FOO: &'static Encoding`
1416 /// unique cross-crate or if Rust starts allowing static arrays
1417 /// to be initialized with `pub static FOO: &'static Encoding`
1418 /// items.
1419 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1420     name: "ISO-8859-8-I",
1421     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1422 };
1423 
1424 /// The ISO-8859-8-I encoding.
1425 ///
1426 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1427 ///
1428 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1429 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1430 ///
1431 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1432 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1433 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1434 /// the private use area.
1435 ///
1436 /// This will change from `static` to `const` if Rust changes
1437 /// to make the referent of `pub const FOO: &'static Encoding`
1438 /// unique cross-crate, so don't take the address of this
1439 /// `static`.
1440 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1441 
1442 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1443 ///
1444 /// For use only for taking the address of this form when
1445 /// Rust prohibits the use of the non-`_INIT` form directly,
1446 /// such as in initializers of other `static`s. If in doubt,
1447 /// use the corresponding non-`_INIT` reference-typed `static`.
1448 ///
1449 /// This part of the public API will go away if Rust changes
1450 /// to make the referent of `pub const FOO: &'static Encoding`
1451 /// unique cross-crate or if Rust starts allowing static arrays
1452 /// to be initialized with `pub static FOO: &'static Encoding`
1453 /// items.
1454 pub static KOI8_R_INIT: Encoding = Encoding {
1455     name: "KOI8-R",
1456     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1457 };
1458 
1459 /// The KOI8-R encoding.
1460 ///
1461 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1462 ///
1463 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1464 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1465 ///
1466 /// This encoding matches the Windows code page 20866.
1467 ///
1468 /// This will change from `static` to `const` if Rust changes
1469 /// to make the referent of `pub const FOO: &'static Encoding`
1470 /// unique cross-crate, so don't take the address of this
1471 /// `static`.
1472 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1473 
1474 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1475 ///
1476 /// For use only for taking the address of this form when
1477 /// Rust prohibits the use of the non-`_INIT` form directly,
1478 /// such as in initializers of other `static`s. If in doubt,
1479 /// use the corresponding non-`_INIT` reference-typed `static`.
1480 ///
1481 /// This part of the public API will go away if Rust changes
1482 /// to make the referent of `pub const FOO: &'static Encoding`
1483 /// unique cross-crate or if Rust starts allowing static arrays
1484 /// to be initialized with `pub static FOO: &'static Encoding`
1485 /// items.
1486 pub static KOI8_U_INIT: Encoding = Encoding {
1487     name: "KOI8-U",
1488     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1489 };
1490 
1491 /// The KOI8-U encoding.
1492 ///
1493 /// This is an encoding for Ukrainian adapted from KOI8-R.
1494 ///
1495 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1496 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1497 ///
1498 /// This encoding matches the Windows code page 21866.
1499 ///
1500 /// This will change from `static` to `const` if Rust changes
1501 /// to make the referent of `pub const FOO: &'static Encoding`
1502 /// unique cross-crate, so don't take the address of this
1503 /// `static`.
1504 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1505 
1506 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1507 ///
1508 /// For use only for taking the address of this form when
1509 /// Rust prohibits the use of the non-`_INIT` form directly,
1510 /// such as in initializers of other `static`s. If in doubt,
1511 /// use the corresponding non-`_INIT` reference-typed `static`.
1512 ///
1513 /// This part of the public API will go away if Rust changes
1514 /// to make the referent of `pub const FOO: &'static Encoding`
1515 /// unique cross-crate or if Rust starts allowing static arrays
1516 /// to be initialized with `pub static FOO: &'static Encoding`
1517 /// items.
1518 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1519     name: "Shift_JIS",
1520     variant: VariantEncoding::ShiftJis,
1521 };
1522 
1523 /// The Shift_JIS encoding.
1524 ///
1525 /// This is the Japanese encoding for Windows.
1526 ///
1527 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1528 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1529 ///
1530 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1531 /// sequences that are error per the Encoding Standard to the question mark or the
1532 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1533 ///
1534 /// This will change from `static` to `const` if Rust changes
1535 /// to make the referent of `pub const FOO: &'static Encoding`
1536 /// unique cross-crate, so don't take the address of this
1537 /// `static`.
1538 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1539 
1540 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1541 ///
1542 /// For use only for taking the address of this form when
1543 /// Rust prohibits the use of the non-`_INIT` form directly,
1544 /// such as in initializers of other `static`s. If in doubt,
1545 /// use the corresponding non-`_INIT` reference-typed `static`.
1546 ///
1547 /// This part of the public API will go away if Rust changes
1548 /// to make the referent of `pub const FOO: &'static Encoding`
1549 /// unique cross-crate or if Rust starts allowing static arrays
1550 /// to be initialized with `pub static FOO: &'static Encoding`
1551 /// items.
1552 pub static UTF_16BE_INIT: Encoding = Encoding {
1553     name: "UTF-16BE",
1554     variant: VariantEncoding::Utf16Be,
1555 };
1556 
1557 /// The UTF-16BE encoding.
1558 ///
1559 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1560 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1561 /// mark the big endian byte order is assumed.
1562 ///
1563 /// There is no corresponding encoder in this crate or in the Encoding
1564 /// Standard. The output encoding of this encoding is UTF-8.
1565 ///
1566 /// This encoding matches the Windows code page 1201.
1567 ///
1568 /// This will change from `static` to `const` if Rust changes
1569 /// to make the referent of `pub const FOO: &'static Encoding`
1570 /// unique cross-crate, so don't take the address of this
1571 /// `static`.
1572 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1573 
1574 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1575 ///
1576 /// For use only for taking the address of this form when
1577 /// Rust prohibits the use of the non-`_INIT` form directly,
1578 /// such as in initializers of other `static`s. If in doubt,
1579 /// use the corresponding non-`_INIT` reference-typed `static`.
1580 ///
1581 /// This part of the public API will go away if Rust changes
1582 /// to make the referent of `pub const FOO: &'static Encoding`
1583 /// unique cross-crate or if Rust starts allowing static arrays
1584 /// to be initialized with `pub static FOO: &'static Encoding`
1585 /// items.
1586 pub static UTF_16LE_INIT: Encoding = Encoding {
1587     name: "UTF-16LE",
1588     variant: VariantEncoding::Utf16Le,
1589 };
1590 
1591 /// The UTF-16LE encoding.
1592 ///
1593 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1594 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1595 /// mark the little endian byte order is assumed.
1596 ///
1597 /// There is no corresponding encoder in this crate or in the Encoding
1598 /// Standard. The output encoding of this encoding is UTF-8.
1599 ///
1600 /// This encoding matches the Windows code page 1200.
1601 ///
1602 /// This will change from `static` to `const` if Rust changes
1603 /// to make the referent of `pub const FOO: &'static Encoding`
1604 /// unique cross-crate, so don't take the address of this
1605 /// `static`.
1606 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1607 
1608 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1609 ///
1610 /// For use only for taking the address of this form when
1611 /// Rust prohibits the use of the non-`_INIT` form directly,
1612 /// such as in initializers of other `static`s. If in doubt,
1613 /// use the corresponding non-`_INIT` reference-typed `static`.
1614 ///
1615 /// This part of the public API will go away if Rust changes
1616 /// to make the referent of `pub const FOO: &'static Encoding`
1617 /// unique cross-crate or if Rust starts allowing static arrays
1618 /// to be initialized with `pub static FOO: &'static Encoding`
1619 /// items.
1620 pub static UTF_8_INIT: Encoding = Encoding {
1621     name: "UTF-8",
1622     variant: VariantEncoding::Utf8,
1623 };
1624 
1625 /// The UTF-8 encoding.
1626 ///
1627 /// This is the encoding that should be used for all new development it can
1628 /// represent all of Unicode.
1629 ///
1630 /// This encoding matches the Windows code page 65001, except Windows differs
1631 /// in the number of errors generated for some erroneous byte sequences.
1632 ///
1633 /// This will change from `static` to `const` if Rust changes
1634 /// to make the referent of `pub const FOO: &'static Encoding`
1635 /// unique cross-crate, so don't take the address of this
1636 /// `static`.
1637 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1638 
1639 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1640 ///
1641 /// For use only for taking the address of this form when
1642 /// Rust prohibits the use of the non-`_INIT` form directly,
1643 /// such as in initializers of other `static`s. If in doubt,
1644 /// use the corresponding non-`_INIT` reference-typed `static`.
1645 ///
1646 /// This part of the public API will go away if Rust changes
1647 /// to make the referent of `pub const FOO: &'static Encoding`
1648 /// unique cross-crate or if Rust starts allowing static arrays
1649 /// to be initialized with `pub static FOO: &'static Encoding`
1650 /// items.
1651 pub static GB18030_INIT: Encoding = Encoding {
1652     name: "gb18030",
1653     variant: VariantEncoding::Gb18030,
1654 };
1655 
1656 /// The gb18030 encoding.
1657 ///
1658 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1659 /// maps to U+3000 for compatibility with existing Web content. As a result,
1660 /// this encoding can represent all of Unicode except for the private-use
1661 /// character U+E5E5.
1662 ///
1663 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1664 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1665 ///
1666 /// This encoding matches the Windows code page 54936.
1667 ///
1668 /// This will change from `static` to `const` if Rust changes
1669 /// to make the referent of `pub const FOO: &'static Encoding`
1670 /// unique cross-crate, so don't take the address of this
1671 /// `static`.
1672 pub static GB18030: &'static Encoding = &GB18030_INIT;
1673 
1674 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1675 ///
1676 /// For use only for taking the address of this form when
1677 /// Rust prohibits the use of the non-`_INIT` form directly,
1678 /// such as in initializers of other `static`s. If in doubt,
1679 /// use the corresponding non-`_INIT` reference-typed `static`.
1680 ///
1681 /// This part of the public API will go away if Rust changes
1682 /// to make the referent of `pub const FOO: &'static Encoding`
1683 /// unique cross-crate or if Rust starts allowing static arrays
1684 /// to be initialized with `pub static FOO: &'static Encoding`
1685 /// items.
1686 pub static MACINTOSH_INIT: Encoding = Encoding {
1687     name: "macintosh",
1688     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1689 };
1690 
1691 /// The macintosh encoding.
1692 ///
1693 /// This is the MacRoman encoding from Mac OS Classic.
1694 ///
1695 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1696 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1697 ///
1698 /// This encoding matches the Windows code page 10000, except Windows decodes
1699 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1700 ///
1701 /// This will change from `static` to `const` if Rust changes
1702 /// to make the referent of `pub const FOO: &'static Encoding`
1703 /// unique cross-crate, so don't take the address of this
1704 /// `static`.
1705 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1706 
1707 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1708 ///
1709 /// For use only for taking the address of this form when
1710 /// Rust prohibits the use of the non-`_INIT` form directly,
1711 /// such as in initializers of other `static`s. If in doubt,
1712 /// use the corresponding non-`_INIT` reference-typed `static`.
1713 ///
1714 /// This part of the public API will go away if Rust changes
1715 /// to make the referent of `pub const FOO: &'static Encoding`
1716 /// unique cross-crate or if Rust starts allowing static arrays
1717 /// to be initialized with `pub static FOO: &'static Encoding`
1718 /// items.
1719 pub static REPLACEMENT_INIT: Encoding = Encoding {
1720     name: "replacement",
1721     variant: VariantEncoding::Replacement,
1722 };
1723 
1724 /// The replacement encoding.
1725 ///
1726 /// This decode-only encoding decodes all non-zero-length streams to a single
1727 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1728 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1729 /// encodings that are no longer supported by the Web Platform and that
1730 /// would be dangerous to treat as ASCII-compatible.
1731 ///
1732 /// There is no corresponding encoder. The output encoding of this encoding
1733 /// is UTF-8.
1734 ///
1735 /// This encoding does not have a Windows code page number.
1736 ///
1737 /// This will change from `static` to `const` if Rust changes
1738 /// to make the referent of `pub const FOO: &'static Encoding`
1739 /// unique cross-crate, so don't take the address of this
1740 /// `static`.
1741 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1742 
1743 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1744 ///
1745 /// For use only for taking the address of this form when
1746 /// Rust prohibits the use of the non-`_INIT` form directly,
1747 /// such as in initializers of other `static`s. If in doubt,
1748 /// use the corresponding non-`_INIT` reference-typed `static`.
1749 ///
1750 /// This part of the public API will go away if Rust changes
1751 /// to make the referent of `pub const FOO: &'static Encoding`
1752 /// unique cross-crate or if Rust starts allowing static arrays
1753 /// to be initialized with `pub static FOO: &'static Encoding`
1754 /// items.
1755 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1756     name: "windows-1250",
1757     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1758 };
1759 
1760 /// The windows-1250 encoding.
1761 ///
1762 /// This is the Central European encoding for Windows.
1763 ///
1764 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1765 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1766 ///
1767 /// This encoding matches the Windows code page 1250.
1768 ///
1769 /// This will change from `static` to `const` if Rust changes
1770 /// to make the referent of `pub const FOO: &'static Encoding`
1771 /// unique cross-crate, so don't take the address of this
1772 /// `static`.
1773 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1774 
1775 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1776 ///
1777 /// For use only for taking the address of this form when
1778 /// Rust prohibits the use of the non-`_INIT` form directly,
1779 /// such as in initializers of other `static`s. If in doubt,
1780 /// use the corresponding non-`_INIT` reference-typed `static`.
1781 ///
1782 /// This part of the public API will go away if Rust changes
1783 /// to make the referent of `pub const FOO: &'static Encoding`
1784 /// unique cross-crate or if Rust starts allowing static arrays
1785 /// to be initialized with `pub static FOO: &'static Encoding`
1786 /// items.
1787 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1788     name: "windows-1251",
1789     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1790 };
1791 
1792 /// The windows-1251 encoding.
1793 ///
1794 /// This is the Cyrillic encoding for Windows.
1795 ///
1796 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1797 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1798 ///
1799 /// This encoding matches the Windows code page 1251.
1800 ///
1801 /// This will change from `static` to `const` if Rust changes
1802 /// to make the referent of `pub const FOO: &'static Encoding`
1803 /// unique cross-crate, so don't take the address of this
1804 /// `static`.
1805 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1806 
1807 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1808 ///
1809 /// For use only for taking the address of this form when
1810 /// Rust prohibits the use of the non-`_INIT` form directly,
1811 /// such as in initializers of other `static`s. If in doubt,
1812 /// use the corresponding non-`_INIT` reference-typed `static`.
1813 ///
1814 /// This part of the public API will go away if Rust changes
1815 /// to make the referent of `pub const FOO: &'static Encoding`
1816 /// unique cross-crate or if Rust starts allowing static arrays
1817 /// to be initialized with `pub static FOO: &'static Encoding`
1818 /// items.
1819 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1820     name: "windows-1252",
1821     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1822 };
1823 
1824 /// The windows-1252 encoding.
1825 ///
1826 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1827 /// which is known as Latin 1.
1828 ///
1829 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1830 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1831 ///
1832 /// This encoding matches the Windows code page 1252.
1833 ///
1834 /// This will change from `static` to `const` if Rust changes
1835 /// to make the referent of `pub const FOO: &'static Encoding`
1836 /// unique cross-crate, so don't take the address of this
1837 /// `static`.
1838 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1839 
1840 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1841 ///
1842 /// For use only for taking the address of this form when
1843 /// Rust prohibits the use of the non-`_INIT` form directly,
1844 /// such as in initializers of other `static`s. If in doubt,
1845 /// use the corresponding non-`_INIT` reference-typed `static`.
1846 ///
1847 /// This part of the public API will go away if Rust changes
1848 /// to make the referent of `pub const FOO: &'static Encoding`
1849 /// unique cross-crate or if Rust starts allowing static arrays
1850 /// to be initialized with `pub static FOO: &'static Encoding`
1851 /// items.
1852 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1853     name: "windows-1253",
1854     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1855 };
1856 
1857 /// The windows-1253 encoding.
1858 ///
1859 /// This is the Greek encoding for Windows. It is mostly an extension of
1860 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1861 ///
1862 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1863 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1864 ///
1865 /// This encoding matches the Windows code page 1253, except Windows decodes
1866 /// unassigned code points to the Private Use Area of Unicode.
1867 ///
1868 /// This will change from `static` to `const` if Rust changes
1869 /// to make the referent of `pub const FOO: &'static Encoding`
1870 /// unique cross-crate, so don't take the address of this
1871 /// `static`.
1872 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1873 
1874 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1875 ///
1876 /// For use only for taking the address of this form when
1877 /// Rust prohibits the use of the non-`_INIT` form directly,
1878 /// such as in initializers of other `static`s. If in doubt,
1879 /// use the corresponding non-`_INIT` reference-typed `static`.
1880 ///
1881 /// This part of the public API will go away if Rust changes
1882 /// to make the referent of `pub const FOO: &'static Encoding`
1883 /// unique cross-crate or if Rust starts allowing static arrays
1884 /// to be initialized with `pub static FOO: &'static Encoding`
1885 /// items.
1886 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1887     name: "windows-1254",
1888     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1889 };
1890 
1891 /// The windows-1254 encoding.
1892 ///
1893 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1894 /// which is known as Latin 5.
1895 ///
1896 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1897 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1898 ///
1899 /// This encoding matches the Windows code page 1254.
1900 ///
1901 /// This will change from `static` to `const` if Rust changes
1902 /// to make the referent of `pub const FOO: &'static Encoding`
1903 /// unique cross-crate, so don't take the address of this
1904 /// `static`.
1905 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1906 
1907 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1908 ///
1909 /// For use only for taking the address of this form when
1910 /// Rust prohibits the use of the non-`_INIT` form directly,
1911 /// such as in initializers of other `static`s. If in doubt,
1912 /// use the corresponding non-`_INIT` reference-typed `static`.
1913 ///
1914 /// This part of the public API will go away if Rust changes
1915 /// to make the referent of `pub const FOO: &'static Encoding`
1916 /// unique cross-crate or if Rust starts allowing static arrays
1917 /// to be initialized with `pub static FOO: &'static Encoding`
1918 /// items.
1919 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1920     name: "windows-1255",
1921     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1922 };
1923 
1924 /// The windows-1255 encoding.
1925 ///
1926 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1927 /// except for a currency sign swap.
1928 ///
1929 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1930 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1931 ///
1932 /// This encoding matches the Windows code page 1255, except Windows decodes
1933 /// unassigned code points to the Private Use Area of Unicode.
1934 ///
1935 /// This will change from `static` to `const` if Rust changes
1936 /// to make the referent of `pub const FOO: &'static Encoding`
1937 /// unique cross-crate, so don't take the address of this
1938 /// `static`.
1939 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1940 
1941 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1942 ///
1943 /// For use only for taking the address of this form when
1944 /// Rust prohibits the use of the non-`_INIT` form directly,
1945 /// such as in initializers of other `static`s. If in doubt,
1946 /// use the corresponding non-`_INIT` reference-typed `static`.
1947 ///
1948 /// This part of the public API will go away if Rust changes
1949 /// to make the referent of `pub const FOO: &'static Encoding`
1950 /// unique cross-crate or if Rust starts allowing static arrays
1951 /// to be initialized with `pub static FOO: &'static Encoding`
1952 /// items.
1953 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1954     name: "windows-1256",
1955     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1956 };
1957 
1958 /// The windows-1256 encoding.
1959 ///
1960 /// This is the Arabic encoding for Windows.
1961 ///
1962 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1963 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1964 ///
1965 /// This encoding matches the Windows code page 1256.
1966 ///
1967 /// This will change from `static` to `const` if Rust changes
1968 /// to make the referent of `pub const FOO: &'static Encoding`
1969 /// unique cross-crate, so don't take the address of this
1970 /// `static`.
1971 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1972 
1973 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1974 ///
1975 /// For use only for taking the address of this form when
1976 /// Rust prohibits the use of the non-`_INIT` form directly,
1977 /// such as in initializers of other `static`s. If in doubt,
1978 /// use the corresponding non-`_INIT` reference-typed `static`.
1979 ///
1980 /// This part of the public API will go away if Rust changes
1981 /// to make the referent of `pub const FOO: &'static Encoding`
1982 /// unique cross-crate or if Rust starts allowing static arrays
1983 /// to be initialized with `pub static FOO: &'static Encoding`
1984 /// items.
1985 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1986     name: "windows-1257",
1987     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1988 };
1989 
1990 /// The windows-1257 encoding.
1991 ///
1992 /// This is the Baltic encoding for Windows.
1993 ///
1994 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1995 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1996 ///
1997 /// This encoding matches the Windows code page 1257, except Windows decodes
1998 /// unassigned code points to the Private Use Area of Unicode.
1999 ///
2000 /// This will change from `static` to `const` if Rust changes
2001 /// to make the referent of `pub const FOO: &'static Encoding`
2002 /// unique cross-crate, so don't take the address of this
2003 /// `static`.
2004 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2005 
2006 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2007 ///
2008 /// For use only for taking the address of this form when
2009 /// Rust prohibits the use of the non-`_INIT` form directly,
2010 /// such as in initializers of other `static`s. If in doubt,
2011 /// use the corresponding non-`_INIT` reference-typed `static`.
2012 ///
2013 /// This part of the public API will go away if Rust changes
2014 /// to make the referent of `pub const FOO: &'static Encoding`
2015 /// unique cross-crate or if Rust starts allowing static arrays
2016 /// to be initialized with `pub static FOO: &'static Encoding`
2017 /// items.
2018 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2019     name: "windows-1258",
2020     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2021 };
2022 
2023 /// The windows-1258 encoding.
2024 ///
2025 /// This is the Vietnamese encoding for Windows.
2026 ///
2027 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2028 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2029 ///
2030 /// This encoding matches the Windows code page 1258 when used in the
2031 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2032 /// result of decoding is not necessarily in Normalization Form C. On the
2033 /// other hand, input in the Normalization Form C is not encoded without
2034 /// replacement. In general, it's a bad idea to encode to encodings other
2035 /// than UTF-8, but this encoding is especially hazardous to encode to.
2036 ///
2037 /// This will change from `static` to `const` if Rust changes
2038 /// to make the referent of `pub const FOO: &'static Encoding`
2039 /// unique cross-crate, so don't take the address of this
2040 /// `static`.
2041 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2042 
2043 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2044 ///
2045 /// For use only for taking the address of this form when
2046 /// Rust prohibits the use of the non-`_INIT` form directly,
2047 /// such as in initializers of other `static`s. If in doubt,
2048 /// use the corresponding non-`_INIT` reference-typed `static`.
2049 ///
2050 /// This part of the public API will go away if Rust changes
2051 /// to make the referent of `pub const FOO: &'static Encoding`
2052 /// unique cross-crate or if Rust starts allowing static arrays
2053 /// to be initialized with `pub static FOO: &'static Encoding`
2054 /// items.
2055 pub static WINDOWS_874_INIT: Encoding = Encoding {
2056     name: "windows-874",
2057     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2058 };
2059 
2060 /// The windows-874 encoding.
2061 ///
2062 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2063 ///
2064 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2065 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2066 ///
2067 /// This encoding matches the Windows code page 874, except Windows decodes
2068 /// unassigned code points to the Private Use Area of Unicode.
2069 ///
2070 /// This will change from `static` to `const` if Rust changes
2071 /// to make the referent of `pub const FOO: &'static Encoding`
2072 /// unique cross-crate, so don't take the address of this
2073 /// `static`.
2074 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2075 
2076 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2077 ///
2078 /// For use only for taking the address of this form when
2079 /// Rust prohibits the use of the non-`_INIT` form directly,
2080 /// such as in initializers of other `static`s. If in doubt,
2081 /// use the corresponding non-`_INIT` reference-typed `static`.
2082 ///
2083 /// This part of the public API will go away if Rust changes
2084 /// to make the referent of `pub const FOO: &'static Encoding`
2085 /// unique cross-crate or if Rust starts allowing static arrays
2086 /// to be initialized with `pub static FOO: &'static Encoding`
2087 /// items.
2088 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2089     name: "x-mac-cyrillic",
2090     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2091 };
2092 
2093 /// The x-mac-cyrillic encoding.
2094 ///
2095 /// This is the MacUkrainian encoding from Mac OS Classic.
2096 ///
2097 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2098 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2099 ///
2100 /// This encoding matches the Windows code page 10017.
2101 ///
2102 /// This will change from `static` to `const` if Rust changes
2103 /// to make the referent of `pub const FOO: &'static Encoding`
2104 /// unique cross-crate, so don't take the address of this
2105 /// `static`.
2106 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2107 
2108 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2109 ///
2110 /// For use only for taking the address of this form when
2111 /// Rust prohibits the use of the non-`_INIT` form directly,
2112 /// such as in initializers of other `static`s. If in doubt,
2113 /// use the corresponding non-`_INIT` reference-typed `static`.
2114 ///
2115 /// This part of the public API will go away if Rust changes
2116 /// to make the referent of `pub const FOO: &'static Encoding`
2117 /// unique cross-crate or if Rust starts allowing static arrays
2118 /// to be initialized with `pub static FOO: &'static Encoding`
2119 /// items.
2120 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2121     name: "x-user-defined",
2122     variant: VariantEncoding::UserDefined,
2123 };
2124 
2125 /// The x-user-defined encoding.
2126 ///
2127 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2128 /// them to the Private Use Area of Unicode. It was used for loading binary
2129 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2130 /// the `"arraybuffer"` response type.
2131 ///
2132 /// This encoding does not have a Windows code page number.
2133 ///
2134 /// This will change from `static` to `const` if Rust changes
2135 /// to make the referent of `pub const FOO: &'static Encoding`
2136 /// unique cross-crate, so don't take the address of this
2137 /// `static`.
2138 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2139 
2140 static LABELS_SORTED: [&'static str; 219] = [
2141     "l1",
2142     "l2",
2143     "l3",
2144     "l4",
2145     "l5",
2146     "l6",
2147     "l9",
2148     "866",
2149     "mac",
2150     "koi",
2151     "gbk",
2152     "big5",
2153     "utf8",
2154     "koi8",
2155     "sjis",
2156     "ms932",
2157     "cp866",
2158     "utf-8",
2159     "cp819",
2160     "ascii",
2161     "x-gbk",
2162     "greek",
2163     "cp1250",
2164     "cp1251",
2165     "latin1",
2166     "gb2312",
2167     "cp1252",
2168     "latin2",
2169     "cp1253",
2170     "latin3",
2171     "cp1254",
2172     "latin4",
2173     "cp1255",
2174     "csbig5",
2175     "latin5",
2176     "utf-16",
2177     "cp1256",
2178     "ibm866",
2179     "latin6",
2180     "cp1257",
2181     "cp1258",
2182     "greek8",
2183     "ibm819",
2184     "arabic",
2185     "visual",
2186     "korean",
2187     "euc-jp",
2188     "koi8-r",
2189     "koi8_r",
2190     "euc-kr",
2191     "x-sjis",
2192     "koi8-u",
2193     "hebrew",
2194     "tis-620",
2195     "gb18030",
2196     "ksc5601",
2197     "gb_2312",
2198     "dos-874",
2199     "cn-big5",
2200     "chinese",
2201     "logical",
2202     "cskoi8r",
2203     "cseuckr",
2204     "koi8-ru",
2205     "x-cp1250",
2206     "ksc_5601",
2207     "x-cp1251",
2208     "iso88591",
2209     "csgb2312",
2210     "x-cp1252",
2211     "iso88592",
2212     "x-cp1253",
2213     "iso88593",
2214     "ecma-114",
2215     "x-cp1254",
2216     "iso88594",
2217     "x-cp1255",
2218     "iso88595",
2219     "x-x-big5",
2220     "x-cp1256",
2221     "csibm866",
2222     "iso88596",
2223     "x-cp1257",
2224     "iso88597",
2225     "asmo-708",
2226     "ecma-118",
2227     "elot_928",
2228     "x-cp1258",
2229     "iso88598",
2230     "iso88599",
2231     "cyrillic",
2232     "utf-16be",
2233     "utf-16le",
2234     "us-ascii",
2235     "ms_kanji",
2236     "x-euc-jp",
2237     "iso885910",
2238     "iso8859-1",
2239     "iso885911",
2240     "iso8859-2",
2241     "iso8859-3",
2242     "iso885913",
2243     "iso8859-4",
2244     "iso885914",
2245     "iso8859-5",
2246     "iso885915",
2247     "iso8859-6",
2248     "iso8859-7",
2249     "iso8859-8",
2250     "iso-ir-58",
2251     "iso8859-9",
2252     "macintosh",
2253     "shift-jis",
2254     "shift_jis",
2255     "iso-ir-100",
2256     "iso8859-10",
2257     "iso-ir-110",
2258     "gb_2312-80",
2259     "iso-8859-1",
2260     "iso_8859-1",
2261     "iso-ir-101",
2262     "iso8859-11",
2263     "iso-8859-2",
2264     "iso_8859-2",
2265     "hz-gb-2312",
2266     "iso-8859-3",
2267     "iso_8859-3",
2268     "iso8859-13",
2269     "iso-8859-4",
2270     "iso_8859-4",
2271     "iso8859-14",
2272     "iso-ir-144",
2273     "iso-8859-5",
2274     "iso_8859-5",
2275     "iso8859-15",
2276     "iso-8859-6",
2277     "iso_8859-6",
2278     "iso-ir-126",
2279     "iso-8859-7",
2280     "iso_8859-7",
2281     "iso-ir-127",
2282     "iso-ir-157",
2283     "iso-8859-8",
2284     "iso_8859-8",
2285     "iso-ir-138",
2286     "iso-ir-148",
2287     "iso-8859-9",
2288     "iso_8859-9",
2289     "iso-ir-109",
2290     "iso-ir-149",
2291     "big5-hkscs",
2292     "csshiftjis",
2293     "iso-8859-10",
2294     "iso-8859-11",
2295     "csisolatin1",
2296     "csisolatin2",
2297     "iso-8859-13",
2298     "csisolatin3",
2299     "iso-8859-14",
2300     "windows-874",
2301     "csisolatin4",
2302     "iso-8859-15",
2303     "iso_8859-15",
2304     "csisolatin5",
2305     "iso-8859-16",
2306     "csisolatin6",
2307     "windows-949",
2308     "csisolatin9",
2309     "csiso88596e",
2310     "csiso88598e",
2311     "csmacintosh",
2312     "csiso88596i",
2313     "csiso88598i",
2314     "windows-31j",
2315     "x-mac-roman",
2316     "iso-2022-cn",
2317     "iso-2022-jp",
2318     "csiso2022jp",
2319     "iso-2022-kr",
2320     "csiso2022kr",
2321     "replacement",
2322     "windows-1250",
2323     "windows-1251",
2324     "windows-1252",
2325     "windows-1253",
2326     "windows-1254",
2327     "windows-1255",
2328     "windows-1256",
2329     "windows-1257",
2330     "windows-1258",
2331     "iso-8859-6-e",
2332     "iso-8859-8-e",
2333     "iso-8859-6-i",
2334     "iso-8859-8-i",
2335     "sun_eu_greek",
2336     "csksc56011987",
2337     "ks_c_5601-1987",
2338     "ansi_x3.4-1968",
2339     "ks_c_5601-1989",
2340     "x-mac-cyrillic",
2341     "x-user-defined",
2342     "csiso58gb231280",
2343     "iso_8859-1:1987",
2344     "iso_8859-2:1987",
2345     "iso_8859-6:1987",
2346     "iso_8859-7:1987",
2347     "iso_8859-3:1988",
2348     "iso_8859-4:1988",
2349     "iso_8859-5:1988",
2350     "iso_8859-8:1988",
2351     "iso_8859-9:1989",
2352     "csisolatingreek",
2353     "x-mac-ukrainian",
2354     "iso-2022-cn-ext",
2355     "csisolatinarabic",
2356     "csisolatinhebrew",
2357     "unicode-1-1-utf-8",
2358     "csisolatincyrillic",
2359     "cseucpkdfmtjapanese",
2360 ];
2361 
2362 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2363     &WINDOWS_1252_INIT,
2364     &ISO_8859_2_INIT,
2365     &ISO_8859_3_INIT,
2366     &ISO_8859_4_INIT,
2367     &WINDOWS_1254_INIT,
2368     &ISO_8859_10_INIT,
2369     &ISO_8859_15_INIT,
2370     &IBM866_INIT,
2371     &MACINTOSH_INIT,
2372     &KOI8_R_INIT,
2373     &GBK_INIT,
2374     &BIG5_INIT,
2375     &UTF_8_INIT,
2376     &KOI8_R_INIT,
2377     &SHIFT_JIS_INIT,
2378     &SHIFT_JIS_INIT,
2379     &IBM866_INIT,
2380     &UTF_8_INIT,
2381     &WINDOWS_1252_INIT,
2382     &WINDOWS_1252_INIT,
2383     &GBK_INIT,
2384     &ISO_8859_7_INIT,
2385     &WINDOWS_1250_INIT,
2386     &WINDOWS_1251_INIT,
2387     &WINDOWS_1252_INIT,
2388     &GBK_INIT,
2389     &WINDOWS_1252_INIT,
2390     &ISO_8859_2_INIT,
2391     &WINDOWS_1253_INIT,
2392     &ISO_8859_3_INIT,
2393     &WINDOWS_1254_INIT,
2394     &ISO_8859_4_INIT,
2395     &WINDOWS_1255_INIT,
2396     &BIG5_INIT,
2397     &WINDOWS_1254_INIT,
2398     &UTF_16LE_INIT,
2399     &WINDOWS_1256_INIT,
2400     &IBM866_INIT,
2401     &ISO_8859_10_INIT,
2402     &WINDOWS_1257_INIT,
2403     &WINDOWS_1258_INIT,
2404     &ISO_8859_7_INIT,
2405     &WINDOWS_1252_INIT,
2406     &ISO_8859_6_INIT,
2407     &ISO_8859_8_INIT,
2408     &EUC_KR_INIT,
2409     &EUC_JP_INIT,
2410     &KOI8_R_INIT,
2411     &KOI8_R_INIT,
2412     &EUC_KR_INIT,
2413     &SHIFT_JIS_INIT,
2414     &KOI8_U_INIT,
2415     &ISO_8859_8_INIT,
2416     &WINDOWS_874_INIT,
2417     &GB18030_INIT,
2418     &EUC_KR_INIT,
2419     &GBK_INIT,
2420     &WINDOWS_874_INIT,
2421     &BIG5_INIT,
2422     &GBK_INIT,
2423     &ISO_8859_8_I_INIT,
2424     &KOI8_R_INIT,
2425     &EUC_KR_INIT,
2426     &KOI8_U_INIT,
2427     &WINDOWS_1250_INIT,
2428     &EUC_KR_INIT,
2429     &WINDOWS_1251_INIT,
2430     &WINDOWS_1252_INIT,
2431     &GBK_INIT,
2432     &WINDOWS_1252_INIT,
2433     &ISO_8859_2_INIT,
2434     &WINDOWS_1253_INIT,
2435     &ISO_8859_3_INIT,
2436     &ISO_8859_6_INIT,
2437     &WINDOWS_1254_INIT,
2438     &ISO_8859_4_INIT,
2439     &WINDOWS_1255_INIT,
2440     &ISO_8859_5_INIT,
2441     &BIG5_INIT,
2442     &WINDOWS_1256_INIT,
2443     &IBM866_INIT,
2444     &ISO_8859_6_INIT,
2445     &WINDOWS_1257_INIT,
2446     &ISO_8859_7_INIT,
2447     &ISO_8859_6_INIT,
2448     &ISO_8859_7_INIT,
2449     &ISO_8859_7_INIT,
2450     &WINDOWS_1258_INIT,
2451     &ISO_8859_8_INIT,
2452     &WINDOWS_1254_INIT,
2453     &ISO_8859_5_INIT,
2454     &UTF_16BE_INIT,
2455     &UTF_16LE_INIT,
2456     &WINDOWS_1252_INIT,
2457     &SHIFT_JIS_INIT,
2458     &EUC_JP_INIT,
2459     &ISO_8859_10_INIT,
2460     &WINDOWS_1252_INIT,
2461     &WINDOWS_874_INIT,
2462     &ISO_8859_2_INIT,
2463     &ISO_8859_3_INIT,
2464     &ISO_8859_13_INIT,
2465     &ISO_8859_4_INIT,
2466     &ISO_8859_14_INIT,
2467     &ISO_8859_5_INIT,
2468     &ISO_8859_15_INIT,
2469     &ISO_8859_6_INIT,
2470     &ISO_8859_7_INIT,
2471     &ISO_8859_8_INIT,
2472     &GBK_INIT,
2473     &WINDOWS_1254_INIT,
2474     &MACINTOSH_INIT,
2475     &SHIFT_JIS_INIT,
2476     &SHIFT_JIS_INIT,
2477     &WINDOWS_1252_INIT,
2478     &ISO_8859_10_INIT,
2479     &ISO_8859_4_INIT,
2480     &GBK_INIT,
2481     &WINDOWS_1252_INIT,
2482     &WINDOWS_1252_INIT,
2483     &ISO_8859_2_INIT,
2484     &WINDOWS_874_INIT,
2485     &ISO_8859_2_INIT,
2486     &ISO_8859_2_INIT,
2487     &REPLACEMENT_INIT,
2488     &ISO_8859_3_INIT,
2489     &ISO_8859_3_INIT,
2490     &ISO_8859_13_INIT,
2491     &ISO_8859_4_INIT,
2492     &ISO_8859_4_INIT,
2493     &ISO_8859_14_INIT,
2494     &ISO_8859_5_INIT,
2495     &ISO_8859_5_INIT,
2496     &ISO_8859_5_INIT,
2497     &ISO_8859_15_INIT,
2498     &ISO_8859_6_INIT,
2499     &ISO_8859_6_INIT,
2500     &ISO_8859_7_INIT,
2501     &ISO_8859_7_INIT,
2502     &ISO_8859_7_INIT,
2503     &ISO_8859_6_INIT,
2504     &ISO_8859_10_INIT,
2505     &ISO_8859_8_INIT,
2506     &ISO_8859_8_INIT,
2507     &ISO_8859_8_INIT,
2508     &WINDOWS_1254_INIT,
2509     &WINDOWS_1254_INIT,
2510     &WINDOWS_1254_INIT,
2511     &ISO_8859_3_INIT,
2512     &EUC_KR_INIT,
2513     &BIG5_INIT,
2514     &SHIFT_JIS_INIT,
2515     &ISO_8859_10_INIT,
2516     &WINDOWS_874_INIT,
2517     &WINDOWS_1252_INIT,
2518     &ISO_8859_2_INIT,
2519     &ISO_8859_13_INIT,
2520     &ISO_8859_3_INIT,
2521     &ISO_8859_14_INIT,
2522     &WINDOWS_874_INIT,
2523     &ISO_8859_4_INIT,
2524     &ISO_8859_15_INIT,
2525     &ISO_8859_15_INIT,
2526     &WINDOWS_1254_INIT,
2527     &ISO_8859_16_INIT,
2528     &ISO_8859_10_INIT,
2529     &EUC_KR_INIT,
2530     &ISO_8859_15_INIT,
2531     &ISO_8859_6_INIT,
2532     &ISO_8859_8_INIT,
2533     &MACINTOSH_INIT,
2534     &ISO_8859_6_INIT,
2535     &ISO_8859_8_I_INIT,
2536     &SHIFT_JIS_INIT,
2537     &MACINTOSH_INIT,
2538     &REPLACEMENT_INIT,
2539     &ISO_2022_JP_INIT,
2540     &ISO_2022_JP_INIT,
2541     &REPLACEMENT_INIT,
2542     &REPLACEMENT_INIT,
2543     &REPLACEMENT_INIT,
2544     &WINDOWS_1250_INIT,
2545     &WINDOWS_1251_INIT,
2546     &WINDOWS_1252_INIT,
2547     &WINDOWS_1253_INIT,
2548     &WINDOWS_1254_INIT,
2549     &WINDOWS_1255_INIT,
2550     &WINDOWS_1256_INIT,
2551     &WINDOWS_1257_INIT,
2552     &WINDOWS_1258_INIT,
2553     &ISO_8859_6_INIT,
2554     &ISO_8859_8_INIT,
2555     &ISO_8859_6_INIT,
2556     &ISO_8859_8_I_INIT,
2557     &ISO_8859_7_INIT,
2558     &EUC_KR_INIT,
2559     &EUC_KR_INIT,
2560     &WINDOWS_1252_INIT,
2561     &EUC_KR_INIT,
2562     &X_MAC_CYRILLIC_INIT,
2563     &X_USER_DEFINED_INIT,
2564     &GBK_INIT,
2565     &WINDOWS_1252_INIT,
2566     &ISO_8859_2_INIT,
2567     &ISO_8859_6_INIT,
2568     &ISO_8859_7_INIT,
2569     &ISO_8859_3_INIT,
2570     &ISO_8859_4_INIT,
2571     &ISO_8859_5_INIT,
2572     &ISO_8859_8_INIT,
2573     &WINDOWS_1254_INIT,
2574     &ISO_8859_7_INIT,
2575     &X_MAC_CYRILLIC_INIT,
2576     &REPLACEMENT_INIT,
2577     &ISO_8859_6_INIT,
2578     &ISO_8859_8_INIT,
2579     &UTF_8_INIT,
2580     &ISO_8859_5_INIT,
2581     &EUC_JP_INIT,
2582 ];
2583 
2584 // END GENERATED CODE
2585 
2586 /// An encoding as defined in the [Encoding Standard][1].
2587 ///
2588 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2589 /// and, in most cases, vice versa. Each encoding has a name, an output
2590 /// encoding, and one or more labels.
2591 ///
2592 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2593 /// encoding in formats and protocols. The _name_ of the encoding is the
2594 /// preferred label in the case appropriate for returning from the
2595 /// [`characterSet`][2] property of the `Document` DOM interface.
2596 ///
2597 /// The _output encoding_ is the encoding used for form submission and URL
2598 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2599 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2600 /// encodings.
2601 ///
2602 /// [1]: https://encoding.spec.whatwg.org/
2603 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2604 ///
2605 /// # Streaming vs. Non-Streaming
2606 ///
2607 /// When you have the entire input in a single buffer, you can use the
2608 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2609 /// [`decode_without_bom_handling()`][5],
2610 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2611 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2612 /// not available in the C API.) Unlike the rest of the API available to Rust,
2613 /// these methods perform heap allocations. You should the `Decoder` and
2614 /// `Encoder` objects when your input is split into multiple buffers or when
2615 /// you want to control the allocation of the output buffers.
2616 ///
2617 /// [3]: #method.decode
2618 /// [4]: #method.decode_with_bom_removal
2619 /// [5]: #method.decode_without_bom_handling
2620 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2621 /// [7]: #method.encode
2622 ///
2623 /// # Instances
2624 ///
2625 /// All instances of `Encoding` are statically allocated and have the `'static`
2626 /// lifetime. There is precisely one unique `Encoding` instance for each
2627 /// encoding defined in the Encoding Standard.
2628 ///
2629 /// To obtain a reference to a particular encoding whose identity you know at
2630 /// compile time, use a `static` that refers to encoding. There is a `static`
2631 /// for each encoding. The `static`s are named in all caps with hyphens
2632 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2633 /// name). For example, if you know at compile time that you will want to
2634 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2635 /// in C/C++).
2636 ///
2637 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2638 /// work around the problem that `static`s of the type `&'static Encoding`
2639 /// cannot be used to initialize items of an array whose type is
2640 /// `[&'static Encoding; N]`.
2641 ///
2642 /// If you don't know what encoding you need at compile time and need to
2643 /// dynamically get an encoding by label, use
2644 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2645 ///
2646 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2647 /// C/C++).
2648 pub struct Encoding {
2649     name: &'static str,
2650     variant: VariantEncoding,
2651 }
2652 
2653 impl Encoding {
2654     /// Implements the
2655     /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2656     /// algorithm.
2657     ///
2658     /// If, after ASCII-lowercasing and removing leading and trailing
2659     /// whitespace, the argument matches a label defined in the Encoding
2660     /// Standard, `Some(&'static Encoding)` representing the corresponding
2661     /// encoding is returned. If there is no match, `None` is returned.
2662     ///
2663     /// This is the right method to use if the action upon the method returning
2664     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2665     /// When the action upon the method returning `None` is not to proceed with
2666     /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2667     /// appropriate.
2668     ///
2669     /// The argument is of type `&[u8]` instead of `&str` to save callers
2670     /// that are extracting the label from a non-UTF-8 protocol the trouble
2671     /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2672     /// on it.)
2673     ///
2674     /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2675     pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2676         let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2677         let mut trimmed_pos = 0usize;
2678         let mut iter = label.into_iter();
2679         // before
2680         loop {
2681             match iter.next() {
2682                 None => {
2683                     return None;
2684                 }
2685                 Some(byte) => {
2686                     // The characters used in labels are:
2687                     // a-z (except q, but excluding it below seems excessive)
2688                     // 0-9
2689                     // . _ - :
2690                     match *byte {
2691                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2692                             continue;
2693                         }
2694                         b'A'..=b'Z' => {
2695                             trimmed[trimmed_pos] = *byte + 0x20u8;
2696                             trimmed_pos = 1usize;
2697                             break;
2698                         }
2699                         b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2700                             trimmed[trimmed_pos] = *byte;
2701                             trimmed_pos = 1usize;
2702                             break;
2703                         }
2704                         _ => {
2705                             return None;
2706                         }
2707                     }
2708                 }
2709             }
2710         }
2711         // inside
2712         loop {
2713             match iter.next() {
2714                 None => {
2715                     break;
2716                 }
2717                 Some(byte) => {
2718                     match *byte {
2719                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2720                             break;
2721                         }
2722                         b'A'..=b'Z' => {
2723                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2724                                 // There's no encoding with a label this long
2725                                 return None;
2726                             }
2727                             trimmed[trimmed_pos] = *byte + 0x20u8;
2728                             trimmed_pos += 1usize;
2729                             continue;
2730                         }
2731                         b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2732                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2733                                 // There's no encoding with a label this long
2734                                 return None;
2735                             }
2736                             trimmed[trimmed_pos] = *byte;
2737                             trimmed_pos += 1usize;
2738                             continue;
2739                         }
2740                         _ => {
2741                             return None;
2742                         }
2743                     }
2744                 }
2745             }
2746         }
2747         // after
2748         loop {
2749             match iter.next() {
2750                 None => {
2751                     break;
2752                 }
2753                 Some(byte) => {
2754                     match *byte {
2755                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2756                             continue;
2757                         }
2758                         _ => {
2759                             // There's no label with space in the middle
2760                             return None;
2761                         }
2762                     }
2763                 }
2764             }
2765         }
2766         let candidate = &trimmed[..trimmed_pos];
2767         match LABELS_SORTED.binary_search_by(|probe| {
2768             let bytes = probe.as_bytes();
2769             let c = bytes.len().cmp(&candidate.len());
2770             if c != Ordering::Equal {
2771                 return c;
2772             }
2773             let probe_iter = bytes.iter().rev();
2774             let candidate_iter = candidate.iter().rev();
2775             probe_iter.cmp(candidate_iter)
2776         }) {
2777             Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2778             Err(_) => None,
2779         }
2780     }
2781 
2782     /// This method behaves the same as `for_label()`, except when `for_label()`
2783     /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2784     ///
2785     /// This method is useful in scenarios where a fatal error is required
2786     /// upon invalid label, because in those cases the caller typically wishes
2787     /// to treat the labels that map to the replacement encoding as fatal
2788     /// errors, too.
2789     ///
2790     /// It is not OK to use this method when the action upon the method returning
2791     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2792     /// case, the `for_label()` method should be used instead in order to avoid
2793     /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2794     ///
2795     /// Available via the C wrapper.
2796     #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2797     pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2798         match Encoding::for_label(label) {
2799             None => None,
2800             Some(encoding) => {
2801                 if encoding == REPLACEMENT {
2802                     None
2803                 } else {
2804                     Some(encoding)
2805                 }
2806             }
2807         }
2808     }
2809 
2810     /// Performs non-incremental BOM sniffing.
2811     ///
2812     /// The argument must either be a buffer representing the entire input
2813     /// stream (non-streaming case) or a buffer representing at least the first
2814     /// three bytes of the input stream (streaming case).
2815     ///
2816     /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2817     /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2818     /// or UTF-16BE BOM or `None` otherwise.
2819     ///
2820     /// Available via the C wrapper.
2821     #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2822     pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2823         if buffer.starts_with(b"\xEF\xBB\xBF") {
2824             Some((UTF_8, 3))
2825         } else if buffer.starts_with(b"\xFF\xFE") {
2826             Some((UTF_16LE, 2))
2827         } else if buffer.starts_with(b"\xFE\xFF") {
2828             Some((UTF_16BE, 2))
2829         } else {
2830             None
2831         }
2832     }
2833 
2834     /// Returns the name of this encoding.
2835     ///
2836     /// This name is appropriate to return as-is from the DOM
2837     /// `document.characterSet` property.
2838     ///
2839     /// Available via the C wrapper.
2840     #[inline]
name(&'static self) -> &'static str2841     pub fn name(&'static self) -> &'static str {
2842         self.name
2843     }
2844 
2845     /// Checks whether the _output encoding_ of this encoding can encode every
2846     /// `char`. (Only true if the output encoding is UTF-8.)
2847     ///
2848     /// Available via the C wrapper.
2849     #[inline]
can_encode_everything(&'static self) -> bool2850     pub fn can_encode_everything(&'static self) -> bool {
2851         self.output_encoding() == UTF_8
2852     }
2853 
2854     /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2855     /// U+0000...U+007F and vice versa.
2856     ///
2857     /// Available via the C wrapper.
2858     #[inline]
is_ascii_compatible(&'static self) -> bool2859     pub fn is_ascii_compatible(&'static self) -> bool {
2860         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2861     }
2862 
2863     /// Checks whether this encoding maps one byte to one Basic Multilingual
2864     /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2865     /// vice versa (for mappable characters).
2866     ///
2867     /// `true` iff this encoding is on the list of [Legacy single-byte
2868     /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2869     /// in the spec or x-user-defined.
2870     ///
2871     /// Available via the C wrapper.
2872     #[inline]
is_single_byte(&'static self) -> bool2873     pub fn is_single_byte(&'static self) -> bool {
2874         self.variant.is_single_byte()
2875     }
2876 
2877     /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2878     /// U+0000...U+007F and vice versa.
2879     #[inline]
is_potentially_borrowable(&'static self) -> bool2880     fn is_potentially_borrowable(&'static self) -> bool {
2881         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2882     }
2883 
2884     /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2885     /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2886     ///
2887     /// Available via the C wrapper.
2888     #[inline]
output_encoding(&'static self) -> &'static Encoding2889     pub fn output_encoding(&'static self) -> &'static Encoding {
2890         if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2891             UTF_8
2892         } else {
2893             self
2894         }
2895     }
2896 
2897     /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2898     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2899     /// entire input is available as a single buffer (i.e. the end of the
2900     /// buffer marks the end of the stream).
2901     ///
2902     /// This method implements the (non-streaming version of) the
2903     /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2904     ///
2905     /// The second item in the returned tuple is the encoding that was actually
2906     /// used (which may differ from this encoding thanks to BOM sniffing).
2907     ///
2908     /// The third item in the returned tuple indicates whether there were
2909     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2910     ///
2911     /// _Note:_ It is wrong to use this when the input buffer represents only
2912     /// a segment of the input instead of the whole input. Use `new_decoder()`
2913     /// when decoding segmented input.
2914     ///
2915     /// This method performs a one or two heap allocations for the backing
2916     /// buffer of the `String` when unable to borrow. (One allocation if not
2917     /// errors and potentially another one in the presence of errors.) The
2918     /// first allocation assumes jemalloc and may not be optimal with
2919     /// allocators that do not use power-of-two buckets. A borrow is performed
2920     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2921     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2922     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2923     /// transitions.
2924     ///
2925     /// # Panics
2926     ///
2927     /// If the size calculation for a heap-allocated backing buffer overflows
2928     /// `usize`.
2929     ///
2930     /// Available to Rust only.
2931     #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2932     pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2933         let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2934             Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2935             None => (self, bytes),
2936         };
2937         let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2938         (cow, encoding, had_errors)
2939     }
2940 
2941     /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2942     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2943     /// entire input is available as a single buffer (i.e. the end of the
2944     /// buffer marks the end of the stream).
2945     ///
2946     /// When invoked on `UTF_8`, this method implements the (non-streaming
2947     /// version of) the
2948     /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2949     /// concept.
2950     ///
2951     /// The second item in the returned pair indicates whether there were
2952     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2953     ///
2954     /// _Note:_ It is wrong to use this when the input buffer represents only
2955     /// a segment of the input instead of the whole input. Use
2956     /// `new_decoder_with_bom_removal()` when decoding segmented input.
2957     ///
2958     /// This method performs a one or two heap allocations for the backing
2959     /// buffer of the `String` when unable to borrow. (One allocation if not
2960     /// errors and potentially another one in the presence of errors.) The
2961     /// first allocation assumes jemalloc and may not be optimal with
2962     /// allocators that do not use power-of-two buckets. A borrow is performed
2963     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2964     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2965     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2966     /// transitions.
2967     ///
2968     /// # Panics
2969     ///
2970     /// If the size calculation for a heap-allocated backing buffer overflows
2971     /// `usize`.
2972     ///
2973     /// Available to Rust only.
2974     #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)2975     pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2976         let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2977             &bytes[3..]
2978         } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2979             || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2980         {
2981             &bytes[2..]
2982         } else {
2983             bytes
2984         };
2985         self.decode_without_bom_handling(without_bom)
2986     }
2987 
2988     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2989     /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2990     /// the entire input is available as a single buffer (i.e. the end of the
2991     /// buffer marks the end of the stream).
2992     ///
2993     /// When invoked on `UTF_8`, this method implements the (non-streaming
2994     /// version of) the
2995     /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2996     /// spec concept.
2997     ///
2998     /// The second item in the returned pair indicates whether there were
2999     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3000     ///
3001     /// _Note:_ It is wrong to use this when the input buffer represents only
3002     /// a segment of the input instead of the whole input. Use
3003     /// `new_decoder_without_bom_handling()` when decoding segmented input.
3004     ///
3005     /// This method performs a one or two heap allocations for the backing
3006     /// buffer of the `String` when unable to borrow. (One allocation if not
3007     /// errors and potentially another one in the presence of errors.) The
3008     /// first allocation assumes jemalloc and may not be optimal with
3009     /// allocators that do not use power-of-two buckets. A borrow is performed
3010     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3011     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3012     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3013     /// transitions.
3014     ///
3015     /// # Panics
3016     ///
3017     /// If the size calculation for a heap-allocated backing buffer overflows
3018     /// `usize`.
3019     ///
3020     /// Available to Rust only.
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3021     pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3022         let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3023             let valid_up_to = if self == UTF_8 {
3024                 utf8_valid_up_to(bytes)
3025             } else if self == ISO_2022_JP {
3026                 iso_2022_jp_ascii_valid_up_to(bytes)
3027             } else {
3028                 ascii_valid_up_to(bytes)
3029             };
3030             if valid_up_to == bytes.len() {
3031                 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3032                 return (Cow::Borrowed(str), false);
3033             }
3034             let decoder = self.new_decoder_without_bom_handling();
3035 
3036             let rounded_without_replacement = checked_next_power_of_two(checked_add(
3037                 valid_up_to,
3038                 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3039             ));
3040             let with_replacement = checked_add(
3041                 valid_up_to,
3042                 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3043             );
3044             let mut string = String::with_capacity(
3045                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3046             );
3047             unsafe {
3048                 let vec = string.as_mut_vec();
3049                 vec.set_len(valid_up_to);
3050                 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3051             }
3052             (decoder, string, valid_up_to)
3053         } else {
3054             let decoder = self.new_decoder_without_bom_handling();
3055             let rounded_without_replacement = checked_next_power_of_two(
3056                 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3057             );
3058             let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3059             let string = String::with_capacity(
3060                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3061             );
3062             (decoder, string, 0)
3063         };
3064 
3065         let mut total_had_errors = false;
3066         loop {
3067             let (result, read, had_errors) =
3068                 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3069             total_read += read;
3070             total_had_errors |= had_errors;
3071             match result {
3072                 CoderResult::InputEmpty => {
3073                     debug_assert_eq!(total_read, bytes.len());
3074                     return (Cow::Owned(string), total_had_errors);
3075                 }
3076                 CoderResult::OutputFull => {
3077                     // Allocate for the worst case. That is, we should come
3078                     // here at most once per invocation of this method.
3079                     let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3080                     string.reserve(needed.unwrap());
3081                 }
3082             }
3083         }
3084     }
3085 
3086     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3087     /// _with malformed sequences treated as fatal_ when the entire input is
3088     /// available as a single buffer (i.e. the end of the buffer marks the end
3089     /// of the stream).
3090     ///
3091     /// When invoked on `UTF_8`, this method implements the (non-streaming
3092     /// version of) the
3093     /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3094     /// spec concept.
3095     ///
3096     /// Returns `None` if a malformed sequence was encountered and the result
3097     /// of the decode as `Some(String)` otherwise.
3098     ///
3099     /// _Note:_ It is wrong to use this when the input buffer represents only
3100     /// a segment of the input instead of the whole input. Use
3101     /// `new_decoder_without_bom_handling()` when decoding segmented input.
3102     ///
3103     /// This method performs a single heap allocation for the backing
3104     /// buffer of the `String` when unable to borrow. A borrow is performed if
3105     /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3106     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3107     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3108     /// transitions.
3109     ///
3110     /// # Panics
3111     ///
3112     /// If the size calculation for a heap-allocated backing buffer overflows
3113     /// `usize`.
3114     ///
3115     /// Available to Rust only.
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3116     pub fn decode_without_bom_handling_and_without_replacement<'a>(
3117         &'static self,
3118         bytes: &'a [u8],
3119     ) -> Option<Cow<'a, str>> {
3120         if self == UTF_8 {
3121             let valid_up_to = utf8_valid_up_to(bytes);
3122             if valid_up_to == bytes.len() {
3123                 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3124                 return Some(Cow::Borrowed(str));
3125             }
3126             return None;
3127         }
3128         let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3129             let valid_up_to = if self == ISO_2022_JP {
3130                 iso_2022_jp_ascii_valid_up_to(bytes)
3131             } else {
3132                 ascii_valid_up_to(bytes)
3133             };
3134             if valid_up_to == bytes.len() {
3135                 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3136                 return Some(Cow::Borrowed(str));
3137             }
3138             let decoder = self.new_decoder_without_bom_handling();
3139             let mut string = String::with_capacity(
3140                 checked_add(
3141                     valid_up_to,
3142                     decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3143                 )
3144                 .unwrap(),
3145             );
3146             unsafe {
3147                 let vec = string.as_mut_vec();
3148                 vec.set_len(valid_up_to);
3149                 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3150             }
3151             (decoder, string, &bytes[valid_up_to..])
3152         } else {
3153             let decoder = self.new_decoder_without_bom_handling();
3154             let string = String::with_capacity(
3155                 decoder
3156                     .max_utf8_buffer_length_without_replacement(bytes.len())
3157                     .unwrap(),
3158             );
3159             (decoder, string, bytes)
3160         };
3161         let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3162         match result {
3163             DecoderResult::InputEmpty => {
3164                 debug_assert_eq!(read, input.len());
3165                 Some(Cow::Owned(string))
3166             }
3167             DecoderResult::Malformed(_, _) => None,
3168             DecoderResult::OutputFull => unreachable!(),
3169         }
3170     }
3171 
3172     /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3173     /// replaced with decimal numeric character references when the entire input
3174     /// is available as a single buffer (i.e. the end of the buffer marks the
3175     /// end of the stream).
3176     ///
3177     /// This method implements the (non-streaming version of) the
3178     /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3179     /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3180     /// spec concept, it is slightly more efficient to use
3181     /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3182     /// method on `UTF_8`.
3183     ///
3184     /// The second item in the returned tuple is the encoding that was actually
3185     /// used (which may differ from this encoding thanks to some encodings
3186     /// having UTF-8 as their output encoding).
3187     ///
3188     /// The third item in the returned tuple indicates whether there were
3189     /// unmappable characters (that were replaced with HTML numeric character
3190     /// references).
3191     ///
3192     /// _Note:_ It is wrong to use this when the input buffer represents only
3193     /// a segment of the input instead of the whole input. Use `new_encoder()`
3194     /// when encoding segmented output.
3195     ///
3196     /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3197     /// ASCII-compatible encoding, this method returns a borrow of the input
3198     /// without a heap allocation. Otherwise, this method performs a single
3199     /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3200     /// unmappable characters and potentially multiple heap allocations if
3201     /// there are. These allocations are tuned for jemalloc and may not be
3202     /// optimal when using a different allocator that doesn't use power-of-two
3203     /// buckets.
3204     ///
3205     /// # Panics
3206     ///
3207     /// If the size calculation for a heap-allocated backing buffer overflows
3208     /// `usize`.
3209     ///
3210     /// Available to Rust only.
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3211     pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3212         let output_encoding = self.output_encoding();
3213         if output_encoding == UTF_8 {
3214             return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3215         }
3216         debug_assert!(output_encoding.is_potentially_borrowable());
3217         let bytes = string.as_bytes();
3218         let valid_up_to = if output_encoding == ISO_2022_JP {
3219             iso_2022_jp_ascii_valid_up_to(bytes)
3220         } else {
3221             ascii_valid_up_to(bytes)
3222         };
3223         if valid_up_to == bytes.len() {
3224             return (Cow::Borrowed(bytes), output_encoding, false);
3225         }
3226         let mut encoder = output_encoding.new_encoder();
3227         let mut vec: Vec<u8> = Vec::with_capacity(
3228             (checked_add(
3229                 valid_up_to,
3230                 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3231             ))
3232             .unwrap()
3233             .next_power_of_two(),
3234         );
3235         unsafe {
3236             vec.set_len(valid_up_to);
3237             core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3238         }
3239         let mut total_read = valid_up_to;
3240         let mut total_had_errors = false;
3241         loop {
3242             let (result, read, had_errors) =
3243                 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3244             total_read += read;
3245             total_had_errors |= had_errors;
3246             match result {
3247                 CoderResult::InputEmpty => {
3248                     debug_assert_eq!(total_read, string.len());
3249                     return (Cow::Owned(vec), output_encoding, total_had_errors);
3250                 }
3251                 CoderResult::OutputFull => {
3252                     // reserve_exact wants to know how much more on top of current
3253                     // length--not current capacity.
3254                     let needed = encoder
3255                         .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3256                     let rounded = (checked_add(vec.capacity(), needed))
3257                         .unwrap()
3258                         .next_power_of_two();
3259                     let additional = rounded - vec.len();
3260                     vec.reserve_exact(additional);
3261                 }
3262             }
3263         }
3264     }
3265 
new_variant_decoder(&'static self) -> VariantDecoder3266     fn new_variant_decoder(&'static self) -> VariantDecoder {
3267         self.variant.new_variant_decoder()
3268     }
3269 
3270     /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3271     ///
3272     /// BOM sniffing may cause the returned decoder to morph into a decoder
3273     /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3274     ///
3275     /// Available via the C wrapper.
3276     #[inline]
new_decoder(&'static self) -> Decoder3277     pub fn new_decoder(&'static self) -> Decoder {
3278         Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3279     }
3280 
3281     /// Instantiates a new decoder for this encoding with BOM removal.
3282     ///
3283     /// If the input starts with bytes that are the BOM for this encoding,
3284     /// those bytes are removed. However, the decoder never morphs into a
3285     /// decoder for another encoding: A BOM for another encoding is treated as
3286     /// (potentially malformed) input to the decoding algorithm for this
3287     /// encoding.
3288     ///
3289     /// Available via the C wrapper.
3290     #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3291     pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3292         Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3293     }
3294 
3295     /// Instantiates a new decoder for this encoding with BOM handling disabled.
3296     ///
3297     /// If the input starts with bytes that look like a BOM, those bytes are
3298     /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3299     /// for another encoding.)
3300     ///
3301     /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3302     /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3303     /// instead of this method to cause the BOM to be removed.
3304     ///
3305     /// Available via the C wrapper.
3306     #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3307     pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3308         Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3309     }
3310 
3311     /// Instantiates a new encoder for the output encoding of this encoding.
3312     ///
3313     /// Available via the C wrapper.
3314     #[inline]
new_encoder(&'static self) -> Encoder3315     pub fn new_encoder(&'static self) -> Encoder {
3316         let enc = self.output_encoding();
3317         enc.variant.new_encoder(enc)
3318     }
3319 
3320     /// Validates UTF-8.
3321     ///
3322     /// Returns the index of the first byte that makes the input malformed as
3323     /// UTF-8 or the length of the slice if the slice is entirely valid.
3324     ///
3325     /// This is currently faster than the corresponding standard library
3326     /// functionality. If this implementation gets upstreamed to the standard
3327     /// library, this method may be removed in the future.
3328     ///
3329     /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3330     pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3331         utf8_valid_up_to(bytes)
3332     }
3333 
3334     /// Validates ASCII.
3335     ///
3336     /// Returns the index of the first byte that makes the input malformed as
3337     /// ASCII or the length of the slice if the slice is entirely valid.
3338     ///
3339     /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3340     pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3341         ascii_valid_up_to(bytes)
3342     }
3343 
3344     /// Validates ISO-2022-JP ASCII-state data.
3345     ///
3346     /// Returns the index of the first byte that makes the input not
3347     /// representable in the ASCII state of ISO-2022-JP or the length of the
3348     /// slice if the slice is entirely representable in the ASCII state of
3349     /// ISO-2022-JP.
3350     ///
3351     /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3352     pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3353         iso_2022_jp_ascii_valid_up_to(bytes)
3354     }
3355 }
3356 
3357 impl PartialEq for Encoding {
3358     #[inline]
eq(&self, other: &Encoding) -> bool3359     fn eq(&self, other: &Encoding) -> bool {
3360         (self as *const Encoding) == (other as *const Encoding)
3361     }
3362 }
3363 
3364 impl Eq for Encoding {}
3365 
3366 #[cfg(test)]
3367 impl PartialOrd for Encoding {
partial_cmp(&self, other: &Self) -> Option<Ordering>3368     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3369         (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3370     }
3371 }
3372 
3373 #[cfg(test)]
3374 impl Ord for Encoding {
cmp(&self, other: &Self) -> Ordering3375     fn cmp(&self, other: &Self) -> Ordering {
3376         (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3377     }
3378 }
3379 
3380 impl Hash for Encoding {
3381     #[inline]
hash<H: Hasher>(&self, state: &mut H)3382     fn hash<H: Hasher>(&self, state: &mut H) {
3383         (self as *const Encoding).hash(state);
3384     }
3385 }
3386 
3387 impl core::fmt::Debug for Encoding {
3388     #[inline]
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result3389     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3390         write!(f, "Encoding {{ {} }}", self.name)
3391     }
3392 }
3393 
3394 #[cfg(feature = "serde")]
3395 impl Serialize for Encoding {
3396     #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3397     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3398     where
3399         S: Serializer,
3400     {
3401         serializer.serialize_str(self.name)
3402     }
3403 }
3404 
3405 #[cfg(feature = "serde")]
3406 struct EncodingVisitor;
3407 
3408 #[cfg(feature = "serde")]
3409 impl<'de> Visitor<'de> for EncodingVisitor {
3410     type Value = &'static Encoding;
3411 
expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result3412     fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3413         formatter.write_str("a valid encoding label")
3414     }
3415 
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3416     fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3417     where
3418         E: serde::de::Error,
3419     {
3420         if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3421             Ok(enc)
3422         } else {
3423             Err(E::custom(alloc::format!("invalid encoding label: {}", value)))
3424         }
3425     }
3426 }
3427 
3428 #[cfg(feature = "serde")]
3429 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3430     fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3431     where
3432         D: Deserializer<'de>,
3433     {
3434         deserializer.deserialize_str(EncodingVisitor)
3435     }
3436 }
3437 
3438 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3439 #[derive(PartialEq, Debug, Copy, Clone)]
3440 enum DecoderLifeCycle {
3441     /// The decoder has seen no input yet.
3442     AtStart,
3443     /// The decoder has seen no input yet but expects UTF-8.
3444     AtUtf8Start,
3445     /// The decoder has seen no input yet but expects UTF-16BE.
3446     AtUtf16BeStart,
3447     /// The decoder has seen no input yet but expects UTF-16LE.
3448     AtUtf16LeStart,
3449     /// The decoder has seen EF.
3450     SeenUtf8First,
3451     /// The decoder has seen EF, BB.
3452     SeenUtf8Second,
3453     /// The decoder has seen FE.
3454     SeenUtf16BeFirst,
3455     /// The decoder has seen FF.
3456     SeenUtf16LeFirst,
3457     /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3458     /// underlying decoder reported EF as an error, so we need to remember to
3459     /// push BB before the next buffer.
3460     ConvertingWithPendingBB,
3461     /// No longer looking for a BOM and EOF not yet seen.
3462     Converting,
3463     /// EOF has been seen.
3464     Finished,
3465 }
3466 
3467 /// Communicate the BOM handling mode.
3468 #[derive(Debug, Copy, Clone)]
3469 enum BomHandling {
3470     /// Don't handle the BOM
3471     Off,
3472     /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3473     Sniff,
3474     /// Remove the BOM only if it's the BOM for this encoding
3475     Remove,
3476 }
3477 
3478 /// Result of a (potentially partial) decode or encode operation with
3479 /// replacement.
3480 #[must_use]
3481 #[derive(Debug, PartialEq, Eq)]
3482 pub enum CoderResult {
3483     /// The input was exhausted.
3484     ///
3485     /// If this result was returned from a call where `last` was `true`, the
3486     /// conversion process has completed. Otherwise, the caller should call a
3487     /// decode or encode method again with more input.
3488     InputEmpty,
3489 
3490     /// The converter cannot produce another unit of output, because the output
3491     /// buffer does not have enough space left.
3492     ///
3493     /// The caller must provide more output space upon the next call and re-push
3494     /// the remaining input to the converter.
3495     OutputFull,
3496 }
3497 
3498 /// Result of a (potentially partial) decode operation without replacement.
3499 #[must_use]
3500 #[derive(Debug, PartialEq, Eq)]
3501 pub enum DecoderResult {
3502     /// The input was exhausted.
3503     ///
3504     /// If this result was returned from a call where `last` was `true`, the
3505     /// decoding process has completed. Otherwise, the caller should call a
3506     /// decode method again with more input.
3507     InputEmpty,
3508 
3509     /// The decoder cannot produce another unit of output, because the output
3510     /// buffer does not have enough space left.
3511     ///
3512     /// The caller must provide more output space upon the next call and re-push
3513     /// the remaining input to the decoder.
3514     OutputFull,
3515 
3516     /// The decoder encountered a malformed byte sequence.
3517     ///
3518     /// The caller must either treat this as a fatal error or must append one
3519     /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3520     /// the remaining input to the decoder.
3521     ///
3522     /// The first wrapped integer indicates the length of the malformed byte
3523     /// sequence. The second wrapped integer indicates the number of bytes
3524     /// that were consumed after the malformed sequence. If the second
3525     /// integer is zero, the last byte that was consumed is the last byte of
3526     /// the malformed sequence. Note that the malformed bytes may have been part
3527     /// of an earlier input buffer.
3528     ///
3529     /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3530     /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3531     /// of the two is 6, which happens with ISO-2022-JP.
3532     Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3533 }
3534 
3535 /// A converter that decodes a byte stream into Unicode according to a
3536 /// character encoding in a streaming (incremental) manner.
3537 ///
3538 /// The various `decode_*` methods take an input buffer (`src`) and an output
3539 /// buffer `dst` both of which are caller-allocated. There are variants for
3540 /// both UTF-8 and UTF-16 output buffers.
3541 ///
3542 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3543 /// into `dst` until one of the following three things happens:
3544 ///
3545 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3546 ///    variants only).
3547 ///
3548 /// 2. The output buffer has been filled so near capacity that the decoder
3549 ///    cannot be sure that processing an additional byte of input wouldn't
3550 ///    cause so much output that the output buffer would overflow.
3551 ///
3552 /// 3. All the input bytes have been processed.
3553 ///
3554 /// The `decode_*` method then returns tuple of a status indicating which one
3555 /// of the three reasons to return happened, how many input bytes were read,
3556 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3557 /// when decoding to UTF-16) were written (except when decoding into `String`,
3558 /// whose length change indicates this), and in the case of the
3559 /// variants performing replacement, a boolean indicating whether an error was
3560 /// replaced with the REPLACEMENT CHARACTER during the call.
3561 ///
3562 /// The number of bytes "written" is what's logically written. Garbage may be
3563 /// written in the output buffer beyond the point logically written to.
3564 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3565 /// methods that take an `&mut str` argument instead of the ones that take an
3566 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3567 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3568 /// latter don't.
3569 ///
3570 /// In the case of the `*_without_replacement` variants, the status is a
3571 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3572 /// `InputEmpty` corresponding to the three cases listed above).
3573 ///
3574 /// In the case of methods whose name does not end with
3575 /// `*_without_replacement`, malformed sequences are automatically replaced
3576 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3577 /// return early.
3578 ///
3579 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3580 /// space. When decoding to UTF-16, the output buffer must have at least two
3581 /// UTF-16 code units (`u16`) of space.
3582 ///
3583 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3584 /// not to return indicating that more output space is needed if the length
3585 /// of the output buffer is at least the length returned by
3586 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3587 /// with replacement, the length of the output buffer that guarantees the
3588 /// methods not to return indicating that more output space is needed is given
3589 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3590 /// or without replacement, the length of the output buffer that guarantees
3591 /// the methods not to return indicating that more output space is needed is
3592 /// given by [`max_utf16_buffer_length()`][4].
3593 ///
3594 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3595 /// and the output after each `decode_*` call is guaranteed to consist of
3596 /// complete characters. (I.e. the code unit sequence for the last character is
3597 /// guaranteed not to be split across output buffers.)
3598 ///
3599 /// The boolean argument `last` indicates that the end of the stream is reached
3600 /// when all the bytes in `src` have been consumed.
3601 ///
3602 /// A `Decoder` object can be used to incrementally decode a byte stream.
3603 ///
3604 /// During the processing of a single stream, the caller must call `decode_*`
3605 /// zero or more times with `last` set to `false` and then call `decode_*` at
3606 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3607 /// the processing of the stream has ended. Otherwise, the caller must call
3608 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3609 ///  a fatal error).
3610 ///
3611 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3612 /// That is, you need to create another one to process another stream.
3613 ///
3614 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3615 /// the caller does not wish to treat it as a fatal error, the input buffer
3616 /// `src` may not have been completely consumed. In that case, the caller must
3617 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3618 /// call.
3619 ///
3620 /// [1]: enum.DecoderResult.html
3621 /// [2]: #method.max_utf8_buffer_length_without_replacement
3622 /// [3]: #method.max_utf8_buffer_length
3623 /// [4]: #method.max_utf16_buffer_length
3624 ///
3625 /// # Infinite loops
3626 ///
3627 /// When converting with a fixed-size output buffer whose size is too small to
3628 /// accommodate one character or (when applicable) one numeric character
3629 /// reference of output, an infinite loop ensues. When converting with a
3630 /// fixed-size output buffer, it generally makes sense to make the buffer
3631 /// fairly large (e.g. couple of kilobytes).
3632 pub struct Decoder {
3633     encoding: &'static Encoding,
3634     variant: VariantDecoder,
3635     life_cycle: DecoderLifeCycle,
3636 }
3637 
3638 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3639     fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3640         Decoder {
3641             encoding: enc,
3642             variant: decoder,
3643             life_cycle: match sniffing {
3644                 BomHandling::Off => DecoderLifeCycle::Converting,
3645                 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3646                 BomHandling::Remove => {
3647                     if enc == UTF_8 {
3648                         DecoderLifeCycle::AtUtf8Start
3649                     } else if enc == UTF_16BE {
3650                         DecoderLifeCycle::AtUtf16BeStart
3651                     } else if enc == UTF_16LE {
3652                         DecoderLifeCycle::AtUtf16LeStart
3653                     } else {
3654                         DecoderLifeCycle::Converting
3655                     }
3656                 }
3657             },
3658         }
3659     }
3660 
3661     /// The `Encoding` this `Decoder` is for.
3662     ///
3663     /// BOM sniffing can change the return value of this method during the life
3664     /// of the decoder.
3665     ///
3666     /// Available via the C wrapper.
3667     #[inline]
encoding(&self) -> &'static Encoding3668     pub fn encoding(&self) -> &'static Encoding {
3669         self.encoding
3670     }
3671 
3672     /// Query the worst-case UTF-8 output size _with replacement_.
3673     ///
3674     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3675     /// that will not overflow given the current state of the decoder and
3676     /// `byte_length` number of additional input bytes when decoding with
3677     /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3678     /// sequence or `None` if `usize` would overflow.
3679     ///
3680     /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3681     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3682         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3683         // BOM getting pushed to the underlying decoder.
3684         match self.life_cycle {
3685             DecoderLifeCycle::Converting
3686             | DecoderLifeCycle::AtUtf8Start
3687             | DecoderLifeCycle::AtUtf16LeStart
3688             | DecoderLifeCycle::AtUtf16BeStart => {
3689                 return self.variant.max_utf8_buffer_length(byte_length);
3690             }
3691             DecoderLifeCycle::AtStart => {
3692                 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3693                     if let Some(utf16_bom) = checked_add(
3694                         1,
3695                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3696                     ) {
3697                         let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3698                         let encoding = self.encoding();
3699                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3700                             // No need to consider the internal state of the underlying decoder,
3701                             // because it is at start, because no data has reached it yet.
3702                             return Some(utf_bom);
3703                         } else if let Some(non_bom) =
3704                             self.variant.max_utf8_buffer_length(byte_length)
3705                         {
3706                             return Some(core::cmp::max(utf_bom, non_bom));
3707                         }
3708                     }
3709                 }
3710             }
3711             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3712                 // Add two bytes even when only one byte has been seen,
3713                 // because the one byte can become a lead byte in multibyte
3714                 // decoders, but only after the decoder has been queried
3715                 // for max length, so the decoder's own logic for adding
3716                 // one for a pending lead cannot work.
3717                 if let Some(sum) = byte_length.checked_add(2) {
3718                     if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3719                         if self.encoding() == UTF_8 {
3720                             // No need to consider the internal state of the underlying decoder,
3721                             // because it is at start, because no data has reached it yet.
3722                             return Some(utf8_bom);
3723                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3724                             return Some(core::cmp::max(utf8_bom, non_bom));
3725                         }
3726                     }
3727                 }
3728             }
3729             DecoderLifeCycle::ConvertingWithPendingBB => {
3730                 if let Some(sum) = byte_length.checked_add(2) {
3731                     return self.variant.max_utf8_buffer_length(sum);
3732                 }
3733             }
3734             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3735                 // Add two bytes even when only one byte has been seen,
3736                 // because the one byte can become a lead byte in multibyte
3737                 // decoders, but only after the decoder has been queried
3738                 // for max length, so the decoder's own logic for adding
3739                 // one for a pending lead cannot work.
3740                 if let Some(sum) = byte_length.checked_add(2) {
3741                     if let Some(utf16_bom) =
3742                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3743                     {
3744                         let encoding = self.encoding();
3745                         if encoding == UTF_16LE || encoding == UTF_16BE {
3746                             // No need to consider the internal state of the underlying decoder,
3747                             // because it is at start, because no data has reached it yet.
3748                             return Some(utf16_bom);
3749                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3750                             return Some(core::cmp::max(utf16_bom, non_bom));
3751                         }
3752                     }
3753                 }
3754             }
3755             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3756         }
3757         None
3758     }
3759 
3760     /// Query the worst-case UTF-8 output size _without replacement_.
3761     ///
3762     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3763     /// that will not overflow given the current state of the decoder and
3764     /// `byte_length` number of additional input bytes when decoding without
3765     /// replacement error handling or `None` if `usize` would overflow.
3766     ///
3767     /// Note that this value may be too small for the `_with_replacement` case.
3768     /// Use `max_utf8_buffer_length()` for that case.
3769     ///
3770     /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3771     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3772         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3773         // BOM getting pushed to the underlying decoder.
3774         match self.life_cycle {
3775             DecoderLifeCycle::Converting
3776             | DecoderLifeCycle::AtUtf8Start
3777             | DecoderLifeCycle::AtUtf16LeStart
3778             | DecoderLifeCycle::AtUtf16BeStart => {
3779                 return self
3780                     .variant
3781                     .max_utf8_buffer_length_without_replacement(byte_length);
3782             }
3783             DecoderLifeCycle::AtStart => {
3784                 if let Some(utf8_bom) = byte_length.checked_add(3) {
3785                     if let Some(utf16_bom) = checked_add(
3786                         1,
3787                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3788                     ) {
3789                         let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3790                         let encoding = self.encoding();
3791                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3792                             // No need to consider the internal state of the underlying decoder,
3793                             // because it is at start, because no data has reached it yet.
3794                             return Some(utf_bom);
3795                         } else if let Some(non_bom) = self
3796                             .variant
3797                             .max_utf8_buffer_length_without_replacement(byte_length)
3798                         {
3799                             return Some(core::cmp::max(utf_bom, non_bom));
3800                         }
3801                     }
3802                 }
3803             }
3804             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3805                 // Add two bytes even when only one byte has been seen,
3806                 // because the one byte can become a lead byte in multibyte
3807                 // decoders, but only after the decoder has been queried
3808                 // for max length, so the decoder's own logic for adding
3809                 // one for a pending lead cannot work.
3810                 if let Some(sum) = byte_length.checked_add(2) {
3811                     if let Some(utf8_bom) = sum.checked_add(3) {
3812                         if self.encoding() == UTF_8 {
3813                             // No need to consider the internal state of the underlying decoder,
3814                             // because it is at start, because no data has reached it yet.
3815                             return Some(utf8_bom);
3816                         } else if let Some(non_bom) =
3817                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3818                         {
3819                             return Some(core::cmp::max(utf8_bom, non_bom));
3820                         }
3821                     }
3822                 }
3823             }
3824             DecoderLifeCycle::ConvertingWithPendingBB => {
3825                 if let Some(sum) = byte_length.checked_add(2) {
3826                     return self.variant.max_utf8_buffer_length_without_replacement(sum);
3827                 }
3828             }
3829             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3830                 // Add two bytes even when only one byte has been seen,
3831                 // because the one byte can become a lead byte in multibyte
3832                 // decoders, but only after the decoder has been queried
3833                 // for max length, so the decoder's own logic for adding
3834                 // one for a pending lead cannot work.
3835                 if let Some(sum) = byte_length.checked_add(2) {
3836                     if let Some(utf16_bom) =
3837                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3838                     {
3839                         let encoding = self.encoding();
3840                         if encoding == UTF_16LE || encoding == UTF_16BE {
3841                             // No need to consider the internal state of the underlying decoder,
3842                             // because it is at start, because no data has reached it yet.
3843                             return Some(utf16_bom);
3844                         } else if let Some(non_bom) =
3845                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3846                         {
3847                             return Some(core::cmp::max(utf16_bom, non_bom));
3848                         }
3849                     }
3850                 }
3851             }
3852             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3853         }
3854         None
3855     }
3856 
3857     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3858     /// replaced with the REPLACEMENT CHARACTER.
3859     ///
3860     /// See the documentation of the struct for documentation for `decode_*`
3861     /// methods collectively.
3862     ///
3863     /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3864     pub fn decode_to_utf8(
3865         &mut self,
3866         src: &[u8],
3867         dst: &mut [u8],
3868         last: bool,
3869     ) -> (CoderResult, usize, usize, bool) {
3870         let mut had_errors = false;
3871         let mut total_read = 0usize;
3872         let mut total_written = 0usize;
3873         loop {
3874             let (result, read, written) = self.decode_to_utf8_without_replacement(
3875                 &src[total_read..],
3876                 &mut dst[total_written..],
3877                 last,
3878             );
3879             total_read += read;
3880             total_written += written;
3881             match result {
3882                 DecoderResult::InputEmpty => {
3883                     return (
3884                         CoderResult::InputEmpty,
3885                         total_read,
3886                         total_written,
3887                         had_errors,
3888                     );
3889                 }
3890                 DecoderResult::OutputFull => {
3891                     return (
3892                         CoderResult::OutputFull,
3893                         total_read,
3894                         total_written,
3895                         had_errors,
3896                     );
3897                 }
3898                 DecoderResult::Malformed(_, _) => {
3899                     had_errors = true;
3900                     // There should always be space for the U+FFFD, because
3901                     // otherwise we'd have gotten OutputFull already.
3902                     // XXX: is the above comment actually true for UTF-8 itself?
3903                     // TODO: Consider having fewer bound checks here.
3904                     dst[total_written] = 0xEFu8;
3905                     total_written += 1;
3906                     dst[total_written] = 0xBFu8;
3907                     total_written += 1;
3908                     dst[total_written] = 0xBDu8;
3909                     total_written += 1;
3910                 }
3911             }
3912         }
3913     }
3914 
3915     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3916     /// replaced with the REPLACEMENT CHARACTER with type system signaling
3917     /// of UTF-8 validity.
3918     ///
3919     /// This methods calls `decode_to_utf8` and then zeroes
3920     /// out up to three bytes that aren't logically part of the write in order
3921     /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3922     ///
3923     /// See the documentation of the struct for documentation for `decode_*`
3924     /// methods collectively.
3925     ///
3926     /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3927     pub fn decode_to_str(
3928         &mut self,
3929         src: &[u8],
3930         dst: &mut str,
3931         last: bool,
3932     ) -> (CoderResult, usize, usize, bool) {
3933         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3934         let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3935         let len = bytes.len();
3936         let mut trail = written;
3937         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3938         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3939         // encodings to avoid overwriting here.
3940         if self.encoding != UTF_8 {
3941             let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3942             while trail < max {
3943                 bytes[trail] = 0;
3944                 trail += 1;
3945             }
3946         }
3947         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3948             bytes[trail] = 0;
3949             trail += 1;
3950         }
3951         (result, read, written, replaced)
3952     }
3953 
3954     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3955     /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3956     ///
3957     /// Like the others, this method follows the logic that the output buffer is
3958     /// caller-allocated. This method treats the capacity of the `String` as
3959     /// the output limit. That is, this method guarantees not to cause a
3960     /// reallocation of the backing buffer of `String`.
3961     ///
3962     /// The return value is a tuple that contains the `DecoderResult`, the
3963     /// number of bytes read and a boolean indicating whether replacements
3964     /// were done. The number of bytes written is signaled via the length of
3965     /// the `String` changing.
3966     ///
3967     /// See the documentation of the struct for documentation for `decode_*`
3968     /// methods collectively.
3969     ///
3970     /// Available to Rust only.
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)3971     pub fn decode_to_string(
3972         &mut self,
3973         src: &[u8],
3974         dst: &mut String,
3975         last: bool,
3976     ) -> (CoderResult, usize, bool) {
3977         unsafe {
3978             let vec = dst.as_mut_vec();
3979             let old_len = vec.len();
3980             let capacity = vec.capacity();
3981             vec.set_len(capacity);
3982             let (result, read, written, replaced) =
3983                 self.decode_to_utf8(src, &mut vec[old_len..], last);
3984             vec.set_len(old_len + written);
3985             (result, read, replaced)
3986         }
3987     }
3988 
3989     public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3990                             /// _without replacement_.
3991                             ///
3992                             /// See the documentation of the struct for
3993                             /// documentation for `decode_*` methods
3994                             /// collectively.
3995                             ///
3996                             /// Available via the C wrapper.
3997                             ,
3998                             decode_to_utf8_without_replacement,
3999                             decode_to_utf8_raw,
4000                             decode_to_utf8_checking_end,
4001                             decode_to_utf8_after_one_potential_bom_byte,
4002                             decode_to_utf8_after_two_potential_bom_bytes,
4003                             decode_to_utf8_checking_end_with_offset,
4004                             u8);
4005 
4006     /// Incrementally decode a byte stream into UTF-8 with type system signaling
4007     /// of UTF-8 validity.
4008     ///
4009     /// This methods calls `decode_to_utf8` and then zeroes out up to three
4010     /// bytes that aren't logically part of the write in order to retain the
4011     /// UTF-8 validity even for the unwritten part of the buffer.
4012     ///
4013     /// See the documentation of the struct for documentation for `decode_*`
4014     /// methods collectively.
4015     ///
4016     /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)4017     pub fn decode_to_str_without_replacement(
4018         &mut self,
4019         src: &[u8],
4020         dst: &mut str,
4021         last: bool,
4022     ) -> (DecoderResult, usize, usize) {
4023         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4024         let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4025         let len = bytes.len();
4026         let mut trail = written;
4027         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4028         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4029         // encodings to avoid overwriting here.
4030         if self.encoding != UTF_8 {
4031             let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4032             while trail < max {
4033                 bytes[trail] = 0;
4034                 trail += 1;
4035             }
4036         }
4037         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4038             bytes[trail] = 0;
4039             trail += 1;
4040         }
4041         (result, read, written)
4042     }
4043 
4044     /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4045     ///
4046     /// Like the others, this method follows the logic that the output buffer is
4047     /// caller-allocated. This method treats the capacity of the `String` as
4048     /// the output limit. That is, this method guarantees not to cause a
4049     /// reallocation of the backing buffer of `String`.
4050     ///
4051     /// The return value is a pair that contains the `DecoderResult` and the
4052     /// number of bytes read. The number of bytes written is signaled via
4053     /// the length of the `String` changing.
4054     ///
4055     /// See the documentation of the struct for documentation for `decode_*`
4056     /// methods collectively.
4057     ///
4058     /// Available to Rust only.
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4059     pub fn decode_to_string_without_replacement(
4060         &mut self,
4061         src: &[u8],
4062         dst: &mut String,
4063         last: bool,
4064     ) -> (DecoderResult, usize) {
4065         unsafe {
4066             let vec = dst.as_mut_vec();
4067             let old_len = vec.len();
4068             let capacity = vec.capacity();
4069             vec.set_len(capacity);
4070             let (result, read, written) =
4071                 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4072             vec.set_len(old_len + written);
4073             (result, read)
4074         }
4075     }
4076 
4077     /// Query the worst-case UTF-16 output size (with or without replacement).
4078     ///
4079     /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4080     /// that will not overflow given the current state of the decoder and
4081     /// `byte_length` number of additional input bytes or `None` if `usize`
4082     /// would overflow.
4083     ///
4084     /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4085     /// return value of this method applies also in the
4086     /// `_without_replacement` case.
4087     ///
4088     /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4089     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4090         // Need to consider a) the decoder morphing due to the BOM and b) a partial
4091         // BOM getting pushed to the underlying decoder.
4092         match self.life_cycle {
4093             DecoderLifeCycle::Converting
4094             | DecoderLifeCycle::AtUtf8Start
4095             | DecoderLifeCycle::AtUtf16LeStart
4096             | DecoderLifeCycle::AtUtf16BeStart => {
4097                 return self.variant.max_utf16_buffer_length(byte_length);
4098             }
4099             DecoderLifeCycle::AtStart => {
4100                 if let Some(utf8_bom) = byte_length.checked_add(1) {
4101                     if let Some(utf16_bom) =
4102                         checked_add(1, checked_div(byte_length.checked_add(1), 2))
4103                     {
4104                         let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4105                         let encoding = self.encoding();
4106                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4107                             // No need to consider the internal state of the underlying decoder,
4108                             // because it is at start, because no data has reached it yet.
4109                             return Some(utf_bom);
4110                         } else if let Some(non_bom) =
4111                             self.variant.max_utf16_buffer_length(byte_length)
4112                         {
4113                             return Some(core::cmp::max(utf_bom, non_bom));
4114                         }
4115                     }
4116                 }
4117             }
4118             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4119                 // Add two bytes even when only one byte has been seen,
4120                 // because the one byte can become a lead byte in multibyte
4121                 // decoders, but only after the decoder has been queried
4122                 // for max length, so the decoder's own logic for adding
4123                 // one for a pending lead cannot work.
4124                 if let Some(sum) = byte_length.checked_add(2) {
4125                     if let Some(utf8_bom) = sum.checked_add(1) {
4126                         if self.encoding() == UTF_8 {
4127                             // No need to consider the internal state of the underlying decoder,
4128                             // because it is at start, because no data has reached it yet.
4129                             return Some(utf8_bom);
4130                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4131                             return Some(core::cmp::max(utf8_bom, non_bom));
4132                         }
4133                     }
4134                 }
4135             }
4136             DecoderLifeCycle::ConvertingWithPendingBB => {
4137                 if let Some(sum) = byte_length.checked_add(2) {
4138                     return self.variant.max_utf16_buffer_length(sum);
4139                 }
4140             }
4141             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4142                 // Add two bytes even when only one byte has been seen,
4143                 // because the one byte can become a lead byte in multibyte
4144                 // decoders, but only after the decoder has been queried
4145                 // for max length, so the decoder's own logic for adding
4146                 // one for a pending lead cannot work.
4147                 if let Some(sum) = byte_length.checked_add(2) {
4148                     if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4149                         let encoding = self.encoding();
4150                         if encoding == UTF_16LE || encoding == UTF_16BE {
4151                             // No need to consider the internal state of the underlying decoder,
4152                             // because it is at start, because no data has reached it yet.
4153                             return Some(utf16_bom);
4154                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4155                             return Some(core::cmp::max(utf16_bom, non_bom));
4156                         }
4157                     }
4158                 }
4159             }
4160             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4161         }
4162         None
4163     }
4164 
4165     /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4166     /// replaced with the REPLACEMENT CHARACTER.
4167     ///
4168     /// See the documentation of the struct for documentation for `decode_*`
4169     /// methods collectively.
4170     ///
4171     /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4172     pub fn decode_to_utf16(
4173         &mut self,
4174         src: &[u8],
4175         dst: &mut [u16],
4176         last: bool,
4177     ) -> (CoderResult, usize, usize, bool) {
4178         let mut had_errors = false;
4179         let mut total_read = 0usize;
4180         let mut total_written = 0usize;
4181         loop {
4182             let (result, read, written) = self.decode_to_utf16_without_replacement(
4183                 &src[total_read..],
4184                 &mut dst[total_written..],
4185                 last,
4186             );
4187             total_read += read;
4188             total_written += written;
4189             match result {
4190                 DecoderResult::InputEmpty => {
4191                     return (
4192                         CoderResult::InputEmpty,
4193                         total_read,
4194                         total_written,
4195                         had_errors,
4196                     );
4197                 }
4198                 DecoderResult::OutputFull => {
4199                     return (
4200                         CoderResult::OutputFull,
4201                         total_read,
4202                         total_written,
4203                         had_errors,
4204                     );
4205                 }
4206                 DecoderResult::Malformed(_, _) => {
4207                     had_errors = true;
4208                     // There should always be space for the U+FFFD, because
4209                     // otherwise we'd have gotten OutputFull already.
4210                     dst[total_written] = 0xFFFD;
4211                     total_written += 1;
4212                 }
4213             }
4214         }
4215     }
4216 
4217     public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4218                             /// _without replacement_.
4219                             ///
4220                             /// See the documentation of the struct for
4221                             /// documentation for `decode_*` methods
4222                             /// collectively.
4223                             ///
4224                             /// Available via the C wrapper.
4225                             ,
4226                             decode_to_utf16_without_replacement,
4227                             decode_to_utf16_raw,
4228                             decode_to_utf16_checking_end,
4229                             decode_to_utf16_after_one_potential_bom_byte,
4230                             decode_to_utf16_after_two_potential_bom_bytes,
4231                             decode_to_utf16_checking_end_with_offset,
4232                             u16);
4233 
4234     /// Checks for compatibility with storing Unicode scalar values as unsigned
4235     /// bytes taking into account the state of the decoder.
4236     ///
4237     /// Returns `None` if the decoder is not in a neutral state, including waiting
4238     /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4239     ///
4240     /// Otherwise returns the index of the first byte whose unsigned value doesn't
4241     /// directly correspond to the decoded Unicode scalar value, or the length
4242     /// of the input if all bytes in the input decode directly to scalar values
4243     /// corresponding to the unsigned byte values.
4244     ///
4245     /// Does not change the state of the decoder.
4246     ///
4247     /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4248     /// storage optimizations.
4249     ///
4250     /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4251     pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4252         match self.life_cycle {
4253             DecoderLifeCycle::Converting => {
4254                 return self.variant.latin1_byte_compatible_up_to(bytes);
4255             }
4256             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4257             _ => None,
4258         }
4259     }
4260 }
4261 
4262 /// Result of a (potentially partial) encode operation without replacement.
4263 #[must_use]
4264 #[derive(Debug, PartialEq, Eq)]
4265 pub enum EncoderResult {
4266     /// The input was exhausted.
4267     ///
4268     /// If this result was returned from a call where `last` was `true`, the
4269     /// decoding process has completed. Otherwise, the caller should call a
4270     /// decode method again with more input.
4271     InputEmpty,
4272 
4273     /// The encoder cannot produce another unit of output, because the output
4274     /// buffer does not have enough space left.
4275     ///
4276     /// The caller must provide more output space upon the next call and re-push
4277     /// the remaining input to the decoder.
4278     OutputFull,
4279 
4280     /// The encoder encountered an unmappable character.
4281     ///
4282     /// The caller must either treat this as a fatal error or must append
4283     /// a placeholder to the output and then re-push the remaining input to the
4284     /// encoder.
4285     Unmappable(char),
4286 }
4287 
4288 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4289     fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4290         EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4291     }
4292 }
4293 
4294 /// A converter that encodes a Unicode stream into bytes according to a
4295 /// character encoding in a streaming (incremental) manner.
4296 ///
4297 /// The various `encode_*` methods take an input buffer (`src`) and an output
4298 /// buffer `dst` both of which are caller-allocated. There are variants for
4299 /// both UTF-8 and UTF-16 input buffers.
4300 ///
4301 /// An `encode_*` method encode characters from `src` into bytes characters
4302 /// stored into `dst` until one of the following three things happens:
4303 ///
4304 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4305 ///    only).
4306 ///
4307 /// 2. The output buffer has been filled so near capacity that the decoder
4308 ///    cannot be sure that processing an additional character of input wouldn't
4309 ///    cause so much output that the output buffer would overflow.
4310 ///
4311 /// 3. All the input characters have been processed.
4312 ///
4313 /// The `encode_*` method then returns tuple of a status indicating which one
4314 /// of the three reasons to return happened, how many input code units (`u8`
4315 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4316 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4317 /// whose length change indicates this), and in the case of the variants that
4318 /// perform replacement, a boolean indicating whether an unmappable
4319 /// character was replaced with a numeric character reference during the call.
4320 ///
4321 /// The number of bytes "written" is what's logically written. Garbage may be
4322 /// written in the output buffer beyond the point logically written to.
4323 ///
4324 /// In the case of the methods whose name ends with
4325 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4326 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4327 /// the three cases listed above).
4328 ///
4329 /// In the case of methods whose name does not end with
4330 /// `*_without_replacement`, unmappable characters are automatically replaced
4331 /// with the corresponding numeric character references and unmappable
4332 /// characters do not cause the methods to return early.
4333 ///
4334 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4335 /// not to return indicating that more output space is needed if the length
4336 /// of the output buffer is at least the length returned by
4337 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4338 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4339 /// methods not to return indicating that more output space is needed in the
4340 /// absence of unmappable characters is given by
4341 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4342 /// UTF-16 without replacement, the methods are guaranteed not to return
4343 /// indicating that more output space is needed if the length of the output
4344 /// buffer is at least the length returned by
4345 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4346 /// from UTF-16 with replacement, the the length of the output buffer that
4347 /// guarantees the methods not to return indicating that more output space is
4348 /// needed in the absence of unmappable characters is given by
4349 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4350 /// When encoding with replacement, applications are not expected to size the
4351 /// buffer for the worst case ahead of time but to resize the buffer if there
4352 /// are unmappable characters. This is why max length queries are only available
4353 /// for the case where there are no unmappable characters.
4354 ///
4355 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4356 /// calling from Rust, the type system takes care of this.) When encoding from
4357 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4358 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4359 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4360 /// are not split across input buffer boundaries.
4361 ///
4362 /// After an `encode_*` call returns, the output produced so far, taken as a
4363 /// whole from the start of the stream, is guaranteed to consist of a valid
4364 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4365 /// character is guaranteed not to be split across output buffers. However, due
4366 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4367 /// from the start for it to be valid. For other encodings, the validity holds
4368 /// on a per-output buffer basis.)
4369 ///
4370 /// The boolean argument `last` indicates that the end of the stream is reached
4371 /// when all the characters in `src` have been consumed. This argument is needed
4372 /// for ISO-2022-JP and is ignored for other encodings.
4373 ///
4374 /// An `Encoder` object can be used to incrementally encode a byte stream.
4375 ///
4376 /// During the processing of a single stream, the caller must call `encode_*`
4377 /// zero or more times with `last` set to `false` and then call `encode_*` at
4378 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4379 /// the processing of the stream has ended. Otherwise, the caller must call
4380 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4381 /// as a fatal error).
4382 ///
4383 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4384 /// That is, you need to create another one to process another stream.
4385 ///
4386 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4387 /// and the caller does not wish to treat it as a fatal error, the input buffer
4388 /// `src` may not have been completely consumed. In that case, the caller must
4389 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4390 /// call.
4391 ///
4392 /// [1]: enum.EncoderResult.html
4393 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4394 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4395 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4396 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4397 ///
4398 /// # Infinite loops
4399 ///
4400 /// When converting with a fixed-size output buffer whose size is too small to
4401 /// accommodate one character of output, an infinite loop ensues. When
4402 /// converting with a fixed-size output buffer, it generally makes sense to
4403 /// make the buffer fairly large (e.g. couple of kilobytes).
4404 pub struct Encoder {
4405     encoding: &'static Encoding,
4406     variant: VariantEncoder,
4407 }
4408 
4409 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4410     fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4411         Encoder {
4412             encoding: enc,
4413             variant: encoder,
4414         }
4415     }
4416 
4417     /// The `Encoding` this `Encoder` is for.
4418     #[inline]
encoding(&self) -> &'static Encoding4419     pub fn encoding(&self) -> &'static Encoding {
4420         self.encoding
4421     }
4422 
4423     /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4424     /// ASCII state and `false` otherwise.
4425     #[inline]
has_pending_state(&self) -> bool4426     pub fn has_pending_state(&self) -> bool {
4427         self.variant.has_pending_state()
4428     }
4429 
4430     /// Query the worst-case output size when encoding from UTF-8 with
4431     /// replacement.
4432     ///
4433     /// Returns the size of the output buffer in bytes that will not overflow
4434     /// given the current state of the encoder and `byte_length` number of
4435     /// additional input code units if there are no unmappable characters in
4436     /// the input or `None` if `usize` would overflow.
4437     ///
4438     /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4439     pub fn max_buffer_length_from_utf8_if_no_unmappables(
4440         &self,
4441         byte_length: usize,
4442     ) -> Option<usize> {
4443         checked_add(
4444             if self.encoding().can_encode_everything() {
4445                 0
4446             } else {
4447                 NCR_EXTRA
4448             },
4449             self.max_buffer_length_from_utf8_without_replacement(byte_length),
4450         )
4451     }
4452 
4453     /// Query the worst-case output size when encoding from UTF-8 without
4454     /// replacement.
4455     ///
4456     /// Returns the size of the output buffer in bytes that will not overflow
4457     /// given the current state of the encoder and `byte_length` number of
4458     /// additional input code units or `None` if `usize` would overflow.
4459     ///
4460     /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4461     pub fn max_buffer_length_from_utf8_without_replacement(
4462         &self,
4463         byte_length: usize,
4464     ) -> Option<usize> {
4465         self.variant
4466             .max_buffer_length_from_utf8_without_replacement(byte_length)
4467     }
4468 
4469     /// Incrementally encode into byte stream from UTF-8 with unmappable
4470     /// characters replaced with HTML (decimal) numeric character references.
4471     ///
4472     /// See the documentation of the struct for documentation for `encode_*`
4473     /// methods collectively.
4474     ///
4475     /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4476     pub fn encode_from_utf8(
4477         &mut self,
4478         src: &str,
4479         dst: &mut [u8],
4480         last: bool,
4481     ) -> (CoderResult, usize, usize, bool) {
4482         let dst_len = dst.len();
4483         let effective_dst_len = if self.encoding().can_encode_everything() {
4484             dst_len
4485         } else {
4486             if dst_len < NCR_EXTRA {
4487                 if src.is_empty() && !(last && self.has_pending_state()) {
4488                     return (CoderResult::InputEmpty, 0, 0, false);
4489                 }
4490                 return (CoderResult::OutputFull, 0, 0, false);
4491             }
4492             dst_len - NCR_EXTRA
4493         };
4494         let mut had_unmappables = false;
4495         let mut total_read = 0usize;
4496         let mut total_written = 0usize;
4497         loop {
4498             let (result, read, written) = self.encode_from_utf8_without_replacement(
4499                 &src[total_read..],
4500                 &mut dst[total_written..effective_dst_len],
4501                 last,
4502             );
4503             total_read += read;
4504             total_written += written;
4505             match result {
4506                 EncoderResult::InputEmpty => {
4507                     return (
4508                         CoderResult::InputEmpty,
4509                         total_read,
4510                         total_written,
4511                         had_unmappables,
4512                     );
4513                 }
4514                 EncoderResult::OutputFull => {
4515                     return (
4516                         CoderResult::OutputFull,
4517                         total_read,
4518                         total_written,
4519                         had_unmappables,
4520                     );
4521                 }
4522                 EncoderResult::Unmappable(unmappable) => {
4523                     had_unmappables = true;
4524                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4525                     debug_assert_ne!(self.encoding(), UTF_16BE);
4526                     debug_assert_ne!(self.encoding(), UTF_16LE);
4527                     // Additionally, Iso2022JpEncoder is responsible for
4528                     // transitioning to ASCII when returning with Unmappable.
4529                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4530                     if total_written >= effective_dst_len {
4531                         if total_read == src.len() && !(last && self.has_pending_state()) {
4532                             return (
4533                                 CoderResult::InputEmpty,
4534                                 total_read,
4535                                 total_written,
4536                                 had_unmappables,
4537                             );
4538                         }
4539                         return (
4540                             CoderResult::OutputFull,
4541                             total_read,
4542                             total_written,
4543                             had_unmappables,
4544                         );
4545                     }
4546                 }
4547             }
4548         }
4549     }
4550 
4551     /// Incrementally encode into byte stream from UTF-8 with unmappable
4552     /// characters replaced with HTML (decimal) numeric character references.
4553     ///
4554     /// See the documentation of the struct for documentation for `encode_*`
4555     /// methods collectively.
4556     ///
4557     /// Available to Rust only.
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4558     pub fn encode_from_utf8_to_vec(
4559         &mut self,
4560         src: &str,
4561         dst: &mut Vec<u8>,
4562         last: bool,
4563     ) -> (CoderResult, usize, bool) {
4564         unsafe {
4565             let old_len = dst.len();
4566             let capacity = dst.capacity();
4567             dst.set_len(capacity);
4568             let (result, read, written, replaced) =
4569                 self.encode_from_utf8(src, &mut dst[old_len..], last);
4570             dst.set_len(old_len + written);
4571             (result, read, replaced)
4572         }
4573     }
4574 
4575     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4576     ///
4577     /// See the documentation of the struct for documentation for `encode_*`
4578     /// methods collectively.
4579     ///
4580     /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4581     pub fn encode_from_utf8_without_replacement(
4582         &mut self,
4583         src: &str,
4584         dst: &mut [u8],
4585         last: bool,
4586     ) -> (EncoderResult, usize, usize) {
4587         self.variant.encode_from_utf8_raw(src, dst, last)
4588     }
4589 
4590     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4591     ///
4592     /// See the documentation of the struct for documentation for `encode_*`
4593     /// methods collectively.
4594     ///
4595     /// Available to Rust only.
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4596     pub fn encode_from_utf8_to_vec_without_replacement(
4597         &mut self,
4598         src: &str,
4599         dst: &mut Vec<u8>,
4600         last: bool,
4601     ) -> (EncoderResult, usize) {
4602         unsafe {
4603             let old_len = dst.len();
4604             let capacity = dst.capacity();
4605             dst.set_len(capacity);
4606             let (result, read, written) =
4607                 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4608             dst.set_len(old_len + written);
4609             (result, read)
4610         }
4611     }
4612 
4613     /// Query the worst-case output size when encoding from UTF-16 with
4614     /// replacement.
4615     ///
4616     /// Returns the size of the output buffer in bytes that will not overflow
4617     /// given the current state of the encoder and `u16_length` number of
4618     /// additional input code units if there are no unmappable characters in
4619     /// the input or `None` if `usize` would overflow.
4620     ///
4621     /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4622     pub fn max_buffer_length_from_utf16_if_no_unmappables(
4623         &self,
4624         u16_length: usize,
4625     ) -> Option<usize> {
4626         checked_add(
4627             if self.encoding().can_encode_everything() {
4628                 0
4629             } else {
4630                 NCR_EXTRA
4631             },
4632             self.max_buffer_length_from_utf16_without_replacement(u16_length),
4633         )
4634     }
4635 
4636     /// Query the worst-case output size when encoding from UTF-16 without
4637     /// replacement.
4638     ///
4639     /// Returns the size of the output buffer in bytes that will not overflow
4640     /// given the current state of the encoder and `u16_length` number of
4641     /// additional input code units or `None` if `usize` would overflow.
4642     ///
4643     /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4644     pub fn max_buffer_length_from_utf16_without_replacement(
4645         &self,
4646         u16_length: usize,
4647     ) -> Option<usize> {
4648         self.variant
4649             .max_buffer_length_from_utf16_without_replacement(u16_length)
4650     }
4651 
4652     /// Incrementally encode into byte stream from UTF-16 with unmappable
4653     /// characters replaced with HTML (decimal) numeric character references.
4654     ///
4655     /// See the documentation of the struct for documentation for `encode_*`
4656     /// methods collectively.
4657     ///
4658     /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4659     pub fn encode_from_utf16(
4660         &mut self,
4661         src: &[u16],
4662         dst: &mut [u8],
4663         last: bool,
4664     ) -> (CoderResult, usize, usize, bool) {
4665         let dst_len = dst.len();
4666         let effective_dst_len = if self.encoding().can_encode_everything() {
4667             dst_len
4668         } else {
4669             if dst_len < NCR_EXTRA {
4670                 if src.is_empty() && !(last && self.has_pending_state()) {
4671                     return (CoderResult::InputEmpty, 0, 0, false);
4672                 }
4673                 return (CoderResult::OutputFull, 0, 0, false);
4674             }
4675             dst_len - NCR_EXTRA
4676         };
4677         let mut had_unmappables = false;
4678         let mut total_read = 0usize;
4679         let mut total_written = 0usize;
4680         loop {
4681             let (result, read, written) = self.encode_from_utf16_without_replacement(
4682                 &src[total_read..],
4683                 &mut dst[total_written..effective_dst_len],
4684                 last,
4685             );
4686             total_read += read;
4687             total_written += written;
4688             match result {
4689                 EncoderResult::InputEmpty => {
4690                     return (
4691                         CoderResult::InputEmpty,
4692                         total_read,
4693                         total_written,
4694                         had_unmappables,
4695                     );
4696                 }
4697                 EncoderResult::OutputFull => {
4698                     return (
4699                         CoderResult::OutputFull,
4700                         total_read,
4701                         total_written,
4702                         had_unmappables,
4703                     );
4704                 }
4705                 EncoderResult::Unmappable(unmappable) => {
4706                     had_unmappables = true;
4707                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4708                     // There are no UTF-16 encoders and even if there were,
4709                     // they'd never have unmappables.
4710                     debug_assert_ne!(self.encoding(), UTF_16BE);
4711                     debug_assert_ne!(self.encoding(), UTF_16LE);
4712                     // Additionally, Iso2022JpEncoder is responsible for
4713                     // transitioning to ASCII when returning with Unmappable
4714                     // from the jis0208 state. That is, when we encode
4715                     // ISO-2022-JP and come here, the encoder is in either the
4716                     // ASCII or the Roman state. We are allowed to generate any
4717                     // printable ASCII excluding \ and ~.
4718                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4719                     if total_written >= effective_dst_len {
4720                         if total_read == src.len() && !(last && self.has_pending_state()) {
4721                             return (
4722                                 CoderResult::InputEmpty,
4723                                 total_read,
4724                                 total_written,
4725                                 had_unmappables,
4726                             );
4727                         }
4728                         return (
4729                             CoderResult::OutputFull,
4730                             total_read,
4731                             total_written,
4732                             had_unmappables,
4733                         );
4734                     }
4735                 }
4736             }
4737         }
4738     }
4739 
4740     /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4741     ///
4742     /// See the documentation of the struct for documentation for `encode_*`
4743     /// methods collectively.
4744     ///
4745     /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4746     pub fn encode_from_utf16_without_replacement(
4747         &mut self,
4748         src: &[u16],
4749         dst: &mut [u8],
4750         last: bool,
4751     ) -> (EncoderResult, usize, usize) {
4752         self.variant.encode_from_utf16_raw(src, dst, last)
4753     }
4754 }
4755 
4756 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4757 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4758     // len is the number of decimal digits needed to represent unmappable plus
4759     // 3 (the length of "&#" and ";").
4760     let mut number = unmappable as u32;
4761     let len = if number >= 1_000_000u32 {
4762         10usize
4763     } else if number >= 100_000u32 {
4764         9usize
4765     } else if number >= 10_000u32 {
4766         8usize
4767     } else if number >= 1_000u32 {
4768         7usize
4769     } else if number >= 100u32 {
4770         6usize
4771     } else {
4772         // Review the outcome of https://github.com/whatwg/encoding/issues/15
4773         // to see if this case is possible
4774         5usize
4775     };
4776     debug_assert!(number >= 10u32);
4777     debug_assert!(len <= dst.len());
4778     let mut pos = len - 1;
4779     dst[pos] = b';';
4780     pos -= 1;
4781     loop {
4782         let rightmost = number % 10;
4783         dst[pos] = rightmost as u8 + b'0';
4784         pos -= 1;
4785         if number < 10 {
4786             break;
4787         }
4788         number /= 10;
4789     }
4790     dst[1] = b'#';
4791     dst[0] = b'&';
4792     len
4793 }
4794 
4795 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4796 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4797     i.wrapping_sub(start) < (end - start)
4798 }
4799 
4800 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4801 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4802     i.wrapping_sub(start) < (end - start)
4803 }
4804 
4805 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4806 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4807     i.wrapping_sub(start) <= (end - start)
4808 }
4809 
4810 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4811 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4812     i.wrapping_sub(start) <= (end - start)
4813 }
4814 
4815 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4816 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4817     i.wrapping_sub(start) <= (end - start)
4818 }
4819 
4820 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4821 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4822     i.wrapping_sub(start) <= (end - start)
4823 }
4824 
4825 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4826 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4827     if let Some(n) = opt {
4828         n.checked_add(num)
4829     } else {
4830         None
4831     }
4832 }
4833 
4834 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4835 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4836     if let Some(n) = one {
4837         checked_add(n, other)
4838     } else {
4839         None
4840     }
4841 }
4842 
4843 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4844 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4845     if let Some(n) = opt {
4846         n.checked_mul(num)
4847     } else {
4848         None
4849     }
4850 }
4851 
4852 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4853 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4854     if let Some(n) = opt {
4855         n.checked_div(num)
4856     } else {
4857         None
4858     }
4859 }
4860 
4861 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4862 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4863     opt.map(|n| n.next_power_of_two())
4864 }
4865 
4866 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4867 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4868     if let Some(a) = one {
4869         if let Some(b) = other {
4870             Some(::core::cmp::min(a, b))
4871         } else {
4872             Some(a)
4873         }
4874     } else {
4875         other
4876     }
4877 }
4878 
4879 // ############## TESTS ###############
4880 
4881 #[cfg(all(test, feature = "serde"))]
4882 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4883 struct Demo {
4884     num: u32,
4885     name: String,
4886     enc: &'static Encoding,
4887 }
4888 
4889 #[cfg(test)]
4890 mod test_labels_names;
4891 
4892 #[cfg(test)]
4893 mod tests {
4894     use super::*;
4895     use alloc::borrow::Cow;
4896 
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4897     fn sniff_to_utf16(
4898         initial_encoding: &'static Encoding,
4899         expected_encoding: &'static Encoding,
4900         bytes: &[u8],
4901         expect: &[u16],
4902         breaks: &[usize],
4903     ) {
4904         let mut decoder = initial_encoding.new_decoder();
4905 
4906         let mut dest: Vec<u16> =
4907             Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4908         let capacity = dest.capacity();
4909         dest.resize(capacity, 0u16);
4910 
4911         let mut total_written = 0usize;
4912         let mut start = 0usize;
4913         for br in breaks {
4914             let (result, read, written, _) =
4915                 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4916             total_written += written;
4917             assert_eq!(read, *br - start);
4918             match result {
4919                 CoderResult::InputEmpty => {}
4920                 CoderResult::OutputFull => {
4921                     unreachable!();
4922                 }
4923             }
4924             start = *br;
4925         }
4926         let (result, read, written, _) =
4927             decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4928         total_written += written;
4929         match result {
4930             CoderResult::InputEmpty => {}
4931             CoderResult::OutputFull => {
4932                 unreachable!();
4933             }
4934         }
4935         assert_eq!(read, bytes.len() - start);
4936         assert_eq!(total_written, expect.len());
4937         assert_eq!(&dest[..total_written], expect);
4938         assert_eq!(decoder.encoding(), expected_encoding);
4939     }
4940 
4941     // Any copyright to the test code below this comment is dedicated to the
4942     // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4943 
4944     #[test]
test_bom_sniffing()4945     fn test_bom_sniffing() {
4946         // ASCII
4947         sniff_to_utf16(
4948             WINDOWS_1252,
4949             WINDOWS_1252,
4950             b"\x61\x62",
4951             &[0x0061u16, 0x0062u16],
4952             &[],
4953         );
4954         // UTF-8
4955         sniff_to_utf16(
4956             WINDOWS_1252,
4957             UTF_8,
4958             b"\xEF\xBB\xBF\x61\x62",
4959             &[0x0061u16, 0x0062u16],
4960             &[],
4961         );
4962         sniff_to_utf16(
4963             WINDOWS_1252,
4964             UTF_8,
4965             b"\xEF\xBB\xBF\x61\x62",
4966             &[0x0061u16, 0x0062u16],
4967             &[1],
4968         );
4969         sniff_to_utf16(
4970             WINDOWS_1252,
4971             UTF_8,
4972             b"\xEF\xBB\xBF\x61\x62",
4973             &[0x0061u16, 0x0062u16],
4974             &[2],
4975         );
4976         sniff_to_utf16(
4977             WINDOWS_1252,
4978             UTF_8,
4979             b"\xEF\xBB\xBF\x61\x62",
4980             &[0x0061u16, 0x0062u16],
4981             &[3],
4982         );
4983         sniff_to_utf16(
4984             WINDOWS_1252,
4985             UTF_8,
4986             b"\xEF\xBB\xBF\x61\x62",
4987             &[0x0061u16, 0x0062u16],
4988             &[4],
4989         );
4990         sniff_to_utf16(
4991             WINDOWS_1252,
4992             UTF_8,
4993             b"\xEF\xBB\xBF\x61\x62",
4994             &[0x0061u16, 0x0062u16],
4995             &[2, 3],
4996         );
4997         sniff_to_utf16(
4998             WINDOWS_1252,
4999             UTF_8,
5000             b"\xEF\xBB\xBF\x61\x62",
5001             &[0x0061u16, 0x0062u16],
5002             &[1, 2],
5003         );
5004         sniff_to_utf16(
5005             WINDOWS_1252,
5006             UTF_8,
5007             b"\xEF\xBB\xBF\x61\x62",
5008             &[0x0061u16, 0x0062u16],
5009             &[1, 3],
5010         );
5011         sniff_to_utf16(
5012             WINDOWS_1252,
5013             UTF_8,
5014             b"\xEF\xBB\xBF\x61\x62",
5015             &[0x0061u16, 0x0062u16],
5016             &[1, 2, 3, 4],
5017         );
5018         sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5019         // Not UTF-8
5020         sniff_to_utf16(
5021             WINDOWS_1252,
5022             WINDOWS_1252,
5023             b"\xEF\xBB\x61\x62",
5024             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5025             &[],
5026         );
5027         sniff_to_utf16(
5028             WINDOWS_1252,
5029             WINDOWS_1252,
5030             b"\xEF\xBB\x61\x62",
5031             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5032             &[1],
5033         );
5034         sniff_to_utf16(
5035             WINDOWS_1252,
5036             WINDOWS_1252,
5037             b"\xEF\x61\x62",
5038             &[0x00EFu16, 0x0061u16, 0x0062u16],
5039             &[],
5040         );
5041         sniff_to_utf16(
5042             WINDOWS_1252,
5043             WINDOWS_1252,
5044             b"\xEF\x61\x62",
5045             &[0x00EFu16, 0x0061u16, 0x0062u16],
5046             &[1],
5047         );
5048         sniff_to_utf16(
5049             WINDOWS_1252,
5050             WINDOWS_1252,
5051             b"\xEF\xBB",
5052             &[0x00EFu16, 0x00BBu16],
5053             &[],
5054         );
5055         sniff_to_utf16(
5056             WINDOWS_1252,
5057             WINDOWS_1252,
5058             b"\xEF\xBB",
5059             &[0x00EFu16, 0x00BBu16],
5060             &[1],
5061         );
5062         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5063         // Not UTF-16
5064         sniff_to_utf16(
5065             WINDOWS_1252,
5066             WINDOWS_1252,
5067             b"\xFE\x61\x62",
5068             &[0x00FEu16, 0x0061u16, 0x0062u16],
5069             &[],
5070         );
5071         sniff_to_utf16(
5072             WINDOWS_1252,
5073             WINDOWS_1252,
5074             b"\xFE\x61\x62",
5075             &[0x00FEu16, 0x0061u16, 0x0062u16],
5076             &[1],
5077         );
5078         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5079         sniff_to_utf16(
5080             WINDOWS_1252,
5081             WINDOWS_1252,
5082             b"\xFF\x61\x62",
5083             &[0x00FFu16, 0x0061u16, 0x0062u16],
5084             &[],
5085         );
5086         sniff_to_utf16(
5087             WINDOWS_1252,
5088             WINDOWS_1252,
5089             b"\xFF\x61\x62",
5090             &[0x00FFu16, 0x0061u16, 0x0062u16],
5091             &[1],
5092         );
5093         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5094         // UTF-16
5095         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5096         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5097         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5098         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5099     }
5100 
5101     #[test]
test_output_encoding()5102     fn test_output_encoding() {
5103         assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5104         assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5105         assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5106         assert_eq!(UTF_8.output_encoding(), UTF_8);
5107         assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5108         assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5109         assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5110         assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5111         assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5112         assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5113     }
5114 
5115     #[test]
test_label_resolution()5116     fn test_label_resolution() {
5117         assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5118         assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5119         assert_eq!(
5120             Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5121             Some(UTF_8)
5122         );
5123         assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5124         assert_eq!(Encoding::for_label(b"bogus"), None);
5125         assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5126     }
5127 
5128     #[test]
test_decode_valid_windows_1257_to_cow()5129     fn test_decode_valid_windows_1257_to_cow() {
5130         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5131         match cow {
5132             Cow::Borrowed(_) => unreachable!(),
5133             Cow::Owned(s) => {
5134                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5135             }
5136         }
5137         assert_eq!(encoding, WINDOWS_1257);
5138         assert!(!had_errors);
5139     }
5140 
5141     #[test]
test_decode_invalid_windows_1257_to_cow()5142     fn test_decode_invalid_windows_1257_to_cow() {
5143         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5144         match cow {
5145             Cow::Borrowed(_) => unreachable!(),
5146             Cow::Owned(s) => {
5147                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5148             }
5149         }
5150         assert_eq!(encoding, WINDOWS_1257);
5151         assert!(had_errors);
5152     }
5153 
5154     #[test]
test_decode_ascii_only_windows_1257_to_cow()5155     fn test_decode_ascii_only_windows_1257_to_cow() {
5156         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5157         match cow {
5158             Cow::Borrowed(s) => {
5159                 assert_eq!(s, "abc");
5160             }
5161             Cow::Owned(_) => unreachable!(),
5162         }
5163         assert_eq!(encoding, WINDOWS_1257);
5164         assert!(!had_errors);
5165     }
5166 
5167     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5168     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5169         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5170         match cow {
5171             Cow::Borrowed(s) => {
5172                 assert_eq!(s, "\u{20AC}\u{00E4}");
5173             }
5174             Cow::Owned(_) => unreachable!(),
5175         }
5176         assert_eq!(encoding, UTF_8);
5177         assert!(!had_errors);
5178     }
5179 
5180     #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5181     fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5182         let (cow, encoding, had_errors) =
5183             WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5184         match cow {
5185             Cow::Borrowed(_) => unreachable!(),
5186             Cow::Owned(s) => {
5187                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5188             }
5189         }
5190         assert_eq!(encoding, UTF_8);
5191         assert!(had_errors);
5192     }
5193 
5194     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5195     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5196         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5197         match cow {
5198             Cow::Borrowed(s) => {
5199                 assert_eq!(s, "\u{20AC}\u{00E4}");
5200             }
5201             Cow::Owned(_) => unreachable!(),
5202         }
5203         assert_eq!(encoding, UTF_8);
5204         assert!(!had_errors);
5205     }
5206 
5207     #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5208     fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5209         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5210         match cow {
5211             Cow::Borrowed(_) => unreachable!(),
5212             Cow::Owned(s) => {
5213                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5214             }
5215         }
5216         assert_eq!(encoding, UTF_8);
5217         assert!(had_errors);
5218     }
5219 
5220     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5221     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5222         let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5223         match cow {
5224             Cow::Borrowed(s) => {
5225                 assert_eq!(s, "\u{20AC}\u{00E4}");
5226             }
5227             Cow::Owned(_) => unreachable!(),
5228         }
5229         assert!(!had_errors);
5230     }
5231 
5232     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5233     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5234         let (cow, had_errors) =
5235             WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5236         match cow {
5237             Cow::Borrowed(_) => unreachable!(),
5238             Cow::Owned(s) => {
5239                 assert_eq!(
5240                     s,
5241                     "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5242                 );
5243             }
5244         }
5245         assert!(!had_errors);
5246     }
5247 
5248     #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5249     fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5250         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5251         match cow {
5252             Cow::Borrowed(_) => unreachable!(),
5253             Cow::Owned(s) => {
5254                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5255             }
5256         }
5257         assert!(!had_errors);
5258     }
5259 
5260     #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5261     fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5262         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5263         match cow {
5264             Cow::Borrowed(_) => unreachable!(),
5265             Cow::Owned(s) => {
5266                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5267             }
5268         }
5269         assert!(had_errors);
5270     }
5271 
5272     #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5273     fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5274         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5275         match cow {
5276             Cow::Borrowed(s) => {
5277                 assert_eq!(s, "abc");
5278             }
5279             Cow::Owned(_) => unreachable!(),
5280         }
5281         assert!(!had_errors);
5282     }
5283 
5284     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5285     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5286         let (cow, had_errors) =
5287             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5288         match cow {
5289             Cow::Borrowed(s) => {
5290                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5291             }
5292             Cow::Owned(_) => unreachable!(),
5293         }
5294         assert!(!had_errors);
5295     }
5296 
5297     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5298     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5299         let (cow, had_errors) =
5300             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5301         match cow {
5302             Cow::Borrowed(_) => unreachable!(),
5303             Cow::Owned(s) => {
5304                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5305             }
5306         }
5307         assert!(had_errors);
5308     }
5309 
5310     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5311     fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5312         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5313         match cow {
5314             Cow::Borrowed(_) => unreachable!(),
5315             Cow::Owned(s) => {
5316                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5317             }
5318         }
5319         assert!(!had_errors);
5320     }
5321 
5322     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5323     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5324         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5325         match cow {
5326             Cow::Borrowed(_) => unreachable!(),
5327             Cow::Owned(s) => {
5328                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5329             }
5330         }
5331         assert!(had_errors);
5332     }
5333 
5334     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5335     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5336         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5337         match cow {
5338             Cow::Borrowed(s) => {
5339                 assert_eq!(s, "abc");
5340             }
5341             Cow::Owned(_) => unreachable!(),
5342         }
5343         assert!(!had_errors);
5344     }
5345 
5346     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5347     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5348         match UTF_8.decode_without_bom_handling_and_without_replacement(
5349             b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5350         ) {
5351             Some(cow) => match cow {
5352                 Cow::Borrowed(s) => {
5353                     assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5354                 }
5355                 Cow::Owned(_) => unreachable!(),
5356             },
5357             None => unreachable!(),
5358         }
5359     }
5360 
5361     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5362     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5363         assert!(UTF_8
5364             .decode_without_bom_handling_and_without_replacement(
5365                 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5366             )
5367             .is_none());
5368     }
5369 
5370     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5371     fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5372         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5373             Some(cow) => match cow {
5374                 Cow::Borrowed(_) => unreachable!(),
5375                 Cow::Owned(s) => {
5376                     assert_eq!(s, "abc\u{20AC}\u{00E4}");
5377                 }
5378             },
5379             None => unreachable!(),
5380         }
5381     }
5382 
5383     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5384     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5385         assert!(WINDOWS_1257
5386             .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5387             .is_none());
5388     }
5389 
5390     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5391     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5392         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5393             Some(cow) => match cow {
5394                 Cow::Borrowed(s) => {
5395                     assert_eq!(s, "abc");
5396                 }
5397                 Cow::Owned(_) => unreachable!(),
5398             },
5399             None => unreachable!(),
5400         }
5401     }
5402 
5403     #[test]
test_encode_ascii_only_windows_1257_to_cow()5404     fn test_encode_ascii_only_windows_1257_to_cow() {
5405         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5406         match cow {
5407             Cow::Borrowed(s) => {
5408                 assert_eq!(s, b"abc");
5409             }
5410             Cow::Owned(_) => unreachable!(),
5411         }
5412         assert_eq!(encoding, WINDOWS_1257);
5413         assert!(!had_errors);
5414     }
5415 
5416     #[test]
test_encode_valid_windows_1257_to_cow()5417     fn test_encode_valid_windows_1257_to_cow() {
5418         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5419         match cow {
5420             Cow::Borrowed(_) => unreachable!(),
5421             Cow::Owned(s) => {
5422                 assert_eq!(s, b"abc\x80\xE4");
5423             }
5424         }
5425         assert_eq!(encoding, WINDOWS_1257);
5426         assert!(!had_errors);
5427     }
5428 
5429     #[test]
test_utf16_space_with_one_bom_byte()5430     fn test_utf16_space_with_one_bom_byte() {
5431         let mut decoder = UTF_16LE.new_decoder();
5432         let mut dst = [0u16; 12];
5433         {
5434             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5435             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5436             assert_eq!(result, CoderResult::InputEmpty);
5437         }
5438         {
5439             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5440             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5441             assert_eq!(result, CoderResult::InputEmpty);
5442         }
5443     }
5444 
5445     #[test]
test_utf8_space_with_one_bom_byte()5446     fn test_utf8_space_with_one_bom_byte() {
5447         let mut decoder = UTF_8.new_decoder();
5448         let mut dst = [0u16; 12];
5449         {
5450             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5451             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5452             assert_eq!(result, CoderResult::InputEmpty);
5453         }
5454         {
5455             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5456             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5457             assert_eq!(result, CoderResult::InputEmpty);
5458         }
5459     }
5460 
5461     #[test]
test_utf16_space_with_two_bom_bytes()5462     fn test_utf16_space_with_two_bom_bytes() {
5463         let mut decoder = UTF_16LE.new_decoder();
5464         let mut dst = [0u16; 12];
5465         {
5466             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5467             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5468             assert_eq!(result, CoderResult::InputEmpty);
5469         }
5470         {
5471             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5472             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5473             assert_eq!(result, CoderResult::InputEmpty);
5474         }
5475         {
5476             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5477             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5478             assert_eq!(result, CoderResult::InputEmpty);
5479         }
5480     }
5481 
5482     #[test]
test_utf8_space_with_two_bom_bytes()5483     fn test_utf8_space_with_two_bom_bytes() {
5484         let mut decoder = UTF_8.new_decoder();
5485         let mut dst = [0u16; 12];
5486         {
5487             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5488             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5489             assert_eq!(result, CoderResult::InputEmpty);
5490         }
5491         {
5492             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5493             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5494             assert_eq!(result, CoderResult::InputEmpty);
5495         }
5496         {
5497             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5498             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5499             assert_eq!(result, CoderResult::InputEmpty);
5500         }
5501     }
5502 
5503     #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5504     fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5505         let mut decoder = UTF_16LE.new_decoder();
5506         let mut dst = [0u16; 12];
5507         {
5508             let needed = decoder.max_utf16_buffer_length(2).unwrap();
5509             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5510             assert_eq!(result, CoderResult::InputEmpty);
5511         }
5512     }
5513 
5514     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5515     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5516         let mut dst = [0u8; 8];
5517         let mut encoder = ISO_2022_JP.new_encoder();
5518         {
5519             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5520             assert_eq!(result, CoderResult::InputEmpty);
5521         }
5522         {
5523             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5524             assert_eq!(result, CoderResult::InputEmpty);
5525         }
5526     }
5527 
5528     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5529     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5530         let mut dst = [0u8; 16];
5531         let mut encoder = ISO_2022_JP.new_encoder();
5532         {
5533             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5534             assert_eq!(result, CoderResult::InputEmpty);
5535         }
5536         {
5537             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5538             assert_eq!(result, CoderResult::InputEmpty);
5539         }
5540         {
5541             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5542             assert_eq!(result, CoderResult::OutputFull);
5543         }
5544     }
5545 
5546     #[test]
test_buffer_end_iso_2022_jp_from_utf8()5547     fn test_buffer_end_iso_2022_jp_from_utf8() {
5548         let mut dst = [0u8; 18];
5549         {
5550             let mut encoder = ISO_2022_JP.new_encoder();
5551             let (result, _, _, _) =
5552                 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5553             assert_eq!(result, CoderResult::InputEmpty);
5554         }
5555         {
5556             let mut encoder = ISO_2022_JP.new_encoder();
5557             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5558             assert_eq!(result, CoderResult::OutputFull);
5559         }
5560         {
5561             let mut encoder = ISO_2022_JP.new_encoder();
5562             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5563             assert_eq!(result, CoderResult::InputEmpty);
5564         }
5565         {
5566             let mut encoder = ISO_2022_JP.new_encoder();
5567             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5568             assert_eq!(result, CoderResult::InputEmpty);
5569         }
5570     }
5571 
5572     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5573     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5574         let mut dst = [0u8; 8];
5575         let mut encoder = ISO_2022_JP.new_encoder();
5576         {
5577             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5578             assert_eq!(result, CoderResult::InputEmpty);
5579         }
5580         {
5581             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5582             assert_eq!(result, CoderResult::InputEmpty);
5583         }
5584     }
5585 
5586     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5587     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5588         let mut dst = [0u8; 16];
5589         let mut encoder = ISO_2022_JP.new_encoder();
5590         {
5591             let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5592             assert_eq!(result, CoderResult::InputEmpty);
5593         }
5594         {
5595             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5596             assert_eq!(result, CoderResult::InputEmpty);
5597         }
5598         {
5599             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5600             assert_eq!(result, CoderResult::OutputFull);
5601         }
5602     }
5603 
5604     #[test]
test_buffer_end_iso_2022_jp_from_utf16()5605     fn test_buffer_end_iso_2022_jp_from_utf16() {
5606         let mut dst = [0u8; 18];
5607         {
5608             let mut encoder = ISO_2022_JP.new_encoder();
5609             let (result, _, _, _) =
5610                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5611             assert_eq!(result, CoderResult::InputEmpty);
5612         }
5613         {
5614             let mut encoder = ISO_2022_JP.new_encoder();
5615             let (result, _, _, _) =
5616                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5617             assert_eq!(result, CoderResult::OutputFull);
5618         }
5619         {
5620             let mut encoder = ISO_2022_JP.new_encoder();
5621             let (result, _, _, _) =
5622                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5623             assert_eq!(result, CoderResult::InputEmpty);
5624         }
5625         {
5626             let mut encoder = ISO_2022_JP.new_encoder();
5627             let (result, _, _, _) =
5628                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5629             assert_eq!(result, CoderResult::InputEmpty);
5630         }
5631     }
5632 
5633     #[test]
test_buffer_end_utf16be()5634     fn test_buffer_end_utf16be() {
5635         let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5636         let mut dest = [0u8; 4];
5637 
5638         assert_eq!(
5639             decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5640             (CoderResult::InputEmpty, 2, 0, false)
5641         );
5642 
5643         let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5644     }
5645 
5646     #[test]
test_hash()5647     fn test_hash() {
5648         let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5649         encodings.insert(UTF_8);
5650         encodings.insert(ISO_2022_JP);
5651         assert!(encodings.contains(UTF_8));
5652         assert!(encodings.contains(ISO_2022_JP));
5653         assert!(!encodings.contains(WINDOWS_1252));
5654         encodings.remove(ISO_2022_JP);
5655         assert!(!encodings.contains(ISO_2022_JP));
5656     }
5657 
5658     #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5659     fn test_iso_2022_jp_ncr_extra_from_utf16() {
5660         let mut dst = [0u8; 17];
5661         {
5662             let mut encoder = ISO_2022_JP.new_encoder();
5663             let (result, _, _, _) =
5664                 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5665             assert_eq!(result, CoderResult::OutputFull);
5666         }
5667     }
5668 
5669     #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5670     fn test_iso_2022_jp_ncr_extra_from_utf8() {
5671         let mut dst = [0u8; 17];
5672         {
5673             let mut encoder = ISO_2022_JP.new_encoder();
5674             let (result, _, _, _) =
5675                 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5676             assert_eq!(result, CoderResult::OutputFull);
5677         }
5678     }
5679 
5680     #[test]
test_max_length_with_bom_to_utf8()5681     fn test_max_length_with_bom_to_utf8() {
5682         let mut output = [0u8; 20];
5683         let mut decoder = REPLACEMENT.new_decoder();
5684         let input = b"\xEF\xBB\xBFA";
5685         {
5686             let needed = decoder
5687                 .max_utf8_buffer_length_without_replacement(input.len())
5688                 .unwrap();
5689             let (result, read, written) =
5690                 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5691             assert_eq!(result, DecoderResult::InputEmpty);
5692             assert_eq!(read, input.len());
5693             assert_eq!(written, 1);
5694             assert_eq!(output[0], 0x41);
5695         }
5696     }
5697 
5698     #[cfg(feature = "serde")]
5699     #[test]
test_serde()5700     fn test_serde() {
5701         let demo = Demo {
5702             num: 42,
5703             name: "foo".into(),
5704             enc: UTF_8,
5705         };
5706 
5707         let serialized = serde_json::to_string(&demo).unwrap();
5708 
5709         let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5710         assert_eq!(deserialized, demo);
5711 
5712         let bincoded = bincode::serialize(&demo).unwrap();
5713         let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5714         assert_eq!(debincoded, demo);
5715     }
5716 
5717     #[test]
test_is_single_byte()5718     fn test_is_single_byte() {
5719         assert!(!BIG5.is_single_byte());
5720         assert!(!EUC_JP.is_single_byte());
5721         assert!(!EUC_KR.is_single_byte());
5722         assert!(!GB18030.is_single_byte());
5723         assert!(!GBK.is_single_byte());
5724         assert!(!REPLACEMENT.is_single_byte());
5725         assert!(!SHIFT_JIS.is_single_byte());
5726         assert!(!UTF_8.is_single_byte());
5727         assert!(!UTF_16BE.is_single_byte());
5728         assert!(!UTF_16LE.is_single_byte());
5729         assert!(!ISO_2022_JP.is_single_byte());
5730 
5731         assert!(IBM866.is_single_byte());
5732         assert!(ISO_8859_2.is_single_byte());
5733         assert!(ISO_8859_3.is_single_byte());
5734         assert!(ISO_8859_4.is_single_byte());
5735         assert!(ISO_8859_5.is_single_byte());
5736         assert!(ISO_8859_6.is_single_byte());
5737         assert!(ISO_8859_7.is_single_byte());
5738         assert!(ISO_8859_8.is_single_byte());
5739         assert!(ISO_8859_10.is_single_byte());
5740         assert!(ISO_8859_13.is_single_byte());
5741         assert!(ISO_8859_14.is_single_byte());
5742         assert!(ISO_8859_15.is_single_byte());
5743         assert!(ISO_8859_16.is_single_byte());
5744         assert!(ISO_8859_8_I.is_single_byte());
5745         assert!(KOI8_R.is_single_byte());
5746         assert!(KOI8_U.is_single_byte());
5747         assert!(MACINTOSH.is_single_byte());
5748         assert!(WINDOWS_874.is_single_byte());
5749         assert!(WINDOWS_1250.is_single_byte());
5750         assert!(WINDOWS_1251.is_single_byte());
5751         assert!(WINDOWS_1252.is_single_byte());
5752         assert!(WINDOWS_1253.is_single_byte());
5753         assert!(WINDOWS_1254.is_single_byte());
5754         assert!(WINDOWS_1255.is_single_byte());
5755         assert!(WINDOWS_1256.is_single_byte());
5756         assert!(WINDOWS_1257.is_single_byte());
5757         assert!(WINDOWS_1258.is_single_byte());
5758         assert!(X_MAC_CYRILLIC.is_single_byte());
5759         assert!(X_USER_DEFINED.is_single_byte());
5760     }
5761 
5762     #[test]
test_latin1_byte_compatible_up_to()5763     fn test_latin1_byte_compatible_up_to() {
5764         let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5765         assert_eq!(
5766             BIG5.new_decoder_without_bom_handling()
5767                 .latin1_byte_compatible_up_to(buffer)
5768                 .unwrap(),
5769             1
5770         );
5771         assert_eq!(
5772             EUC_JP
5773                 .new_decoder_without_bom_handling()
5774                 .latin1_byte_compatible_up_to(buffer)
5775                 .unwrap(),
5776             1
5777         );
5778         assert_eq!(
5779             EUC_KR
5780                 .new_decoder_without_bom_handling()
5781                 .latin1_byte_compatible_up_to(buffer)
5782                 .unwrap(),
5783             1
5784         );
5785         assert_eq!(
5786             GB18030
5787                 .new_decoder_without_bom_handling()
5788                 .latin1_byte_compatible_up_to(buffer)
5789                 .unwrap(),
5790             1
5791         );
5792         assert_eq!(
5793             GBK.new_decoder_without_bom_handling()
5794                 .latin1_byte_compatible_up_to(buffer)
5795                 .unwrap(),
5796             1
5797         );
5798         assert!(REPLACEMENT
5799             .new_decoder_without_bom_handling()
5800             .latin1_byte_compatible_up_to(buffer)
5801             .is_none());
5802         assert_eq!(
5803             SHIFT_JIS
5804                 .new_decoder_without_bom_handling()
5805                 .latin1_byte_compatible_up_to(buffer)
5806                 .unwrap(),
5807             1
5808         );
5809         assert_eq!(
5810             UTF_8
5811                 .new_decoder_without_bom_handling()
5812                 .latin1_byte_compatible_up_to(buffer)
5813                 .unwrap(),
5814             1
5815         );
5816         assert!(UTF_16BE
5817             .new_decoder_without_bom_handling()
5818             .latin1_byte_compatible_up_to(buffer)
5819             .is_none());
5820         assert!(UTF_16LE
5821             .new_decoder_without_bom_handling()
5822             .latin1_byte_compatible_up_to(buffer)
5823             .is_none());
5824         assert_eq!(
5825             ISO_2022_JP
5826                 .new_decoder_without_bom_handling()
5827                 .latin1_byte_compatible_up_to(buffer)
5828                 .unwrap(),
5829             1
5830         );
5831 
5832         assert_eq!(
5833             IBM866
5834                 .new_decoder_without_bom_handling()
5835                 .latin1_byte_compatible_up_to(buffer)
5836                 .unwrap(),
5837             1
5838         );
5839         assert_eq!(
5840             ISO_8859_2
5841                 .new_decoder_without_bom_handling()
5842                 .latin1_byte_compatible_up_to(buffer)
5843                 .unwrap(),
5844             2
5845         );
5846         assert_eq!(
5847             ISO_8859_3
5848                 .new_decoder_without_bom_handling()
5849                 .latin1_byte_compatible_up_to(buffer)
5850                 .unwrap(),
5851             2
5852         );
5853         assert_eq!(
5854             ISO_8859_4
5855                 .new_decoder_without_bom_handling()
5856                 .latin1_byte_compatible_up_to(buffer)
5857                 .unwrap(),
5858             2
5859         );
5860         assert_eq!(
5861             ISO_8859_5
5862                 .new_decoder_without_bom_handling()
5863                 .latin1_byte_compatible_up_to(buffer)
5864                 .unwrap(),
5865             2
5866         );
5867         assert_eq!(
5868             ISO_8859_6
5869                 .new_decoder_without_bom_handling()
5870                 .latin1_byte_compatible_up_to(buffer)
5871                 .unwrap(),
5872             2
5873         );
5874         assert_eq!(
5875             ISO_8859_7
5876                 .new_decoder_without_bom_handling()
5877                 .latin1_byte_compatible_up_to(buffer)
5878                 .unwrap(),
5879             2
5880         );
5881         assert_eq!(
5882             ISO_8859_8
5883                 .new_decoder_without_bom_handling()
5884                 .latin1_byte_compatible_up_to(buffer)
5885                 .unwrap(),
5886             3
5887         );
5888         assert_eq!(
5889             ISO_8859_10
5890                 .new_decoder_without_bom_handling()
5891                 .latin1_byte_compatible_up_to(buffer)
5892                 .unwrap(),
5893             2
5894         );
5895         assert_eq!(
5896             ISO_8859_13
5897                 .new_decoder_without_bom_handling()
5898                 .latin1_byte_compatible_up_to(buffer)
5899                 .unwrap(),
5900             4
5901         );
5902         assert_eq!(
5903             ISO_8859_14
5904                 .new_decoder_without_bom_handling()
5905                 .latin1_byte_compatible_up_to(buffer)
5906                 .unwrap(),
5907             4
5908         );
5909         assert_eq!(
5910             ISO_8859_15
5911                 .new_decoder_without_bom_handling()
5912                 .latin1_byte_compatible_up_to(buffer)
5913                 .unwrap(),
5914             6
5915         );
5916         assert_eq!(
5917             ISO_8859_16
5918                 .new_decoder_without_bom_handling()
5919                 .latin1_byte_compatible_up_to(buffer)
5920                 .unwrap(),
5921             4
5922         );
5923         assert_eq!(
5924             ISO_8859_8_I
5925                 .new_decoder_without_bom_handling()
5926                 .latin1_byte_compatible_up_to(buffer)
5927                 .unwrap(),
5928             3
5929         );
5930         assert_eq!(
5931             KOI8_R
5932                 .new_decoder_without_bom_handling()
5933                 .latin1_byte_compatible_up_to(buffer)
5934                 .unwrap(),
5935             1
5936         );
5937         assert_eq!(
5938             KOI8_U
5939                 .new_decoder_without_bom_handling()
5940                 .latin1_byte_compatible_up_to(buffer)
5941                 .unwrap(),
5942             1
5943         );
5944         assert_eq!(
5945             MACINTOSH
5946                 .new_decoder_without_bom_handling()
5947                 .latin1_byte_compatible_up_to(buffer)
5948                 .unwrap(),
5949             1
5950         );
5951         assert_eq!(
5952             WINDOWS_874
5953                 .new_decoder_without_bom_handling()
5954                 .latin1_byte_compatible_up_to(buffer)
5955                 .unwrap(),
5956             2
5957         );
5958         assert_eq!(
5959             WINDOWS_1250
5960                 .new_decoder_without_bom_handling()
5961                 .latin1_byte_compatible_up_to(buffer)
5962                 .unwrap(),
5963             4
5964         );
5965         assert_eq!(
5966             WINDOWS_1251
5967                 .new_decoder_without_bom_handling()
5968                 .latin1_byte_compatible_up_to(buffer)
5969                 .unwrap(),
5970             1
5971         );
5972         assert_eq!(
5973             WINDOWS_1252
5974                 .new_decoder_without_bom_handling()
5975                 .latin1_byte_compatible_up_to(buffer)
5976                 .unwrap(),
5977             5
5978         );
5979         assert_eq!(
5980             WINDOWS_1253
5981                 .new_decoder_without_bom_handling()
5982                 .latin1_byte_compatible_up_to(buffer)
5983                 .unwrap(),
5984             3
5985         );
5986         assert_eq!(
5987             WINDOWS_1254
5988                 .new_decoder_without_bom_handling()
5989                 .latin1_byte_compatible_up_to(buffer)
5990                 .unwrap(),
5991             4
5992         );
5993         assert_eq!(
5994             WINDOWS_1255
5995                 .new_decoder_without_bom_handling()
5996                 .latin1_byte_compatible_up_to(buffer)
5997                 .unwrap(),
5998             3
5999         );
6000         assert_eq!(
6001             WINDOWS_1256
6002                 .new_decoder_without_bom_handling()
6003                 .latin1_byte_compatible_up_to(buffer)
6004                 .unwrap(),
6005             1
6006         );
6007         assert_eq!(
6008             WINDOWS_1257
6009                 .new_decoder_without_bom_handling()
6010                 .latin1_byte_compatible_up_to(buffer)
6011                 .unwrap(),
6012             4
6013         );
6014         assert_eq!(
6015             WINDOWS_1258
6016                 .new_decoder_without_bom_handling()
6017                 .latin1_byte_compatible_up_to(buffer)
6018                 .unwrap(),
6019             4
6020         );
6021         assert_eq!(
6022             X_MAC_CYRILLIC
6023                 .new_decoder_without_bom_handling()
6024                 .latin1_byte_compatible_up_to(buffer)
6025                 .unwrap(),
6026             1
6027         );
6028         assert_eq!(
6029             X_USER_DEFINED
6030                 .new_decoder_without_bom_handling()
6031                 .latin1_byte_compatible_up_to(buffer)
6032                 .unwrap(),
6033             1
6034         );
6035 
6036         assert!(UTF_8
6037             .new_decoder()
6038             .latin1_byte_compatible_up_to(buffer)
6039             .is_none());
6040 
6041         let mut decoder = UTF_8.new_decoder();
6042         let mut output = [0u16; 4];
6043         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6044         assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6045         let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6046         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6047         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6048         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6049     }
6050 }
6051