1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 #![cfg_attr(
11     feature = "cargo-clippy",
12     allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14 
15 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17 //! Gecko-oriented means that converting to and from UTF-16 is supported in
18 //! addition to converting to and from UTF-8, that the performance and
19 //! streamability goals are browser-oriented, and that FFI-friendliness is a
20 //! goal.
21 //!
22 //! Additionally, the `mem` module provides functions that are useful for
23 //! applications that need to be able to deal with legacy in-memory
24 //! representations of Unicode.
25 //!
26 //! For expectation setting, please be sure to read the sections
27 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29 //!
30 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31 //! design and internals of the crate.
32 //!
33 //! # Availability
34 //!
35 //! The code is available under the
36 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38 //! See the
39 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40 //! file for details.
41 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43 //!
44 //! # Integration with `std::io`
45 //!
46 //! This crate doesn't implement traits from `std::io`. However, for the case of
47 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50 //!
51 //! # Examples
52 //!
53 //! Example programs:
54 //!
55 //! * [Rust](https://github.com/hsivonen/recode_rs)
56 //! * [C](https://github.com/hsivonen/recode_c)
57 //! * [C++](https://github.com/hsivonen/recode_cpp)
58 //!
59 //! Decode using the non-streaming API:
60 //!
61 //! ```
62 //! #[cfg(feature = "alloc")] {
63 //! use encoding_rs::*;
64 //!
65 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
66 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
67 //!
68 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
69 //! assert_eq!(&cow[..], expectation);
70 //! assert_eq!(encoding_used, SHIFT_JIS);
71 //! assert!(!had_errors);
72 //! }
73 //! ```
74 //!
75 //! Decode using the streaming API with minimal `unsafe`:
76 //!
77 //! ```
78 //! use encoding_rs::*;
79 //!
80 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
81 //!
82 //! // Use an array of byte slices to demonstrate content arriving piece by
83 //! // piece from the network.
84 //! let bytes: [&'static [u8]; 4] = [b"\x83",
85 //!                                  b"n\x83\x8D\x81",
86 //!                                  b"[\x81E\x83\x8F\x81[\x83",
87 //!                                  b"\x8B\x83h"];
88 //!
89 //! // Very short output buffer to demonstrate the output buffer getting full.
90 //! // Normally, you'd use something like `[0u8; 2048]`.
91 //! let mut buffer_bytes = [0u8; 8];
92 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
93 //!
94 //! // How many bytes in the buffer currently hold significant data.
95 //! let mut bytes_in_buffer = 0usize;
96 //!
97 //! // Collect the output to a string for demonstration purposes.
98 //! let mut output = String::new();
99 //!
100 //! // The `Decoder`
101 //! let mut decoder = SHIFT_JIS.new_decoder();
102 //!
103 //! // Track whether we see errors.
104 //! let mut total_had_errors = false;
105 //!
106 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
107 //! // use of a fixed-size buffer; normally when the output of an incremental
108 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
109 //! // avoid the intermediate buffer).
110 //! for input in &bytes[..] {
111 //!     // The number of bytes already read from current `input` in total.
112 //!     let mut total_read_from_current_input = 0usize;
113 //!
114 //!     loop {
115 //!         let (result, read, written, had_errors) =
116 //!             decoder.decode_to_str(&input[total_read_from_current_input..],
117 //!                                   &mut buffer[bytes_in_buffer..],
118 //!                                   false);
119 //!         total_read_from_current_input += read;
120 //!         bytes_in_buffer += written;
121 //!         total_had_errors |= had_errors;
122 //!         match result {
123 //!             CoderResult::InputEmpty => {
124 //!                 // We have consumed the current input buffer. Break out of
125 //!                 // the inner loop to get the next input buffer from the
126 //!                 // outer loop.
127 //!                 break;
128 //!             },
129 //!             CoderResult::OutputFull => {
130 //!                 // Write the current buffer out and consider the buffer
131 //!                 // empty.
132 //!                 output.push_str(&buffer[..bytes_in_buffer]);
133 //!                 bytes_in_buffer = 0usize;
134 //!                 continue;
135 //!             }
136 //!         }
137 //!     }
138 //! }
139 //!
140 //! // Process EOF
141 //! loop {
142 //!     let (result, _, written, had_errors) =
143 //!         decoder.decode_to_str(b"",
144 //!                               &mut buffer[bytes_in_buffer..],
145 //!                               true);
146 //!     bytes_in_buffer += written;
147 //!     total_had_errors |= had_errors;
148 //!     // Write the current buffer out and consider the buffer empty.
149 //!     // Need to do this here for both `match` arms, because we exit the
150 //!     // loop on `CoderResult::InputEmpty`.
151 //!     output.push_str(&buffer[..bytes_in_buffer]);
152 //!     bytes_in_buffer = 0usize;
153 //!     match result {
154 //!         CoderResult::InputEmpty => {
155 //!             // Done!
156 //!             break;
157 //!         },
158 //!         CoderResult::OutputFull => {
159 //!             continue;
160 //!         }
161 //!     }
162 //! }
163 //!
164 //! assert_eq!(&output[..], expectation);
165 //! assert!(!total_had_errors);
166 //! ```
167 //!
168 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
169 //!
170 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
171 //! __so this crate does not provide encoders for those encodings__!
172 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
173 //! so you get an UTF-8 encoder if you request an encoder for them.
174 //!
175 //! Additionally, the Encoding Standard factors BOM handling into wrapper
176 //! algorithms so that BOM handling isn't part of the definition of the
177 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
178 //! Standard define BOM handling or lack thereof as part of the encoding
179 //! scheme.
180 //!
181 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
182 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
183 //! the Unicode Standard.
184 //!
185 //! When used with the `_with_bom_removal` entry points, the UTF-8
186 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
187 //! Standard.
188 //!
189 //! This crate does not provide a mode that matches the UTF-16 _encoding
190 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
191 //! the entry points without `_bom_` qualifiers is the closest match,
192 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
193 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
194 //! Unicode Standard.
195 //!
196 //! The UTF-32 family of Unicode encoding schemes is not supported
197 //! by this crate. The Encoding Standard doesn't define any UTF-32
198 //! family encodings, since they aren't necessary for consuming Web
199 //! content.
200 //!
201 //! ## ISO-8859-1
202 //!
203 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
204 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
205 //! byte value to the same Unicode scalar value is not available via
206 //! `Encoding` in this crate.
207 //!
208 //! However, the functions whose name starts with `convert` and contains
209 //! `latin1` in the `mem` module support such conversions, which are known as
210 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
211 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
212 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
213 //!
214 //! ## Web / Browser Focus
215 //!
216 //! Both in terms of scope and performance, the focus is on the Web. For scope,
217 //! this means that encoding_rs implements the Encoding Standard fully and
218 //! doesn't implement encodings that are not specified in the Encoding
219 //! Standard. For performance, this means that decoding performance is
220 //! important as well as performance for encoding into UTF-8 or encoding the
221 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
222 //! be encoded into legacy encodings in only two places in the Web platform: in
223 //! the query part of URLs, in which case it's a matter of relatively rare
224 //! error handling, and in form submission, in which case the user action and
225 //! networking tend to hide the performance of the encoder.
226 //!
227 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
228 //! encodings enables smaller code size thanks to the encoder side using the
229 //! decode-optimized data tables without having encode-optimized data tables at
230 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
231 //! multiplication operations.
232 //!
233 //! Additionally, performance is a non-goal for the ASCII-incompatible
234 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
235 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
236 //! of implementation.
237 //!
238 //! Despite the browser focus, the hope is that non-browser applications
239 //! that wish to consume Web content or submit Web forms in a Web-compatible
240 //! way will find encoding_rs useful. While encoding_rs does not try to match
241 //! Windows behavior, many of the encodings are close enough to legacy
242 //! encodings implemented by Windows that applications that need to consume
243 //! data in legacy Windows encodins may find encoding_rs useful. The
244 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
245 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
246 //!
247 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
248 //! to the encodings defined in the Encoding Standard. The
249 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
250 //! UTF-7 decoding for email purposes.
251 //!
252 //! For single-byte DOS encodings beyond the ones supported by the Encoding
253 //! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
254 //!
255 //! # Preparing Text for the Encoders
256 //!
257 //! Normalizing text into Unicode Normalization Form C prior to encoding text
258 //! into a legacy encoding minimizes unmappable characters. Text can be
259 //! normalized to Unicode Normalization Form C using the
260 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
261 //!
262 //! The exception is windows-1258, which after normalizing to Unicode
263 //! Normalization Form C requires tone marks to be decomposed in order to
264 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
265 //! using the [`detone`](https://crates.io/crates/detone) crate.
266 //!
267 //! # Streaming & Non-Streaming; Rust & C/C++
268 //!
269 //! The API in Rust has two modes of operation: streaming and non-streaming.
270 //! The streaming API is the foundation of the implementation and should be
271 //! used when processing data that arrives piecemeal from an i/o stream. The
272 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
273 //! to C callers. The non-streaming part of the API is for Rust callers only and
274 //! is smart about borrowing instead of copying when possible. When
275 //! streamability is not needed, the non-streaming API should be preferrer in
276 //! order to avoid copying data when a borrow suffices.
277 //!
278 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
279 //! standard types for growable byte buffers and Unicode strings that know
280 //! their length.
281 //!
282 //! The C API (header file generated at `target/include/encoding_rs.h` when
283 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
284 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
285 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
286 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
287 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
288 //! as part of Mozilla [bug 1261841][4].
289 //!
290 //! The `Encoding` type is common to both the streaming and non-streaming
291 //! modes. In the streaming mode, decoding operations are performed with a
292 //! `Decoder` and encoding operations with an `Encoder` object obtained via
293 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
294 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
295 //! and `Encoder` objects are not used at all.
296 //!
297 //! [1]: https://github.com/hsivonen/encoding_c
298 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
299 //! [3]: https://github.com/Microsoft/GSL/
300 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
301 //!
302 //! # Memory management
303 //!
304 //! The non-streaming mode never performs heap allocations (even the methods
305 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
306 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
307 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
308 //!
309 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
310 //! perform heap allocations but only to allocate the backing buffer of the
311 //! `Vec<u8>` or the `String`.
312 //!
313 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
314 //! `Drop` cleanup.
315 //!
316 //! # Buffer reading and writing behavior
317 //!
318 //! Based on experience gained with the `java.nio.charset` encoding converter
319 //! API and with the Gecko uconv encoding converter API, the buffer reading
320 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
321 //! fully drained but output buffers are not always fully filled.
322 //!
323 //! When reading from an input buffer, encoding_rs always consumes all input
324 //! up to the next error or to the end of the buffer. In particular, when
325 //! decoding, even if the input buffer ends in the middle of a byte sequence
326 //! for a character, the decoder consumes all input. This has the benefit that
327 //! the caller of the API can always fill the next buffer from the start from
328 //! whatever source the bytes come from and never has to first copy the last
329 //! bytes of the previous buffer to the start of the next buffer. However, when
330 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
331 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
332 //! boundaries falling in the middle of a surrogate pair result in both
333 //! suggorates being treated individually as unpaired surrogates.
334 //!
335 //! Additionally, decoders guarantee that they can be fed even one byte at a
336 //! time and encoders guarantee that they can be fed even one code point at a
337 //! time. This has the benefit of not placing restrictions on the size of
338 //! chunks the content arrives e.g. from network.
339 //!
340 //! When writing into an output buffer, encoding_rs makes sure that the code
341 //! unit sequence for a character is never split across output buffer
342 //! boundaries. This may result in wasted space at the end of an output buffer,
343 //! but the advantages are that the output side of both decoders and encoders
344 //! is greatly simplified compared to designs that attempt to fill output
345 //! buffers exactly even when that entails splitting a code unit sequence and
346 //! when encoding_rs methods return to the caller, the output produces thus
347 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
348 //! the output needs to be considered as a whole, because the latest output
349 //! buffer taken alone might not be valid taken alone if the transition away
350 //! from the ASCII state occurred in an earlier output buffer. However, since
351 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
352 //! state as being in error despite the encoder generating a transition to the
353 //! ASCII state at the end, the claim about the partial output taken as a whole
354 //! being valid is true even for ISO-2022-JP.)
355 //!
356 //! # Error Reporting
357 //!
358 //! Based on experience gained with the `java.nio.charset` encoding converter
359 //! API and with the Gecko uconv encoding converter API, the error reporting
360 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
361 //! that leave it up to the caller to extract the erroneous bytes from the
362 //! input stream if the caller wishes to do so but encoder errors provide the
363 //! code point associated with the error without requiring the caller to
364 //! extract it from the input on its own.
365 //!
366 //! On the encoder side, an error is always triggered by the most recently
367 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
368 //! caller. Also, it's very typical for the caller to wish to do something with
369 //! this data: generate a numeric escape for the character. Additionally, the
370 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
371 //! certain cases, so requiring the caller to extract the character from the
372 //! input buffer would require the caller to handle ISO-2022-JP details.
373 //! Furthermore, requiring the caller to extract the character from the input
374 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
375 //! the job of an encoding conversion library.
376 //!
377 //! On the decoder side, errors are triggered in more complex ways. For
378 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
379 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
380 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
381 //! be the ones most recently pushed to the decoder and the error might not even
382 //! be in the current buffer.
383 //!
384 //! Some encoding conversion APIs address the problem by not acknowledging
385 //! trailing bytes of an input buffer as consumed if it's still possible for
386 //! future bytes to cause the trailing bytes to be in error. This way, error
387 //! reporting can always refer to the most recently pushed buffer. This has the
388 //! problem that the caller of the API has to copy the unconsumed trailing
389 //! bytes to the start of the next buffer before being able to fill the rest
390 //! of the next buffer. This is annoying, error-prone and inefficient.
391 //!
392 //! A possible solution would be making the decoder remember recently consumed
393 //! bytes in order to be able to include a copy of the erroneous bytes when
394 //! reporting an error. This has two problem: First, callers a rarely
395 //! interested in the erroneous bytes, so attempts to identify them are most
396 //! often just overhead anyway. Second, the rare applications that are
397 //! interested typically care about the location of the error in the input
398 //! stream.
399 //!
400 //! To keep the API convenient for common uses and the overhead low while making
401 //! it possible to develop applications, such as HTML validators, that care
402 //! about which bytes were in error, encoding_rs reports the length of the
403 //! erroneous sequence and the number of bytes consumed after the erroneous
404 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
405 //! this makes it possible for callers that care about the erroneous bytes to
406 //! locate them.
407 //!
408 //! # No Convenience API for Custom Replacements
409 //!
410 //! The Web Platform and, therefore, the Encoding Standard supports only one
411 //! error recovery mode for decoders and only one error recovery mode for
412 //! encoders. The supported error recovery mode for decoders is emitting the
413 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
414 //! encoders is emitting an HTML decimal numeric character reference for
415 //! unmappable characters.
416 //!
417 //! Since encoding_rs is Web-focused, these are the only error recovery modes
418 //! for which convenient support is provided. Moreover, on the decoder side,
419 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
420 //! on error (other than treating errors as fatal). In particular, simply
421 //! ignoring errors is a
422 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
423 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
424 //! callers to ignore errors.
425 //!
426 //! On the encoder side, there are plausible alternatives for HTML decimal
427 //! numeric character references. For example, when outputting CSS, CSS-style
428 //! escapes would seem to make sense. However, instead of facilitating the
429 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
430 //! position that you shouldn't generate output in encodings other than UTF-8,
431 //! except where backward compatibility with interacting with the legacy Web
432 //! requires it. The legacy Web requires it only when parsing the query strings
433 //! of URLs and when submitting forms, and those two both use HTML decimal
434 //! numeric character references.
435 //!
436 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
437 //! numeric character references easy, it does make them _possible_.
438 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
439 //! for unmappable characters, is implemented on top of
440 //! `encode_from_utf8_without_replacement()`. Applications that really, really
441 //! want other replacement schemes for unmappable characters can likewise
442 //! implement them on top of `encode_from_utf8_without_replacement()`.
443 //!
444 //! # No Extensibility by Design
445 //!
446 //! The set of encodings supported by encoding_rs is not extensible by design.
447 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
448 //! rather than `trait`s. encoding_rs takes the design position that all future
449 //! text interchange should be done using UTF-8, which can represent all of
450 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
451 //! Standard and encoding_rs that can represent all of Unicode and that has
452 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
453 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
454 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
455 //! other than being able to consume legacy content.
456 //!
457 //! Considering that UTF-8 can represent all of Unicode and is already supported
458 //! by all Web browsers, introducing a new encoding wouldn't add to the
459 //! expressiveness but would add to compatibility problems. In that sense,
460 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
461 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
462 //! the Web Platform. On the other hand, the set of legacy encodings that must
463 //! be supported for a Web browser to be able to be successful is not going to
464 //! expand. Empirically, the set of encodings specified in the Encoding Standard
465 //! is already sufficient and the set of legacy encodings won't grow
466 //! retroactively.
467 //!
468 //! Since extensibility doesn't make sense considering the Web focus of
469 //! encoding_rs and adding encodings to Web clients would be actively harmful,
470 //! it makes sense to make the set of encodings that encoding_rs supports
471 //! non-extensible and to take the (admittedly small) benefits arising from
472 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
473 //!  of time, which enables stack allocation thereof.
474 //!
475 //! This does have downsides for applications that might want to put encoding_rs
476 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
477 //! needed for Web uses. The needs of such applications should not complicate
478 //! encoding_rs itself, though. It is up to those applications to provide a
479 //! framework that delegates the operations with encodings that encoding_rs
480 //! supports to encoding_rs and operations with other encodings to something
481 //! else (as opposed to encoding_rs itself providing an extensibility
482 //! framework).
483 //!
484 //! # Panics
485 //!
486 //! Methods in encoding_rs can panic if the API is used against the requirements
487 //! stated in the documentation, if a state that's supposed to be impossible
488 //! is reached due to an internal bug or on integer overflow. When used
489 //! according to documentation with buffer sizes that stay below integer
490 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
491 //!
492 //! Panics arising from API misuse aren't documented beyond this on individual
493 //! methods.
494 //!
495 //! # At-Risk Parts of the API
496 //!
497 //! The foreseeable source of partially backward-incompatible API change is the
498 //! way the instances of `Encoding` are made available.
499 //!
500 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
501 //! initialized with `static`s of type `&'static Encoding`, the non-reference
502 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
503 //!
504 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
505 //! unique when the constant is used in different crates, the reference-typed
506 //! `static`s for the encoding instances will be changed from `static` to
507 //! `const` and the non-reference-typed `_INIT` instances will be removed.
508 //!
509 //! # Mapping Spec Concepts onto the API
510 //!
511 //! <table>
512 //! <thead>
513 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
514 //! </thead>
515 //! <tbody>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
522 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
523 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
524 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
525 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
526 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
527 //! </tbody>
528 //! </table>
529 //!
530 //! # Compatibility with the rust-encoding API
531 //!
532 //! The crate
533 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
534 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
535 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
536 //!
537 //! # Mapping rust-encoding concepts to encoding_rs concepts
538 //!
539 //! The following table provides a mapping from rust-encoding constructs to
540 //! encoding_rs ones.
541 //!
542 //! <table>
543 //! <thead>
544 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
545 //! </thead>
546 //! <tbody>
547 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
548 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
549 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
550 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
551 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
552 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
553 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
554 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
555 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
556 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
557 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
558 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
559 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
560 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
561 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
562 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
563 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
564 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
565 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
566 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
567 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
568 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
570 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
572 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
573 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
574 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
575 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
576 //! </tbody>
577 //! </table>
578 //!
579 //! # Relationship with Windows Code Pages
580 //!
581 //! Despite the Web and browser focus, the encodings defined by the Encoding
582 //! Standard and implemented by this crate may be useful for decoding legacy
583 //! data that uses Windows code pages. The following table names the single-byte
584 //! encodings
585 //! that have a closely related Windows code page, the number of the closest
586 //! code page, a column indicating whether Windows maps unassigned code points
587 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
588 //! indicating remarks in the list after the table.
589 //!
590 //! <table>
591 //! <thead>
592 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
593 //! </thead>
594 //! <tbody>
595 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
596 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
597 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
598 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
599 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
600 //! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
601 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
602 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
603 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
604 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
605 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
606 //! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
607 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
608 //! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
609 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
610 //! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
611 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
612 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
613 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
614 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
615 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
616 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
617 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
618 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
619 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
620 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
621 //! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
622 //! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
623 //! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
624 //! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
625 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
626 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
627 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
628 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
629 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
630 //! </tbody>
631 //! </table>
632 //!
633 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
634 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
635 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
636 //!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
637 //!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
638 //!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
639 //!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
640 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
641 //!    of LRM and RLM.
642 //! 5. Remarks from the previous item apply.
643 //!
644 //! The differences between this crate and Windows in the case of multibyte encodings
645 //! are not yet fully documented here. The lack of remarks above should not be taken
646 //! as indication of lack of differences.
647 //!
648 //! # Notable Differences from IANA Naming
649 //!
650 //! In some cases, the Encoding Standard specifies the popular unextended encoding
651 //! name where in IANA terms one of the other labels would be more precise considering
652 //! the extensions that the Encoding Standard has unified into the encoding.
653 //!
654 //! <table>
655 //! <thead>
656 //! <tr><th>Encoding</th><th>IANA</th></tr>
657 //! </thead>
658 //! <tbody>
659 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
660 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
661 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
662 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
663 //! </tbody>
664 //! </table>
665 //!
666 //! In other cases where the Encoding Standard unifies unextended and extended
667 //! variants of an encoding, the encoding gets the name of the extended
668 //! variant.
669 //!
670 //! <table>
671 //! <thead>
672 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
673 //! </thead>
674 //! <tbody>
675 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
676 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
677 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
678 //! </tbody>
679 //! </table>
680 //!
681 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
682 //! for discussion about the UTF-16 family.
683 
684 #![no_std]
685 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
686 
687 #[cfg(feature = "alloc")]
688 #[cfg_attr(test, macro_use)]
689 extern crate alloc;
690 
691 extern crate core;
692 #[macro_use]
693 extern crate cfg_if;
694 
695 #[cfg(all(
696     feature = "simd-accel",
697     any(
698         target_feature = "sse2",
699         all(target_endian = "little", target_arch = "aarch64"),
700         all(target_endian = "little", target_feature = "neon")
701     )
702 ))]
703 #[macro_use(shuffle)]
704 extern crate packed_simd;
705 
706 #[cfg(feature = "serde")]
707 extern crate serde;
708 
709 #[cfg(all(test, feature = "serde"))]
710 extern crate bincode;
711 #[cfg(all(test, feature = "serde"))]
712 #[macro_use]
713 extern crate serde_derive;
714 #[cfg(all(test, feature = "serde"))]
715 extern crate serde_json;
716 
717 #[macro_use]
718 mod macros;
719 
720 #[cfg(all(
721     feature = "simd-accel",
722     any(
723         target_feature = "sse2",
724         all(target_endian = "little", target_arch = "aarch64"),
725         all(target_endian = "little", target_feature = "neon")
726     )
727 ))]
728 mod simd_funcs;
729 
730 #[cfg(all(test, feature = "alloc"))]
731 mod testing;
732 
733 mod big5;
734 mod euc_jp;
735 mod euc_kr;
736 mod gb18030;
737 mod iso_2022_jp;
738 mod replacement;
739 mod shift_jis;
740 mod single_byte;
741 mod utf_16;
742 mod utf_8;
743 mod x_user_defined;
744 
745 mod ascii;
746 mod data;
747 mod handles;
748 mod variant;
749 
750 pub mod mem;
751 
752 use crate::ascii::ascii_valid_up_to;
753 use crate::ascii::iso_2022_jp_ascii_valid_up_to;
754 use crate::utf_8::utf8_valid_up_to;
755 use crate::variant::*;
756 
757 #[cfg(feature = "alloc")]
758 use alloc::borrow::Cow;
759 #[cfg(feature = "alloc")]
760 use alloc::string::String;
761 #[cfg(feature = "alloc")]
762 use alloc::vec::Vec;
763 use core::cmp::Ordering;
764 use core::hash::Hash;
765 use core::hash::Hasher;
766 
767 #[cfg(feature = "serde")]
768 use serde::de::Visitor;
769 #[cfg(feature = "serde")]
770 use serde::{Deserialize, Deserializer, Serialize, Serializer};
771 
772 /// This has to be the max length of an NCR instead of max
773 /// minus one, because we can't rely on getting the minus
774 /// one from the space reserved for the current unmappable,
775 /// because the ISO-2022-JP encoder can fill up that space
776 /// with a state transition escape.
777 const NCR_EXTRA: usize = 10; // &#1114111;
778 
779 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
780 // Instead, please regenerate using generate-encoding-data.py
781 
782 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
783 
784 /// The initializer for the [Big5](static.BIG5.html) encoding.
785 ///
786 /// For use only for taking the address of this form when
787 /// Rust prohibits the use of the non-`_INIT` form directly,
788 /// such as in initializers of other `static`s. If in doubt,
789 /// use the corresponding non-`_INIT` reference-typed `static`.
790 ///
791 /// This part of the public API will go away if Rust changes
792 /// to make the referent of `pub const FOO: &'static Encoding`
793 /// unique cross-crate or if Rust starts allowing static arrays
794 /// to be initialized with `pub static FOO: &'static Encoding`
795 /// items.
796 pub static BIG5_INIT: Encoding = Encoding {
797     name: "Big5",
798     variant: VariantEncoding::Big5,
799 };
800 
801 /// The Big5 encoding.
802 ///
803 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
804 /// instead of the Private Use Area code points that have been used historically.
805 /// It is believed to be able to decode existing Web content in a way that makes
806 /// sense.
807 ///
808 /// To avoid form submissions generating data that Web servers don't understand,
809 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
810 /// Big5 in the lexical order.
811 ///
812 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
813 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
814 ///
815 /// This encoding is designed to be suited for decoding the Windows code page 950
816 /// and its HKSCS patched "951" variant such that the text makes sense, given
817 /// assignments that Unicode has made after those encodings used Private Use
818 /// Area characters.
819 ///
820 /// This will change from `static` to `const` if Rust changes
821 /// to make the referent of `pub const FOO: &'static Encoding`
822 /// unique cross-crate, so don't take the address of this
823 /// `static`.
824 pub static BIG5: &'static Encoding = &BIG5_INIT;
825 
826 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
827 ///
828 /// For use only for taking the address of this form when
829 /// Rust prohibits the use of the non-`_INIT` form directly,
830 /// such as in initializers of other `static`s. If in doubt,
831 /// use the corresponding non-`_INIT` reference-typed `static`.
832 ///
833 /// This part of the public API will go away if Rust changes
834 /// to make the referent of `pub const FOO: &'static Encoding`
835 /// unique cross-crate or if Rust starts allowing static arrays
836 /// to be initialized with `pub static FOO: &'static Encoding`
837 /// items.
838 pub static EUC_JP_INIT: Encoding = Encoding {
839     name: "EUC-JP",
840     variant: VariantEncoding::EucJp,
841 };
842 
843 /// The EUC-JP encoding.
844 ///
845 /// This is the legacy Unix encoding for Japanese.
846 ///
847 /// For compatibility with Web servers that don't expect three-byte sequences
848 /// in form submissions, the encoder doesn't generate three-byte sequences.
849 /// That is, the JIS X 0212 support is decode-only.
850 ///
851 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
852 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
853 ///
854 /// This encoding roughly matches the Windows code page 20932. There are error
855 /// handling differences and a handful of 2-byte sequences that decode differently.
856 /// Additionall, Windows doesn't support 3-byte sequences.
857 ///
858 /// This will change from `static` to `const` if Rust changes
859 /// to make the referent of `pub const FOO: &'static Encoding`
860 /// unique cross-crate, so don't take the address of this
861 /// `static`.
862 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
863 
864 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
865 ///
866 /// For use only for taking the address of this form when
867 /// Rust prohibits the use of the non-`_INIT` form directly,
868 /// such as in initializers of other `static`s. If in doubt,
869 /// use the corresponding non-`_INIT` reference-typed `static`.
870 ///
871 /// This part of the public API will go away if Rust changes
872 /// to make the referent of `pub const FOO: &'static Encoding`
873 /// unique cross-crate or if Rust starts allowing static arrays
874 /// to be initialized with `pub static FOO: &'static Encoding`
875 /// items.
876 pub static EUC_KR_INIT: Encoding = Encoding {
877     name: "EUC-KR",
878     variant: VariantEncoding::EucKr,
879 };
880 
881 /// The EUC-KR encoding.
882 ///
883 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
884 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
885 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
886 ///
887 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
888 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
889 ///
890 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
891 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
892 /// the question mark or the Private Use Area.
893 ///
894 /// This will change from `static` to `const` if Rust changes
895 /// to make the referent of `pub const FOO: &'static Encoding`
896 /// unique cross-crate, so don't take the address of this
897 /// `static`.
898 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
899 
900 /// The initializer for the [GBK](static.GBK.html) encoding.
901 ///
902 /// For use only for taking the address of this form when
903 /// Rust prohibits the use of the non-`_INIT` form directly,
904 /// such as in initializers of other `static`s. If in doubt,
905 /// use the corresponding non-`_INIT` reference-typed `static`.
906 ///
907 /// This part of the public API will go away if Rust changes
908 /// to make the referent of `pub const FOO: &'static Encoding`
909 /// unique cross-crate or if Rust starts allowing static arrays
910 /// to be initialized with `pub static FOO: &'static Encoding`
911 /// items.
912 pub static GBK_INIT: Encoding = Encoding {
913     name: "GBK",
914     variant: VariantEncoding::Gbk,
915 };
916 
917 /// The GBK encoding.
918 ///
919 /// The decoder for this encoding is the same as the decoder for gb18030.
920 /// The encoder side of this encoding is GBK with Windows code page 936 euro
921 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
922 /// Unicode block as well as a handful of ideographs from the CJK Unified
923 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
924 ///
925 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
926 /// unified with the gb18030 encoder in the Encoding Standard out of concern
927 /// that servers that expect GBK form submissions might not be able to handle
928 /// the four-byte sequences.
929 ///
930 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
931 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
932 ///
933 /// The encoder of this encoding roughly matches the Windows code page 936.
934 /// The decoder side is a superset.
935 ///
936 /// This will change from `static` to `const` if Rust changes
937 /// to make the referent of `pub const FOO: &'static Encoding`
938 /// unique cross-crate, so don't take the address of this
939 /// `static`.
940 pub static GBK: &'static Encoding = &GBK_INIT;
941 
942 /// The initializer for the [IBM866](static.IBM866.html) encoding.
943 ///
944 /// For use only for taking the address of this form when
945 /// Rust prohibits the use of the non-`_INIT` form directly,
946 /// such as in initializers of other `static`s. If in doubt,
947 /// use the corresponding non-`_INIT` reference-typed `static`.
948 ///
949 /// This part of the public API will go away if Rust changes
950 /// to make the referent of `pub const FOO: &'static Encoding`
951 /// unique cross-crate or if Rust starts allowing static arrays
952 /// to be initialized with `pub static FOO: &'static Encoding`
953 /// items.
954 pub static IBM866_INIT: Encoding = Encoding {
955     name: "IBM866",
956     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
957 };
958 
959 /// The IBM866 encoding.
960 ///
961 /// This the most notable one of the DOS Cyrillic code pages. It has the same
962 /// box drawing characters as code page 437, so it can be used for decoding
963 /// DOS-era ASCII + box drawing data.
964 ///
965 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
966 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
967 ///
968 /// This encoding matches the Windows code page 866.
969 ///
970 /// This will change from `static` to `const` if Rust changes
971 /// to make the referent of `pub const FOO: &'static Encoding`
972 /// unique cross-crate, so don't take the address of this
973 /// `static`.
974 pub static IBM866: &'static Encoding = &IBM866_INIT;
975 
976 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
977 ///
978 /// For use only for taking the address of this form when
979 /// Rust prohibits the use of the non-`_INIT` form directly,
980 /// such as in initializers of other `static`s. If in doubt,
981 /// use the corresponding non-`_INIT` reference-typed `static`.
982 ///
983 /// This part of the public API will go away if Rust changes
984 /// to make the referent of `pub const FOO: &'static Encoding`
985 /// unique cross-crate or if Rust starts allowing static arrays
986 /// to be initialized with `pub static FOO: &'static Encoding`
987 /// items.
988 pub static ISO_2022_JP_INIT: Encoding = Encoding {
989     name: "ISO-2022-JP",
990     variant: VariantEncoding::Iso2022Jp,
991 };
992 
993 /// The ISO-2022-JP encoding.
994 ///
995 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
996 /// byte range to encode non-Basic Latin characters. It's the only encoding
997 /// supported by this crate whose encoder is stateful.
998 ///
999 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
1000 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
1001 ///
1002 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
1003 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
1004 /// error handling.
1005 ///
1006 /// This will change from `static` to `const` if Rust changes
1007 /// to make the referent of `pub const FOO: &'static Encoding`
1008 /// unique cross-crate, so don't take the address of this
1009 /// `static`.
1010 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1011 
1012 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1013 ///
1014 /// For use only for taking the address of this form when
1015 /// Rust prohibits the use of the non-`_INIT` form directly,
1016 /// such as in initializers of other `static`s. If in doubt,
1017 /// use the corresponding non-`_INIT` reference-typed `static`.
1018 ///
1019 /// This part of the public API will go away if Rust changes
1020 /// to make the referent of `pub const FOO: &'static Encoding`
1021 /// unique cross-crate or if Rust starts allowing static arrays
1022 /// to be initialized with `pub static FOO: &'static Encoding`
1023 /// items.
1024 pub static ISO_8859_10_INIT: Encoding = Encoding {
1025     name: "ISO-8859-10",
1026     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1027 };
1028 
1029 /// The ISO-8859-10 encoding.
1030 ///
1031 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1032 /// is also known as Latin 6.
1033 ///
1034 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1035 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1036 ///
1037 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1038 /// does not support this encoding.
1039 ///
1040 /// This will change from `static` to `const` if Rust changes
1041 /// to make the referent of `pub const FOO: &'static Encoding`
1042 /// unique cross-crate, so don't take the address of this
1043 /// `static`.
1044 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1045 
1046 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1047 ///
1048 /// For use only for taking the address of this form when
1049 /// Rust prohibits the use of the non-`_INIT` form directly,
1050 /// such as in initializers of other `static`s. If in doubt,
1051 /// use the corresponding non-`_INIT` reference-typed `static`.
1052 ///
1053 /// This part of the public API will go away if Rust changes
1054 /// to make the referent of `pub const FOO: &'static Encoding`
1055 /// unique cross-crate or if Rust starts allowing static arrays
1056 /// to be initialized with `pub static FOO: &'static Encoding`
1057 /// items.
1058 pub static ISO_8859_13_INIT: Encoding = Encoding {
1059     name: "ISO-8859-13",
1060     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1061 };
1062 
1063 /// The ISO-8859-13 encoding.
1064 ///
1065 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1066 /// is also known as Latin 7.
1067 ///
1068 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1069 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1070 ///
1071 /// This encoding matches the Windows code page 28603, except Windows decodes
1072 /// unassigned code points to the Private Use Area of Unicode.
1073 ///
1074 /// This will change from `static` to `const` if Rust changes
1075 /// to make the referent of `pub const FOO: &'static Encoding`
1076 /// unique cross-crate, so don't take the address of this
1077 /// `static`.
1078 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1079 
1080 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1081 ///
1082 /// For use only for taking the address of this form when
1083 /// Rust prohibits the use of the non-`_INIT` form directly,
1084 /// such as in initializers of other `static`s. If in doubt,
1085 /// use the corresponding non-`_INIT` reference-typed `static`.
1086 ///
1087 /// This part of the public API will go away if Rust changes
1088 /// to make the referent of `pub const FOO: &'static Encoding`
1089 /// unique cross-crate or if Rust starts allowing static arrays
1090 /// to be initialized with `pub static FOO: &'static Encoding`
1091 /// items.
1092 pub static ISO_8859_14_INIT: Encoding = Encoding {
1093     name: "ISO-8859-14",
1094     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1095 };
1096 
1097 /// The ISO-8859-14 encoding.
1098 ///
1099 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1100 /// is also known as Latin 8.
1101 ///
1102 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1103 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1104 ///
1105 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1106 /// does not support this encoding.
1107 ///
1108 /// This will change from `static` to `const` if Rust changes
1109 /// to make the referent of `pub const FOO: &'static Encoding`
1110 /// unique cross-crate, so don't take the address of this
1111 /// `static`.
1112 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1113 
1114 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1115 ///
1116 /// For use only for taking the address of this form when
1117 /// Rust prohibits the use of the non-`_INIT` form directly,
1118 /// such as in initializers of other `static`s. If in doubt,
1119 /// use the corresponding non-`_INIT` reference-typed `static`.
1120 ///
1121 /// This part of the public API will go away if Rust changes
1122 /// to make the referent of `pub const FOO: &'static Encoding`
1123 /// unique cross-crate or if Rust starts allowing static arrays
1124 /// to be initialized with `pub static FOO: &'static Encoding`
1125 /// items.
1126 pub static ISO_8859_15_INIT: Encoding = Encoding {
1127     name: "ISO-8859-15",
1128     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1129 };
1130 
1131 /// The ISO-8859-15 encoding.
1132 ///
1133 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1134 /// family. This encoding is also known as Latin 9.
1135 ///
1136 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1137 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1138 ///
1139 /// This encoding matches the Windows code page 28605.
1140 ///
1141 /// This will change from `static` to `const` if Rust changes
1142 /// to make the referent of `pub const FOO: &'static Encoding`
1143 /// unique cross-crate, so don't take the address of this
1144 /// `static`.
1145 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1146 
1147 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1148 ///
1149 /// For use only for taking the address of this form when
1150 /// Rust prohibits the use of the non-`_INIT` form directly,
1151 /// such as in initializers of other `static`s. If in doubt,
1152 /// use the corresponding non-`_INIT` reference-typed `static`.
1153 ///
1154 /// This part of the public API will go away if Rust changes
1155 /// to make the referent of `pub const FOO: &'static Encoding`
1156 /// unique cross-crate or if Rust starts allowing static arrays
1157 /// to be initialized with `pub static FOO: &'static Encoding`
1158 /// items.
1159 pub static ISO_8859_16_INIT: Encoding = Encoding {
1160     name: "ISO-8859-16",
1161     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1162 };
1163 
1164 /// The ISO-8859-16 encoding.
1165 ///
1166 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1167 /// family. This encoding is also known as Latin 10.
1168 ///
1169 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1170 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1171 ///
1172 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1173 /// does not support this encoding.
1174 ///
1175 /// This will change from `static` to `const` if Rust changes
1176 /// to make the referent of `pub const FOO: &'static Encoding`
1177 /// unique cross-crate, so don't take the address of this
1178 /// `static`.
1179 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1180 
1181 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1182 ///
1183 /// For use only for taking the address of this form when
1184 /// Rust prohibits the use of the non-`_INIT` form directly,
1185 /// such as in initializers of other `static`s. If in doubt,
1186 /// use the corresponding non-`_INIT` reference-typed `static`.
1187 ///
1188 /// This part of the public API will go away if Rust changes
1189 /// to make the referent of `pub const FOO: &'static Encoding`
1190 /// unique cross-crate or if Rust starts allowing static arrays
1191 /// to be initialized with `pub static FOO: &'static Encoding`
1192 /// items.
1193 pub static ISO_8859_2_INIT: Encoding = Encoding {
1194     name: "ISO-8859-2",
1195     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1196 };
1197 
1198 /// The ISO-8859-2 encoding.
1199 ///
1200 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1201 ///
1202 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1203 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1204 ///
1205 /// This encoding matches the Windows code page 28592.
1206 ///
1207 /// This will change from `static` to `const` if Rust changes
1208 /// to make the referent of `pub const FOO: &'static Encoding`
1209 /// unique cross-crate, so don't take the address of this
1210 /// `static`.
1211 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1212 
1213 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1214 ///
1215 /// For use only for taking the address of this form when
1216 /// Rust prohibits the use of the non-`_INIT` form directly,
1217 /// such as in initializers of other `static`s. If in doubt,
1218 /// use the corresponding non-`_INIT` reference-typed `static`.
1219 ///
1220 /// This part of the public API will go away if Rust changes
1221 /// to make the referent of `pub const FOO: &'static Encoding`
1222 /// unique cross-crate or if Rust starts allowing static arrays
1223 /// to be initialized with `pub static FOO: &'static Encoding`
1224 /// items.
1225 pub static ISO_8859_3_INIT: Encoding = Encoding {
1226     name: "ISO-8859-3",
1227     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1228 };
1229 
1230 /// The ISO-8859-3 encoding.
1231 ///
1232 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1233 ///
1234 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1235 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1236 ///
1237 /// This encoding matches the Windows code page 28593.
1238 ///
1239 /// This will change from `static` to `const` if Rust changes
1240 /// to make the referent of `pub const FOO: &'static Encoding`
1241 /// unique cross-crate, so don't take the address of this
1242 /// `static`.
1243 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1244 
1245 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1246 ///
1247 /// For use only for taking the address of this form when
1248 /// Rust prohibits the use of the non-`_INIT` form directly,
1249 /// such as in initializers of other `static`s. If in doubt,
1250 /// use the corresponding non-`_INIT` reference-typed `static`.
1251 ///
1252 /// This part of the public API will go away if Rust changes
1253 /// to make the referent of `pub const FOO: &'static Encoding`
1254 /// unique cross-crate or if Rust starts allowing static arrays
1255 /// to be initialized with `pub static FOO: &'static Encoding`
1256 /// items.
1257 pub static ISO_8859_4_INIT: Encoding = Encoding {
1258     name: "ISO-8859-4",
1259     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1260 };
1261 
1262 /// The ISO-8859-4 encoding.
1263 ///
1264 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1265 ///
1266 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1267 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1268 ///
1269 /// This encoding matches the Windows code page 28594.
1270 ///
1271 /// This will change from `static` to `const` if Rust changes
1272 /// to make the referent of `pub const FOO: &'static Encoding`
1273 /// unique cross-crate, so don't take the address of this
1274 /// `static`.
1275 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1276 
1277 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1278 ///
1279 /// For use only for taking the address of this form when
1280 /// Rust prohibits the use of the non-`_INIT` form directly,
1281 /// such as in initializers of other `static`s. If in doubt,
1282 /// use the corresponding non-`_INIT` reference-typed `static`.
1283 ///
1284 /// This part of the public API will go away if Rust changes
1285 /// to make the referent of `pub const FOO: &'static Encoding`
1286 /// unique cross-crate or if Rust starts allowing static arrays
1287 /// to be initialized with `pub static FOO: &'static Encoding`
1288 /// items.
1289 pub static ISO_8859_5_INIT: Encoding = Encoding {
1290     name: "ISO-8859-5",
1291     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1292 };
1293 
1294 /// The ISO-8859-5 encoding.
1295 ///
1296 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1297 ///
1298 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1299 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1300 ///
1301 /// This encoding matches the Windows code page 28595.
1302 ///
1303 /// This will change from `static` to `const` if Rust changes
1304 /// to make the referent of `pub const FOO: &'static Encoding`
1305 /// unique cross-crate, so don't take the address of this
1306 /// `static`.
1307 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1308 
1309 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1310 ///
1311 /// For use only for taking the address of this form when
1312 /// Rust prohibits the use of the non-`_INIT` form directly,
1313 /// such as in initializers of other `static`s. If in doubt,
1314 /// use the corresponding non-`_INIT` reference-typed `static`.
1315 ///
1316 /// This part of the public API will go away if Rust changes
1317 /// to make the referent of `pub const FOO: &'static Encoding`
1318 /// unique cross-crate or if Rust starts allowing static arrays
1319 /// to be initialized with `pub static FOO: &'static Encoding`
1320 /// items.
1321 pub static ISO_8859_6_INIT: Encoding = Encoding {
1322     name: "ISO-8859-6",
1323     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1324 };
1325 
1326 /// The ISO-8859-6 encoding.
1327 ///
1328 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1329 ///
1330 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1331 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1332 ///
1333 /// This encoding matches the Windows code page 28596, except Windows decodes
1334 /// unassigned code points to the Private Use Area of Unicode.
1335 ///
1336 /// This will change from `static` to `const` if Rust changes
1337 /// to make the referent of `pub const FOO: &'static Encoding`
1338 /// unique cross-crate, so don't take the address of this
1339 /// `static`.
1340 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1341 
1342 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1343 ///
1344 /// For use only for taking the address of this form when
1345 /// Rust prohibits the use of the non-`_INIT` form directly,
1346 /// such as in initializers of other `static`s. If in doubt,
1347 /// use the corresponding non-`_INIT` reference-typed `static`.
1348 ///
1349 /// This part of the public API will go away if Rust changes
1350 /// to make the referent of `pub const FOO: &'static Encoding`
1351 /// unique cross-crate or if Rust starts allowing static arrays
1352 /// to be initialized with `pub static FOO: &'static Encoding`
1353 /// items.
1354 pub static ISO_8859_7_INIT: Encoding = Encoding {
1355     name: "ISO-8859-7",
1356     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1357 };
1358 
1359 /// The ISO-8859-7 encoding.
1360 ///
1361 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1362 ///
1363 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1364 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1365 ///
1366 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1367 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1368 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1369 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1370 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1371 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1372 ///
1373 /// This will change from `static` to `const` if Rust changes
1374 /// to make the referent of `pub const FOO: &'static Encoding`
1375 /// unique cross-crate, so don't take the address of this
1376 /// `static`.
1377 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1378 
1379 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1380 ///
1381 /// For use only for taking the address of this form when
1382 /// Rust prohibits the use of the non-`_INIT` form directly,
1383 /// such as in initializers of other `static`s. If in doubt,
1384 /// use the corresponding non-`_INIT` reference-typed `static`.
1385 ///
1386 /// This part of the public API will go away if Rust changes
1387 /// to make the referent of `pub const FOO: &'static Encoding`
1388 /// unique cross-crate or if Rust starts allowing static arrays
1389 /// to be initialized with `pub static FOO: &'static Encoding`
1390 /// items.
1391 pub static ISO_8859_8_INIT: Encoding = Encoding {
1392     name: "ISO-8859-8",
1393     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1394 };
1395 
1396 /// The ISO-8859-8 encoding.
1397 ///
1398 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1399 ///
1400 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1401 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1402 ///
1403 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1404 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1405 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1406 /// the private use area.
1407 ///
1408 /// This will change from `static` to `const` if Rust changes
1409 /// to make the referent of `pub const FOO: &'static Encoding`
1410 /// unique cross-crate, so don't take the address of this
1411 /// `static`.
1412 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1413 
1414 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1415 ///
1416 /// For use only for taking the address of this form when
1417 /// Rust prohibits the use of the non-`_INIT` form directly,
1418 /// such as in initializers of other `static`s. If in doubt,
1419 /// use the corresponding non-`_INIT` reference-typed `static`.
1420 ///
1421 /// This part of the public API will go away if Rust changes
1422 /// to make the referent of `pub const FOO: &'static Encoding`
1423 /// unique cross-crate or if Rust starts allowing static arrays
1424 /// to be initialized with `pub static FOO: &'static Encoding`
1425 /// items.
1426 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1427     name: "ISO-8859-8-I",
1428     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1429 };
1430 
1431 /// The ISO-8859-8-I encoding.
1432 ///
1433 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1434 ///
1435 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1436 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1437 ///
1438 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1439 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1440 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1441 /// the private use area.
1442 ///
1443 /// This will change from `static` to `const` if Rust changes
1444 /// to make the referent of `pub const FOO: &'static Encoding`
1445 /// unique cross-crate, so don't take the address of this
1446 /// `static`.
1447 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1448 
1449 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1450 ///
1451 /// For use only for taking the address of this form when
1452 /// Rust prohibits the use of the non-`_INIT` form directly,
1453 /// such as in initializers of other `static`s. If in doubt,
1454 /// use the corresponding non-`_INIT` reference-typed `static`.
1455 ///
1456 /// This part of the public API will go away if Rust changes
1457 /// to make the referent of `pub const FOO: &'static Encoding`
1458 /// unique cross-crate or if Rust starts allowing static arrays
1459 /// to be initialized with `pub static FOO: &'static Encoding`
1460 /// items.
1461 pub static KOI8_R_INIT: Encoding = Encoding {
1462     name: "KOI8-R",
1463     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1464 };
1465 
1466 /// The KOI8-R encoding.
1467 ///
1468 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1469 ///
1470 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1471 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1472 ///
1473 /// This encoding matches the Windows code page 20866.
1474 ///
1475 /// This will change from `static` to `const` if Rust changes
1476 /// to make the referent of `pub const FOO: &'static Encoding`
1477 /// unique cross-crate, so don't take the address of this
1478 /// `static`.
1479 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1480 
1481 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1482 ///
1483 /// For use only for taking the address of this form when
1484 /// Rust prohibits the use of the non-`_INIT` form directly,
1485 /// such as in initializers of other `static`s. If in doubt,
1486 /// use the corresponding non-`_INIT` reference-typed `static`.
1487 ///
1488 /// This part of the public API will go away if Rust changes
1489 /// to make the referent of `pub const FOO: &'static Encoding`
1490 /// unique cross-crate or if Rust starts allowing static arrays
1491 /// to be initialized with `pub static FOO: &'static Encoding`
1492 /// items.
1493 pub static KOI8_U_INIT: Encoding = Encoding {
1494     name: "KOI8-U",
1495     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1496 };
1497 
1498 /// The KOI8-U encoding.
1499 ///
1500 /// This is an encoding for Ukrainian adapted from KOI8-R.
1501 ///
1502 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1503 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1504 ///
1505 /// This encoding matches the Windows code page 21866.
1506 ///
1507 /// This will change from `static` to `const` if Rust changes
1508 /// to make the referent of `pub const FOO: &'static Encoding`
1509 /// unique cross-crate, so don't take the address of this
1510 /// `static`.
1511 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1512 
1513 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1514 ///
1515 /// For use only for taking the address of this form when
1516 /// Rust prohibits the use of the non-`_INIT` form directly,
1517 /// such as in initializers of other `static`s. If in doubt,
1518 /// use the corresponding non-`_INIT` reference-typed `static`.
1519 ///
1520 /// This part of the public API will go away if Rust changes
1521 /// to make the referent of `pub const FOO: &'static Encoding`
1522 /// unique cross-crate or if Rust starts allowing static arrays
1523 /// to be initialized with `pub static FOO: &'static Encoding`
1524 /// items.
1525 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1526     name: "Shift_JIS",
1527     variant: VariantEncoding::ShiftJis,
1528 };
1529 
1530 /// The Shift_JIS encoding.
1531 ///
1532 /// This is the Japanese encoding for Windows.
1533 ///
1534 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1535 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1536 ///
1537 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1538 /// sequences that are error per the Encoding Standard to the question mark or the
1539 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1540 ///
1541 /// This will change from `static` to `const` if Rust changes
1542 /// to make the referent of `pub const FOO: &'static Encoding`
1543 /// unique cross-crate, so don't take the address of this
1544 /// `static`.
1545 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1546 
1547 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1548 ///
1549 /// For use only for taking the address of this form when
1550 /// Rust prohibits the use of the non-`_INIT` form directly,
1551 /// such as in initializers of other `static`s. If in doubt,
1552 /// use the corresponding non-`_INIT` reference-typed `static`.
1553 ///
1554 /// This part of the public API will go away if Rust changes
1555 /// to make the referent of `pub const FOO: &'static Encoding`
1556 /// unique cross-crate or if Rust starts allowing static arrays
1557 /// to be initialized with `pub static FOO: &'static Encoding`
1558 /// items.
1559 pub static UTF_16BE_INIT: Encoding = Encoding {
1560     name: "UTF-16BE",
1561     variant: VariantEncoding::Utf16Be,
1562 };
1563 
1564 /// The UTF-16BE encoding.
1565 ///
1566 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1567 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1568 /// mark the big endian byte order is assumed.
1569 ///
1570 /// There is no corresponding encoder in this crate or in the Encoding
1571 /// Standard. The output encoding of this encoding is UTF-8.
1572 ///
1573 /// This encoding matches the Windows code page 1201.
1574 ///
1575 /// This will change from `static` to `const` if Rust changes
1576 /// to make the referent of `pub const FOO: &'static Encoding`
1577 /// unique cross-crate, so don't take the address of this
1578 /// `static`.
1579 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1580 
1581 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1582 ///
1583 /// For use only for taking the address of this form when
1584 /// Rust prohibits the use of the non-`_INIT` form directly,
1585 /// such as in initializers of other `static`s. If in doubt,
1586 /// use the corresponding non-`_INIT` reference-typed `static`.
1587 ///
1588 /// This part of the public API will go away if Rust changes
1589 /// to make the referent of `pub const FOO: &'static Encoding`
1590 /// unique cross-crate or if Rust starts allowing static arrays
1591 /// to be initialized with `pub static FOO: &'static Encoding`
1592 /// items.
1593 pub static UTF_16LE_INIT: Encoding = Encoding {
1594     name: "UTF-16LE",
1595     variant: VariantEncoding::Utf16Le,
1596 };
1597 
1598 /// The UTF-16LE encoding.
1599 ///
1600 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1601 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1602 /// mark the little endian byte order is assumed.
1603 ///
1604 /// There is no corresponding encoder in this crate or in the Encoding
1605 /// Standard. The output encoding of this encoding is UTF-8.
1606 ///
1607 /// This encoding matches the Windows code page 1200.
1608 ///
1609 /// This will change from `static` to `const` if Rust changes
1610 /// to make the referent of `pub const FOO: &'static Encoding`
1611 /// unique cross-crate, so don't take the address of this
1612 /// `static`.
1613 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1614 
1615 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1616 ///
1617 /// For use only for taking the address of this form when
1618 /// Rust prohibits the use of the non-`_INIT` form directly,
1619 /// such as in initializers of other `static`s. If in doubt,
1620 /// use the corresponding non-`_INIT` reference-typed `static`.
1621 ///
1622 /// This part of the public API will go away if Rust changes
1623 /// to make the referent of `pub const FOO: &'static Encoding`
1624 /// unique cross-crate or if Rust starts allowing static arrays
1625 /// to be initialized with `pub static FOO: &'static Encoding`
1626 /// items.
1627 pub static UTF_8_INIT: Encoding = Encoding {
1628     name: "UTF-8",
1629     variant: VariantEncoding::Utf8,
1630 };
1631 
1632 /// The UTF-8 encoding.
1633 ///
1634 /// This is the encoding that should be used for all new development it can
1635 /// represent all of Unicode.
1636 ///
1637 /// This encoding matches the Windows code page 65001, except Windows differs
1638 /// in the number of errors generated for some erroneous byte sequences.
1639 ///
1640 /// This will change from `static` to `const` if Rust changes
1641 /// to make the referent of `pub const FOO: &'static Encoding`
1642 /// unique cross-crate, so don't take the address of this
1643 /// `static`.
1644 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1645 
1646 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1647 ///
1648 /// For use only for taking the address of this form when
1649 /// Rust prohibits the use of the non-`_INIT` form directly,
1650 /// such as in initializers of other `static`s. If in doubt,
1651 /// use the corresponding non-`_INIT` reference-typed `static`.
1652 ///
1653 /// This part of the public API will go away if Rust changes
1654 /// to make the referent of `pub const FOO: &'static Encoding`
1655 /// unique cross-crate or if Rust starts allowing static arrays
1656 /// to be initialized with `pub static FOO: &'static Encoding`
1657 /// items.
1658 pub static GB18030_INIT: Encoding = Encoding {
1659     name: "gb18030",
1660     variant: VariantEncoding::Gb18030,
1661 };
1662 
1663 /// The gb18030 encoding.
1664 ///
1665 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1666 /// maps to U+3000 for compatibility with existing Web content. As a result,
1667 /// this encoding can represent all of Unicode except for the private-use
1668 /// character U+E5E5.
1669 ///
1670 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1671 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1672 ///
1673 /// This encoding matches the Windows code page 54936.
1674 ///
1675 /// This will change from `static` to `const` if Rust changes
1676 /// to make the referent of `pub const FOO: &'static Encoding`
1677 /// unique cross-crate, so don't take the address of this
1678 /// `static`.
1679 pub static GB18030: &'static Encoding = &GB18030_INIT;
1680 
1681 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1682 ///
1683 /// For use only for taking the address of this form when
1684 /// Rust prohibits the use of the non-`_INIT` form directly,
1685 /// such as in initializers of other `static`s. If in doubt,
1686 /// use the corresponding non-`_INIT` reference-typed `static`.
1687 ///
1688 /// This part of the public API will go away if Rust changes
1689 /// to make the referent of `pub const FOO: &'static Encoding`
1690 /// unique cross-crate or if Rust starts allowing static arrays
1691 /// to be initialized with `pub static FOO: &'static Encoding`
1692 /// items.
1693 pub static MACINTOSH_INIT: Encoding = Encoding {
1694     name: "macintosh",
1695     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1696 };
1697 
1698 /// The macintosh encoding.
1699 ///
1700 /// This is the MacRoman encoding from Mac OS Classic.
1701 ///
1702 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1703 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1704 ///
1705 /// This encoding matches the Windows code page 10000, except Windows decodes
1706 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1707 ///
1708 /// This will change from `static` to `const` if Rust changes
1709 /// to make the referent of `pub const FOO: &'static Encoding`
1710 /// unique cross-crate, so don't take the address of this
1711 /// `static`.
1712 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1713 
1714 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1715 ///
1716 /// For use only for taking the address of this form when
1717 /// Rust prohibits the use of the non-`_INIT` form directly,
1718 /// such as in initializers of other `static`s. If in doubt,
1719 /// use the corresponding non-`_INIT` reference-typed `static`.
1720 ///
1721 /// This part of the public API will go away if Rust changes
1722 /// to make the referent of `pub const FOO: &'static Encoding`
1723 /// unique cross-crate or if Rust starts allowing static arrays
1724 /// to be initialized with `pub static FOO: &'static Encoding`
1725 /// items.
1726 pub static REPLACEMENT_INIT: Encoding = Encoding {
1727     name: "replacement",
1728     variant: VariantEncoding::Replacement,
1729 };
1730 
1731 /// The replacement encoding.
1732 ///
1733 /// This decode-only encoding decodes all non-zero-length streams to a single
1734 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1735 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1736 /// encodings that are no longer supported by the Web Platform and that
1737 /// would be dangerous to treat as ASCII-compatible.
1738 ///
1739 /// There is no corresponding encoder. The output encoding of this encoding
1740 /// is UTF-8.
1741 ///
1742 /// This encoding does not have a Windows code page number.
1743 ///
1744 /// This will change from `static` to `const` if Rust changes
1745 /// to make the referent of `pub const FOO: &'static Encoding`
1746 /// unique cross-crate, so don't take the address of this
1747 /// `static`.
1748 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1749 
1750 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1751 ///
1752 /// For use only for taking the address of this form when
1753 /// Rust prohibits the use of the non-`_INIT` form directly,
1754 /// such as in initializers of other `static`s. If in doubt,
1755 /// use the corresponding non-`_INIT` reference-typed `static`.
1756 ///
1757 /// This part of the public API will go away if Rust changes
1758 /// to make the referent of `pub const FOO: &'static Encoding`
1759 /// unique cross-crate or if Rust starts allowing static arrays
1760 /// to be initialized with `pub static FOO: &'static Encoding`
1761 /// items.
1762 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1763     name: "windows-1250",
1764     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1765 };
1766 
1767 /// The windows-1250 encoding.
1768 ///
1769 /// This is the Central European encoding for Windows.
1770 ///
1771 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1772 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1773 ///
1774 /// This encoding matches the Windows code page 1250.
1775 ///
1776 /// This will change from `static` to `const` if Rust changes
1777 /// to make the referent of `pub const FOO: &'static Encoding`
1778 /// unique cross-crate, so don't take the address of this
1779 /// `static`.
1780 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1781 
1782 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1783 ///
1784 /// For use only for taking the address of this form when
1785 /// Rust prohibits the use of the non-`_INIT` form directly,
1786 /// such as in initializers of other `static`s. If in doubt,
1787 /// use the corresponding non-`_INIT` reference-typed `static`.
1788 ///
1789 /// This part of the public API will go away if Rust changes
1790 /// to make the referent of `pub const FOO: &'static Encoding`
1791 /// unique cross-crate or if Rust starts allowing static arrays
1792 /// to be initialized with `pub static FOO: &'static Encoding`
1793 /// items.
1794 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1795     name: "windows-1251",
1796     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1797 };
1798 
1799 /// The windows-1251 encoding.
1800 ///
1801 /// This is the Cyrillic encoding for Windows.
1802 ///
1803 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1804 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1805 ///
1806 /// This encoding matches the Windows code page 1251.
1807 ///
1808 /// This will change from `static` to `const` if Rust changes
1809 /// to make the referent of `pub const FOO: &'static Encoding`
1810 /// unique cross-crate, so don't take the address of this
1811 /// `static`.
1812 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1813 
1814 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1815 ///
1816 /// For use only for taking the address of this form when
1817 /// Rust prohibits the use of the non-`_INIT` form directly,
1818 /// such as in initializers of other `static`s. If in doubt,
1819 /// use the corresponding non-`_INIT` reference-typed `static`.
1820 ///
1821 /// This part of the public API will go away if Rust changes
1822 /// to make the referent of `pub const FOO: &'static Encoding`
1823 /// unique cross-crate or if Rust starts allowing static arrays
1824 /// to be initialized with `pub static FOO: &'static Encoding`
1825 /// items.
1826 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1827     name: "windows-1252",
1828     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1829 };
1830 
1831 /// The windows-1252 encoding.
1832 ///
1833 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1834 /// which is known as Latin 1.
1835 ///
1836 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1837 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1838 ///
1839 /// This encoding matches the Windows code page 1252.
1840 ///
1841 /// This will change from `static` to `const` if Rust changes
1842 /// to make the referent of `pub const FOO: &'static Encoding`
1843 /// unique cross-crate, so don't take the address of this
1844 /// `static`.
1845 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1846 
1847 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1848 ///
1849 /// For use only for taking the address of this form when
1850 /// Rust prohibits the use of the non-`_INIT` form directly,
1851 /// such as in initializers of other `static`s. If in doubt,
1852 /// use the corresponding non-`_INIT` reference-typed `static`.
1853 ///
1854 /// This part of the public API will go away if Rust changes
1855 /// to make the referent of `pub const FOO: &'static Encoding`
1856 /// unique cross-crate or if Rust starts allowing static arrays
1857 /// to be initialized with `pub static FOO: &'static Encoding`
1858 /// items.
1859 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1860     name: "windows-1253",
1861     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1862 };
1863 
1864 /// The windows-1253 encoding.
1865 ///
1866 /// This is the Greek encoding for Windows. It is mostly an extension of
1867 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1868 ///
1869 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1870 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1871 ///
1872 /// This encoding matches the Windows code page 1253, except Windows decodes
1873 /// unassigned code points to the Private Use Area of Unicode.
1874 ///
1875 /// This will change from `static` to `const` if Rust changes
1876 /// to make the referent of `pub const FOO: &'static Encoding`
1877 /// unique cross-crate, so don't take the address of this
1878 /// `static`.
1879 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1880 
1881 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1882 ///
1883 /// For use only for taking the address of this form when
1884 /// Rust prohibits the use of the non-`_INIT` form directly,
1885 /// such as in initializers of other `static`s. If in doubt,
1886 /// use the corresponding non-`_INIT` reference-typed `static`.
1887 ///
1888 /// This part of the public API will go away if Rust changes
1889 /// to make the referent of `pub const FOO: &'static Encoding`
1890 /// unique cross-crate or if Rust starts allowing static arrays
1891 /// to be initialized with `pub static FOO: &'static Encoding`
1892 /// items.
1893 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1894     name: "windows-1254",
1895     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1896 };
1897 
1898 /// The windows-1254 encoding.
1899 ///
1900 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1901 /// which is known as Latin 5.
1902 ///
1903 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1904 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1905 ///
1906 /// This encoding matches the Windows code page 1254.
1907 ///
1908 /// This will change from `static` to `const` if Rust changes
1909 /// to make the referent of `pub const FOO: &'static Encoding`
1910 /// unique cross-crate, so don't take the address of this
1911 /// `static`.
1912 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1913 
1914 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1915 ///
1916 /// For use only for taking the address of this form when
1917 /// Rust prohibits the use of the non-`_INIT` form directly,
1918 /// such as in initializers of other `static`s. If in doubt,
1919 /// use the corresponding non-`_INIT` reference-typed `static`.
1920 ///
1921 /// This part of the public API will go away if Rust changes
1922 /// to make the referent of `pub const FOO: &'static Encoding`
1923 /// unique cross-crate or if Rust starts allowing static arrays
1924 /// to be initialized with `pub static FOO: &'static Encoding`
1925 /// items.
1926 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1927     name: "windows-1255",
1928     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1929 };
1930 
1931 /// The windows-1255 encoding.
1932 ///
1933 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1934 /// except for a currency sign swap.
1935 ///
1936 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1937 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1938 ///
1939 /// This encoding matches the Windows code page 1255, except Windows decodes
1940 /// unassigned code points to the Private Use Area of Unicode.
1941 ///
1942 /// This will change from `static` to `const` if Rust changes
1943 /// to make the referent of `pub const FOO: &'static Encoding`
1944 /// unique cross-crate, so don't take the address of this
1945 /// `static`.
1946 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1947 
1948 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1949 ///
1950 /// For use only for taking the address of this form when
1951 /// Rust prohibits the use of the non-`_INIT` form directly,
1952 /// such as in initializers of other `static`s. If in doubt,
1953 /// use the corresponding non-`_INIT` reference-typed `static`.
1954 ///
1955 /// This part of the public API will go away if Rust changes
1956 /// to make the referent of `pub const FOO: &'static Encoding`
1957 /// unique cross-crate or if Rust starts allowing static arrays
1958 /// to be initialized with `pub static FOO: &'static Encoding`
1959 /// items.
1960 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1961     name: "windows-1256",
1962     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1963 };
1964 
1965 /// The windows-1256 encoding.
1966 ///
1967 /// This is the Arabic encoding for Windows.
1968 ///
1969 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1970 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1971 ///
1972 /// This encoding matches the Windows code page 1256.
1973 ///
1974 /// This will change from `static` to `const` if Rust changes
1975 /// to make the referent of `pub const FOO: &'static Encoding`
1976 /// unique cross-crate, so don't take the address of this
1977 /// `static`.
1978 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1979 
1980 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1981 ///
1982 /// For use only for taking the address of this form when
1983 /// Rust prohibits the use of the non-`_INIT` form directly,
1984 /// such as in initializers of other `static`s. If in doubt,
1985 /// use the corresponding non-`_INIT` reference-typed `static`.
1986 ///
1987 /// This part of the public API will go away if Rust changes
1988 /// to make the referent of `pub const FOO: &'static Encoding`
1989 /// unique cross-crate or if Rust starts allowing static arrays
1990 /// to be initialized with `pub static FOO: &'static Encoding`
1991 /// items.
1992 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1993     name: "windows-1257",
1994     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1995 };
1996 
1997 /// The windows-1257 encoding.
1998 ///
1999 /// This is the Baltic encoding for Windows.
2000 ///
2001 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
2002 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
2003 ///
2004 /// This encoding matches the Windows code page 1257, except Windows decodes
2005 /// unassigned code points to the Private Use Area of Unicode.
2006 ///
2007 /// This will change from `static` to `const` if Rust changes
2008 /// to make the referent of `pub const FOO: &'static Encoding`
2009 /// unique cross-crate, so don't take the address of this
2010 /// `static`.
2011 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2012 
2013 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2014 ///
2015 /// For use only for taking the address of this form when
2016 /// Rust prohibits the use of the non-`_INIT` form directly,
2017 /// such as in initializers of other `static`s. If in doubt,
2018 /// use the corresponding non-`_INIT` reference-typed `static`.
2019 ///
2020 /// This part of the public API will go away if Rust changes
2021 /// to make the referent of `pub const FOO: &'static Encoding`
2022 /// unique cross-crate or if Rust starts allowing static arrays
2023 /// to be initialized with `pub static FOO: &'static Encoding`
2024 /// items.
2025 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2026     name: "windows-1258",
2027     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2028 };
2029 
2030 /// The windows-1258 encoding.
2031 ///
2032 /// This is the Vietnamese encoding for Windows.
2033 ///
2034 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2035 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2036 ///
2037 /// This encoding matches the Windows code page 1258 when used in the
2038 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2039 /// result of decoding is not necessarily in Normalization Form C. On the
2040 /// other hand, input in the Normalization Form C is not encoded without
2041 /// replacement. In general, it's a bad idea to encode to encodings other
2042 /// than UTF-8, but this encoding is especially hazardous to encode to.
2043 ///
2044 /// This will change from `static` to `const` if Rust changes
2045 /// to make the referent of `pub const FOO: &'static Encoding`
2046 /// unique cross-crate, so don't take the address of this
2047 /// `static`.
2048 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2049 
2050 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2051 ///
2052 /// For use only for taking the address of this form when
2053 /// Rust prohibits the use of the non-`_INIT` form directly,
2054 /// such as in initializers of other `static`s. If in doubt,
2055 /// use the corresponding non-`_INIT` reference-typed `static`.
2056 ///
2057 /// This part of the public API will go away if Rust changes
2058 /// to make the referent of `pub const FOO: &'static Encoding`
2059 /// unique cross-crate or if Rust starts allowing static arrays
2060 /// to be initialized with `pub static FOO: &'static Encoding`
2061 /// items.
2062 pub static WINDOWS_874_INIT: Encoding = Encoding {
2063     name: "windows-874",
2064     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2065 };
2066 
2067 /// The windows-874 encoding.
2068 ///
2069 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2070 ///
2071 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2072 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2073 ///
2074 /// This encoding matches the Windows code page 874, except Windows decodes
2075 /// unassigned code points to the Private Use Area of Unicode.
2076 ///
2077 /// This will change from `static` to `const` if Rust changes
2078 /// to make the referent of `pub const FOO: &'static Encoding`
2079 /// unique cross-crate, so don't take the address of this
2080 /// `static`.
2081 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2082 
2083 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2084 ///
2085 /// For use only for taking the address of this form when
2086 /// Rust prohibits the use of the non-`_INIT` form directly,
2087 /// such as in initializers of other `static`s. If in doubt,
2088 /// use the corresponding non-`_INIT` reference-typed `static`.
2089 ///
2090 /// This part of the public API will go away if Rust changes
2091 /// to make the referent of `pub const FOO: &'static Encoding`
2092 /// unique cross-crate or if Rust starts allowing static arrays
2093 /// to be initialized with `pub static FOO: &'static Encoding`
2094 /// items.
2095 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2096     name: "x-mac-cyrillic",
2097     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2098 };
2099 
2100 /// The x-mac-cyrillic encoding.
2101 ///
2102 /// This is the MacUkrainian encoding from Mac OS Classic.
2103 ///
2104 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2105 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2106 ///
2107 /// This encoding matches the Windows code page 10017.
2108 ///
2109 /// This will change from `static` to `const` if Rust changes
2110 /// to make the referent of `pub const FOO: &'static Encoding`
2111 /// unique cross-crate, so don't take the address of this
2112 /// `static`.
2113 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2114 
2115 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2116 ///
2117 /// For use only for taking the address of this form when
2118 /// Rust prohibits the use of the non-`_INIT` form directly,
2119 /// such as in initializers of other `static`s. If in doubt,
2120 /// use the corresponding non-`_INIT` reference-typed `static`.
2121 ///
2122 /// This part of the public API will go away if Rust changes
2123 /// to make the referent of `pub const FOO: &'static Encoding`
2124 /// unique cross-crate or if Rust starts allowing static arrays
2125 /// to be initialized with `pub static FOO: &'static Encoding`
2126 /// items.
2127 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2128     name: "x-user-defined",
2129     variant: VariantEncoding::UserDefined,
2130 };
2131 
2132 /// The x-user-defined encoding.
2133 ///
2134 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2135 /// them to the Private Use Area of Unicode. It was used for loading binary
2136 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2137 /// the `"arraybuffer"` response type.
2138 ///
2139 /// This encoding does not have a Windows code page number.
2140 ///
2141 /// This will change from `static` to `const` if Rust changes
2142 /// to make the referent of `pub const FOO: &'static Encoding`
2143 /// unique cross-crate, so don't take the address of this
2144 /// `static`.
2145 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2146 
2147 static LABELS_SORTED: [&'static str; 228] = [
2148     "l1",
2149     "l2",
2150     "l3",
2151     "l4",
2152     "l5",
2153     "l6",
2154     "l9",
2155     "866",
2156     "mac",
2157     "koi",
2158     "gbk",
2159     "big5",
2160     "utf8",
2161     "koi8",
2162     "sjis",
2163     "ucs-2",
2164     "ms932",
2165     "cp866",
2166     "utf-8",
2167     "cp819",
2168     "ascii",
2169     "x-gbk",
2170     "greek",
2171     "cp1250",
2172     "cp1251",
2173     "latin1",
2174     "gb2312",
2175     "cp1252",
2176     "latin2",
2177     "cp1253",
2178     "latin3",
2179     "cp1254",
2180     "latin4",
2181     "cp1255",
2182     "csbig5",
2183     "latin5",
2184     "utf-16",
2185     "cp1256",
2186     "ibm866",
2187     "latin6",
2188     "cp1257",
2189     "cp1258",
2190     "greek8",
2191     "ibm819",
2192     "arabic",
2193     "visual",
2194     "korean",
2195     "euc-jp",
2196     "koi8-r",
2197     "koi8_r",
2198     "euc-kr",
2199     "x-sjis",
2200     "koi8-u",
2201     "hebrew",
2202     "tis-620",
2203     "gb18030",
2204     "ksc5601",
2205     "gb_2312",
2206     "dos-874",
2207     "cn-big5",
2208     "unicode",
2209     "chinese",
2210     "logical",
2211     "cskoi8r",
2212     "cseuckr",
2213     "koi8-ru",
2214     "x-cp1250",
2215     "ksc_5601",
2216     "x-cp1251",
2217     "iso88591",
2218     "csgb2312",
2219     "x-cp1252",
2220     "iso88592",
2221     "x-cp1253",
2222     "iso88593",
2223     "ecma-114",
2224     "x-cp1254",
2225     "iso88594",
2226     "x-cp1255",
2227     "iso88595",
2228     "x-x-big5",
2229     "x-cp1256",
2230     "csibm866",
2231     "iso88596",
2232     "x-cp1257",
2233     "iso88597",
2234     "asmo-708",
2235     "ecma-118",
2236     "elot_928",
2237     "x-cp1258",
2238     "iso88598",
2239     "iso88599",
2240     "cyrillic",
2241     "utf-16be",
2242     "utf-16le",
2243     "us-ascii",
2244     "ms_kanji",
2245     "x-euc-jp",
2246     "iso885910",
2247     "iso8859-1",
2248     "iso885911",
2249     "iso8859-2",
2250     "iso8859-3",
2251     "iso885913",
2252     "iso8859-4",
2253     "iso885914",
2254     "iso8859-5",
2255     "iso885915",
2256     "iso8859-6",
2257     "iso8859-7",
2258     "iso8859-8",
2259     "iso-ir-58",
2260     "iso8859-9",
2261     "csunicode",
2262     "macintosh",
2263     "shift-jis",
2264     "shift_jis",
2265     "iso-ir-100",
2266     "iso8859-10",
2267     "iso-ir-110",
2268     "gb_2312-80",
2269     "iso-8859-1",
2270     "iso_8859-1",
2271     "iso-ir-101",
2272     "iso8859-11",
2273     "iso-8859-2",
2274     "iso_8859-2",
2275     "hz-gb-2312",
2276     "iso-8859-3",
2277     "iso_8859-3",
2278     "iso8859-13",
2279     "iso-8859-4",
2280     "iso_8859-4",
2281     "iso8859-14",
2282     "iso-ir-144",
2283     "iso-8859-5",
2284     "iso_8859-5",
2285     "iso8859-15",
2286     "iso-8859-6",
2287     "iso_8859-6",
2288     "iso-ir-126",
2289     "iso-8859-7",
2290     "iso_8859-7",
2291     "iso-ir-127",
2292     "iso-ir-157",
2293     "iso-8859-8",
2294     "iso_8859-8",
2295     "iso-ir-138",
2296     "iso-ir-148",
2297     "iso-8859-9",
2298     "iso_8859-9",
2299     "iso-ir-109",
2300     "iso-ir-149",
2301     "big5-hkscs",
2302     "csshiftjis",
2303     "iso-8859-10",
2304     "iso-8859-11",
2305     "csisolatin1",
2306     "csisolatin2",
2307     "iso-8859-13",
2308     "csisolatin3",
2309     "iso-8859-14",
2310     "windows-874",
2311     "csisolatin4",
2312     "iso-8859-15",
2313     "iso_8859-15",
2314     "csisolatin5",
2315     "iso-8859-16",
2316     "csisolatin6",
2317     "windows-949",
2318     "csisolatin9",
2319     "csiso88596e",
2320     "csiso88598e",
2321     "unicodefffe",
2322     "unicodefeff",
2323     "csmacintosh",
2324     "csiso88596i",
2325     "csiso88598i",
2326     "windows-31j",
2327     "x-mac-roman",
2328     "iso-2022-cn",
2329     "iso-2022-jp",
2330     "csiso2022jp",
2331     "iso-2022-kr",
2332     "csiso2022kr",
2333     "replacement",
2334     "windows-1250",
2335     "windows-1251",
2336     "windows-1252",
2337     "windows-1253",
2338     "windows-1254",
2339     "windows-1255",
2340     "windows-1256",
2341     "windows-1257",
2342     "windows-1258",
2343     "iso-8859-6-e",
2344     "iso-8859-8-e",
2345     "iso-8859-6-i",
2346     "iso-8859-8-i",
2347     "sun_eu_greek",
2348     "csksc56011987",
2349     "unicode20utf8",
2350     "unicode11utf8",
2351     "ks_c_5601-1987",
2352     "ansi_x3.4-1968",
2353     "ks_c_5601-1989",
2354     "x-mac-cyrillic",
2355     "x-user-defined",
2356     "csiso58gb231280",
2357     "iso-10646-ucs-2",
2358     "iso_8859-1:1987",
2359     "iso_8859-2:1987",
2360     "iso_8859-6:1987",
2361     "iso_8859-7:1987",
2362     "iso_8859-3:1988",
2363     "iso_8859-4:1988",
2364     "iso_8859-5:1988",
2365     "iso_8859-8:1988",
2366     "x-unicode20utf8",
2367     "iso_8859-9:1989",
2368     "csisolatingreek",
2369     "x-mac-ukrainian",
2370     "iso-2022-cn-ext",
2371     "csisolatinarabic",
2372     "csisolatinhebrew",
2373     "unicode-1-1-utf-8",
2374     "csisolatincyrillic",
2375     "cseucpkdfmtjapanese",
2376 ];
2377 
2378 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 228] = [
2379     &WINDOWS_1252_INIT,
2380     &ISO_8859_2_INIT,
2381     &ISO_8859_3_INIT,
2382     &ISO_8859_4_INIT,
2383     &WINDOWS_1254_INIT,
2384     &ISO_8859_10_INIT,
2385     &ISO_8859_15_INIT,
2386     &IBM866_INIT,
2387     &MACINTOSH_INIT,
2388     &KOI8_R_INIT,
2389     &GBK_INIT,
2390     &BIG5_INIT,
2391     &UTF_8_INIT,
2392     &KOI8_R_INIT,
2393     &SHIFT_JIS_INIT,
2394     &UTF_16LE_INIT,
2395     &SHIFT_JIS_INIT,
2396     &IBM866_INIT,
2397     &UTF_8_INIT,
2398     &WINDOWS_1252_INIT,
2399     &WINDOWS_1252_INIT,
2400     &GBK_INIT,
2401     &ISO_8859_7_INIT,
2402     &WINDOWS_1250_INIT,
2403     &WINDOWS_1251_INIT,
2404     &WINDOWS_1252_INIT,
2405     &GBK_INIT,
2406     &WINDOWS_1252_INIT,
2407     &ISO_8859_2_INIT,
2408     &WINDOWS_1253_INIT,
2409     &ISO_8859_3_INIT,
2410     &WINDOWS_1254_INIT,
2411     &ISO_8859_4_INIT,
2412     &WINDOWS_1255_INIT,
2413     &BIG5_INIT,
2414     &WINDOWS_1254_INIT,
2415     &UTF_16LE_INIT,
2416     &WINDOWS_1256_INIT,
2417     &IBM866_INIT,
2418     &ISO_8859_10_INIT,
2419     &WINDOWS_1257_INIT,
2420     &WINDOWS_1258_INIT,
2421     &ISO_8859_7_INIT,
2422     &WINDOWS_1252_INIT,
2423     &ISO_8859_6_INIT,
2424     &ISO_8859_8_INIT,
2425     &EUC_KR_INIT,
2426     &EUC_JP_INIT,
2427     &KOI8_R_INIT,
2428     &KOI8_R_INIT,
2429     &EUC_KR_INIT,
2430     &SHIFT_JIS_INIT,
2431     &KOI8_U_INIT,
2432     &ISO_8859_8_INIT,
2433     &WINDOWS_874_INIT,
2434     &GB18030_INIT,
2435     &EUC_KR_INIT,
2436     &GBK_INIT,
2437     &WINDOWS_874_INIT,
2438     &BIG5_INIT,
2439     &UTF_16LE_INIT,
2440     &GBK_INIT,
2441     &ISO_8859_8_I_INIT,
2442     &KOI8_R_INIT,
2443     &EUC_KR_INIT,
2444     &KOI8_U_INIT,
2445     &WINDOWS_1250_INIT,
2446     &EUC_KR_INIT,
2447     &WINDOWS_1251_INIT,
2448     &WINDOWS_1252_INIT,
2449     &GBK_INIT,
2450     &WINDOWS_1252_INIT,
2451     &ISO_8859_2_INIT,
2452     &WINDOWS_1253_INIT,
2453     &ISO_8859_3_INIT,
2454     &ISO_8859_6_INIT,
2455     &WINDOWS_1254_INIT,
2456     &ISO_8859_4_INIT,
2457     &WINDOWS_1255_INIT,
2458     &ISO_8859_5_INIT,
2459     &BIG5_INIT,
2460     &WINDOWS_1256_INIT,
2461     &IBM866_INIT,
2462     &ISO_8859_6_INIT,
2463     &WINDOWS_1257_INIT,
2464     &ISO_8859_7_INIT,
2465     &ISO_8859_6_INIT,
2466     &ISO_8859_7_INIT,
2467     &ISO_8859_7_INIT,
2468     &WINDOWS_1258_INIT,
2469     &ISO_8859_8_INIT,
2470     &WINDOWS_1254_INIT,
2471     &ISO_8859_5_INIT,
2472     &UTF_16BE_INIT,
2473     &UTF_16LE_INIT,
2474     &WINDOWS_1252_INIT,
2475     &SHIFT_JIS_INIT,
2476     &EUC_JP_INIT,
2477     &ISO_8859_10_INIT,
2478     &WINDOWS_1252_INIT,
2479     &WINDOWS_874_INIT,
2480     &ISO_8859_2_INIT,
2481     &ISO_8859_3_INIT,
2482     &ISO_8859_13_INIT,
2483     &ISO_8859_4_INIT,
2484     &ISO_8859_14_INIT,
2485     &ISO_8859_5_INIT,
2486     &ISO_8859_15_INIT,
2487     &ISO_8859_6_INIT,
2488     &ISO_8859_7_INIT,
2489     &ISO_8859_8_INIT,
2490     &GBK_INIT,
2491     &WINDOWS_1254_INIT,
2492     &UTF_16LE_INIT,
2493     &MACINTOSH_INIT,
2494     &SHIFT_JIS_INIT,
2495     &SHIFT_JIS_INIT,
2496     &WINDOWS_1252_INIT,
2497     &ISO_8859_10_INIT,
2498     &ISO_8859_4_INIT,
2499     &GBK_INIT,
2500     &WINDOWS_1252_INIT,
2501     &WINDOWS_1252_INIT,
2502     &ISO_8859_2_INIT,
2503     &WINDOWS_874_INIT,
2504     &ISO_8859_2_INIT,
2505     &ISO_8859_2_INIT,
2506     &REPLACEMENT_INIT,
2507     &ISO_8859_3_INIT,
2508     &ISO_8859_3_INIT,
2509     &ISO_8859_13_INIT,
2510     &ISO_8859_4_INIT,
2511     &ISO_8859_4_INIT,
2512     &ISO_8859_14_INIT,
2513     &ISO_8859_5_INIT,
2514     &ISO_8859_5_INIT,
2515     &ISO_8859_5_INIT,
2516     &ISO_8859_15_INIT,
2517     &ISO_8859_6_INIT,
2518     &ISO_8859_6_INIT,
2519     &ISO_8859_7_INIT,
2520     &ISO_8859_7_INIT,
2521     &ISO_8859_7_INIT,
2522     &ISO_8859_6_INIT,
2523     &ISO_8859_10_INIT,
2524     &ISO_8859_8_INIT,
2525     &ISO_8859_8_INIT,
2526     &ISO_8859_8_INIT,
2527     &WINDOWS_1254_INIT,
2528     &WINDOWS_1254_INIT,
2529     &WINDOWS_1254_INIT,
2530     &ISO_8859_3_INIT,
2531     &EUC_KR_INIT,
2532     &BIG5_INIT,
2533     &SHIFT_JIS_INIT,
2534     &ISO_8859_10_INIT,
2535     &WINDOWS_874_INIT,
2536     &WINDOWS_1252_INIT,
2537     &ISO_8859_2_INIT,
2538     &ISO_8859_13_INIT,
2539     &ISO_8859_3_INIT,
2540     &ISO_8859_14_INIT,
2541     &WINDOWS_874_INIT,
2542     &ISO_8859_4_INIT,
2543     &ISO_8859_15_INIT,
2544     &ISO_8859_15_INIT,
2545     &WINDOWS_1254_INIT,
2546     &ISO_8859_16_INIT,
2547     &ISO_8859_10_INIT,
2548     &EUC_KR_INIT,
2549     &ISO_8859_15_INIT,
2550     &ISO_8859_6_INIT,
2551     &ISO_8859_8_INIT,
2552     &UTF_16BE_INIT,
2553     &UTF_16LE_INIT,
2554     &MACINTOSH_INIT,
2555     &ISO_8859_6_INIT,
2556     &ISO_8859_8_I_INIT,
2557     &SHIFT_JIS_INIT,
2558     &MACINTOSH_INIT,
2559     &REPLACEMENT_INIT,
2560     &ISO_2022_JP_INIT,
2561     &ISO_2022_JP_INIT,
2562     &REPLACEMENT_INIT,
2563     &REPLACEMENT_INIT,
2564     &REPLACEMENT_INIT,
2565     &WINDOWS_1250_INIT,
2566     &WINDOWS_1251_INIT,
2567     &WINDOWS_1252_INIT,
2568     &WINDOWS_1253_INIT,
2569     &WINDOWS_1254_INIT,
2570     &WINDOWS_1255_INIT,
2571     &WINDOWS_1256_INIT,
2572     &WINDOWS_1257_INIT,
2573     &WINDOWS_1258_INIT,
2574     &ISO_8859_6_INIT,
2575     &ISO_8859_8_INIT,
2576     &ISO_8859_6_INIT,
2577     &ISO_8859_8_I_INIT,
2578     &ISO_8859_7_INIT,
2579     &EUC_KR_INIT,
2580     &UTF_8_INIT,
2581     &UTF_8_INIT,
2582     &EUC_KR_INIT,
2583     &WINDOWS_1252_INIT,
2584     &EUC_KR_INIT,
2585     &X_MAC_CYRILLIC_INIT,
2586     &X_USER_DEFINED_INIT,
2587     &GBK_INIT,
2588     &UTF_16LE_INIT,
2589     &WINDOWS_1252_INIT,
2590     &ISO_8859_2_INIT,
2591     &ISO_8859_6_INIT,
2592     &ISO_8859_7_INIT,
2593     &ISO_8859_3_INIT,
2594     &ISO_8859_4_INIT,
2595     &ISO_8859_5_INIT,
2596     &ISO_8859_8_INIT,
2597     &UTF_8_INIT,
2598     &WINDOWS_1254_INIT,
2599     &ISO_8859_7_INIT,
2600     &X_MAC_CYRILLIC_INIT,
2601     &REPLACEMENT_INIT,
2602     &ISO_8859_6_INIT,
2603     &ISO_8859_8_INIT,
2604     &UTF_8_INIT,
2605     &ISO_8859_5_INIT,
2606     &EUC_JP_INIT,
2607 ];
2608 
2609 // END GENERATED CODE
2610 
2611 /// An encoding as defined in the [Encoding Standard][1].
2612 ///
2613 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2614 /// and, in most cases, vice versa. Each encoding has a name, an output
2615 /// encoding, and one or more labels.
2616 ///
2617 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2618 /// encoding in formats and protocols. The _name_ of the encoding is the
2619 /// preferred label in the case appropriate for returning from the
2620 /// [`characterSet`][2] property of the `Document` DOM interface.
2621 ///
2622 /// The _output encoding_ is the encoding used for form submission and URL
2623 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2624 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2625 /// encodings.
2626 ///
2627 /// [1]: https://encoding.spec.whatwg.org/
2628 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2629 ///
2630 /// # Streaming vs. Non-Streaming
2631 ///
2632 /// When you have the entire input in a single buffer, you can use the
2633 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2634 /// [`decode_without_bom_handling()`][5],
2635 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2636 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2637 /// not available in the C API.) Unlike the rest of the API available to Rust,
2638 /// these methods perform heap allocations. You should the `Decoder` and
2639 /// `Encoder` objects when your input is split into multiple buffers or when
2640 /// you want to control the allocation of the output buffers.
2641 ///
2642 /// [3]: #method.decode
2643 /// [4]: #method.decode_with_bom_removal
2644 /// [5]: #method.decode_without_bom_handling
2645 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2646 /// [7]: #method.encode
2647 ///
2648 /// # Instances
2649 ///
2650 /// All instances of `Encoding` are statically allocated and have the `'static`
2651 /// lifetime. There is precisely one unique `Encoding` instance for each
2652 /// encoding defined in the Encoding Standard.
2653 ///
2654 /// To obtain a reference to a particular encoding whose identity you know at
2655 /// compile time, use a `static` that refers to encoding. There is a `static`
2656 /// for each encoding. The `static`s are named in all caps with hyphens
2657 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2658 /// name). For example, if you know at compile time that you will want to
2659 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2660 /// in C/C++).
2661 ///
2662 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2663 /// work around the problem that `static`s of the type `&'static Encoding`
2664 /// cannot be used to initialize items of an array whose type is
2665 /// `[&'static Encoding; N]`.
2666 ///
2667 /// If you don't know what encoding you need at compile time and need to
2668 /// dynamically get an encoding by label, use
2669 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2670 ///
2671 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2672 /// C/C++).
2673 pub struct Encoding {
2674     name: &'static str,
2675     variant: VariantEncoding,
2676 }
2677 
2678 impl Encoding {
2679     /// Implements the
2680     /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2681     /// algorithm.
2682     ///
2683     /// If, after ASCII-lowercasing and removing leading and trailing
2684     /// whitespace, the argument matches a label defined in the Encoding
2685     /// Standard, `Some(&'static Encoding)` representing the corresponding
2686     /// encoding is returned. If there is no match, `None` is returned.
2687     ///
2688     /// This is the right method to use if the action upon the method returning
2689     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2690     /// When the action upon the method returning `None` is not to proceed with
2691     /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2692     /// appropriate.
2693     ///
2694     /// The argument is of type `&[u8]` instead of `&str` to save callers
2695     /// that are extracting the label from a non-UTF-8 protocol the trouble
2696     /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2697     /// on it.)
2698     ///
2699     /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2700     pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2701         let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2702         let mut trimmed_pos = 0usize;
2703         let mut iter = label.into_iter();
2704         // before
2705         loop {
2706             match iter.next() {
2707                 None => {
2708                     return None;
2709                 }
2710                 Some(byte) => {
2711                     // The characters used in labels are:
2712                     // a-z (except q, but excluding it below seems excessive)
2713                     // 0-9
2714                     // . _ - :
2715                     match *byte {
2716                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2717                             continue;
2718                         }
2719                         b'A'..=b'Z' => {
2720                             trimmed[trimmed_pos] = *byte + 0x20u8;
2721                             trimmed_pos = 1usize;
2722                             break;
2723                         }
2724                         b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2725                             trimmed[trimmed_pos] = *byte;
2726                             trimmed_pos = 1usize;
2727                             break;
2728                         }
2729                         _ => {
2730                             return None;
2731                         }
2732                     }
2733                 }
2734             }
2735         }
2736         // inside
2737         loop {
2738             match iter.next() {
2739                 None => {
2740                     break;
2741                 }
2742                 Some(byte) => {
2743                     match *byte {
2744                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2745                             break;
2746                         }
2747                         b'A'..=b'Z' => {
2748                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2749                                 // There's no encoding with a label this long
2750                                 return None;
2751                             }
2752                             trimmed[trimmed_pos] = *byte + 0x20u8;
2753                             trimmed_pos += 1usize;
2754                             continue;
2755                         }
2756                         b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2757                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2758                                 // There's no encoding with a label this long
2759                                 return None;
2760                             }
2761                             trimmed[trimmed_pos] = *byte;
2762                             trimmed_pos += 1usize;
2763                             continue;
2764                         }
2765                         _ => {
2766                             return None;
2767                         }
2768                     }
2769                 }
2770             }
2771         }
2772         // after
2773         loop {
2774             match iter.next() {
2775                 None => {
2776                     break;
2777                 }
2778                 Some(byte) => {
2779                     match *byte {
2780                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2781                             continue;
2782                         }
2783                         _ => {
2784                             // There's no label with space in the middle
2785                             return None;
2786                         }
2787                     }
2788                 }
2789             }
2790         }
2791         let candidate = &trimmed[..trimmed_pos];
2792         match LABELS_SORTED.binary_search_by(|probe| {
2793             let bytes = probe.as_bytes();
2794             let c = bytes.len().cmp(&candidate.len());
2795             if c != Ordering::Equal {
2796                 return c;
2797             }
2798             let probe_iter = bytes.iter().rev();
2799             let candidate_iter = candidate.iter().rev();
2800             probe_iter.cmp(candidate_iter)
2801         }) {
2802             Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2803             Err(_) => None,
2804         }
2805     }
2806 
2807     /// This method behaves the same as `for_label()`, except when `for_label()`
2808     /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2809     ///
2810     /// This method is useful in scenarios where a fatal error is required
2811     /// upon invalid label, because in those cases the caller typically wishes
2812     /// to treat the labels that map to the replacement encoding as fatal
2813     /// errors, too.
2814     ///
2815     /// It is not OK to use this method when the action upon the method returning
2816     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2817     /// case, the `for_label()` method should be used instead in order to avoid
2818     /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2819     ///
2820     /// Available via the C wrapper.
2821     #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2822     pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2823         match Encoding::for_label(label) {
2824             None => None,
2825             Some(encoding) => {
2826                 if encoding == REPLACEMENT {
2827                     None
2828                 } else {
2829                     Some(encoding)
2830                 }
2831             }
2832         }
2833     }
2834 
2835     /// Performs non-incremental BOM sniffing.
2836     ///
2837     /// The argument must either be a buffer representing the entire input
2838     /// stream (non-streaming case) or a buffer representing at least the first
2839     /// three bytes of the input stream (streaming case).
2840     ///
2841     /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2842     /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2843     /// or UTF-16BE BOM or `None` otherwise.
2844     ///
2845     /// Available via the C wrapper.
2846     #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2847     pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2848         if buffer.starts_with(b"\xEF\xBB\xBF") {
2849             Some((UTF_8, 3))
2850         } else if buffer.starts_with(b"\xFF\xFE") {
2851             Some((UTF_16LE, 2))
2852         } else if buffer.starts_with(b"\xFE\xFF") {
2853             Some((UTF_16BE, 2))
2854         } else {
2855             None
2856         }
2857     }
2858 
2859     /// Returns the name of this encoding.
2860     ///
2861     /// This name is appropriate to return as-is from the DOM
2862     /// `document.characterSet` property.
2863     ///
2864     /// Available via the C wrapper.
2865     #[inline]
name(&'static self) -> &'static str2866     pub fn name(&'static self) -> &'static str {
2867         self.name
2868     }
2869 
2870     /// Checks whether the _output encoding_ of this encoding can encode every
2871     /// `char`. (Only true if the output encoding is UTF-8.)
2872     ///
2873     /// Available via the C wrapper.
2874     #[inline]
can_encode_everything(&'static self) -> bool2875     pub fn can_encode_everything(&'static self) -> bool {
2876         self.output_encoding() == UTF_8
2877     }
2878 
2879     /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2880     /// U+0000...U+007F and vice versa.
2881     ///
2882     /// Available via the C wrapper.
2883     #[inline]
is_ascii_compatible(&'static self) -> bool2884     pub fn is_ascii_compatible(&'static self) -> bool {
2885         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2886     }
2887 
2888     /// Checks whether this encoding maps one byte to one Basic Multilingual
2889     /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2890     /// vice versa (for mappable characters).
2891     ///
2892     /// `true` iff this encoding is on the list of [Legacy single-byte
2893     /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2894     /// in the spec or x-user-defined.
2895     ///
2896     /// Available via the C wrapper.
2897     #[inline]
is_single_byte(&'static self) -> bool2898     pub fn is_single_byte(&'static self) -> bool {
2899         self.variant.is_single_byte()
2900     }
2901 
2902     /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2903     /// U+0000...U+007F and vice versa.
2904     #[cfg(feature = "alloc")]
2905     #[inline]
is_potentially_borrowable(&'static self) -> bool2906     fn is_potentially_borrowable(&'static self) -> bool {
2907         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2908     }
2909 
2910     /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2911     /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2912     ///
2913     /// Available via the C wrapper.
2914     #[inline]
output_encoding(&'static self) -> &'static Encoding2915     pub fn output_encoding(&'static self) -> &'static Encoding {
2916         if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2917             UTF_8
2918         } else {
2919             self
2920         }
2921     }
2922 
2923     /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2924     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2925     /// entire input is available as a single buffer (i.e. the end of the
2926     /// buffer marks the end of the stream).
2927     ///
2928     /// This method implements the (non-streaming version of) the
2929     /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2930     ///
2931     /// The second item in the returned tuple is the encoding that was actually
2932     /// used (which may differ from this encoding thanks to BOM sniffing).
2933     ///
2934     /// The third item in the returned tuple indicates whether there were
2935     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2936     ///
2937     /// _Note:_ It is wrong to use this when the input buffer represents only
2938     /// a segment of the input instead of the whole input. Use `new_decoder()`
2939     /// when decoding segmented input.
2940     ///
2941     /// This method performs a one or two heap allocations for the backing
2942     /// buffer of the `String` when unable to borrow. (One allocation if not
2943     /// errors and potentially another one in the presence of errors.) The
2944     /// first allocation assumes jemalloc and may not be optimal with
2945     /// allocators that do not use power-of-two buckets. A borrow is performed
2946     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2947     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2948     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2949     /// transitions.
2950     ///
2951     /// # Panics
2952     ///
2953     /// If the size calculation for a heap-allocated backing buffer overflows
2954     /// `usize`.
2955     ///
2956     /// Available to Rust only and only with the `alloc` feature enabled (enabled
2957     /// by default).
2958     #[cfg(feature = "alloc")]
2959     #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2960     pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2961         let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2962             Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2963             None => (self, bytes),
2964         };
2965         let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2966         (cow, encoding, had_errors)
2967     }
2968 
2969     /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2970     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2971     /// entire input is available as a single buffer (i.e. the end of the
2972     /// buffer marks the end of the stream).
2973     ///
2974     /// When invoked on `UTF_8`, this method implements the (non-streaming
2975     /// version of) the
2976     /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2977     /// concept.
2978     ///
2979     /// The second item in the returned pair indicates whether there were
2980     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2981     ///
2982     /// _Note:_ It is wrong to use this when the input buffer represents only
2983     /// a segment of the input instead of the whole input. Use
2984     /// `new_decoder_with_bom_removal()` when decoding segmented input.
2985     ///
2986     /// This method performs a one or two heap allocations for the backing
2987     /// buffer of the `String` when unable to borrow. (One allocation if not
2988     /// errors and potentially another one in the presence of errors.) The
2989     /// first allocation assumes jemalloc and may not be optimal with
2990     /// allocators that do not use power-of-two buckets. A borrow is performed
2991     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2992     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2993     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2994     /// transitions.
2995     ///
2996     /// # Panics
2997     ///
2998     /// If the size calculation for a heap-allocated backing buffer overflows
2999     /// `usize`.
3000     ///
3001     /// Available to Rust only and only with the `alloc` feature enabled (enabled
3002     /// by default).
3003     #[cfg(feature = "alloc")]
3004     #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3005     pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3006         let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
3007             &bytes[3..]
3008         } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
3009             || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
3010         {
3011             &bytes[2..]
3012         } else {
3013             bytes
3014         };
3015         self.decode_without_bom_handling(without_bom)
3016     }
3017 
3018     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3019     /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
3020     /// the entire input is available as a single buffer (i.e. the end of the
3021     /// buffer marks the end of the stream).
3022     ///
3023     /// When invoked on `UTF_8`, this method implements the (non-streaming
3024     /// version of) the
3025     /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
3026     /// spec concept.
3027     ///
3028     /// The second item in the returned pair indicates whether there were
3029     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3030     ///
3031     /// _Note:_ It is wrong to use this when the input buffer represents only
3032     /// a segment of the input instead of the whole input. Use
3033     /// `new_decoder_without_bom_handling()` when decoding segmented input.
3034     ///
3035     /// This method performs a one or two heap allocations for the backing
3036     /// buffer of the `String` when unable to borrow. (One allocation if not
3037     /// errors and potentially another one in the presence of errors.) The
3038     /// first allocation assumes jemalloc and may not be optimal with
3039     /// allocators that do not use power-of-two buckets. A borrow is performed
3040     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3041     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3042     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3043     /// transitions.
3044     ///
3045     /// # Panics
3046     ///
3047     /// If the size calculation for a heap-allocated backing buffer overflows
3048     /// `usize`.
3049     ///
3050     /// Available to Rust only and only with the `alloc` feature enabled (enabled
3051     /// by default).
3052     #[cfg(feature = "alloc")]
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3053     pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3054         let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3055             let valid_up_to = if self == UTF_8 {
3056                 utf8_valid_up_to(bytes)
3057             } else if self == ISO_2022_JP {
3058                 iso_2022_jp_ascii_valid_up_to(bytes)
3059             } else {
3060                 ascii_valid_up_to(bytes)
3061             };
3062             if valid_up_to == bytes.len() {
3063                 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3064                 return (Cow::Borrowed(str), false);
3065             }
3066             let decoder = self.new_decoder_without_bom_handling();
3067 
3068             let rounded_without_replacement = checked_next_power_of_two(checked_add(
3069                 valid_up_to,
3070                 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3071             ));
3072             let with_replacement = checked_add(
3073                 valid_up_to,
3074                 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3075             );
3076             let mut string = String::with_capacity(
3077                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3078             );
3079             unsafe {
3080                 let vec = string.as_mut_vec();
3081                 vec.set_len(valid_up_to);
3082                 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3083             }
3084             (decoder, string, valid_up_to)
3085         } else {
3086             let decoder = self.new_decoder_without_bom_handling();
3087             let rounded_without_replacement = checked_next_power_of_two(
3088                 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3089             );
3090             let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3091             let string = String::with_capacity(
3092                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3093             );
3094             (decoder, string, 0)
3095         };
3096 
3097         let mut total_had_errors = false;
3098         loop {
3099             let (result, read, had_errors) =
3100                 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3101             total_read += read;
3102             total_had_errors |= had_errors;
3103             match result {
3104                 CoderResult::InputEmpty => {
3105                     debug_assert_eq!(total_read, bytes.len());
3106                     return (Cow::Owned(string), total_had_errors);
3107                 }
3108                 CoderResult::OutputFull => {
3109                     // Allocate for the worst case. That is, we should come
3110                     // here at most once per invocation of this method.
3111                     let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3112                     string.reserve(needed.unwrap());
3113                 }
3114             }
3115         }
3116     }
3117 
3118     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3119     /// _with malformed sequences treated as fatal_ when the entire input is
3120     /// available as a single buffer (i.e. the end of the buffer marks the end
3121     /// of the stream).
3122     ///
3123     /// When invoked on `UTF_8`, this method implements the (non-streaming
3124     /// version of) the
3125     /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3126     /// spec concept.
3127     ///
3128     /// Returns `None` if a malformed sequence was encountered and the result
3129     /// of the decode as `Some(String)` otherwise.
3130     ///
3131     /// _Note:_ It is wrong to use this when the input buffer represents only
3132     /// a segment of the input instead of the whole input. Use
3133     /// `new_decoder_without_bom_handling()` when decoding segmented input.
3134     ///
3135     /// This method performs a single heap allocation for the backing
3136     /// buffer of the `String` when unable to borrow. A borrow is performed if
3137     /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3138     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3139     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3140     /// transitions.
3141     ///
3142     /// # Panics
3143     ///
3144     /// If the size calculation for a heap-allocated backing buffer overflows
3145     /// `usize`.
3146     ///
3147     /// Available to Rust only and only with the `alloc` feature enabled (enabled
3148     /// by default).
3149     #[cfg(feature = "alloc")]
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3150     pub fn decode_without_bom_handling_and_without_replacement<'a>(
3151         &'static self,
3152         bytes: &'a [u8],
3153     ) -> Option<Cow<'a, str>> {
3154         if self == UTF_8 {
3155             let valid_up_to = utf8_valid_up_to(bytes);
3156             if valid_up_to == bytes.len() {
3157                 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3158                 return Some(Cow::Borrowed(str));
3159             }
3160             return None;
3161         }
3162         let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3163             let valid_up_to = if self == ISO_2022_JP {
3164                 iso_2022_jp_ascii_valid_up_to(bytes)
3165             } else {
3166                 ascii_valid_up_to(bytes)
3167             };
3168             if valid_up_to == bytes.len() {
3169                 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3170                 return Some(Cow::Borrowed(str));
3171             }
3172             let decoder = self.new_decoder_without_bom_handling();
3173             let mut string = String::with_capacity(
3174                 checked_add(
3175                     valid_up_to,
3176                     decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3177                 )
3178                 .unwrap(),
3179             );
3180             unsafe {
3181                 let vec = string.as_mut_vec();
3182                 vec.set_len(valid_up_to);
3183                 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3184             }
3185             (decoder, string, &bytes[valid_up_to..])
3186         } else {
3187             let decoder = self.new_decoder_without_bom_handling();
3188             let string = String::with_capacity(
3189                 decoder
3190                     .max_utf8_buffer_length_without_replacement(bytes.len())
3191                     .unwrap(),
3192             );
3193             (decoder, string, bytes)
3194         };
3195         let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3196         match result {
3197             DecoderResult::InputEmpty => {
3198                 debug_assert_eq!(read, input.len());
3199                 Some(Cow::Owned(string))
3200             }
3201             DecoderResult::Malformed(_, _) => None,
3202             DecoderResult::OutputFull => unreachable!(),
3203         }
3204     }
3205 
3206     /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3207     /// replaced with decimal numeric character references when the entire input
3208     /// is available as a single buffer (i.e. the end of the buffer marks the
3209     /// end of the stream).
3210     ///
3211     /// This method implements the (non-streaming version of) the
3212     /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3213     /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3214     /// spec concept, it is slightly more efficient to use
3215     /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3216     /// method on `UTF_8`.
3217     ///
3218     /// The second item in the returned tuple is the encoding that was actually
3219     /// used (which may differ from this encoding thanks to some encodings
3220     /// having UTF-8 as their output encoding).
3221     ///
3222     /// The third item in the returned tuple indicates whether there were
3223     /// unmappable characters (that were replaced with HTML numeric character
3224     /// references).
3225     ///
3226     /// _Note:_ It is wrong to use this when the input buffer represents only
3227     /// a segment of the input instead of the whole input. Use `new_encoder()`
3228     /// when encoding segmented output.
3229     ///
3230     /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3231     /// ASCII-compatible encoding, this method returns a borrow of the input
3232     /// without a heap allocation. Otherwise, this method performs a single
3233     /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3234     /// unmappable characters and potentially multiple heap allocations if
3235     /// there are. These allocations are tuned for jemalloc and may not be
3236     /// optimal when using a different allocator that doesn't use power-of-two
3237     /// buckets.
3238     ///
3239     /// # Panics
3240     ///
3241     /// If the size calculation for a heap-allocated backing buffer overflows
3242     /// `usize`.
3243     ///
3244     /// Available to Rust only and only with the `alloc` feature enabled (enabled
3245     /// by default).
3246     #[cfg(feature = "alloc")]
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3247     pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3248         let output_encoding = self.output_encoding();
3249         if output_encoding == UTF_8 {
3250             return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3251         }
3252         debug_assert!(output_encoding.is_potentially_borrowable());
3253         let bytes = string.as_bytes();
3254         let valid_up_to = if output_encoding == ISO_2022_JP {
3255             iso_2022_jp_ascii_valid_up_to(bytes)
3256         } else {
3257             ascii_valid_up_to(bytes)
3258         };
3259         if valid_up_to == bytes.len() {
3260             return (Cow::Borrowed(bytes), output_encoding, false);
3261         }
3262         let mut encoder = output_encoding.new_encoder();
3263         let mut vec: Vec<u8> = Vec::with_capacity(
3264             (checked_add(
3265                 valid_up_to,
3266                 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3267             ))
3268             .unwrap()
3269             .next_power_of_two(),
3270         );
3271         unsafe {
3272             vec.set_len(valid_up_to);
3273             core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3274         }
3275         let mut total_read = valid_up_to;
3276         let mut total_had_errors = false;
3277         loop {
3278             let (result, read, had_errors) =
3279                 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3280             total_read += read;
3281             total_had_errors |= had_errors;
3282             match result {
3283                 CoderResult::InputEmpty => {
3284                     debug_assert_eq!(total_read, string.len());
3285                     return (Cow::Owned(vec), output_encoding, total_had_errors);
3286                 }
3287                 CoderResult::OutputFull => {
3288                     // reserve_exact wants to know how much more on top of current
3289                     // length--not current capacity.
3290                     let needed = encoder
3291                         .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3292                     let rounded = (checked_add(vec.capacity(), needed))
3293                         .unwrap()
3294                         .next_power_of_two();
3295                     let additional = rounded - vec.len();
3296                     vec.reserve_exact(additional);
3297                 }
3298             }
3299         }
3300     }
3301 
new_variant_decoder(&'static self) -> VariantDecoder3302     fn new_variant_decoder(&'static self) -> VariantDecoder {
3303         self.variant.new_variant_decoder()
3304     }
3305 
3306     /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3307     ///
3308     /// BOM sniffing may cause the returned decoder to morph into a decoder
3309     /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3310     ///
3311     /// Available via the C wrapper.
3312     #[inline]
new_decoder(&'static self) -> Decoder3313     pub fn new_decoder(&'static self) -> Decoder {
3314         Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3315     }
3316 
3317     /// Instantiates a new decoder for this encoding with BOM removal.
3318     ///
3319     /// If the input starts with bytes that are the BOM for this encoding,
3320     /// those bytes are removed. However, the decoder never morphs into a
3321     /// decoder for another encoding: A BOM for another encoding is treated as
3322     /// (potentially malformed) input to the decoding algorithm for this
3323     /// encoding.
3324     ///
3325     /// Available via the C wrapper.
3326     #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3327     pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3328         Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3329     }
3330 
3331     /// Instantiates a new decoder for this encoding with BOM handling disabled.
3332     ///
3333     /// If the input starts with bytes that look like a BOM, those bytes are
3334     /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3335     /// for another encoding.)
3336     ///
3337     /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3338     /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3339     /// instead of this method to cause the BOM to be removed.
3340     ///
3341     /// Available via the C wrapper.
3342     #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3343     pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3344         Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3345     }
3346 
3347     /// Instantiates a new encoder for the output encoding of this encoding.
3348     ///
3349     /// Available via the C wrapper.
3350     #[inline]
new_encoder(&'static self) -> Encoder3351     pub fn new_encoder(&'static self) -> Encoder {
3352         let enc = self.output_encoding();
3353         enc.variant.new_encoder(enc)
3354     }
3355 
3356     /// Validates UTF-8.
3357     ///
3358     /// Returns the index of the first byte that makes the input malformed as
3359     /// UTF-8 or the length of the slice if the slice is entirely valid.
3360     ///
3361     /// This is currently faster than the corresponding standard library
3362     /// functionality. If this implementation gets upstreamed to the standard
3363     /// library, this method may be removed in the future.
3364     ///
3365     /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3366     pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3367         utf8_valid_up_to(bytes)
3368     }
3369 
3370     /// Validates ASCII.
3371     ///
3372     /// Returns the index of the first byte that makes the input malformed as
3373     /// ASCII or the length of the slice if the slice is entirely valid.
3374     ///
3375     /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3376     pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3377         ascii_valid_up_to(bytes)
3378     }
3379 
3380     /// Validates ISO-2022-JP ASCII-state data.
3381     ///
3382     /// Returns the index of the first byte that makes the input not
3383     /// representable in the ASCII state of ISO-2022-JP or the length of the
3384     /// slice if the slice is entirely representable in the ASCII state of
3385     /// ISO-2022-JP.
3386     ///
3387     /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3388     pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3389         iso_2022_jp_ascii_valid_up_to(bytes)
3390     }
3391 }
3392 
3393 impl PartialEq for Encoding {
3394     #[inline]
eq(&self, other: &Encoding) -> bool3395     fn eq(&self, other: &Encoding) -> bool {
3396         (self as *const Encoding) == (other as *const Encoding)
3397     }
3398 }
3399 
3400 impl Eq for Encoding {}
3401 
3402 #[cfg(test)]
3403 impl PartialOrd for Encoding {
partial_cmp(&self, other: &Self) -> Option<Ordering>3404     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3405         (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3406     }
3407 }
3408 
3409 #[cfg(test)]
3410 impl Ord for Encoding {
cmp(&self, other: &Self) -> Ordering3411     fn cmp(&self, other: &Self) -> Ordering {
3412         (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3413     }
3414 }
3415 
3416 impl Hash for Encoding {
3417     #[inline]
hash<H: Hasher>(&self, state: &mut H)3418     fn hash<H: Hasher>(&self, state: &mut H) {
3419         (self as *const Encoding).hash(state);
3420     }
3421 }
3422 
3423 impl core::fmt::Debug for Encoding {
3424     #[inline]
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result3425     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3426         write!(f, "Encoding {{ {} }}", self.name)
3427     }
3428 }
3429 
3430 #[cfg(feature = "serde")]
3431 impl Serialize for Encoding {
3432     #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3433     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3434     where
3435         S: Serializer,
3436     {
3437         serializer.serialize_str(self.name)
3438     }
3439 }
3440 
3441 #[cfg(feature = "serde")]
3442 struct EncodingVisitor;
3443 
3444 #[cfg(feature = "serde")]
3445 impl<'de> Visitor<'de> for EncodingVisitor {
3446     type Value = &'static Encoding;
3447 
expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result3448     fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3449         formatter.write_str("a valid encoding label")
3450     }
3451 
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3452     fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3453     where
3454         E: serde::de::Error,
3455     {
3456         if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3457             Ok(enc)
3458         } else {
3459             Err(E::custom(alloc::format!(
3460                 "invalid encoding label: {}",
3461                 value
3462             )))
3463         }
3464     }
3465 }
3466 
3467 #[cfg(feature = "serde")]
3468 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3469     fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3470     where
3471         D: Deserializer<'de>,
3472     {
3473         deserializer.deserialize_str(EncodingVisitor)
3474     }
3475 }
3476 
3477 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3478 #[derive(PartialEq, Debug, Copy, Clone)]
3479 enum DecoderLifeCycle {
3480     /// The decoder has seen no input yet.
3481     AtStart,
3482     /// The decoder has seen no input yet but expects UTF-8.
3483     AtUtf8Start,
3484     /// The decoder has seen no input yet but expects UTF-16BE.
3485     AtUtf16BeStart,
3486     /// The decoder has seen no input yet but expects UTF-16LE.
3487     AtUtf16LeStart,
3488     /// The decoder has seen EF.
3489     SeenUtf8First,
3490     /// The decoder has seen EF, BB.
3491     SeenUtf8Second,
3492     /// The decoder has seen FE.
3493     SeenUtf16BeFirst,
3494     /// The decoder has seen FF.
3495     SeenUtf16LeFirst,
3496     /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3497     /// underlying decoder reported EF as an error, so we need to remember to
3498     /// push BB before the next buffer.
3499     ConvertingWithPendingBB,
3500     /// No longer looking for a BOM and EOF not yet seen.
3501     Converting,
3502     /// EOF has been seen.
3503     Finished,
3504 }
3505 
3506 /// Communicate the BOM handling mode.
3507 #[derive(Debug, Copy, Clone)]
3508 enum BomHandling {
3509     /// Don't handle the BOM
3510     Off,
3511     /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3512     Sniff,
3513     /// Remove the BOM only if it's the BOM for this encoding
3514     Remove,
3515 }
3516 
3517 /// Result of a (potentially partial) decode or encode operation with
3518 /// replacement.
3519 #[must_use]
3520 #[derive(Debug, PartialEq, Eq)]
3521 pub enum CoderResult {
3522     /// The input was exhausted.
3523     ///
3524     /// If this result was returned from a call where `last` was `true`, the
3525     /// conversion process has completed. Otherwise, the caller should call a
3526     /// decode or encode method again with more input.
3527     InputEmpty,
3528 
3529     /// The converter cannot produce another unit of output, because the output
3530     /// buffer does not have enough space left.
3531     ///
3532     /// The caller must provide more output space upon the next call and re-push
3533     /// the remaining input to the converter.
3534     OutputFull,
3535 }
3536 
3537 /// Result of a (potentially partial) decode operation without replacement.
3538 #[must_use]
3539 #[derive(Debug, PartialEq, Eq)]
3540 pub enum DecoderResult {
3541     /// The input was exhausted.
3542     ///
3543     /// If this result was returned from a call where `last` was `true`, the
3544     /// decoding process has completed. Otherwise, the caller should call a
3545     /// decode method again with more input.
3546     InputEmpty,
3547 
3548     /// The decoder cannot produce another unit of output, because the output
3549     /// buffer does not have enough space left.
3550     ///
3551     /// The caller must provide more output space upon the next call and re-push
3552     /// the remaining input to the decoder.
3553     OutputFull,
3554 
3555     /// The decoder encountered a malformed byte sequence.
3556     ///
3557     /// The caller must either treat this as a fatal error or must append one
3558     /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3559     /// the remaining input to the decoder.
3560     ///
3561     /// The first wrapped integer indicates the length of the malformed byte
3562     /// sequence. The second wrapped integer indicates the number of bytes
3563     /// that were consumed after the malformed sequence. If the second
3564     /// integer is zero, the last byte that was consumed is the last byte of
3565     /// the malformed sequence. Note that the malformed bytes may have been part
3566     /// of an earlier input buffer.
3567     ///
3568     /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3569     /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3570     /// of the two is 6, which happens with ISO-2022-JP.
3571     Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3572 }
3573 
3574 /// A converter that decodes a byte stream into Unicode according to a
3575 /// character encoding in a streaming (incremental) manner.
3576 ///
3577 /// The various `decode_*` methods take an input buffer (`src`) and an output
3578 /// buffer `dst` both of which are caller-allocated. There are variants for
3579 /// both UTF-8 and UTF-16 output buffers.
3580 ///
3581 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3582 /// into `dst` until one of the following three things happens:
3583 ///
3584 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3585 ///    variants only).
3586 ///
3587 /// 2. The output buffer has been filled so near capacity that the decoder
3588 ///    cannot be sure that processing an additional byte of input wouldn't
3589 ///    cause so much output that the output buffer would overflow.
3590 ///
3591 /// 3. All the input bytes have been processed.
3592 ///
3593 /// The `decode_*` method then returns tuple of a status indicating which one
3594 /// of the three reasons to return happened, how many input bytes were read,
3595 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3596 /// when decoding to UTF-16) were written (except when decoding into `String`,
3597 /// whose length change indicates this), and in the case of the
3598 /// variants performing replacement, a boolean indicating whether an error was
3599 /// replaced with the REPLACEMENT CHARACTER during the call.
3600 ///
3601 /// The number of bytes "written" is what's logically written. Garbage may be
3602 /// written in the output buffer beyond the point logically written to.
3603 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3604 /// methods that take an `&mut str` argument instead of the ones that take an
3605 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3606 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3607 /// latter don't.
3608 ///
3609 /// In the case of the `*_without_replacement` variants, the status is a
3610 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3611 /// `InputEmpty` corresponding to the three cases listed above).
3612 ///
3613 /// In the case of methods whose name does not end with
3614 /// `*_without_replacement`, malformed sequences are automatically replaced
3615 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3616 /// return early.
3617 ///
3618 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3619 /// space. When decoding to UTF-16, the output buffer must have at least two
3620 /// UTF-16 code units (`u16`) of space.
3621 ///
3622 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3623 /// not to return indicating that more output space is needed if the length
3624 /// of the output buffer is at least the length returned by
3625 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3626 /// with replacement, the length of the output buffer that guarantees the
3627 /// methods not to return indicating that more output space is needed is given
3628 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3629 /// or without replacement, the length of the output buffer that guarantees
3630 /// the methods not to return indicating that more output space is needed is
3631 /// given by [`max_utf16_buffer_length()`][4].
3632 ///
3633 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3634 /// and the output after each `decode_*` call is guaranteed to consist of
3635 /// complete characters. (I.e. the code unit sequence for the last character is
3636 /// guaranteed not to be split across output buffers.)
3637 ///
3638 /// The boolean argument `last` indicates that the end of the stream is reached
3639 /// when all the bytes in `src` have been consumed.
3640 ///
3641 /// A `Decoder` object can be used to incrementally decode a byte stream.
3642 ///
3643 /// During the processing of a single stream, the caller must call `decode_*`
3644 /// zero or more times with `last` set to `false` and then call `decode_*` at
3645 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3646 /// the processing of the stream has ended. Otherwise, the caller must call
3647 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3648 ///  a fatal error).
3649 ///
3650 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3651 /// That is, you need to create another one to process another stream.
3652 ///
3653 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3654 /// the caller does not wish to treat it as a fatal error, the input buffer
3655 /// `src` may not have been completely consumed. In that case, the caller must
3656 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3657 /// call.
3658 ///
3659 /// [1]: enum.DecoderResult.html
3660 /// [2]: #method.max_utf8_buffer_length_without_replacement
3661 /// [3]: #method.max_utf8_buffer_length
3662 /// [4]: #method.max_utf16_buffer_length
3663 ///
3664 /// # Infinite loops
3665 ///
3666 /// When converting with a fixed-size output buffer whose size is too small to
3667 /// accommodate one character or (when applicable) one numeric character
3668 /// reference of output, an infinite loop ensues. When converting with a
3669 /// fixed-size output buffer, it generally makes sense to make the buffer
3670 /// fairly large (e.g. couple of kilobytes).
3671 pub struct Decoder {
3672     encoding: &'static Encoding,
3673     variant: VariantDecoder,
3674     life_cycle: DecoderLifeCycle,
3675 }
3676 
3677 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3678     fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3679         Decoder {
3680             encoding: enc,
3681             variant: decoder,
3682             life_cycle: match sniffing {
3683                 BomHandling::Off => DecoderLifeCycle::Converting,
3684                 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3685                 BomHandling::Remove => {
3686                     if enc == UTF_8 {
3687                         DecoderLifeCycle::AtUtf8Start
3688                     } else if enc == UTF_16BE {
3689                         DecoderLifeCycle::AtUtf16BeStart
3690                     } else if enc == UTF_16LE {
3691                         DecoderLifeCycle::AtUtf16LeStart
3692                     } else {
3693                         DecoderLifeCycle::Converting
3694                     }
3695                 }
3696             },
3697         }
3698     }
3699 
3700     /// The `Encoding` this `Decoder` is for.
3701     ///
3702     /// BOM sniffing can change the return value of this method during the life
3703     /// of the decoder.
3704     ///
3705     /// Available via the C wrapper.
3706     #[inline]
encoding(&self) -> &'static Encoding3707     pub fn encoding(&self) -> &'static Encoding {
3708         self.encoding
3709     }
3710 
3711     /// Query the worst-case UTF-8 output size _with replacement_.
3712     ///
3713     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3714     /// that will not overflow given the current state of the decoder and
3715     /// `byte_length` number of additional input bytes when decoding with
3716     /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3717     /// sequence or `None` if `usize` would overflow.
3718     ///
3719     /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3720     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3721         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3722         // BOM getting pushed to the underlying decoder.
3723         match self.life_cycle {
3724             DecoderLifeCycle::Converting
3725             | DecoderLifeCycle::AtUtf8Start
3726             | DecoderLifeCycle::AtUtf16LeStart
3727             | DecoderLifeCycle::AtUtf16BeStart => {
3728                 return self.variant.max_utf8_buffer_length(byte_length);
3729             }
3730             DecoderLifeCycle::AtStart => {
3731                 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3732                     if let Some(utf16_bom) = checked_add(
3733                         1,
3734                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3735                     ) {
3736                         let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3737                         let encoding = self.encoding();
3738                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3739                             // No need to consider the internal state of the underlying decoder,
3740                             // because it is at start, because no data has reached it yet.
3741                             return Some(utf_bom);
3742                         } else if let Some(non_bom) =
3743                             self.variant.max_utf8_buffer_length(byte_length)
3744                         {
3745                             return Some(core::cmp::max(utf_bom, non_bom));
3746                         }
3747                     }
3748                 }
3749             }
3750             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3751                 // Add two bytes even when only one byte has been seen,
3752                 // because the one byte can become a lead byte in multibyte
3753                 // decoders, but only after the decoder has been queried
3754                 // for max length, so the decoder's own logic for adding
3755                 // one for a pending lead cannot work.
3756                 if let Some(sum) = byte_length.checked_add(2) {
3757                     if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3758                         if self.encoding() == UTF_8 {
3759                             // No need to consider the internal state of the underlying decoder,
3760                             // because it is at start, because no data has reached it yet.
3761                             return Some(utf8_bom);
3762                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3763                             return Some(core::cmp::max(utf8_bom, non_bom));
3764                         }
3765                     }
3766                 }
3767             }
3768             DecoderLifeCycle::ConvertingWithPendingBB => {
3769                 if let Some(sum) = byte_length.checked_add(2) {
3770                     return self.variant.max_utf8_buffer_length(sum);
3771                 }
3772             }
3773             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3774                 // Add two bytes even when only one byte has been seen,
3775                 // because the one byte can become a lead byte in multibyte
3776                 // decoders, but only after the decoder has been queried
3777                 // for max length, so the decoder's own logic for adding
3778                 // one for a pending lead cannot work.
3779                 if let Some(sum) = byte_length.checked_add(2) {
3780                     if let Some(utf16_bom) =
3781                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3782                     {
3783                         let encoding = self.encoding();
3784                         if encoding == UTF_16LE || encoding == UTF_16BE {
3785                             // No need to consider the internal state of the underlying decoder,
3786                             // because it is at start, because no data has reached it yet.
3787                             return Some(utf16_bom);
3788                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3789                             return Some(core::cmp::max(utf16_bom, non_bom));
3790                         }
3791                     }
3792                 }
3793             }
3794             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3795         }
3796         None
3797     }
3798 
3799     /// Query the worst-case UTF-8 output size _without replacement_.
3800     ///
3801     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3802     /// that will not overflow given the current state of the decoder and
3803     /// `byte_length` number of additional input bytes when decoding without
3804     /// replacement error handling or `None` if `usize` would overflow.
3805     ///
3806     /// Note that this value may be too small for the `_with_replacement` case.
3807     /// Use `max_utf8_buffer_length()` for that case.
3808     ///
3809     /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3810     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3811         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3812         // BOM getting pushed to the underlying decoder.
3813         match self.life_cycle {
3814             DecoderLifeCycle::Converting
3815             | DecoderLifeCycle::AtUtf8Start
3816             | DecoderLifeCycle::AtUtf16LeStart
3817             | DecoderLifeCycle::AtUtf16BeStart => {
3818                 return self
3819                     .variant
3820                     .max_utf8_buffer_length_without_replacement(byte_length);
3821             }
3822             DecoderLifeCycle::AtStart => {
3823                 if let Some(utf8_bom) = byte_length.checked_add(3) {
3824                     if let Some(utf16_bom) = checked_add(
3825                         1,
3826                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3827                     ) {
3828                         let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3829                         let encoding = self.encoding();
3830                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3831                             // No need to consider the internal state of the underlying decoder,
3832                             // because it is at start, because no data has reached it yet.
3833                             return Some(utf_bom);
3834                         } else if let Some(non_bom) = self
3835                             .variant
3836                             .max_utf8_buffer_length_without_replacement(byte_length)
3837                         {
3838                             return Some(core::cmp::max(utf_bom, non_bom));
3839                         }
3840                     }
3841                 }
3842             }
3843             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3844                 // Add two bytes even when only one byte has been seen,
3845                 // because the one byte can become a lead byte in multibyte
3846                 // decoders, but only after the decoder has been queried
3847                 // for max length, so the decoder's own logic for adding
3848                 // one for a pending lead cannot work.
3849                 if let Some(sum) = byte_length.checked_add(2) {
3850                     if let Some(utf8_bom) = sum.checked_add(3) {
3851                         if self.encoding() == UTF_8 {
3852                             // No need to consider the internal state of the underlying decoder,
3853                             // because it is at start, because no data has reached it yet.
3854                             return Some(utf8_bom);
3855                         } else if let Some(non_bom) =
3856                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3857                         {
3858                             return Some(core::cmp::max(utf8_bom, non_bom));
3859                         }
3860                     }
3861                 }
3862             }
3863             DecoderLifeCycle::ConvertingWithPendingBB => {
3864                 if let Some(sum) = byte_length.checked_add(2) {
3865                     return self.variant.max_utf8_buffer_length_without_replacement(sum);
3866                 }
3867             }
3868             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3869                 // Add two bytes even when only one byte has been seen,
3870                 // because the one byte can become a lead byte in multibyte
3871                 // decoders, but only after the decoder has been queried
3872                 // for max length, so the decoder's own logic for adding
3873                 // one for a pending lead cannot work.
3874                 if let Some(sum) = byte_length.checked_add(2) {
3875                     if let Some(utf16_bom) =
3876                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3877                     {
3878                         let encoding = self.encoding();
3879                         if encoding == UTF_16LE || encoding == UTF_16BE {
3880                             // No need to consider the internal state of the underlying decoder,
3881                             // because it is at start, because no data has reached it yet.
3882                             return Some(utf16_bom);
3883                         } else if let Some(non_bom) =
3884                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3885                         {
3886                             return Some(core::cmp::max(utf16_bom, non_bom));
3887                         }
3888                     }
3889                 }
3890             }
3891             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3892         }
3893         None
3894     }
3895 
3896     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3897     /// replaced with the REPLACEMENT CHARACTER.
3898     ///
3899     /// See the documentation of the struct for documentation for `decode_*`
3900     /// methods collectively.
3901     ///
3902     /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3903     pub fn decode_to_utf8(
3904         &mut self,
3905         src: &[u8],
3906         dst: &mut [u8],
3907         last: bool,
3908     ) -> (CoderResult, usize, usize, bool) {
3909         let mut had_errors = false;
3910         let mut total_read = 0usize;
3911         let mut total_written = 0usize;
3912         loop {
3913             let (result, read, written) = self.decode_to_utf8_without_replacement(
3914                 &src[total_read..],
3915                 &mut dst[total_written..],
3916                 last,
3917             );
3918             total_read += read;
3919             total_written += written;
3920             match result {
3921                 DecoderResult::InputEmpty => {
3922                     return (
3923                         CoderResult::InputEmpty,
3924                         total_read,
3925                         total_written,
3926                         had_errors,
3927                     );
3928                 }
3929                 DecoderResult::OutputFull => {
3930                     return (
3931                         CoderResult::OutputFull,
3932                         total_read,
3933                         total_written,
3934                         had_errors,
3935                     );
3936                 }
3937                 DecoderResult::Malformed(_, _) => {
3938                     had_errors = true;
3939                     // There should always be space for the U+FFFD, because
3940                     // otherwise we'd have gotten OutputFull already.
3941                     // XXX: is the above comment actually true for UTF-8 itself?
3942                     // TODO: Consider having fewer bound checks here.
3943                     dst[total_written] = 0xEFu8;
3944                     total_written += 1;
3945                     dst[total_written] = 0xBFu8;
3946                     total_written += 1;
3947                     dst[total_written] = 0xBDu8;
3948                     total_written += 1;
3949                 }
3950             }
3951         }
3952     }
3953 
3954     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3955     /// replaced with the REPLACEMENT CHARACTER with type system signaling
3956     /// of UTF-8 validity.
3957     ///
3958     /// This methods calls `decode_to_utf8` and then zeroes
3959     /// out up to three bytes that aren't logically part of the write in order
3960     /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3961     ///
3962     /// See the documentation of the struct for documentation for `decode_*`
3963     /// methods collectively.
3964     ///
3965     /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3966     pub fn decode_to_str(
3967         &mut self,
3968         src: &[u8],
3969         dst: &mut str,
3970         last: bool,
3971     ) -> (CoderResult, usize, usize, bool) {
3972         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3973         let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3974         let len = bytes.len();
3975         let mut trail = written;
3976         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3977         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3978         // encodings to avoid overwriting here.
3979         if self.encoding != UTF_8 {
3980             let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3981             while trail < max {
3982                 bytes[trail] = 0;
3983                 trail += 1;
3984             }
3985         }
3986         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3987             bytes[trail] = 0;
3988             trail += 1;
3989         }
3990         (result, read, written, replaced)
3991     }
3992 
3993     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3994     /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3995     ///
3996     /// Like the others, this method follows the logic that the output buffer is
3997     /// caller-allocated. This method treats the capacity of the `String` as
3998     /// the output limit. That is, this method guarantees not to cause a
3999     /// reallocation of the backing buffer of `String`.
4000     ///
4001     /// The return value is a tuple that contains the `DecoderResult`, the
4002     /// number of bytes read and a boolean indicating whether replacements
4003     /// were done. The number of bytes written is signaled via the length of
4004     /// the `String` changing.
4005     ///
4006     /// See the documentation of the struct for documentation for `decode_*`
4007     /// methods collectively.
4008     ///
4009     /// Available to Rust only and only with the `alloc` feature enabled (enabled
4010     /// by default).
4011     #[cfg(feature = "alloc")]
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)4012     pub fn decode_to_string(
4013         &mut self,
4014         src: &[u8],
4015         dst: &mut String,
4016         last: bool,
4017     ) -> (CoderResult, usize, bool) {
4018         unsafe {
4019             let vec = dst.as_mut_vec();
4020             let old_len = vec.len();
4021             let capacity = vec.capacity();
4022             vec.set_len(capacity);
4023             let (result, read, written, replaced) =
4024                 self.decode_to_utf8(src, &mut vec[old_len..], last);
4025             vec.set_len(old_len + written);
4026             (result, read, replaced)
4027         }
4028     }
4029 
4030     public_decode_function!(/// Incrementally decode a byte stream into UTF-8
4031                             /// _without replacement_.
4032                             ///
4033                             /// See the documentation of the struct for
4034                             /// documentation for `decode_*` methods
4035                             /// collectively.
4036                             ///
4037                             /// Available via the C wrapper.
4038                             ,
4039                             decode_to_utf8_without_replacement,
4040                             decode_to_utf8_raw,
4041                             decode_to_utf8_checking_end,
4042                             decode_to_utf8_after_one_potential_bom_byte,
4043                             decode_to_utf8_after_two_potential_bom_bytes,
4044                             decode_to_utf8_checking_end_with_offset,
4045                             u8);
4046 
4047     /// Incrementally decode a byte stream into UTF-8 with type system signaling
4048     /// of UTF-8 validity.
4049     ///
4050     /// This methods calls `decode_to_utf8` and then zeroes out up to three
4051     /// bytes that aren't logically part of the write in order to retain the
4052     /// UTF-8 validity even for the unwritten part of the buffer.
4053     ///
4054     /// See the documentation of the struct for documentation for `decode_*`
4055     /// methods collectively.
4056     ///
4057     /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)4058     pub fn decode_to_str_without_replacement(
4059         &mut self,
4060         src: &[u8],
4061         dst: &mut str,
4062         last: bool,
4063     ) -> (DecoderResult, usize, usize) {
4064         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4065         let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4066         let len = bytes.len();
4067         let mut trail = written;
4068         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4069         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4070         // encodings to avoid overwriting here.
4071         if self.encoding != UTF_8 {
4072             let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4073             while trail < max {
4074                 bytes[trail] = 0;
4075                 trail += 1;
4076             }
4077         }
4078         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4079             bytes[trail] = 0;
4080             trail += 1;
4081         }
4082         (result, read, written)
4083     }
4084 
4085     /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4086     ///
4087     /// Like the others, this method follows the logic that the output buffer is
4088     /// caller-allocated. This method treats the capacity of the `String` as
4089     /// the output limit. That is, this method guarantees not to cause a
4090     /// reallocation of the backing buffer of `String`.
4091     ///
4092     /// The return value is a pair that contains the `DecoderResult` and the
4093     /// number of bytes read. The number of bytes written is signaled via
4094     /// the length of the `String` changing.
4095     ///
4096     /// See the documentation of the struct for documentation for `decode_*`
4097     /// methods collectively.
4098     ///
4099     /// Available to Rust only and only with the `alloc` feature enabled (enabled
4100     /// by default).
4101     #[cfg(feature = "alloc")]
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4102     pub fn decode_to_string_without_replacement(
4103         &mut self,
4104         src: &[u8],
4105         dst: &mut String,
4106         last: bool,
4107     ) -> (DecoderResult, usize) {
4108         unsafe {
4109             let vec = dst.as_mut_vec();
4110             let old_len = vec.len();
4111             let capacity = vec.capacity();
4112             vec.set_len(capacity);
4113             let (result, read, written) =
4114                 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4115             vec.set_len(old_len + written);
4116             (result, read)
4117         }
4118     }
4119 
4120     /// Query the worst-case UTF-16 output size (with or without replacement).
4121     ///
4122     /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4123     /// that will not overflow given the current state of the decoder and
4124     /// `byte_length` number of additional input bytes or `None` if `usize`
4125     /// would overflow.
4126     ///
4127     /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4128     /// return value of this method applies also in the
4129     /// `_without_replacement` case.
4130     ///
4131     /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4132     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4133         // Need to consider a) the decoder morphing due to the BOM and b) a partial
4134         // BOM getting pushed to the underlying decoder.
4135         match self.life_cycle {
4136             DecoderLifeCycle::Converting
4137             | DecoderLifeCycle::AtUtf8Start
4138             | DecoderLifeCycle::AtUtf16LeStart
4139             | DecoderLifeCycle::AtUtf16BeStart => {
4140                 return self.variant.max_utf16_buffer_length(byte_length);
4141             }
4142             DecoderLifeCycle::AtStart => {
4143                 if let Some(utf8_bom) = byte_length.checked_add(1) {
4144                     if let Some(utf16_bom) =
4145                         checked_add(1, checked_div(byte_length.checked_add(1), 2))
4146                     {
4147                         let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4148                         let encoding = self.encoding();
4149                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4150                             // No need to consider the internal state of the underlying decoder,
4151                             // because it is at start, because no data has reached it yet.
4152                             return Some(utf_bom);
4153                         } else if let Some(non_bom) =
4154                             self.variant.max_utf16_buffer_length(byte_length)
4155                         {
4156                             return Some(core::cmp::max(utf_bom, non_bom));
4157                         }
4158                     }
4159                 }
4160             }
4161             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4162                 // Add two bytes even when only one byte has been seen,
4163                 // because the one byte can become a lead byte in multibyte
4164                 // decoders, but only after the decoder has been queried
4165                 // for max length, so the decoder's own logic for adding
4166                 // one for a pending lead cannot work.
4167                 if let Some(sum) = byte_length.checked_add(2) {
4168                     if let Some(utf8_bom) = sum.checked_add(1) {
4169                         if self.encoding() == UTF_8 {
4170                             // No need to consider the internal state of the underlying decoder,
4171                             // because it is at start, because no data has reached it yet.
4172                             return Some(utf8_bom);
4173                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4174                             return Some(core::cmp::max(utf8_bom, non_bom));
4175                         }
4176                     }
4177                 }
4178             }
4179             DecoderLifeCycle::ConvertingWithPendingBB => {
4180                 if let Some(sum) = byte_length.checked_add(2) {
4181                     return self.variant.max_utf16_buffer_length(sum);
4182                 }
4183             }
4184             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4185                 // Add two bytes even when only one byte has been seen,
4186                 // because the one byte can become a lead byte in multibyte
4187                 // decoders, but only after the decoder has been queried
4188                 // for max length, so the decoder's own logic for adding
4189                 // one for a pending lead cannot work.
4190                 if let Some(sum) = byte_length.checked_add(2) {
4191                     if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4192                         let encoding = self.encoding();
4193                         if encoding == UTF_16LE || encoding == UTF_16BE {
4194                             // No need to consider the internal state of the underlying decoder,
4195                             // because it is at start, because no data has reached it yet.
4196                             return Some(utf16_bom);
4197                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4198                             return Some(core::cmp::max(utf16_bom, non_bom));
4199                         }
4200                     }
4201                 }
4202             }
4203             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4204         }
4205         None
4206     }
4207 
4208     /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4209     /// replaced with the REPLACEMENT CHARACTER.
4210     ///
4211     /// See the documentation of the struct for documentation for `decode_*`
4212     /// methods collectively.
4213     ///
4214     /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4215     pub fn decode_to_utf16(
4216         &mut self,
4217         src: &[u8],
4218         dst: &mut [u16],
4219         last: bool,
4220     ) -> (CoderResult, usize, usize, bool) {
4221         let mut had_errors = false;
4222         let mut total_read = 0usize;
4223         let mut total_written = 0usize;
4224         loop {
4225             let (result, read, written) = self.decode_to_utf16_without_replacement(
4226                 &src[total_read..],
4227                 &mut dst[total_written..],
4228                 last,
4229             );
4230             total_read += read;
4231             total_written += written;
4232             match result {
4233                 DecoderResult::InputEmpty => {
4234                     return (
4235                         CoderResult::InputEmpty,
4236                         total_read,
4237                         total_written,
4238                         had_errors,
4239                     );
4240                 }
4241                 DecoderResult::OutputFull => {
4242                     return (
4243                         CoderResult::OutputFull,
4244                         total_read,
4245                         total_written,
4246                         had_errors,
4247                     );
4248                 }
4249                 DecoderResult::Malformed(_, _) => {
4250                     had_errors = true;
4251                     // There should always be space for the U+FFFD, because
4252                     // otherwise we'd have gotten OutputFull already.
4253                     dst[total_written] = 0xFFFD;
4254                     total_written += 1;
4255                 }
4256             }
4257         }
4258     }
4259 
4260     public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4261                             /// _without replacement_.
4262                             ///
4263                             /// See the documentation of the struct for
4264                             /// documentation for `decode_*` methods
4265                             /// collectively.
4266                             ///
4267                             /// Available via the C wrapper.
4268                             ,
4269                             decode_to_utf16_without_replacement,
4270                             decode_to_utf16_raw,
4271                             decode_to_utf16_checking_end,
4272                             decode_to_utf16_after_one_potential_bom_byte,
4273                             decode_to_utf16_after_two_potential_bom_bytes,
4274                             decode_to_utf16_checking_end_with_offset,
4275                             u16);
4276 
4277     /// Checks for compatibility with storing Unicode scalar values as unsigned
4278     /// bytes taking into account the state of the decoder.
4279     ///
4280     /// Returns `None` if the decoder is not in a neutral state, including waiting
4281     /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4282     ///
4283     /// Otherwise returns the index of the first byte whose unsigned value doesn't
4284     /// directly correspond to the decoded Unicode scalar value, or the length
4285     /// of the input if all bytes in the input decode directly to scalar values
4286     /// corresponding to the unsigned byte values.
4287     ///
4288     /// Does not change the state of the decoder.
4289     ///
4290     /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4291     /// storage optimizations.
4292     ///
4293     /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4294     pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4295         match self.life_cycle {
4296             DecoderLifeCycle::Converting => {
4297                 return self.variant.latin1_byte_compatible_up_to(bytes);
4298             }
4299             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4300             _ => None,
4301         }
4302     }
4303 }
4304 
4305 /// Result of a (potentially partial) encode operation without replacement.
4306 #[must_use]
4307 #[derive(Debug, PartialEq, Eq)]
4308 pub enum EncoderResult {
4309     /// The input was exhausted.
4310     ///
4311     /// If this result was returned from a call where `last` was `true`, the
4312     /// decoding process has completed. Otherwise, the caller should call a
4313     /// decode method again with more input.
4314     InputEmpty,
4315 
4316     /// The encoder cannot produce another unit of output, because the output
4317     /// buffer does not have enough space left.
4318     ///
4319     /// The caller must provide more output space upon the next call and re-push
4320     /// the remaining input to the decoder.
4321     OutputFull,
4322 
4323     /// The encoder encountered an unmappable character.
4324     ///
4325     /// The caller must either treat this as a fatal error or must append
4326     /// a placeholder to the output and then re-push the remaining input to the
4327     /// encoder.
4328     Unmappable(char),
4329 }
4330 
4331 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4332     fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4333         EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4334     }
4335 }
4336 
4337 /// A converter that encodes a Unicode stream into bytes according to a
4338 /// character encoding in a streaming (incremental) manner.
4339 ///
4340 /// The various `encode_*` methods take an input buffer (`src`) and an output
4341 /// buffer `dst` both of which are caller-allocated. There are variants for
4342 /// both UTF-8 and UTF-16 input buffers.
4343 ///
4344 /// An `encode_*` method encode characters from `src` into bytes characters
4345 /// stored into `dst` until one of the following three things happens:
4346 ///
4347 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4348 ///    only).
4349 ///
4350 /// 2. The output buffer has been filled so near capacity that the decoder
4351 ///    cannot be sure that processing an additional character of input wouldn't
4352 ///    cause so much output that the output buffer would overflow.
4353 ///
4354 /// 3. All the input characters have been processed.
4355 ///
4356 /// The `encode_*` method then returns tuple of a status indicating which one
4357 /// of the three reasons to return happened, how many input code units (`u8`
4358 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4359 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4360 /// whose length change indicates this), and in the case of the variants that
4361 /// perform replacement, a boolean indicating whether an unmappable
4362 /// character was replaced with a numeric character reference during the call.
4363 ///
4364 /// The number of bytes "written" is what's logically written. Garbage may be
4365 /// written in the output buffer beyond the point logically written to.
4366 ///
4367 /// In the case of the methods whose name ends with
4368 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4369 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4370 /// the three cases listed above).
4371 ///
4372 /// In the case of methods whose name does not end with
4373 /// `*_without_replacement`, unmappable characters are automatically replaced
4374 /// with the corresponding numeric character references and unmappable
4375 /// characters do not cause the methods to return early.
4376 ///
4377 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4378 /// not to return indicating that more output space is needed if the length
4379 /// of the output buffer is at least the length returned by
4380 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4381 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4382 /// methods not to return indicating that more output space is needed in the
4383 /// absence of unmappable characters is given by
4384 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4385 /// UTF-16 without replacement, the methods are guaranteed not to return
4386 /// indicating that more output space is needed if the length of the output
4387 /// buffer is at least the length returned by
4388 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4389 /// from UTF-16 with replacement, the the length of the output buffer that
4390 /// guarantees the methods not to return indicating that more output space is
4391 /// needed in the absence of unmappable characters is given by
4392 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4393 /// When encoding with replacement, applications are not expected to size the
4394 /// buffer for the worst case ahead of time but to resize the buffer if there
4395 /// are unmappable characters. This is why max length queries are only available
4396 /// for the case where there are no unmappable characters.
4397 ///
4398 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4399 /// calling from Rust, the type system takes care of this.) When encoding from
4400 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4401 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4402 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4403 /// are not split across input buffer boundaries.
4404 ///
4405 /// After an `encode_*` call returns, the output produced so far, taken as a
4406 /// whole from the start of the stream, is guaranteed to consist of a valid
4407 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4408 /// character is guaranteed not to be split across output buffers. However, due
4409 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4410 /// from the start for it to be valid. For other encodings, the validity holds
4411 /// on a per-output buffer basis.)
4412 ///
4413 /// The boolean argument `last` indicates that the end of the stream is reached
4414 /// when all the characters in `src` have been consumed. This argument is needed
4415 /// for ISO-2022-JP and is ignored for other encodings.
4416 ///
4417 /// An `Encoder` object can be used to incrementally encode a byte stream.
4418 ///
4419 /// During the processing of a single stream, the caller must call `encode_*`
4420 /// zero or more times with `last` set to `false` and then call `encode_*` at
4421 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4422 /// the processing of the stream has ended. Otherwise, the caller must call
4423 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4424 /// as a fatal error).
4425 ///
4426 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4427 /// That is, you need to create another one to process another stream.
4428 ///
4429 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4430 /// and the caller does not wish to treat it as a fatal error, the input buffer
4431 /// `src` may not have been completely consumed. In that case, the caller must
4432 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4433 /// call.
4434 ///
4435 /// [1]: enum.EncoderResult.html
4436 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4437 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4438 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4439 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4440 ///
4441 /// # Infinite loops
4442 ///
4443 /// When converting with a fixed-size output buffer whose size is too small to
4444 /// accommodate one character of output, an infinite loop ensues. When
4445 /// converting with a fixed-size output buffer, it generally makes sense to
4446 /// make the buffer fairly large (e.g. couple of kilobytes).
4447 pub struct Encoder {
4448     encoding: &'static Encoding,
4449     variant: VariantEncoder,
4450 }
4451 
4452 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4453     fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4454         Encoder {
4455             encoding: enc,
4456             variant: encoder,
4457         }
4458     }
4459 
4460     /// The `Encoding` this `Encoder` is for.
4461     #[inline]
encoding(&self) -> &'static Encoding4462     pub fn encoding(&self) -> &'static Encoding {
4463         self.encoding
4464     }
4465 
4466     /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4467     /// ASCII state and `false` otherwise.
4468     #[inline]
has_pending_state(&self) -> bool4469     pub fn has_pending_state(&self) -> bool {
4470         self.variant.has_pending_state()
4471     }
4472 
4473     /// Query the worst-case output size when encoding from UTF-8 with
4474     /// replacement.
4475     ///
4476     /// Returns the size of the output buffer in bytes that will not overflow
4477     /// given the current state of the encoder and `byte_length` number of
4478     /// additional input code units if there are no unmappable characters in
4479     /// the input or `None` if `usize` would overflow.
4480     ///
4481     /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4482     pub fn max_buffer_length_from_utf8_if_no_unmappables(
4483         &self,
4484         byte_length: usize,
4485     ) -> Option<usize> {
4486         checked_add(
4487             if self.encoding().can_encode_everything() {
4488                 0
4489             } else {
4490                 NCR_EXTRA
4491             },
4492             self.max_buffer_length_from_utf8_without_replacement(byte_length),
4493         )
4494     }
4495 
4496     /// Query the worst-case output size when encoding from UTF-8 without
4497     /// replacement.
4498     ///
4499     /// Returns the size of the output buffer in bytes that will not overflow
4500     /// given the current state of the encoder and `byte_length` number of
4501     /// additional input code units or `None` if `usize` would overflow.
4502     ///
4503     /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4504     pub fn max_buffer_length_from_utf8_without_replacement(
4505         &self,
4506         byte_length: usize,
4507     ) -> Option<usize> {
4508         self.variant
4509             .max_buffer_length_from_utf8_without_replacement(byte_length)
4510     }
4511 
4512     /// Incrementally encode into byte stream from UTF-8 with unmappable
4513     /// characters replaced with HTML (decimal) numeric character references.
4514     ///
4515     /// See the documentation of the struct for documentation for `encode_*`
4516     /// methods collectively.
4517     ///
4518     /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4519     pub fn encode_from_utf8(
4520         &mut self,
4521         src: &str,
4522         dst: &mut [u8],
4523         last: bool,
4524     ) -> (CoderResult, usize, usize, bool) {
4525         let dst_len = dst.len();
4526         let effective_dst_len = if self.encoding().can_encode_everything() {
4527             dst_len
4528         } else {
4529             if dst_len < NCR_EXTRA {
4530                 if src.is_empty() && !(last && self.has_pending_state()) {
4531                     return (CoderResult::InputEmpty, 0, 0, false);
4532                 }
4533                 return (CoderResult::OutputFull, 0, 0, false);
4534             }
4535             dst_len - NCR_EXTRA
4536         };
4537         let mut had_unmappables = false;
4538         let mut total_read = 0usize;
4539         let mut total_written = 0usize;
4540         loop {
4541             let (result, read, written) = self.encode_from_utf8_without_replacement(
4542                 &src[total_read..],
4543                 &mut dst[total_written..effective_dst_len],
4544                 last,
4545             );
4546             total_read += read;
4547             total_written += written;
4548             match result {
4549                 EncoderResult::InputEmpty => {
4550                     return (
4551                         CoderResult::InputEmpty,
4552                         total_read,
4553                         total_written,
4554                         had_unmappables,
4555                     );
4556                 }
4557                 EncoderResult::OutputFull => {
4558                     return (
4559                         CoderResult::OutputFull,
4560                         total_read,
4561                         total_written,
4562                         had_unmappables,
4563                     );
4564                 }
4565                 EncoderResult::Unmappable(unmappable) => {
4566                     had_unmappables = true;
4567                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4568                     debug_assert_ne!(self.encoding(), UTF_16BE);
4569                     debug_assert_ne!(self.encoding(), UTF_16LE);
4570                     // Additionally, Iso2022JpEncoder is responsible for
4571                     // transitioning to ASCII when returning with Unmappable.
4572                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4573                     if total_written >= effective_dst_len {
4574                         if total_read == src.len() && !(last && self.has_pending_state()) {
4575                             return (
4576                                 CoderResult::InputEmpty,
4577                                 total_read,
4578                                 total_written,
4579                                 had_unmappables,
4580                             );
4581                         }
4582                         return (
4583                             CoderResult::OutputFull,
4584                             total_read,
4585                             total_written,
4586                             had_unmappables,
4587                         );
4588                     }
4589                 }
4590             }
4591         }
4592     }
4593 
4594     /// Incrementally encode into byte stream from UTF-8 with unmappable
4595     /// characters replaced with HTML (decimal) numeric character references.
4596     ///
4597     /// See the documentation of the struct for documentation for `encode_*`
4598     /// methods collectively.
4599     ///
4600     /// Available to Rust only and only with the `alloc` feature enabled (enabled
4601     /// by default).
4602     #[cfg(feature = "alloc")]
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4603     pub fn encode_from_utf8_to_vec(
4604         &mut self,
4605         src: &str,
4606         dst: &mut Vec<u8>,
4607         last: bool,
4608     ) -> (CoderResult, usize, bool) {
4609         unsafe {
4610             let old_len = dst.len();
4611             let capacity = dst.capacity();
4612             dst.set_len(capacity);
4613             let (result, read, written, replaced) =
4614                 self.encode_from_utf8(src, &mut dst[old_len..], last);
4615             dst.set_len(old_len + written);
4616             (result, read, replaced)
4617         }
4618     }
4619 
4620     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4621     ///
4622     /// See the documentation of the struct for documentation for `encode_*`
4623     /// methods collectively.
4624     ///
4625     /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4626     pub fn encode_from_utf8_without_replacement(
4627         &mut self,
4628         src: &str,
4629         dst: &mut [u8],
4630         last: bool,
4631     ) -> (EncoderResult, usize, usize) {
4632         self.variant.encode_from_utf8_raw(src, dst, last)
4633     }
4634 
4635     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4636     ///
4637     /// See the documentation of the struct for documentation for `encode_*`
4638     /// methods collectively.
4639     ///
4640     /// Available to Rust only and only with the `alloc` feature enabled (enabled
4641     /// by default).
4642     #[cfg(feature = "alloc")]
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4643     pub fn encode_from_utf8_to_vec_without_replacement(
4644         &mut self,
4645         src: &str,
4646         dst: &mut Vec<u8>,
4647         last: bool,
4648     ) -> (EncoderResult, usize) {
4649         unsafe {
4650             let old_len = dst.len();
4651             let capacity = dst.capacity();
4652             dst.set_len(capacity);
4653             let (result, read, written) =
4654                 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4655             dst.set_len(old_len + written);
4656             (result, read)
4657         }
4658     }
4659 
4660     /// Query the worst-case output size when encoding from UTF-16 with
4661     /// replacement.
4662     ///
4663     /// Returns the size of the output buffer in bytes that will not overflow
4664     /// given the current state of the encoder and `u16_length` number of
4665     /// additional input code units if there are no unmappable characters in
4666     /// the input or `None` if `usize` would overflow.
4667     ///
4668     /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4669     pub fn max_buffer_length_from_utf16_if_no_unmappables(
4670         &self,
4671         u16_length: usize,
4672     ) -> Option<usize> {
4673         checked_add(
4674             if self.encoding().can_encode_everything() {
4675                 0
4676             } else {
4677                 NCR_EXTRA
4678             },
4679             self.max_buffer_length_from_utf16_without_replacement(u16_length),
4680         )
4681     }
4682 
4683     /// Query the worst-case output size when encoding from UTF-16 without
4684     /// replacement.
4685     ///
4686     /// Returns the size of the output buffer in bytes that will not overflow
4687     /// given the current state of the encoder and `u16_length` number of
4688     /// additional input code units or `None` if `usize` would overflow.
4689     ///
4690     /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4691     pub fn max_buffer_length_from_utf16_without_replacement(
4692         &self,
4693         u16_length: usize,
4694     ) -> Option<usize> {
4695         self.variant
4696             .max_buffer_length_from_utf16_without_replacement(u16_length)
4697     }
4698 
4699     /// Incrementally encode into byte stream from UTF-16 with unmappable
4700     /// characters replaced with HTML (decimal) numeric character references.
4701     ///
4702     /// See the documentation of the struct for documentation for `encode_*`
4703     /// methods collectively.
4704     ///
4705     /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4706     pub fn encode_from_utf16(
4707         &mut self,
4708         src: &[u16],
4709         dst: &mut [u8],
4710         last: bool,
4711     ) -> (CoderResult, usize, usize, bool) {
4712         let dst_len = dst.len();
4713         let effective_dst_len = if self.encoding().can_encode_everything() {
4714             dst_len
4715         } else {
4716             if dst_len < NCR_EXTRA {
4717                 if src.is_empty() && !(last && self.has_pending_state()) {
4718                     return (CoderResult::InputEmpty, 0, 0, false);
4719                 }
4720                 return (CoderResult::OutputFull, 0, 0, false);
4721             }
4722             dst_len - NCR_EXTRA
4723         };
4724         let mut had_unmappables = false;
4725         let mut total_read = 0usize;
4726         let mut total_written = 0usize;
4727         loop {
4728             let (result, read, written) = self.encode_from_utf16_without_replacement(
4729                 &src[total_read..],
4730                 &mut dst[total_written..effective_dst_len],
4731                 last,
4732             );
4733             total_read += read;
4734             total_written += written;
4735             match result {
4736                 EncoderResult::InputEmpty => {
4737                     return (
4738                         CoderResult::InputEmpty,
4739                         total_read,
4740                         total_written,
4741                         had_unmappables,
4742                     );
4743                 }
4744                 EncoderResult::OutputFull => {
4745                     return (
4746                         CoderResult::OutputFull,
4747                         total_read,
4748                         total_written,
4749                         had_unmappables,
4750                     );
4751                 }
4752                 EncoderResult::Unmappable(unmappable) => {
4753                     had_unmappables = true;
4754                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4755                     // There are no UTF-16 encoders and even if there were,
4756                     // they'd never have unmappables.
4757                     debug_assert_ne!(self.encoding(), UTF_16BE);
4758                     debug_assert_ne!(self.encoding(), UTF_16LE);
4759                     // Additionally, Iso2022JpEncoder is responsible for
4760                     // transitioning to ASCII when returning with Unmappable
4761                     // from the jis0208 state. That is, when we encode
4762                     // ISO-2022-JP and come here, the encoder is in either the
4763                     // ASCII or the Roman state. We are allowed to generate any
4764                     // printable ASCII excluding \ and ~.
4765                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4766                     if total_written >= effective_dst_len {
4767                         if total_read == src.len() && !(last && self.has_pending_state()) {
4768                             return (
4769                                 CoderResult::InputEmpty,
4770                                 total_read,
4771                                 total_written,
4772                                 had_unmappables,
4773                             );
4774                         }
4775                         return (
4776                             CoderResult::OutputFull,
4777                             total_read,
4778                             total_written,
4779                             had_unmappables,
4780                         );
4781                     }
4782                 }
4783             }
4784         }
4785     }
4786 
4787     /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4788     ///
4789     /// See the documentation of the struct for documentation for `encode_*`
4790     /// methods collectively.
4791     ///
4792     /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4793     pub fn encode_from_utf16_without_replacement(
4794         &mut self,
4795         src: &[u16],
4796         dst: &mut [u8],
4797         last: bool,
4798     ) -> (EncoderResult, usize, usize) {
4799         self.variant.encode_from_utf16_raw(src, dst, last)
4800     }
4801 }
4802 
4803 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4804 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4805     // len is the number of decimal digits needed to represent unmappable plus
4806     // 3 (the length of "&#" and ";").
4807     let mut number = unmappable as u32;
4808     let len = if number >= 1_000_000u32 {
4809         10usize
4810     } else if number >= 100_000u32 {
4811         9usize
4812     } else if number >= 10_000u32 {
4813         8usize
4814     } else if number >= 1_000u32 {
4815         7usize
4816     } else if number >= 100u32 {
4817         6usize
4818     } else {
4819         // Review the outcome of https://github.com/whatwg/encoding/issues/15
4820         // to see if this case is possible
4821         5usize
4822     };
4823     debug_assert!(number >= 10u32);
4824     debug_assert!(len <= dst.len());
4825     let mut pos = len - 1;
4826     dst[pos] = b';';
4827     pos -= 1;
4828     loop {
4829         let rightmost = number % 10;
4830         dst[pos] = rightmost as u8 + b'0';
4831         pos -= 1;
4832         if number < 10 {
4833             break;
4834         }
4835         number /= 10;
4836     }
4837     dst[1] = b'#';
4838     dst[0] = b'&';
4839     len
4840 }
4841 
4842 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4843 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4844     i.wrapping_sub(start) < (end - start)
4845 }
4846 
4847 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4848 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4849     i.wrapping_sub(start) < (end - start)
4850 }
4851 
4852 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4853 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4854     i.wrapping_sub(start) <= (end - start)
4855 }
4856 
4857 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4858 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4859     i.wrapping_sub(start) <= (end - start)
4860 }
4861 
4862 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4863 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4864     i.wrapping_sub(start) <= (end - start)
4865 }
4866 
4867 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4868 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4869     i.wrapping_sub(start) <= (end - start)
4870 }
4871 
4872 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4873 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4874     if let Some(n) = opt {
4875         n.checked_add(num)
4876     } else {
4877         None
4878     }
4879 }
4880 
4881 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4882 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4883     if let Some(n) = one {
4884         checked_add(n, other)
4885     } else {
4886         None
4887     }
4888 }
4889 
4890 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4891 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4892     if let Some(n) = opt {
4893         n.checked_mul(num)
4894     } else {
4895         None
4896     }
4897 }
4898 
4899 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4900 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4901     if let Some(n) = opt {
4902         n.checked_div(num)
4903     } else {
4904         None
4905     }
4906 }
4907 
4908 #[cfg(feature = "alloc")]
4909 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4910 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4911     opt.map(|n| n.next_power_of_two())
4912 }
4913 
4914 #[cfg(feature = "alloc")]
4915 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4916 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4917     if let Some(a) = one {
4918         if let Some(b) = other {
4919             Some(::core::cmp::min(a, b))
4920         } else {
4921             Some(a)
4922         }
4923     } else {
4924         other
4925     }
4926 }
4927 
4928 // ############## TESTS ###############
4929 
4930 #[cfg(all(test, feature = "serde"))]
4931 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4932 struct Demo {
4933     num: u32,
4934     name: String,
4935     enc: &'static Encoding,
4936 }
4937 
4938 #[cfg(test)]
4939 mod test_labels_names;
4940 
4941 #[cfg(all(test, feature = "alloc"))]
4942 mod tests {
4943     use super::*;
4944     use alloc::borrow::Cow;
4945 
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4946     fn sniff_to_utf16(
4947         initial_encoding: &'static Encoding,
4948         expected_encoding: &'static Encoding,
4949         bytes: &[u8],
4950         expect: &[u16],
4951         breaks: &[usize],
4952     ) {
4953         let mut decoder = initial_encoding.new_decoder();
4954 
4955         let mut dest: Vec<u16> =
4956             Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4957         let capacity = dest.capacity();
4958         dest.resize(capacity, 0u16);
4959 
4960         let mut total_written = 0usize;
4961         let mut start = 0usize;
4962         for br in breaks {
4963             let (result, read, written, _) =
4964                 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4965             total_written += written;
4966             assert_eq!(read, *br - start);
4967             match result {
4968                 CoderResult::InputEmpty => {}
4969                 CoderResult::OutputFull => {
4970                     unreachable!();
4971                 }
4972             }
4973             start = *br;
4974         }
4975         let (result, read, written, _) =
4976             decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4977         total_written += written;
4978         match result {
4979             CoderResult::InputEmpty => {}
4980             CoderResult::OutputFull => {
4981                 unreachable!();
4982             }
4983         }
4984         assert_eq!(read, bytes.len() - start);
4985         assert_eq!(total_written, expect.len());
4986         assert_eq!(&dest[..total_written], expect);
4987         assert_eq!(decoder.encoding(), expected_encoding);
4988     }
4989 
4990     // Any copyright to the test code below this comment is dedicated to the
4991     // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4992 
4993     #[test]
test_bom_sniffing()4994     fn test_bom_sniffing() {
4995         // ASCII
4996         sniff_to_utf16(
4997             WINDOWS_1252,
4998             WINDOWS_1252,
4999             b"\x61\x62",
5000             &[0x0061u16, 0x0062u16],
5001             &[],
5002         );
5003         // UTF-8
5004         sniff_to_utf16(
5005             WINDOWS_1252,
5006             UTF_8,
5007             b"\xEF\xBB\xBF\x61\x62",
5008             &[0x0061u16, 0x0062u16],
5009             &[],
5010         );
5011         sniff_to_utf16(
5012             WINDOWS_1252,
5013             UTF_8,
5014             b"\xEF\xBB\xBF\x61\x62",
5015             &[0x0061u16, 0x0062u16],
5016             &[1],
5017         );
5018         sniff_to_utf16(
5019             WINDOWS_1252,
5020             UTF_8,
5021             b"\xEF\xBB\xBF\x61\x62",
5022             &[0x0061u16, 0x0062u16],
5023             &[2],
5024         );
5025         sniff_to_utf16(
5026             WINDOWS_1252,
5027             UTF_8,
5028             b"\xEF\xBB\xBF\x61\x62",
5029             &[0x0061u16, 0x0062u16],
5030             &[3],
5031         );
5032         sniff_to_utf16(
5033             WINDOWS_1252,
5034             UTF_8,
5035             b"\xEF\xBB\xBF\x61\x62",
5036             &[0x0061u16, 0x0062u16],
5037             &[4],
5038         );
5039         sniff_to_utf16(
5040             WINDOWS_1252,
5041             UTF_8,
5042             b"\xEF\xBB\xBF\x61\x62",
5043             &[0x0061u16, 0x0062u16],
5044             &[2, 3],
5045         );
5046         sniff_to_utf16(
5047             WINDOWS_1252,
5048             UTF_8,
5049             b"\xEF\xBB\xBF\x61\x62",
5050             &[0x0061u16, 0x0062u16],
5051             &[1, 2],
5052         );
5053         sniff_to_utf16(
5054             WINDOWS_1252,
5055             UTF_8,
5056             b"\xEF\xBB\xBF\x61\x62",
5057             &[0x0061u16, 0x0062u16],
5058             &[1, 3],
5059         );
5060         sniff_to_utf16(
5061             WINDOWS_1252,
5062             UTF_8,
5063             b"\xEF\xBB\xBF\x61\x62",
5064             &[0x0061u16, 0x0062u16],
5065             &[1, 2, 3, 4],
5066         );
5067         sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5068         // Not UTF-8
5069         sniff_to_utf16(
5070             WINDOWS_1252,
5071             WINDOWS_1252,
5072             b"\xEF\xBB\x61\x62",
5073             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5074             &[],
5075         );
5076         sniff_to_utf16(
5077             WINDOWS_1252,
5078             WINDOWS_1252,
5079             b"\xEF\xBB\x61\x62",
5080             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5081             &[1],
5082         );
5083         sniff_to_utf16(
5084             WINDOWS_1252,
5085             WINDOWS_1252,
5086             b"\xEF\x61\x62",
5087             &[0x00EFu16, 0x0061u16, 0x0062u16],
5088             &[],
5089         );
5090         sniff_to_utf16(
5091             WINDOWS_1252,
5092             WINDOWS_1252,
5093             b"\xEF\x61\x62",
5094             &[0x00EFu16, 0x0061u16, 0x0062u16],
5095             &[1],
5096         );
5097         sniff_to_utf16(
5098             WINDOWS_1252,
5099             WINDOWS_1252,
5100             b"\xEF\xBB",
5101             &[0x00EFu16, 0x00BBu16],
5102             &[],
5103         );
5104         sniff_to_utf16(
5105             WINDOWS_1252,
5106             WINDOWS_1252,
5107             b"\xEF\xBB",
5108             &[0x00EFu16, 0x00BBu16],
5109             &[1],
5110         );
5111         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5112         // Not UTF-16
5113         sniff_to_utf16(
5114             WINDOWS_1252,
5115             WINDOWS_1252,
5116             b"\xFE\x61\x62",
5117             &[0x00FEu16, 0x0061u16, 0x0062u16],
5118             &[],
5119         );
5120         sniff_to_utf16(
5121             WINDOWS_1252,
5122             WINDOWS_1252,
5123             b"\xFE\x61\x62",
5124             &[0x00FEu16, 0x0061u16, 0x0062u16],
5125             &[1],
5126         );
5127         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5128         sniff_to_utf16(
5129             WINDOWS_1252,
5130             WINDOWS_1252,
5131             b"\xFF\x61\x62",
5132             &[0x00FFu16, 0x0061u16, 0x0062u16],
5133             &[],
5134         );
5135         sniff_to_utf16(
5136             WINDOWS_1252,
5137             WINDOWS_1252,
5138             b"\xFF\x61\x62",
5139             &[0x00FFu16, 0x0061u16, 0x0062u16],
5140             &[1],
5141         );
5142         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5143         // UTF-16
5144         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5145         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5146         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5147         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5148     }
5149 
5150     #[test]
test_output_encoding()5151     fn test_output_encoding() {
5152         assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5153         assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5154         assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5155         assert_eq!(UTF_8.output_encoding(), UTF_8);
5156         assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5157         assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5158         assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5159         assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5160         assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5161         assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5162     }
5163 
5164     #[test]
test_label_resolution()5165     fn test_label_resolution() {
5166         assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5167         assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5168         assert_eq!(
5169             Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5170             Some(UTF_8)
5171         );
5172         assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5173         assert_eq!(Encoding::for_label(b"bogus"), None);
5174         assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5175     }
5176 
5177     #[test]
test_decode_valid_windows_1257_to_cow()5178     fn test_decode_valid_windows_1257_to_cow() {
5179         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5180         match cow {
5181             Cow::Borrowed(_) => unreachable!(),
5182             Cow::Owned(s) => {
5183                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5184             }
5185         }
5186         assert_eq!(encoding, WINDOWS_1257);
5187         assert!(!had_errors);
5188     }
5189 
5190     #[test]
test_decode_invalid_windows_1257_to_cow()5191     fn test_decode_invalid_windows_1257_to_cow() {
5192         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5193         match cow {
5194             Cow::Borrowed(_) => unreachable!(),
5195             Cow::Owned(s) => {
5196                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5197             }
5198         }
5199         assert_eq!(encoding, WINDOWS_1257);
5200         assert!(had_errors);
5201     }
5202 
5203     #[test]
test_decode_ascii_only_windows_1257_to_cow()5204     fn test_decode_ascii_only_windows_1257_to_cow() {
5205         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5206         match cow {
5207             Cow::Borrowed(s) => {
5208                 assert_eq!(s, "abc");
5209             }
5210             Cow::Owned(_) => unreachable!(),
5211         }
5212         assert_eq!(encoding, WINDOWS_1257);
5213         assert!(!had_errors);
5214     }
5215 
5216     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5217     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5218         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5219         match cow {
5220             Cow::Borrowed(s) => {
5221                 assert_eq!(s, "\u{20AC}\u{00E4}");
5222             }
5223             Cow::Owned(_) => unreachable!(),
5224         }
5225         assert_eq!(encoding, UTF_8);
5226         assert!(!had_errors);
5227     }
5228 
5229     #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5230     fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5231         let (cow, encoding, had_errors) =
5232             WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5233         match cow {
5234             Cow::Borrowed(_) => unreachable!(),
5235             Cow::Owned(s) => {
5236                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5237             }
5238         }
5239         assert_eq!(encoding, UTF_8);
5240         assert!(had_errors);
5241     }
5242 
5243     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5244     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5245         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5246         match cow {
5247             Cow::Borrowed(s) => {
5248                 assert_eq!(s, "\u{20AC}\u{00E4}");
5249             }
5250             Cow::Owned(_) => unreachable!(),
5251         }
5252         assert_eq!(encoding, UTF_8);
5253         assert!(!had_errors);
5254     }
5255 
5256     #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5257     fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5258         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5259         match cow {
5260             Cow::Borrowed(_) => unreachable!(),
5261             Cow::Owned(s) => {
5262                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5263             }
5264         }
5265         assert_eq!(encoding, UTF_8);
5266         assert!(had_errors);
5267     }
5268 
5269     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5270     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5271         let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5272         match cow {
5273             Cow::Borrowed(s) => {
5274                 assert_eq!(s, "\u{20AC}\u{00E4}");
5275             }
5276             Cow::Owned(_) => unreachable!(),
5277         }
5278         assert!(!had_errors);
5279     }
5280 
5281     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5282     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5283         let (cow, had_errors) =
5284             WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5285         match cow {
5286             Cow::Borrowed(_) => unreachable!(),
5287             Cow::Owned(s) => {
5288                 assert_eq!(
5289                     s,
5290                     "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5291                 );
5292             }
5293         }
5294         assert!(!had_errors);
5295     }
5296 
5297     #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5298     fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5299         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5300         match cow {
5301             Cow::Borrowed(_) => unreachable!(),
5302             Cow::Owned(s) => {
5303                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5304             }
5305         }
5306         assert!(!had_errors);
5307     }
5308 
5309     #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5310     fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5311         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5312         match cow {
5313             Cow::Borrowed(_) => unreachable!(),
5314             Cow::Owned(s) => {
5315                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5316             }
5317         }
5318         assert!(had_errors);
5319     }
5320 
5321     #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5322     fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5323         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5324         match cow {
5325             Cow::Borrowed(s) => {
5326                 assert_eq!(s, "abc");
5327             }
5328             Cow::Owned(_) => unreachable!(),
5329         }
5330         assert!(!had_errors);
5331     }
5332 
5333     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5334     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5335         let (cow, had_errors) =
5336             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5337         match cow {
5338             Cow::Borrowed(s) => {
5339                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5340             }
5341             Cow::Owned(_) => unreachable!(),
5342         }
5343         assert!(!had_errors);
5344     }
5345 
5346     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5347     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5348         let (cow, had_errors) =
5349             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5350         match cow {
5351             Cow::Borrowed(_) => unreachable!(),
5352             Cow::Owned(s) => {
5353                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5354             }
5355         }
5356         assert!(had_errors);
5357     }
5358 
5359     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5360     fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5361         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5362         match cow {
5363             Cow::Borrowed(_) => unreachable!(),
5364             Cow::Owned(s) => {
5365                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5366             }
5367         }
5368         assert!(!had_errors);
5369     }
5370 
5371     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5372     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5373         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5374         match cow {
5375             Cow::Borrowed(_) => unreachable!(),
5376             Cow::Owned(s) => {
5377                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5378             }
5379         }
5380         assert!(had_errors);
5381     }
5382 
5383     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5384     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5385         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5386         match cow {
5387             Cow::Borrowed(s) => {
5388                 assert_eq!(s, "abc");
5389             }
5390             Cow::Owned(_) => unreachable!(),
5391         }
5392         assert!(!had_errors);
5393     }
5394 
5395     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5396     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5397         match UTF_8.decode_without_bom_handling_and_without_replacement(
5398             b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5399         ) {
5400             Some(cow) => match cow {
5401                 Cow::Borrowed(s) => {
5402                     assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5403                 }
5404                 Cow::Owned(_) => unreachable!(),
5405             },
5406             None => unreachable!(),
5407         }
5408     }
5409 
5410     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5411     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5412         assert!(UTF_8
5413             .decode_without_bom_handling_and_without_replacement(
5414                 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5415             )
5416             .is_none());
5417     }
5418 
5419     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5420     fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5421         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5422             Some(cow) => match cow {
5423                 Cow::Borrowed(_) => unreachable!(),
5424                 Cow::Owned(s) => {
5425                     assert_eq!(s, "abc\u{20AC}\u{00E4}");
5426                 }
5427             },
5428             None => unreachable!(),
5429         }
5430     }
5431 
5432     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5433     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5434         assert!(WINDOWS_1257
5435             .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5436             .is_none());
5437     }
5438 
5439     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5440     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5441         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5442             Some(cow) => match cow {
5443                 Cow::Borrowed(s) => {
5444                     assert_eq!(s, "abc");
5445                 }
5446                 Cow::Owned(_) => unreachable!(),
5447             },
5448             None => unreachable!(),
5449         }
5450     }
5451 
5452     #[test]
test_encode_ascii_only_windows_1257_to_cow()5453     fn test_encode_ascii_only_windows_1257_to_cow() {
5454         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5455         match cow {
5456             Cow::Borrowed(s) => {
5457                 assert_eq!(s, b"abc");
5458             }
5459             Cow::Owned(_) => unreachable!(),
5460         }
5461         assert_eq!(encoding, WINDOWS_1257);
5462         assert!(!had_errors);
5463     }
5464 
5465     #[test]
test_encode_valid_windows_1257_to_cow()5466     fn test_encode_valid_windows_1257_to_cow() {
5467         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5468         match cow {
5469             Cow::Borrowed(_) => unreachable!(),
5470             Cow::Owned(s) => {
5471                 assert_eq!(s, b"abc\x80\xE4");
5472             }
5473         }
5474         assert_eq!(encoding, WINDOWS_1257);
5475         assert!(!had_errors);
5476     }
5477 
5478     #[test]
test_utf16_space_with_one_bom_byte()5479     fn test_utf16_space_with_one_bom_byte() {
5480         let mut decoder = UTF_16LE.new_decoder();
5481         let mut dst = [0u16; 12];
5482         {
5483             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5484             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5485             assert_eq!(result, CoderResult::InputEmpty);
5486         }
5487         {
5488             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5489             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5490             assert_eq!(result, CoderResult::InputEmpty);
5491         }
5492     }
5493 
5494     #[test]
test_utf8_space_with_one_bom_byte()5495     fn test_utf8_space_with_one_bom_byte() {
5496         let mut decoder = UTF_8.new_decoder();
5497         let mut dst = [0u16; 12];
5498         {
5499             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5500             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5501             assert_eq!(result, CoderResult::InputEmpty);
5502         }
5503         {
5504             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5505             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5506             assert_eq!(result, CoderResult::InputEmpty);
5507         }
5508     }
5509 
5510     #[test]
test_utf16_space_with_two_bom_bytes()5511     fn test_utf16_space_with_two_bom_bytes() {
5512         let mut decoder = UTF_16LE.new_decoder();
5513         let mut dst = [0u16; 12];
5514         {
5515             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5516             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5517             assert_eq!(result, CoderResult::InputEmpty);
5518         }
5519         {
5520             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5521             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5522             assert_eq!(result, CoderResult::InputEmpty);
5523         }
5524         {
5525             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5526             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5527             assert_eq!(result, CoderResult::InputEmpty);
5528         }
5529     }
5530 
5531     #[test]
test_utf8_space_with_two_bom_bytes()5532     fn test_utf8_space_with_two_bom_bytes() {
5533         let mut decoder = UTF_8.new_decoder();
5534         let mut dst = [0u16; 12];
5535         {
5536             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5537             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5538             assert_eq!(result, CoderResult::InputEmpty);
5539         }
5540         {
5541             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5542             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5543             assert_eq!(result, CoderResult::InputEmpty);
5544         }
5545         {
5546             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5547             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5548             assert_eq!(result, CoderResult::InputEmpty);
5549         }
5550     }
5551 
5552     #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5553     fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5554         let mut decoder = UTF_16LE.new_decoder();
5555         let mut dst = [0u16; 12];
5556         {
5557             let needed = decoder.max_utf16_buffer_length(2).unwrap();
5558             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5559             assert_eq!(result, CoderResult::InputEmpty);
5560         }
5561     }
5562 
5563     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5564     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5565         let mut dst = [0u8; 8];
5566         let mut encoder = ISO_2022_JP.new_encoder();
5567         {
5568             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5569             assert_eq!(result, CoderResult::InputEmpty);
5570         }
5571         {
5572             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5573             assert_eq!(result, CoderResult::InputEmpty);
5574         }
5575     }
5576 
5577     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5578     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5579         let mut dst = [0u8; 16];
5580         let mut encoder = ISO_2022_JP.new_encoder();
5581         {
5582             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5583             assert_eq!(result, CoderResult::InputEmpty);
5584         }
5585         {
5586             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5587             assert_eq!(result, CoderResult::InputEmpty);
5588         }
5589         {
5590             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5591             assert_eq!(result, CoderResult::OutputFull);
5592         }
5593     }
5594 
5595     #[test]
test_buffer_end_iso_2022_jp_from_utf8()5596     fn test_buffer_end_iso_2022_jp_from_utf8() {
5597         let mut dst = [0u8; 18];
5598         {
5599             let mut encoder = ISO_2022_JP.new_encoder();
5600             let (result, _, _, _) =
5601                 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5602             assert_eq!(result, CoderResult::InputEmpty);
5603         }
5604         {
5605             let mut encoder = ISO_2022_JP.new_encoder();
5606             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5607             assert_eq!(result, CoderResult::OutputFull);
5608         }
5609         {
5610             let mut encoder = ISO_2022_JP.new_encoder();
5611             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5612             assert_eq!(result, CoderResult::InputEmpty);
5613         }
5614         {
5615             let mut encoder = ISO_2022_JP.new_encoder();
5616             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5617             assert_eq!(result, CoderResult::InputEmpty);
5618         }
5619     }
5620 
5621     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5622     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5623         let mut dst = [0u8; 8];
5624         let mut encoder = ISO_2022_JP.new_encoder();
5625         {
5626             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5627             assert_eq!(result, CoderResult::InputEmpty);
5628         }
5629         {
5630             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5631             assert_eq!(result, CoderResult::InputEmpty);
5632         }
5633     }
5634 
5635     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5636     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5637         let mut dst = [0u8; 16];
5638         let mut encoder = ISO_2022_JP.new_encoder();
5639         {
5640             let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5641             assert_eq!(result, CoderResult::InputEmpty);
5642         }
5643         {
5644             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5645             assert_eq!(result, CoderResult::InputEmpty);
5646         }
5647         {
5648             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5649             assert_eq!(result, CoderResult::OutputFull);
5650         }
5651     }
5652 
5653     #[test]
test_buffer_end_iso_2022_jp_from_utf16()5654     fn test_buffer_end_iso_2022_jp_from_utf16() {
5655         let mut dst = [0u8; 18];
5656         {
5657             let mut encoder = ISO_2022_JP.new_encoder();
5658             let (result, _, _, _) =
5659                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5660             assert_eq!(result, CoderResult::InputEmpty);
5661         }
5662         {
5663             let mut encoder = ISO_2022_JP.new_encoder();
5664             let (result, _, _, _) =
5665                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5666             assert_eq!(result, CoderResult::OutputFull);
5667         }
5668         {
5669             let mut encoder = ISO_2022_JP.new_encoder();
5670             let (result, _, _, _) =
5671                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5672             assert_eq!(result, CoderResult::InputEmpty);
5673         }
5674         {
5675             let mut encoder = ISO_2022_JP.new_encoder();
5676             let (result, _, _, _) =
5677                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5678             assert_eq!(result, CoderResult::InputEmpty);
5679         }
5680     }
5681 
5682     #[test]
test_buffer_end_utf16be()5683     fn test_buffer_end_utf16be() {
5684         let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5685         let mut dest = [0u8; 4];
5686 
5687         assert_eq!(
5688             decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5689             (CoderResult::InputEmpty, 2, 0, false)
5690         );
5691 
5692         let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5693     }
5694 
5695     #[test]
test_hash()5696     fn test_hash() {
5697         let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5698         encodings.insert(UTF_8);
5699         encodings.insert(ISO_2022_JP);
5700         assert!(encodings.contains(UTF_8));
5701         assert!(encodings.contains(ISO_2022_JP));
5702         assert!(!encodings.contains(WINDOWS_1252));
5703         encodings.remove(ISO_2022_JP);
5704         assert!(!encodings.contains(ISO_2022_JP));
5705     }
5706 
5707     #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5708     fn test_iso_2022_jp_ncr_extra_from_utf16() {
5709         let mut dst = [0u8; 17];
5710         {
5711             let mut encoder = ISO_2022_JP.new_encoder();
5712             let (result, _, _, _) =
5713                 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5714             assert_eq!(result, CoderResult::OutputFull);
5715         }
5716     }
5717 
5718     #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5719     fn test_iso_2022_jp_ncr_extra_from_utf8() {
5720         let mut dst = [0u8; 17];
5721         {
5722             let mut encoder = ISO_2022_JP.new_encoder();
5723             let (result, _, _, _) =
5724                 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5725             assert_eq!(result, CoderResult::OutputFull);
5726         }
5727     }
5728 
5729     #[test]
test_max_length_with_bom_to_utf8()5730     fn test_max_length_with_bom_to_utf8() {
5731         let mut output = [0u8; 20];
5732         let mut decoder = REPLACEMENT.new_decoder();
5733         let input = b"\xEF\xBB\xBFA";
5734         {
5735             let needed = decoder
5736                 .max_utf8_buffer_length_without_replacement(input.len())
5737                 .unwrap();
5738             let (result, read, written) =
5739                 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5740             assert_eq!(result, DecoderResult::InputEmpty);
5741             assert_eq!(read, input.len());
5742             assert_eq!(written, 1);
5743             assert_eq!(output[0], 0x41);
5744         }
5745     }
5746 
5747     #[cfg(feature = "serde")]
5748     #[test]
test_serde()5749     fn test_serde() {
5750         let demo = Demo {
5751             num: 42,
5752             name: "foo".into(),
5753             enc: UTF_8,
5754         };
5755 
5756         let serialized = serde_json::to_string(&demo).unwrap();
5757 
5758         let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5759         assert_eq!(deserialized, demo);
5760 
5761         let bincoded = bincode::serialize(&demo).unwrap();
5762         let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5763         assert_eq!(debincoded, demo);
5764     }
5765 
5766     #[test]
test_is_single_byte()5767     fn test_is_single_byte() {
5768         assert!(!BIG5.is_single_byte());
5769         assert!(!EUC_JP.is_single_byte());
5770         assert!(!EUC_KR.is_single_byte());
5771         assert!(!GB18030.is_single_byte());
5772         assert!(!GBK.is_single_byte());
5773         assert!(!REPLACEMENT.is_single_byte());
5774         assert!(!SHIFT_JIS.is_single_byte());
5775         assert!(!UTF_8.is_single_byte());
5776         assert!(!UTF_16BE.is_single_byte());
5777         assert!(!UTF_16LE.is_single_byte());
5778         assert!(!ISO_2022_JP.is_single_byte());
5779 
5780         assert!(IBM866.is_single_byte());
5781         assert!(ISO_8859_2.is_single_byte());
5782         assert!(ISO_8859_3.is_single_byte());
5783         assert!(ISO_8859_4.is_single_byte());
5784         assert!(ISO_8859_5.is_single_byte());
5785         assert!(ISO_8859_6.is_single_byte());
5786         assert!(ISO_8859_7.is_single_byte());
5787         assert!(ISO_8859_8.is_single_byte());
5788         assert!(ISO_8859_10.is_single_byte());
5789         assert!(ISO_8859_13.is_single_byte());
5790         assert!(ISO_8859_14.is_single_byte());
5791         assert!(ISO_8859_15.is_single_byte());
5792         assert!(ISO_8859_16.is_single_byte());
5793         assert!(ISO_8859_8_I.is_single_byte());
5794         assert!(KOI8_R.is_single_byte());
5795         assert!(KOI8_U.is_single_byte());
5796         assert!(MACINTOSH.is_single_byte());
5797         assert!(WINDOWS_874.is_single_byte());
5798         assert!(WINDOWS_1250.is_single_byte());
5799         assert!(WINDOWS_1251.is_single_byte());
5800         assert!(WINDOWS_1252.is_single_byte());
5801         assert!(WINDOWS_1253.is_single_byte());
5802         assert!(WINDOWS_1254.is_single_byte());
5803         assert!(WINDOWS_1255.is_single_byte());
5804         assert!(WINDOWS_1256.is_single_byte());
5805         assert!(WINDOWS_1257.is_single_byte());
5806         assert!(WINDOWS_1258.is_single_byte());
5807         assert!(X_MAC_CYRILLIC.is_single_byte());
5808         assert!(X_USER_DEFINED.is_single_byte());
5809     }
5810 
5811     #[test]
test_latin1_byte_compatible_up_to()5812     fn test_latin1_byte_compatible_up_to() {
5813         let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5814         assert_eq!(
5815             BIG5.new_decoder_without_bom_handling()
5816                 .latin1_byte_compatible_up_to(buffer)
5817                 .unwrap(),
5818             1
5819         );
5820         assert_eq!(
5821             EUC_JP
5822                 .new_decoder_without_bom_handling()
5823                 .latin1_byte_compatible_up_to(buffer)
5824                 .unwrap(),
5825             1
5826         );
5827         assert_eq!(
5828             EUC_KR
5829                 .new_decoder_without_bom_handling()
5830                 .latin1_byte_compatible_up_to(buffer)
5831                 .unwrap(),
5832             1
5833         );
5834         assert_eq!(
5835             GB18030
5836                 .new_decoder_without_bom_handling()
5837                 .latin1_byte_compatible_up_to(buffer)
5838                 .unwrap(),
5839             1
5840         );
5841         assert_eq!(
5842             GBK.new_decoder_without_bom_handling()
5843                 .latin1_byte_compatible_up_to(buffer)
5844                 .unwrap(),
5845             1
5846         );
5847         assert!(REPLACEMENT
5848             .new_decoder_without_bom_handling()
5849             .latin1_byte_compatible_up_to(buffer)
5850             .is_none());
5851         assert_eq!(
5852             SHIFT_JIS
5853                 .new_decoder_without_bom_handling()
5854                 .latin1_byte_compatible_up_to(buffer)
5855                 .unwrap(),
5856             1
5857         );
5858         assert_eq!(
5859             UTF_8
5860                 .new_decoder_without_bom_handling()
5861                 .latin1_byte_compatible_up_to(buffer)
5862                 .unwrap(),
5863             1
5864         );
5865         assert!(UTF_16BE
5866             .new_decoder_without_bom_handling()
5867             .latin1_byte_compatible_up_to(buffer)
5868             .is_none());
5869         assert!(UTF_16LE
5870             .new_decoder_without_bom_handling()
5871             .latin1_byte_compatible_up_to(buffer)
5872             .is_none());
5873         assert_eq!(
5874             ISO_2022_JP
5875                 .new_decoder_without_bom_handling()
5876                 .latin1_byte_compatible_up_to(buffer)
5877                 .unwrap(),
5878             1
5879         );
5880 
5881         assert_eq!(
5882             IBM866
5883                 .new_decoder_without_bom_handling()
5884                 .latin1_byte_compatible_up_to(buffer)
5885                 .unwrap(),
5886             1
5887         );
5888         assert_eq!(
5889             ISO_8859_2
5890                 .new_decoder_without_bom_handling()
5891                 .latin1_byte_compatible_up_to(buffer)
5892                 .unwrap(),
5893             2
5894         );
5895         assert_eq!(
5896             ISO_8859_3
5897                 .new_decoder_without_bom_handling()
5898                 .latin1_byte_compatible_up_to(buffer)
5899                 .unwrap(),
5900             2
5901         );
5902         assert_eq!(
5903             ISO_8859_4
5904                 .new_decoder_without_bom_handling()
5905                 .latin1_byte_compatible_up_to(buffer)
5906                 .unwrap(),
5907             2
5908         );
5909         assert_eq!(
5910             ISO_8859_5
5911                 .new_decoder_without_bom_handling()
5912                 .latin1_byte_compatible_up_to(buffer)
5913                 .unwrap(),
5914             2
5915         );
5916         assert_eq!(
5917             ISO_8859_6
5918                 .new_decoder_without_bom_handling()
5919                 .latin1_byte_compatible_up_to(buffer)
5920                 .unwrap(),
5921             2
5922         );
5923         assert_eq!(
5924             ISO_8859_7
5925                 .new_decoder_without_bom_handling()
5926                 .latin1_byte_compatible_up_to(buffer)
5927                 .unwrap(),
5928             2
5929         );
5930         assert_eq!(
5931             ISO_8859_8
5932                 .new_decoder_without_bom_handling()
5933                 .latin1_byte_compatible_up_to(buffer)
5934                 .unwrap(),
5935             3
5936         );
5937         assert_eq!(
5938             ISO_8859_10
5939                 .new_decoder_without_bom_handling()
5940                 .latin1_byte_compatible_up_to(buffer)
5941                 .unwrap(),
5942             2
5943         );
5944         assert_eq!(
5945             ISO_8859_13
5946                 .new_decoder_without_bom_handling()
5947                 .latin1_byte_compatible_up_to(buffer)
5948                 .unwrap(),
5949             4
5950         );
5951         assert_eq!(
5952             ISO_8859_14
5953                 .new_decoder_without_bom_handling()
5954                 .latin1_byte_compatible_up_to(buffer)
5955                 .unwrap(),
5956             4
5957         );
5958         assert_eq!(
5959             ISO_8859_15
5960                 .new_decoder_without_bom_handling()
5961                 .latin1_byte_compatible_up_to(buffer)
5962                 .unwrap(),
5963             6
5964         );
5965         assert_eq!(
5966             ISO_8859_16
5967                 .new_decoder_without_bom_handling()
5968                 .latin1_byte_compatible_up_to(buffer)
5969                 .unwrap(),
5970             4
5971         );
5972         assert_eq!(
5973             ISO_8859_8_I
5974                 .new_decoder_without_bom_handling()
5975                 .latin1_byte_compatible_up_to(buffer)
5976                 .unwrap(),
5977             3
5978         );
5979         assert_eq!(
5980             KOI8_R
5981                 .new_decoder_without_bom_handling()
5982                 .latin1_byte_compatible_up_to(buffer)
5983                 .unwrap(),
5984             1
5985         );
5986         assert_eq!(
5987             KOI8_U
5988                 .new_decoder_without_bom_handling()
5989                 .latin1_byte_compatible_up_to(buffer)
5990                 .unwrap(),
5991             1
5992         );
5993         assert_eq!(
5994             MACINTOSH
5995                 .new_decoder_without_bom_handling()
5996                 .latin1_byte_compatible_up_to(buffer)
5997                 .unwrap(),
5998             1
5999         );
6000         assert_eq!(
6001             WINDOWS_874
6002                 .new_decoder_without_bom_handling()
6003                 .latin1_byte_compatible_up_to(buffer)
6004                 .unwrap(),
6005             2
6006         );
6007         assert_eq!(
6008             WINDOWS_1250
6009                 .new_decoder_without_bom_handling()
6010                 .latin1_byte_compatible_up_to(buffer)
6011                 .unwrap(),
6012             4
6013         );
6014         assert_eq!(
6015             WINDOWS_1251
6016                 .new_decoder_without_bom_handling()
6017                 .latin1_byte_compatible_up_to(buffer)
6018                 .unwrap(),
6019             1
6020         );
6021         assert_eq!(
6022             WINDOWS_1252
6023                 .new_decoder_without_bom_handling()
6024                 .latin1_byte_compatible_up_to(buffer)
6025                 .unwrap(),
6026             5
6027         );
6028         assert_eq!(
6029             WINDOWS_1253
6030                 .new_decoder_without_bom_handling()
6031                 .latin1_byte_compatible_up_to(buffer)
6032                 .unwrap(),
6033             3
6034         );
6035         assert_eq!(
6036             WINDOWS_1254
6037                 .new_decoder_without_bom_handling()
6038                 .latin1_byte_compatible_up_to(buffer)
6039                 .unwrap(),
6040             4
6041         );
6042         assert_eq!(
6043             WINDOWS_1255
6044                 .new_decoder_without_bom_handling()
6045                 .latin1_byte_compatible_up_to(buffer)
6046                 .unwrap(),
6047             3
6048         );
6049         assert_eq!(
6050             WINDOWS_1256
6051                 .new_decoder_without_bom_handling()
6052                 .latin1_byte_compatible_up_to(buffer)
6053                 .unwrap(),
6054             1
6055         );
6056         assert_eq!(
6057             WINDOWS_1257
6058                 .new_decoder_without_bom_handling()
6059                 .latin1_byte_compatible_up_to(buffer)
6060                 .unwrap(),
6061             4
6062         );
6063         assert_eq!(
6064             WINDOWS_1258
6065                 .new_decoder_without_bom_handling()
6066                 .latin1_byte_compatible_up_to(buffer)
6067                 .unwrap(),
6068             4
6069         );
6070         assert_eq!(
6071             X_MAC_CYRILLIC
6072                 .new_decoder_without_bom_handling()
6073                 .latin1_byte_compatible_up_to(buffer)
6074                 .unwrap(),
6075             1
6076         );
6077         assert_eq!(
6078             X_USER_DEFINED
6079                 .new_decoder_without_bom_handling()
6080                 .latin1_byte_compatible_up_to(buffer)
6081                 .unwrap(),
6082             1
6083         );
6084 
6085         assert!(UTF_8
6086             .new_decoder()
6087             .latin1_byte_compatible_up_to(buffer)
6088             .is_none());
6089 
6090         let mut decoder = UTF_8.new_decoder();
6091         let mut output = [0u16; 4];
6092         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6093         assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6094         let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6095         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6096         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6097         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6098     }
6099 }
6100