1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 #![cfg_attr(
11     feature = "cargo-clippy",
12     allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14 
15 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17 //! Gecko-oriented means that converting to and from UTF-16 is supported in
18 //! addition to converting to and from UTF-8, that the performance and
19 //! streamability goals are browser-oriented, and that FFI-friendliness is a
20 //! goal.
21 //!
22 //! Additionally, the `mem` module provides functions that are useful for
23 //! applications that need to be able to deal with legacy in-memory
24 //! representations of Unicode.
25 //!
26 //! For expectation setting, please be sure to read the sections
27 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29 //!
30 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31 //! design and internals of the crate.
32 //!
33 //! # Availability
34 //!
35 //! The code is available under the
36 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38 //! See the
39 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40 //! file for details.
41 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43 //!
44 //! # Integration with `std::io`
45 //!
46 //! This crate doesn't implement traits from `std::io`. However, for the case of
47 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50 //!
51 //! # Examples
52 //!
53 //! Example programs:
54 //!
55 //! * [Rust](https://github.com/hsivonen/recode_rs)
56 //! * [C](https://github.com/hsivonen/recode_c)
57 //! * [C++](https://github.com/hsivonen/recode_cpp)
58 //!
59 //! Decode using the non-streaming API:
60 //!
61 //! ```
62 //! use encoding_rs::*;
63 //!
64 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
65 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
66 //!
67 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
68 //! assert_eq!(&cow[..], expectation);
69 //! assert_eq!(encoding_used, SHIFT_JIS);
70 //! assert!(!had_errors);
71 //! ```
72 //!
73 //! Decode using the streaming API with minimal `unsafe`:
74 //!
75 //! ```
76 //! use encoding_rs::*;
77 //!
78 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
79 //!
80 //! // Use an array of byte slices to demonstrate content arriving piece by
81 //! // piece from the network.
82 //! let bytes: [&'static [u8]; 4] = [b"\x83",
83 //!                                  b"n\x83\x8D\x81",
84 //!                                  b"[\x81E\x83\x8F\x81[\x83",
85 //!                                  b"\x8B\x83h"];
86 //!
87 //! // Very short output buffer to demonstrate the output buffer getting full.
88 //! // Normally, you'd use something like `[0u8; 2048]`.
89 //! let mut buffer_bytes = [0u8; 8];
90 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
91 //!
92 //! // How many bytes in the buffer currently hold significant data.
93 //! let mut bytes_in_buffer = 0usize;
94 //!
95 //! // Collect the output to a string for demonstration purposes.
96 //! let mut output = String::new();
97 //!
98 //! // The `Decoder`
99 //! let mut decoder = SHIFT_JIS.new_decoder();
100 //!
101 //! // Track whether we see errors.
102 //! let mut total_had_errors = false;
103 //!
104 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
105 //! // use of a fixed-size buffer; normally when the output of an incremental
106 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
107 //! // avoid the intermediate buffer).
108 //! for input in &bytes[..] {
109 //!     // The number of bytes already read from current `input` in total.
110 //!     let mut total_read_from_current_input = 0usize;
111 //!
112 //!     loop {
113 //!         let (result, read, written, had_errors) =
114 //!             decoder.decode_to_str(&input[total_read_from_current_input..],
115 //!                                   &mut buffer[bytes_in_buffer..],
116 //!                                   false);
117 //!         total_read_from_current_input += read;
118 //!         bytes_in_buffer += written;
119 //!         total_had_errors |= had_errors;
120 //!         match result {
121 //!             CoderResult::InputEmpty => {
122 //!                 // We have consumed the current input buffer. Break out of
123 //!                 // the inner loop to get the next input buffer from the
124 //!                 // outer loop.
125 //!                 break;
126 //!             },
127 //!             CoderResult::OutputFull => {
128 //!                 // Write the current buffer out and consider the buffer
129 //!                 // empty.
130 //!                 output.push_str(&buffer[..bytes_in_buffer]);
131 //!                 bytes_in_buffer = 0usize;
132 //!                 continue;
133 //!             }
134 //!         }
135 //!     }
136 //! }
137 //!
138 //! // Process EOF
139 //! loop {
140 //!     let (result, _, written, had_errors) =
141 //!         decoder.decode_to_str(b"",
142 //!                               &mut buffer[bytes_in_buffer..],
143 //!                               true);
144 //!     bytes_in_buffer += written;
145 //!     total_had_errors |= had_errors;
146 //!     // Write the current buffer out and consider the buffer empty.
147 //!     // Need to do this here for both `match` arms, because we exit the
148 //!     // loop on `CoderResult::InputEmpty`.
149 //!     output.push_str(&buffer[..bytes_in_buffer]);
150 //!     bytes_in_buffer = 0usize;
151 //!     match result {
152 //!         CoderResult::InputEmpty => {
153 //!             // Done!
154 //!             break;
155 //!         },
156 //!         CoderResult::OutputFull => {
157 //!             continue;
158 //!         }
159 //!     }
160 //! }
161 //!
162 //! assert_eq!(&output[..], expectation);
163 //! assert!(!total_had_errors);
164 //! ```
165 //!
166 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
167 //!
168 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
169 //! __so this crate does not provide encoders for those encodings__!
170 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
171 //! so you get an UTF-8 encoder if you request an encoder for them.
172 //!
173 //! Additionally, the Encoding Standard factors BOM handling into wrapper
174 //! algorithms so that BOM handling isn't part of the definition of the
175 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
176 //! Standard define BOM handling or lack thereof as part of the encoding
177 //! scheme.
178 //!
179 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
180 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
181 //! the Unicode Standard.
182 //!
183 //! When used with the `_with_bom_removal` entry points, the UTF-8
184 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
185 //! Standard.
186 //!
187 //! This crate does not provide a mode that matches the UTF-16 _encoding
188 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
189 //! the entry points without `_bom_` qualifiers is the closest match,
190 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
191 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
192 //! Unicode Standard.
193 //!
194 //! The UTF-32 family of Unicode encoding schemes is not supported
195 //! by this crate. The Encoding Standard doesn't define any UTF-32
196 //! family encodings, since they aren't necessary for consuming Web
197 //! content.
198 //!
199 //! ## ISO-8859-1
200 //!
201 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
202 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
203 //! byte value to the same Unicode scalar value is not available via
204 //! `Encoding` in this crate.
205 //!
206 //! However, the functions whose name starts with `convert` and contains
207 //! `latin1` in the `mem` module support such conversions, which are known as
208 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
209 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
210 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
211 //!
212 //! ## Web / Browser Focus
213 //!
214 //! Both in terms of scope and performance, the focus is on the Web. For scope,
215 //! this means that encoding_rs implements the Encoding Standard fully and
216 //! doesn't implement encodings that are not specified in the Encoding
217 //! Standard. For performance, this means that decoding performance is
218 //! important as well as performance for encoding into UTF-8 or encoding the
219 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
220 //! be encoded into legacy encodings in only two places in the Web platform: in
221 //! the query part of URLs, in which case it's a matter of relatively rare
222 //! error handling, and in form submission, in which case the user action and
223 //! networking tend to hide the performance of the encoder.
224 //!
225 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
226 //! encodings enables smaller code size thanks to the encoder side using the
227 //! decode-optimized data tables without having encode-optimized data tables at
228 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
229 //! multiplication operations.
230 //!
231 //! Additionally, performance is a non-goal for the ASCII-incompatible
232 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
233 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
234 //! of implementation.
235 //!
236 //! Despite the browser focus, the hope is that non-browser applications
237 //! that wish to consume Web content or submit Web forms in a Web-compatible
238 //! way will find encoding_rs useful. While encoding_rs does not try to match
239 //! Windows behavior, many of the encodings are close enough to legacy
240 //! encodings implemented by Windows that applications that need to consume
241 //! data in legacy Windows encodins may find encoding_rs useful. The
242 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
243 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
244 //!
245 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
246 //! to the encodings defined in the Encoding Standard. The
247 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
248 //! UTF-7 decoding for email purposes.
249 //!
250 //! For single-byte DOS encodings beyond the ones supported by the Encoding
251 //! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
252 //!
253 //! # Preparing Text for the Encoders
254 //!
255 //! Normalizing text into Unicode Normalization Form C prior to encoding text
256 //! into a legacy encoding minimizes unmappable characters. Text can be
257 //! normalized to Unicode Normalization Form C using the
258 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
259 //!
260 //! The exception is windows-1258, which after normalizing to Unicode
261 //! Normalization Form C requires tone marks to be decomposed in order to
262 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
263 //! using the [`detone`](https://crates.io/crates/detone) crate.
264 //!
265 //! # Streaming & Non-Streaming; Rust & C/C++
266 //!
267 //! The API in Rust has two modes of operation: streaming and non-streaming.
268 //! The streaming API is the foundation of the implementation and should be
269 //! used when processing data that arrives piecemeal from an i/o stream. The
270 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
271 //! to C callers. The non-streaming part of the API is for Rust callers only and
272 //! is smart about borrowing instead of copying when possible. When
273 //! streamability is not needed, the non-streaming API should be preferrer in
274 //! order to avoid copying data when a borrow suffices.
275 //!
276 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
277 //! standard types for growable byte buffers and Unicode strings that know
278 //! their length.
279 //!
280 //! The C API (header file generated at `target/include/encoding_rs.h` when
281 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
282 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
283 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
284 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
285 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
286 //! as part of Mozilla [bug 1261841][4].
287 //!
288 //! The `Encoding` type is common to both the streaming and non-streaming
289 //! modes. In the streaming mode, decoding operations are performed with a
290 //! `Decoder` and encoding operations with an `Encoder` object obtained via
291 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
292 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
293 //! and `Encoder` objects are not used at all.
294 //!
295 //! [1]: https://github.com/hsivonen/encoding_c
296 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
297 //! [3]: https://github.com/Microsoft/GSL/
298 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
299 //!
300 //! # Memory management
301 //!
302 //! The non-streaming mode never performs heap allocations (even the methods
303 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
304 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
305 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
306 //!
307 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
308 //! perform heap allocations but only to allocate the backing buffer of the
309 //! `Vec<u8>` or the `String`.
310 //!
311 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
312 //! `Drop` cleanup.
313 //!
314 //! # Buffer reading and writing behavior
315 //!
316 //! Based on experience gained with the `java.nio.charset` encoding converter
317 //! API and with the Gecko uconv encoding converter API, the buffer reading
318 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
319 //! fully drained but output buffers are not always fully filled.
320 //!
321 //! When reading from an input buffer, encoding_rs always consumes all input
322 //! up to the next error or to the end of the buffer. In particular, when
323 //! decoding, even if the input buffer ends in the middle of a byte sequence
324 //! for a character, the decoder consumes all input. This has the benefit that
325 //! the caller of the API can always fill the next buffer from the start from
326 //! whatever source the bytes come from and never has to first copy the last
327 //! bytes of the previous buffer to the start of the next buffer. However, when
328 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
329 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
330 //! boundaries falling in the middle of a surrogate pair result in both
331 //! suggorates being treated individually as unpaired surrogates.
332 //!
333 //! Additionally, decoders guarantee that they can be fed even one byte at a
334 //! time and encoders guarantee that they can be fed even one code point at a
335 //! time. This has the benefit of not placing restrictions on the size of
336 //! chunks the content arrives e.g. from network.
337 //!
338 //! When writing into an output buffer, encoding_rs makes sure that the code
339 //! unit sequence for a character is never split across output buffer
340 //! boundaries. This may result in wasted space at the end of an output buffer,
341 //! but the advantages are that the output side of both decoders and encoders
342 //! is greatly simplified compared to designs that attempt to fill output
343 //! buffers exactly even when that entails splitting a code unit sequence and
344 //! when encoding_rs methods return to the caller, the output produces thus
345 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
346 //! the output needs to be considered as a whole, because the latest output
347 //! buffer taken alone might not be valid taken alone if the transition away
348 //! from the ASCII state occurred in an earlier output buffer. However, since
349 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
350 //! state as being in error despite the encoder generating a transition to the
351 //! ASCII state at the end, the claim about the partial output taken as a whole
352 //! being valid is true even for ISO-2022-JP.)
353 //!
354 //! # Error Reporting
355 //!
356 //! Based on experience gained with the `java.nio.charset` encoding converter
357 //! API and with the Gecko uconv encoding converter API, the error reporting
358 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
359 //! that leave it up to the caller to extract the erroneous bytes from the
360 //! input stream if the caller wishes to do so but encoder errors provide the
361 //! code point associated with the error without requiring the caller to
362 //! extract it from the input on its own.
363 //!
364 //! On the encoder side, an error is always triggered by the most recently
365 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
366 //! caller. Also, it's very typical for the caller to wish to do something with
367 //! this data: generate a numeric escape for the character. Additionally, the
368 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
369 //! certain cases, so requiring the caller to extract the character from the
370 //! input buffer would require the caller to handle ISO-2022-JP details.
371 //! Furthermore, requiring the caller to extract the character from the input
372 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
373 //! the job of an encoding conversion library.
374 //!
375 //! On the decoder side, errors are triggered in more complex ways. For
376 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
377 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
378 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
379 //! be the ones most recently pushed to the decoder and the error might not even
380 //! be in the current buffer.
381 //!
382 //! Some encoding conversion APIs address the problem by not acknowledging
383 //! trailing bytes of an input buffer as consumed if it's still possible for
384 //! future bytes to cause the trailing bytes to be in error. This way, error
385 //! reporting can always refer to the most recently pushed buffer. This has the
386 //! problem that the caller of the API has to copy the unconsumed trailing
387 //! bytes to the start of the next buffer before being able to fill the rest
388 //! of the next buffer. This is annoying, error-prone and inefficient.
389 //!
390 //! A possible solution would be making the decoder remember recently consumed
391 //! bytes in order to be able to include a copy of the erroneous bytes when
392 //! reporting an error. This has two problem: First, callers a rarely
393 //! interested in the erroneous bytes, so attempts to identify them are most
394 //! often just overhead anyway. Second, the rare applications that are
395 //! interested typically care about the location of the error in the input
396 //! stream.
397 //!
398 //! To keep the API convenient for common uses and the overhead low while making
399 //! it possible to develop applications, such as HTML validators, that care
400 //! about which bytes were in error, encoding_rs reports the length of the
401 //! erroneous sequence and the number of bytes consumed after the erroneous
402 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
403 //! this makes it possible for callers that care about the erroneous bytes to
404 //! locate them.
405 //!
406 //! # No Convenience API for Custom Replacements
407 //!
408 //! The Web Platform and, therefore, the Encoding Standard supports only one
409 //! error recovery mode for decoders and only one error recovery mode for
410 //! encoders. The supported error recovery mode for decoders is emitting the
411 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
412 //! encoders is emitting an HTML decimal numeric character reference for
413 //! unmappable characters.
414 //!
415 //! Since encoding_rs is Web-focused, these are the only error recovery modes
416 //! for which convenient support is provided. Moreover, on the decoder side,
417 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
418 //! on error (other than treating errors as fatal). In particular, simply
419 //! ignoring errors is a
420 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
421 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
422 //! callers to ignore errors.
423 //!
424 //! On the encoder side, there are plausible alternatives for HTML decimal
425 //! numeric character references. For example, when outputting CSS, CSS-style
426 //! escapes would seem to make sense. However, instead of facilitating the
427 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
428 //! position that you shouldn't generate output in encodings other than UTF-8,
429 //! except where backward compatibility with interacting with the legacy Web
430 //! requires it. The legacy Web requires it only when parsing the query strings
431 //! of URLs and when submitting forms, and those two both use HTML decimal
432 //! numeric character references.
433 //!
434 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
435 //! numeric character references easy, it does make them _possible_.
436 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
437 //! for unmappable characters, is implemented on top of
438 //! `encode_from_utf8_without_replacement()`. Applications that really, really
439 //! want other replacement schemes for unmappable characters can likewise
440 //! implement them on top of `encode_from_utf8_without_replacement()`.
441 //!
442 //! # No Extensibility by Design
443 //!
444 //! The set of encodings supported by encoding_rs is not extensible by design.
445 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
446 //! rather than `trait`s. encoding_rs takes the design position that all future
447 //! text interchange should be done using UTF-8, which can represent all of
448 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
449 //! Standard and encoding_rs that can represent all of Unicode and that has
450 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
451 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
452 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
453 //! other than being able to consume legacy content.
454 //!
455 //! Considering that UTF-8 can represent all of Unicode and is already supported
456 //! by all Web browsers, introducing a new encoding wouldn't add to the
457 //! expressiveness but would add to compatibility problems. In that sense,
458 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
459 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
460 //! the Web Platform. On the other hand, the set of legacy encodings that must
461 //! be supported for a Web browser to be able to be successful is not going to
462 //! expand. Empirically, the set of encodings specified in the Encoding Standard
463 //! is already sufficient and the set of legacy encodings won't grow
464 //! retroactively.
465 //!
466 //! Since extensibility doesn't make sense considering the Web focus of
467 //! encoding_rs and adding encodings to Web clients would be actively harmful,
468 //! it makes sense to make the set of encodings that encoding_rs supports
469 //! non-extensible and to take the (admittedly small) benefits arising from
470 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
471 //!  of time, which enables stack allocation thereof.
472 //!
473 //! This does have downsides for applications that might want to put encoding_rs
474 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
475 //! needed for Web uses. The needs of such applications should not complicate
476 //! encoding_rs itself, though. It is up to those applications to provide a
477 //! framework that delegates the operations with encodings that encoding_rs
478 //! supports to encoding_rs and operations with other encodings to something
479 //! else (as opposed to encoding_rs itself providing an extensibility
480 //! framework).
481 //!
482 //! # Panics
483 //!
484 //! Methods in encoding_rs can panic if the API is used against the requirements
485 //! stated in the documentation, if a state that's supposed to be impossible
486 //! is reached due to an internal bug or on integer overflow. When used
487 //! according to documentation with buffer sizes that stay below integer
488 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
489 //!
490 //! Panics arising from API misuse aren't documented beyond this on individual
491 //! methods.
492 //!
493 //! # At-Risk Parts of the API
494 //!
495 //! The foreseeable source of partially backward-incompatible API change is the
496 //! way the instances of `Encoding` are made available.
497 //!
498 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
499 //! initialized with `static`s of type `&'static Encoding`, the non-reference
500 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
501 //!
502 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
503 //! unique when the constant is used in different crates, the reference-typed
504 //! `static`s for the encoding instances will be changed from `static` to
505 //! `const` and the non-reference-typed `_INIT` instances will be removed.
506 //!
507 //! # Mapping Spec Concepts onto the API
508 //!
509 //! <table>
510 //! <thead>
511 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
512 //! </thead>
513 //! <tbody>
514 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
515 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
522 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
523 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
524 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
525 //! </tbody>
526 //! </table>
527 //!
528 //! # Compatibility with the rust-encoding API
529 //!
530 //! The crate
531 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
532 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
533 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
534 //!
535 //! # Mapping rust-encoding concepts to encoding_rs concepts
536 //!
537 //! The following table provides a mapping from rust-encoding constructs to
538 //! encoding_rs ones.
539 //!
540 //! <table>
541 //! <thead>
542 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
543 //! </thead>
544 //! <tbody>
545 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
546 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
547 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
548 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
549 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
550 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
551 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
552 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
553 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
554 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
555 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
556 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
557 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
558 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
559 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
560 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
561 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
562 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
563 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
564 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
565 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
566 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
567 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
568 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
570 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
572 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
573 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
574 //! </tbody>
575 //! </table>
576 //!
577 //! # Relationship with Windows Code Pages
578 //!
579 //! Despite the Web and browser focus, the encodings defined by the Encoding
580 //! Standard and implemented by this crate may be useful for decoding legacy
581 //! data that uses Windows code pages. The following table names the single-byte
582 //! encodings
583 //! that have a closely related Windows code page, the number of the closest
584 //! code page, a column indicating whether Windows maps unassigned code points
585 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
586 //! indicating remarks in the list after the table.
587 //!
588 //! <table>
589 //! <thead>
590 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
591 //! </thead>
592 //! <tbody>
593 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
594 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
595 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
596 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
597 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
598 //! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
599 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
600 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
601 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
602 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
603 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
604 //! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
605 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
606 //! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
607 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
608 //! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
609 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
610 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
611 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
612 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
613 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
614 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
615 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
616 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
617 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
618 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
619 //! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
620 //! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
621 //! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
622 //! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
623 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
624 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
625 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
626 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
627 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
628 //! </tbody>
629 //! </table>
630 //!
631 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
632 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
633 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
634 //!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
635 //!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
636 //!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
637 //!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
638 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
639 //!    of LRM and RLM.
640 //! 5. Remarks from the previous item apply.
641 //!
642 //! The differences between this crate and Windows in the case of multibyte encodings
643 //! are not yet fully documented here. The lack of remarks above should not be taken
644 //! as indication of lack of differences.
645 //!
646 //! # Notable Differences from IANA Naming
647 //!
648 //! In some cases, the Encoding Standard specifies the popular unextended encoding
649 //! name where in IANA terms one of the other labels would be more precise considering
650 //! the extensions that the Encoding Standard has unified into the encoding.
651 //!
652 //! <table>
653 //! <thead>
654 //! <tr><th>Encoding</th><th>IANA</th></tr>
655 //! </thead>
656 //! <tbody>
657 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
658 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
659 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
660 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
661 //! </tbody>
662 //! </table>
663 //!
664 //! In other cases where the Encoding Standard unifies unextended and extended
665 //! variants of an encoding, the encoding gets the name of the extended
666 //! variant.
667 //!
668 //! <table>
669 //! <thead>
670 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
671 //! </thead>
672 //! <tbody>
673 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
674 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
675 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
676 //! </tbody>
677 //! </table>
678 //!
679 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
680 //! for discussion about the UTF-16 family.
681 
682 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
683 
684 #[macro_use]
685 extern crate cfg_if;
686 
687 #[cfg(all(
688     feature = "simd-accel",
689     any(
690         target_feature = "sse2",
691         all(target_endian = "little", target_arch = "aarch64"),
692         all(target_endian = "little", target_feature = "neon")
693     )
694 ))]
695 #[macro_use(shuffle)]
696 extern crate packed_simd;
697 
698 #[cfg(feature = "serde")]
699 extern crate serde;
700 
701 #[cfg(all(test, feature = "serde"))]
702 extern crate bincode;
703 #[cfg(all(test, feature = "serde"))]
704 #[macro_use]
705 extern crate serde_derive;
706 #[cfg(all(test, feature = "serde"))]
707 extern crate serde_json;
708 
709 #[macro_use]
710 mod macros;
711 
712 #[cfg(all(
713     feature = "simd-accel",
714     any(
715         target_feature = "sse2",
716         all(target_endian = "little", target_arch = "aarch64"),
717         all(target_endian = "little", target_feature = "neon")
718     )
719 ))]
720 mod simd_funcs;
721 
722 #[cfg(test)]
723 mod testing;
724 
725 mod big5;
726 mod euc_jp;
727 mod euc_kr;
728 mod gb18030;
729 mod iso_2022_jp;
730 mod replacement;
731 mod shift_jis;
732 mod single_byte;
733 mod utf_16;
734 mod utf_8;
735 mod x_user_defined;
736 
737 mod ascii;
738 mod data;
739 mod handles;
740 mod variant;
741 
742 pub mod mem;
743 
744 use crate::ascii::ascii_valid_up_to;
745 use crate::ascii::iso_2022_jp_ascii_valid_up_to;
746 use crate::utf_8::utf8_valid_up_to;
747 use crate::variant::*;
748 
749 use std::borrow::Cow;
750 use std::cmp::Ordering;
751 use std::hash::Hash;
752 use std::hash::Hasher;
753 
754 #[cfg(feature = "serde")]
755 use serde::de::Visitor;
756 #[cfg(feature = "serde")]
757 use serde::{Deserialize, Deserializer, Serialize, Serializer};
758 
759 /// This has to be the max length of an NCR instead of max
760 /// minus one, because we can't rely on getting the minus
761 /// one from the space reserved for the current unmappable,
762 /// because the ISO-2022-JP encoder can fill up that space
763 /// with a state transition escape.
764 const NCR_EXTRA: usize = 10; // &#1114111;
765 
766 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
767 // Instead, please regenerate using generate-encoding-data.py
768 
769 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
770 
771 /// The initializer for the [Big5](static.BIG5.html) encoding.
772 ///
773 /// For use only for taking the address of this form when
774 /// Rust prohibits the use of the non-`_INIT` form directly,
775 /// such as in initializers of other `static`s. If in doubt,
776 /// use the corresponding non-`_INIT` reference-typed `static`.
777 ///
778 /// This part of the public API will go away if Rust changes
779 /// to make the referent of `pub const FOO: &'static Encoding`
780 /// unique cross-crate or if Rust starts allowing static arrays
781 /// to be initialized with `pub static FOO: &'static Encoding`
782 /// items.
783 pub static BIG5_INIT: Encoding = Encoding {
784     name: "Big5",
785     variant: VariantEncoding::Big5,
786 };
787 
788 /// The Big5 encoding.
789 ///
790 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
791 /// instead of the Private Use Area code points that have been used historically.
792 /// It is believed to be able to decode existing Web content in a way that makes
793 /// sense.
794 ///
795 /// To avoid form submissions generating data that Web servers don't understand,
796 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
797 /// Big5 in the lexical order.
798 ///
799 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
800 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
801 ///
802 /// This encoding is designed to be suited for decoding the Windows code page 950
803 /// and its HKSCS patched "951" variant such that the text makes sense, given
804 /// assignments that Unicode has made after those encodings used Private Use
805 /// Area characters.
806 ///
807 /// This will change from `static` to `const` if Rust changes
808 /// to make the referent of `pub const FOO: &'static Encoding`
809 /// unique cross-crate, so don't take the address of this
810 /// `static`.
811 pub static BIG5: &'static Encoding = &BIG5_INIT;
812 
813 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
814 ///
815 /// For use only for taking the address of this form when
816 /// Rust prohibits the use of the non-`_INIT` form directly,
817 /// such as in initializers of other `static`s. If in doubt,
818 /// use the corresponding non-`_INIT` reference-typed `static`.
819 ///
820 /// This part of the public API will go away if Rust changes
821 /// to make the referent of `pub const FOO: &'static Encoding`
822 /// unique cross-crate or if Rust starts allowing static arrays
823 /// to be initialized with `pub static FOO: &'static Encoding`
824 /// items.
825 pub static EUC_JP_INIT: Encoding = Encoding {
826     name: "EUC-JP",
827     variant: VariantEncoding::EucJp,
828 };
829 
830 /// The EUC-JP encoding.
831 ///
832 /// This is the legacy Unix encoding for Japanese.
833 ///
834 /// For compatibility with Web servers that don't expect three-byte sequences
835 /// in form submissions, the encoder doesn't generate three-byte sequences.
836 /// That is, the JIS X 0212 support is decode-only.
837 ///
838 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
839 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
840 ///
841 /// This encoding roughly matches the Windows code page 20932. There are error
842 /// handling differences and a handful of 2-byte sequences that decode differently.
843 /// Additionall, Windows doesn't support 3-byte sequences.
844 ///
845 /// This will change from `static` to `const` if Rust changes
846 /// to make the referent of `pub const FOO: &'static Encoding`
847 /// unique cross-crate, so don't take the address of this
848 /// `static`.
849 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
850 
851 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
852 ///
853 /// For use only for taking the address of this form when
854 /// Rust prohibits the use of the non-`_INIT` form directly,
855 /// such as in initializers of other `static`s. If in doubt,
856 /// use the corresponding non-`_INIT` reference-typed `static`.
857 ///
858 /// This part of the public API will go away if Rust changes
859 /// to make the referent of `pub const FOO: &'static Encoding`
860 /// unique cross-crate or if Rust starts allowing static arrays
861 /// to be initialized with `pub static FOO: &'static Encoding`
862 /// items.
863 pub static EUC_KR_INIT: Encoding = Encoding {
864     name: "EUC-KR",
865     variant: VariantEncoding::EucKr,
866 };
867 
868 /// The EUC-KR encoding.
869 ///
870 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
871 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
872 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
873 ///
874 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
875 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
876 ///
877 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
878 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
879 /// the question mark or the Private Use Area.
880 ///
881 /// This will change from `static` to `const` if Rust changes
882 /// to make the referent of `pub const FOO: &'static Encoding`
883 /// unique cross-crate, so don't take the address of this
884 /// `static`.
885 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
886 
887 /// The initializer for the [GBK](static.GBK.html) encoding.
888 ///
889 /// For use only for taking the address of this form when
890 /// Rust prohibits the use of the non-`_INIT` form directly,
891 /// such as in initializers of other `static`s. If in doubt,
892 /// use the corresponding non-`_INIT` reference-typed `static`.
893 ///
894 /// This part of the public API will go away if Rust changes
895 /// to make the referent of `pub const FOO: &'static Encoding`
896 /// unique cross-crate or if Rust starts allowing static arrays
897 /// to be initialized with `pub static FOO: &'static Encoding`
898 /// items.
899 pub static GBK_INIT: Encoding = Encoding {
900     name: "GBK",
901     variant: VariantEncoding::Gbk,
902 };
903 
904 /// The GBK encoding.
905 ///
906 /// The decoder for this encoding is the same as the decoder for gb18030.
907 /// The encoder side of this encoding is GBK with Windows code page 936 euro
908 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
909 /// Unicode block as well as a handful of ideographs from the CJK Unified
910 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
911 ///
912 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
913 /// unified with the gb18030 encoder in the Encoding Standard out of concern
914 /// that servers that expect GBK form submissions might not be able to handle
915 /// the four-byte sequences.
916 ///
917 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
918 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
919 ///
920 /// The encoder of this encoding roughly matches the Windows code page 936.
921 /// The decoder side is a superset.
922 ///
923 /// This will change from `static` to `const` if Rust changes
924 /// to make the referent of `pub const FOO: &'static Encoding`
925 /// unique cross-crate, so don't take the address of this
926 /// `static`.
927 pub static GBK: &'static Encoding = &GBK_INIT;
928 
929 /// The initializer for the [IBM866](static.IBM866.html) encoding.
930 ///
931 /// For use only for taking the address of this form when
932 /// Rust prohibits the use of the non-`_INIT` form directly,
933 /// such as in initializers of other `static`s. If in doubt,
934 /// use the corresponding non-`_INIT` reference-typed `static`.
935 ///
936 /// This part of the public API will go away if Rust changes
937 /// to make the referent of `pub const FOO: &'static Encoding`
938 /// unique cross-crate or if Rust starts allowing static arrays
939 /// to be initialized with `pub static FOO: &'static Encoding`
940 /// items.
941 pub static IBM866_INIT: Encoding = Encoding {
942     name: "IBM866",
943     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
944 };
945 
946 /// The IBM866 encoding.
947 ///
948 /// This the most notable one of the DOS Cyrillic code pages. It has the same
949 /// box drawing characters as code page 437, so it can be used for decoding
950 /// DOS-era ASCII + box drawing data.
951 ///
952 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
953 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
954 ///
955 /// This encoding matches the Windows code page 866.
956 ///
957 /// This will change from `static` to `const` if Rust changes
958 /// to make the referent of `pub const FOO: &'static Encoding`
959 /// unique cross-crate, so don't take the address of this
960 /// `static`.
961 pub static IBM866: &'static Encoding = &IBM866_INIT;
962 
963 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
964 ///
965 /// For use only for taking the address of this form when
966 /// Rust prohibits the use of the non-`_INIT` form directly,
967 /// such as in initializers of other `static`s. If in doubt,
968 /// use the corresponding non-`_INIT` reference-typed `static`.
969 ///
970 /// This part of the public API will go away if Rust changes
971 /// to make the referent of `pub const FOO: &'static Encoding`
972 /// unique cross-crate or if Rust starts allowing static arrays
973 /// to be initialized with `pub static FOO: &'static Encoding`
974 /// items.
975 pub static ISO_2022_JP_INIT: Encoding = Encoding {
976     name: "ISO-2022-JP",
977     variant: VariantEncoding::Iso2022Jp,
978 };
979 
980 /// The ISO-2022-JP encoding.
981 ///
982 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
983 /// byte range to encode non-Basic Latin characters. It's the only encoding
984 /// supported by this crate whose encoder is stateful.
985 ///
986 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
987 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
988 ///
989 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
990 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
991 /// error handling.
992 ///
993 /// This will change from `static` to `const` if Rust changes
994 /// to make the referent of `pub const FOO: &'static Encoding`
995 /// unique cross-crate, so don't take the address of this
996 /// `static`.
997 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
998 
999 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1000 ///
1001 /// For use only for taking the address of this form when
1002 /// Rust prohibits the use of the non-`_INIT` form directly,
1003 /// such as in initializers of other `static`s. If in doubt,
1004 /// use the corresponding non-`_INIT` reference-typed `static`.
1005 ///
1006 /// This part of the public API will go away if Rust changes
1007 /// to make the referent of `pub const FOO: &'static Encoding`
1008 /// unique cross-crate or if Rust starts allowing static arrays
1009 /// to be initialized with `pub static FOO: &'static Encoding`
1010 /// items.
1011 pub static ISO_8859_10_INIT: Encoding = Encoding {
1012     name: "ISO-8859-10",
1013     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1014 };
1015 
1016 /// The ISO-8859-10 encoding.
1017 ///
1018 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1019 /// is also known as Latin 6.
1020 ///
1021 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1022 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1023 ///
1024 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1025 /// does not support this encoding.
1026 ///
1027 /// This will change from `static` to `const` if Rust changes
1028 /// to make the referent of `pub const FOO: &'static Encoding`
1029 /// unique cross-crate, so don't take the address of this
1030 /// `static`.
1031 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1032 
1033 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1034 ///
1035 /// For use only for taking the address of this form when
1036 /// Rust prohibits the use of the non-`_INIT` form directly,
1037 /// such as in initializers of other `static`s. If in doubt,
1038 /// use the corresponding non-`_INIT` reference-typed `static`.
1039 ///
1040 /// This part of the public API will go away if Rust changes
1041 /// to make the referent of `pub const FOO: &'static Encoding`
1042 /// unique cross-crate or if Rust starts allowing static arrays
1043 /// to be initialized with `pub static FOO: &'static Encoding`
1044 /// items.
1045 pub static ISO_8859_13_INIT: Encoding = Encoding {
1046     name: "ISO-8859-13",
1047     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1048 };
1049 
1050 /// The ISO-8859-13 encoding.
1051 ///
1052 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1053 /// is also known as Latin 7.
1054 ///
1055 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1056 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1057 ///
1058 /// This encoding matches the Windows code page 28603, except Windows decodes
1059 /// unassigned code points to the Private Use Area of Unicode.
1060 ///
1061 /// This will change from `static` to `const` if Rust changes
1062 /// to make the referent of `pub const FOO: &'static Encoding`
1063 /// unique cross-crate, so don't take the address of this
1064 /// `static`.
1065 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1066 
1067 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1068 ///
1069 /// For use only for taking the address of this form when
1070 /// Rust prohibits the use of the non-`_INIT` form directly,
1071 /// such as in initializers of other `static`s. If in doubt,
1072 /// use the corresponding non-`_INIT` reference-typed `static`.
1073 ///
1074 /// This part of the public API will go away if Rust changes
1075 /// to make the referent of `pub const FOO: &'static Encoding`
1076 /// unique cross-crate or if Rust starts allowing static arrays
1077 /// to be initialized with `pub static FOO: &'static Encoding`
1078 /// items.
1079 pub static ISO_8859_14_INIT: Encoding = Encoding {
1080     name: "ISO-8859-14",
1081     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1082 };
1083 
1084 /// The ISO-8859-14 encoding.
1085 ///
1086 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1087 /// is also known as Latin 8.
1088 ///
1089 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1090 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1091 ///
1092 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1093 /// does not support this encoding.
1094 ///
1095 /// This will change from `static` to `const` if Rust changes
1096 /// to make the referent of `pub const FOO: &'static Encoding`
1097 /// unique cross-crate, so don't take the address of this
1098 /// `static`.
1099 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1100 
1101 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1102 ///
1103 /// For use only for taking the address of this form when
1104 /// Rust prohibits the use of the non-`_INIT` form directly,
1105 /// such as in initializers of other `static`s. If in doubt,
1106 /// use the corresponding non-`_INIT` reference-typed `static`.
1107 ///
1108 /// This part of the public API will go away if Rust changes
1109 /// to make the referent of `pub const FOO: &'static Encoding`
1110 /// unique cross-crate or if Rust starts allowing static arrays
1111 /// to be initialized with `pub static FOO: &'static Encoding`
1112 /// items.
1113 pub static ISO_8859_15_INIT: Encoding = Encoding {
1114     name: "ISO-8859-15",
1115     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1116 };
1117 
1118 /// The ISO-8859-15 encoding.
1119 ///
1120 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1121 /// family. This encoding is also known as Latin 9.
1122 ///
1123 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1124 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1125 ///
1126 /// This encoding matches the Windows code page 28605.
1127 ///
1128 /// This will change from `static` to `const` if Rust changes
1129 /// to make the referent of `pub const FOO: &'static Encoding`
1130 /// unique cross-crate, so don't take the address of this
1131 /// `static`.
1132 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1133 
1134 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1135 ///
1136 /// For use only for taking the address of this form when
1137 /// Rust prohibits the use of the non-`_INIT` form directly,
1138 /// such as in initializers of other `static`s. If in doubt,
1139 /// use the corresponding non-`_INIT` reference-typed `static`.
1140 ///
1141 /// This part of the public API will go away if Rust changes
1142 /// to make the referent of `pub const FOO: &'static Encoding`
1143 /// unique cross-crate or if Rust starts allowing static arrays
1144 /// to be initialized with `pub static FOO: &'static Encoding`
1145 /// items.
1146 pub static ISO_8859_16_INIT: Encoding = Encoding {
1147     name: "ISO-8859-16",
1148     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1149 };
1150 
1151 /// The ISO-8859-16 encoding.
1152 ///
1153 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1154 /// family. This encoding is also known as Latin 10.
1155 ///
1156 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1157 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1158 ///
1159 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1160 /// does not support this encoding.
1161 ///
1162 /// This will change from `static` to `const` if Rust changes
1163 /// to make the referent of `pub const FOO: &'static Encoding`
1164 /// unique cross-crate, so don't take the address of this
1165 /// `static`.
1166 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1167 
1168 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1169 ///
1170 /// For use only for taking the address of this form when
1171 /// Rust prohibits the use of the non-`_INIT` form directly,
1172 /// such as in initializers of other `static`s. If in doubt,
1173 /// use the corresponding non-`_INIT` reference-typed `static`.
1174 ///
1175 /// This part of the public API will go away if Rust changes
1176 /// to make the referent of `pub const FOO: &'static Encoding`
1177 /// unique cross-crate or if Rust starts allowing static arrays
1178 /// to be initialized with `pub static FOO: &'static Encoding`
1179 /// items.
1180 pub static ISO_8859_2_INIT: Encoding = Encoding {
1181     name: "ISO-8859-2",
1182     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1183 };
1184 
1185 /// The ISO-8859-2 encoding.
1186 ///
1187 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1188 ///
1189 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1190 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1191 ///
1192 /// This encoding matches the Windows code page 28592.
1193 ///
1194 /// This will change from `static` to `const` if Rust changes
1195 /// to make the referent of `pub const FOO: &'static Encoding`
1196 /// unique cross-crate, so don't take the address of this
1197 /// `static`.
1198 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1199 
1200 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1201 ///
1202 /// For use only for taking the address of this form when
1203 /// Rust prohibits the use of the non-`_INIT` form directly,
1204 /// such as in initializers of other `static`s. If in doubt,
1205 /// use the corresponding non-`_INIT` reference-typed `static`.
1206 ///
1207 /// This part of the public API will go away if Rust changes
1208 /// to make the referent of `pub const FOO: &'static Encoding`
1209 /// unique cross-crate or if Rust starts allowing static arrays
1210 /// to be initialized with `pub static FOO: &'static Encoding`
1211 /// items.
1212 pub static ISO_8859_3_INIT: Encoding = Encoding {
1213     name: "ISO-8859-3",
1214     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1215 };
1216 
1217 /// The ISO-8859-3 encoding.
1218 ///
1219 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1220 ///
1221 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1222 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1223 ///
1224 /// This encoding matches the Windows code page 28593.
1225 ///
1226 /// This will change from `static` to `const` if Rust changes
1227 /// to make the referent of `pub const FOO: &'static Encoding`
1228 /// unique cross-crate, so don't take the address of this
1229 /// `static`.
1230 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1231 
1232 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1233 ///
1234 /// For use only for taking the address of this form when
1235 /// Rust prohibits the use of the non-`_INIT` form directly,
1236 /// such as in initializers of other `static`s. If in doubt,
1237 /// use the corresponding non-`_INIT` reference-typed `static`.
1238 ///
1239 /// This part of the public API will go away if Rust changes
1240 /// to make the referent of `pub const FOO: &'static Encoding`
1241 /// unique cross-crate or if Rust starts allowing static arrays
1242 /// to be initialized with `pub static FOO: &'static Encoding`
1243 /// items.
1244 pub static ISO_8859_4_INIT: Encoding = Encoding {
1245     name: "ISO-8859-4",
1246     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1247 };
1248 
1249 /// The ISO-8859-4 encoding.
1250 ///
1251 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1252 ///
1253 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1254 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1255 ///
1256 /// This encoding matches the Windows code page 28594.
1257 ///
1258 /// This will change from `static` to `const` if Rust changes
1259 /// to make the referent of `pub const FOO: &'static Encoding`
1260 /// unique cross-crate, so don't take the address of this
1261 /// `static`.
1262 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1263 
1264 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1265 ///
1266 /// For use only for taking the address of this form when
1267 /// Rust prohibits the use of the non-`_INIT` form directly,
1268 /// such as in initializers of other `static`s. If in doubt,
1269 /// use the corresponding non-`_INIT` reference-typed `static`.
1270 ///
1271 /// This part of the public API will go away if Rust changes
1272 /// to make the referent of `pub const FOO: &'static Encoding`
1273 /// unique cross-crate or if Rust starts allowing static arrays
1274 /// to be initialized with `pub static FOO: &'static Encoding`
1275 /// items.
1276 pub static ISO_8859_5_INIT: Encoding = Encoding {
1277     name: "ISO-8859-5",
1278     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1279 };
1280 
1281 /// The ISO-8859-5 encoding.
1282 ///
1283 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1284 ///
1285 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1286 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1287 ///
1288 /// This encoding matches the Windows code page 28595.
1289 ///
1290 /// This will change from `static` to `const` if Rust changes
1291 /// to make the referent of `pub const FOO: &'static Encoding`
1292 /// unique cross-crate, so don't take the address of this
1293 /// `static`.
1294 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1295 
1296 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1297 ///
1298 /// For use only for taking the address of this form when
1299 /// Rust prohibits the use of the non-`_INIT` form directly,
1300 /// such as in initializers of other `static`s. If in doubt,
1301 /// use the corresponding non-`_INIT` reference-typed `static`.
1302 ///
1303 /// This part of the public API will go away if Rust changes
1304 /// to make the referent of `pub const FOO: &'static Encoding`
1305 /// unique cross-crate or if Rust starts allowing static arrays
1306 /// to be initialized with `pub static FOO: &'static Encoding`
1307 /// items.
1308 pub static ISO_8859_6_INIT: Encoding = Encoding {
1309     name: "ISO-8859-6",
1310     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1311 };
1312 
1313 /// The ISO-8859-6 encoding.
1314 ///
1315 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1316 ///
1317 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1318 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1319 ///
1320 /// This encoding matches the Windows code page 28596, except Windows decodes
1321 /// unassigned code points to the Private Use Area of Unicode.
1322 ///
1323 /// This will change from `static` to `const` if Rust changes
1324 /// to make the referent of `pub const FOO: &'static Encoding`
1325 /// unique cross-crate, so don't take the address of this
1326 /// `static`.
1327 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1328 
1329 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1330 ///
1331 /// For use only for taking the address of this form when
1332 /// Rust prohibits the use of the non-`_INIT` form directly,
1333 /// such as in initializers of other `static`s. If in doubt,
1334 /// use the corresponding non-`_INIT` reference-typed `static`.
1335 ///
1336 /// This part of the public API will go away if Rust changes
1337 /// to make the referent of `pub const FOO: &'static Encoding`
1338 /// unique cross-crate or if Rust starts allowing static arrays
1339 /// to be initialized with `pub static FOO: &'static Encoding`
1340 /// items.
1341 pub static ISO_8859_7_INIT: Encoding = Encoding {
1342     name: "ISO-8859-7",
1343     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1344 };
1345 
1346 /// The ISO-8859-7 encoding.
1347 ///
1348 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1349 ///
1350 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1351 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1352 ///
1353 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1354 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1355 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1356 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1357 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1358 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1359 ///
1360 /// This will change from `static` to `const` if Rust changes
1361 /// to make the referent of `pub const FOO: &'static Encoding`
1362 /// unique cross-crate, so don't take the address of this
1363 /// `static`.
1364 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1365 
1366 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1367 ///
1368 /// For use only for taking the address of this form when
1369 /// Rust prohibits the use of the non-`_INIT` form directly,
1370 /// such as in initializers of other `static`s. If in doubt,
1371 /// use the corresponding non-`_INIT` reference-typed `static`.
1372 ///
1373 /// This part of the public API will go away if Rust changes
1374 /// to make the referent of `pub const FOO: &'static Encoding`
1375 /// unique cross-crate or if Rust starts allowing static arrays
1376 /// to be initialized with `pub static FOO: &'static Encoding`
1377 /// items.
1378 pub static ISO_8859_8_INIT: Encoding = Encoding {
1379     name: "ISO-8859-8",
1380     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1381 };
1382 
1383 /// The ISO-8859-8 encoding.
1384 ///
1385 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1386 ///
1387 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1388 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1389 ///
1390 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1391 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1392 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1393 /// the private use area.
1394 ///
1395 /// This will change from `static` to `const` if Rust changes
1396 /// to make the referent of `pub const FOO: &'static Encoding`
1397 /// unique cross-crate, so don't take the address of this
1398 /// `static`.
1399 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1400 
1401 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1402 ///
1403 /// For use only for taking the address of this form when
1404 /// Rust prohibits the use of the non-`_INIT` form directly,
1405 /// such as in initializers of other `static`s. If in doubt,
1406 /// use the corresponding non-`_INIT` reference-typed `static`.
1407 ///
1408 /// This part of the public API will go away if Rust changes
1409 /// to make the referent of `pub const FOO: &'static Encoding`
1410 /// unique cross-crate or if Rust starts allowing static arrays
1411 /// to be initialized with `pub static FOO: &'static Encoding`
1412 /// items.
1413 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1414     name: "ISO-8859-8-I",
1415     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1416 };
1417 
1418 /// The ISO-8859-8-I encoding.
1419 ///
1420 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1421 ///
1422 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1423 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1424 ///
1425 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1426 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1427 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1428 /// the private use area.
1429 ///
1430 /// This will change from `static` to `const` if Rust changes
1431 /// to make the referent of `pub const FOO: &'static Encoding`
1432 /// unique cross-crate, so don't take the address of this
1433 /// `static`.
1434 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1435 
1436 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1437 ///
1438 /// For use only for taking the address of this form when
1439 /// Rust prohibits the use of the non-`_INIT` form directly,
1440 /// such as in initializers of other `static`s. If in doubt,
1441 /// use the corresponding non-`_INIT` reference-typed `static`.
1442 ///
1443 /// This part of the public API will go away if Rust changes
1444 /// to make the referent of `pub const FOO: &'static Encoding`
1445 /// unique cross-crate or if Rust starts allowing static arrays
1446 /// to be initialized with `pub static FOO: &'static Encoding`
1447 /// items.
1448 pub static KOI8_R_INIT: Encoding = Encoding {
1449     name: "KOI8-R",
1450     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1451 };
1452 
1453 /// The KOI8-R encoding.
1454 ///
1455 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1456 ///
1457 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1458 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1459 ///
1460 /// This encoding matches the Windows code page 20866.
1461 ///
1462 /// This will change from `static` to `const` if Rust changes
1463 /// to make the referent of `pub const FOO: &'static Encoding`
1464 /// unique cross-crate, so don't take the address of this
1465 /// `static`.
1466 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1467 
1468 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1469 ///
1470 /// For use only for taking the address of this form when
1471 /// Rust prohibits the use of the non-`_INIT` form directly,
1472 /// such as in initializers of other `static`s. If in doubt,
1473 /// use the corresponding non-`_INIT` reference-typed `static`.
1474 ///
1475 /// This part of the public API will go away if Rust changes
1476 /// to make the referent of `pub const FOO: &'static Encoding`
1477 /// unique cross-crate or if Rust starts allowing static arrays
1478 /// to be initialized with `pub static FOO: &'static Encoding`
1479 /// items.
1480 pub static KOI8_U_INIT: Encoding = Encoding {
1481     name: "KOI8-U",
1482     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1483 };
1484 
1485 /// The KOI8-U encoding.
1486 ///
1487 /// This is an encoding for Ukrainian adapted from KOI8-R.
1488 ///
1489 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1490 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1491 ///
1492 /// This encoding matches the Windows code page 21866.
1493 ///
1494 /// This will change from `static` to `const` if Rust changes
1495 /// to make the referent of `pub const FOO: &'static Encoding`
1496 /// unique cross-crate, so don't take the address of this
1497 /// `static`.
1498 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1499 
1500 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1501 ///
1502 /// For use only for taking the address of this form when
1503 /// Rust prohibits the use of the non-`_INIT` form directly,
1504 /// such as in initializers of other `static`s. If in doubt,
1505 /// use the corresponding non-`_INIT` reference-typed `static`.
1506 ///
1507 /// This part of the public API will go away if Rust changes
1508 /// to make the referent of `pub const FOO: &'static Encoding`
1509 /// unique cross-crate or if Rust starts allowing static arrays
1510 /// to be initialized with `pub static FOO: &'static Encoding`
1511 /// items.
1512 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1513     name: "Shift_JIS",
1514     variant: VariantEncoding::ShiftJis,
1515 };
1516 
1517 /// The Shift_JIS encoding.
1518 ///
1519 /// This is the Japanese encoding for Windows.
1520 ///
1521 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1522 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1523 ///
1524 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1525 /// sequences that are error per the Encoding Standard to the question mark or the
1526 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1527 ///
1528 /// This will change from `static` to `const` if Rust changes
1529 /// to make the referent of `pub const FOO: &'static Encoding`
1530 /// unique cross-crate, so don't take the address of this
1531 /// `static`.
1532 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1533 
1534 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1535 ///
1536 /// For use only for taking the address of this form when
1537 /// Rust prohibits the use of the non-`_INIT` form directly,
1538 /// such as in initializers of other `static`s. If in doubt,
1539 /// use the corresponding non-`_INIT` reference-typed `static`.
1540 ///
1541 /// This part of the public API will go away if Rust changes
1542 /// to make the referent of `pub const FOO: &'static Encoding`
1543 /// unique cross-crate or if Rust starts allowing static arrays
1544 /// to be initialized with `pub static FOO: &'static Encoding`
1545 /// items.
1546 pub static UTF_16BE_INIT: Encoding = Encoding {
1547     name: "UTF-16BE",
1548     variant: VariantEncoding::Utf16Be,
1549 };
1550 
1551 /// The UTF-16BE encoding.
1552 ///
1553 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1554 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1555 /// mark the big endian byte order is assumed.
1556 ///
1557 /// There is no corresponding encoder in this crate or in the Encoding
1558 /// Standard. The output encoding of this encoding is UTF-8.
1559 ///
1560 /// This encoding matches the Windows code page 1201.
1561 ///
1562 /// This will change from `static` to `const` if Rust changes
1563 /// to make the referent of `pub const FOO: &'static Encoding`
1564 /// unique cross-crate, so don't take the address of this
1565 /// `static`.
1566 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1567 
1568 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1569 ///
1570 /// For use only for taking the address of this form when
1571 /// Rust prohibits the use of the non-`_INIT` form directly,
1572 /// such as in initializers of other `static`s. If in doubt,
1573 /// use the corresponding non-`_INIT` reference-typed `static`.
1574 ///
1575 /// This part of the public API will go away if Rust changes
1576 /// to make the referent of `pub const FOO: &'static Encoding`
1577 /// unique cross-crate or if Rust starts allowing static arrays
1578 /// to be initialized with `pub static FOO: &'static Encoding`
1579 /// items.
1580 pub static UTF_16LE_INIT: Encoding = Encoding {
1581     name: "UTF-16LE",
1582     variant: VariantEncoding::Utf16Le,
1583 };
1584 
1585 /// The UTF-16LE encoding.
1586 ///
1587 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1588 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1589 /// mark the little endian byte order is assumed.
1590 ///
1591 /// There is no corresponding encoder in this crate or in the Encoding
1592 /// Standard. The output encoding of this encoding is UTF-8.
1593 ///
1594 /// This encoding matches the Windows code page 1200.
1595 ///
1596 /// This will change from `static` to `const` if Rust changes
1597 /// to make the referent of `pub const FOO: &'static Encoding`
1598 /// unique cross-crate, so don't take the address of this
1599 /// `static`.
1600 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1601 
1602 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1603 ///
1604 /// For use only for taking the address of this form when
1605 /// Rust prohibits the use of the non-`_INIT` form directly,
1606 /// such as in initializers of other `static`s. If in doubt,
1607 /// use the corresponding non-`_INIT` reference-typed `static`.
1608 ///
1609 /// This part of the public API will go away if Rust changes
1610 /// to make the referent of `pub const FOO: &'static Encoding`
1611 /// unique cross-crate or if Rust starts allowing static arrays
1612 /// to be initialized with `pub static FOO: &'static Encoding`
1613 /// items.
1614 pub static UTF_8_INIT: Encoding = Encoding {
1615     name: "UTF-8",
1616     variant: VariantEncoding::Utf8,
1617 };
1618 
1619 /// The UTF-8 encoding.
1620 ///
1621 /// This is the encoding that should be used for all new development it can
1622 /// represent all of Unicode.
1623 ///
1624 /// This encoding matches the Windows code page 65001, except Windows differs
1625 /// in the number of errors generated for some erroneous byte sequences.
1626 ///
1627 /// This will change from `static` to `const` if Rust changes
1628 /// to make the referent of `pub const FOO: &'static Encoding`
1629 /// unique cross-crate, so don't take the address of this
1630 /// `static`.
1631 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1632 
1633 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1634 ///
1635 /// For use only for taking the address of this form when
1636 /// Rust prohibits the use of the non-`_INIT` form directly,
1637 /// such as in initializers of other `static`s. If in doubt,
1638 /// use the corresponding non-`_INIT` reference-typed `static`.
1639 ///
1640 /// This part of the public API will go away if Rust changes
1641 /// to make the referent of `pub const FOO: &'static Encoding`
1642 /// unique cross-crate or if Rust starts allowing static arrays
1643 /// to be initialized with `pub static FOO: &'static Encoding`
1644 /// items.
1645 pub static GB18030_INIT: Encoding = Encoding {
1646     name: "gb18030",
1647     variant: VariantEncoding::Gb18030,
1648 };
1649 
1650 /// The gb18030 encoding.
1651 ///
1652 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1653 /// maps to U+3000 for compatibility with existing Web content. As a result,
1654 /// this encoding can represent all of Unicode except for the private-use
1655 /// character U+E5E5.
1656 ///
1657 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1658 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1659 ///
1660 /// This encoding matches the Windows code page 54936.
1661 ///
1662 /// This will change from `static` to `const` if Rust changes
1663 /// to make the referent of `pub const FOO: &'static Encoding`
1664 /// unique cross-crate, so don't take the address of this
1665 /// `static`.
1666 pub static GB18030: &'static Encoding = &GB18030_INIT;
1667 
1668 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1669 ///
1670 /// For use only for taking the address of this form when
1671 /// Rust prohibits the use of the non-`_INIT` form directly,
1672 /// such as in initializers of other `static`s. If in doubt,
1673 /// use the corresponding non-`_INIT` reference-typed `static`.
1674 ///
1675 /// This part of the public API will go away if Rust changes
1676 /// to make the referent of `pub const FOO: &'static Encoding`
1677 /// unique cross-crate or if Rust starts allowing static arrays
1678 /// to be initialized with `pub static FOO: &'static Encoding`
1679 /// items.
1680 pub static MACINTOSH_INIT: Encoding = Encoding {
1681     name: "macintosh",
1682     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1683 };
1684 
1685 /// The macintosh encoding.
1686 ///
1687 /// This is the MacRoman encoding from Mac OS Classic.
1688 ///
1689 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1690 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1691 ///
1692 /// This encoding matches the Windows code page 10000, except Windows decodes
1693 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1694 ///
1695 /// This will change from `static` to `const` if Rust changes
1696 /// to make the referent of `pub const FOO: &'static Encoding`
1697 /// unique cross-crate, so don't take the address of this
1698 /// `static`.
1699 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1700 
1701 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1702 ///
1703 /// For use only for taking the address of this form when
1704 /// Rust prohibits the use of the non-`_INIT` form directly,
1705 /// such as in initializers of other `static`s. If in doubt,
1706 /// use the corresponding non-`_INIT` reference-typed `static`.
1707 ///
1708 /// This part of the public API will go away if Rust changes
1709 /// to make the referent of `pub const FOO: &'static Encoding`
1710 /// unique cross-crate or if Rust starts allowing static arrays
1711 /// to be initialized with `pub static FOO: &'static Encoding`
1712 /// items.
1713 pub static REPLACEMENT_INIT: Encoding = Encoding {
1714     name: "replacement",
1715     variant: VariantEncoding::Replacement,
1716 };
1717 
1718 /// The replacement encoding.
1719 ///
1720 /// This decode-only encoding decodes all non-zero-length streams to a single
1721 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1722 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1723 /// encodings that are no longer supported by the Web Platform and that
1724 /// would be dangerous to treat as ASCII-compatible.
1725 ///
1726 /// There is no corresponding encoder. The output encoding of this encoding
1727 /// is UTF-8.
1728 ///
1729 /// This encoding does not have a Windows code page number.
1730 ///
1731 /// This will change from `static` to `const` if Rust changes
1732 /// to make the referent of `pub const FOO: &'static Encoding`
1733 /// unique cross-crate, so don't take the address of this
1734 /// `static`.
1735 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1736 
1737 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1738 ///
1739 /// For use only for taking the address of this form when
1740 /// Rust prohibits the use of the non-`_INIT` form directly,
1741 /// such as in initializers of other `static`s. If in doubt,
1742 /// use the corresponding non-`_INIT` reference-typed `static`.
1743 ///
1744 /// This part of the public API will go away if Rust changes
1745 /// to make the referent of `pub const FOO: &'static Encoding`
1746 /// unique cross-crate or if Rust starts allowing static arrays
1747 /// to be initialized with `pub static FOO: &'static Encoding`
1748 /// items.
1749 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1750     name: "windows-1250",
1751     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1752 };
1753 
1754 /// The windows-1250 encoding.
1755 ///
1756 /// This is the Central European encoding for Windows.
1757 ///
1758 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1759 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1760 ///
1761 /// This encoding matches the Windows code page 1250.
1762 ///
1763 /// This will change from `static` to `const` if Rust changes
1764 /// to make the referent of `pub const FOO: &'static Encoding`
1765 /// unique cross-crate, so don't take the address of this
1766 /// `static`.
1767 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1768 
1769 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1770 ///
1771 /// For use only for taking the address of this form when
1772 /// Rust prohibits the use of the non-`_INIT` form directly,
1773 /// such as in initializers of other `static`s. If in doubt,
1774 /// use the corresponding non-`_INIT` reference-typed `static`.
1775 ///
1776 /// This part of the public API will go away if Rust changes
1777 /// to make the referent of `pub const FOO: &'static Encoding`
1778 /// unique cross-crate or if Rust starts allowing static arrays
1779 /// to be initialized with `pub static FOO: &'static Encoding`
1780 /// items.
1781 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1782     name: "windows-1251",
1783     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1784 };
1785 
1786 /// The windows-1251 encoding.
1787 ///
1788 /// This is the Cyrillic encoding for Windows.
1789 ///
1790 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1791 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1792 ///
1793 /// This encoding matches the Windows code page 1251.
1794 ///
1795 /// This will change from `static` to `const` if Rust changes
1796 /// to make the referent of `pub const FOO: &'static Encoding`
1797 /// unique cross-crate, so don't take the address of this
1798 /// `static`.
1799 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1800 
1801 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1802 ///
1803 /// For use only for taking the address of this form when
1804 /// Rust prohibits the use of the non-`_INIT` form directly,
1805 /// such as in initializers of other `static`s. If in doubt,
1806 /// use the corresponding non-`_INIT` reference-typed `static`.
1807 ///
1808 /// This part of the public API will go away if Rust changes
1809 /// to make the referent of `pub const FOO: &'static Encoding`
1810 /// unique cross-crate or if Rust starts allowing static arrays
1811 /// to be initialized with `pub static FOO: &'static Encoding`
1812 /// items.
1813 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1814     name: "windows-1252",
1815     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1816 };
1817 
1818 /// The windows-1252 encoding.
1819 ///
1820 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1821 /// which is known as Latin 1.
1822 ///
1823 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1824 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1825 ///
1826 /// This encoding matches the Windows code page 1252.
1827 ///
1828 /// This will change from `static` to `const` if Rust changes
1829 /// to make the referent of `pub const FOO: &'static Encoding`
1830 /// unique cross-crate, so don't take the address of this
1831 /// `static`.
1832 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1833 
1834 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1835 ///
1836 /// For use only for taking the address of this form when
1837 /// Rust prohibits the use of the non-`_INIT` form directly,
1838 /// such as in initializers of other `static`s. If in doubt,
1839 /// use the corresponding non-`_INIT` reference-typed `static`.
1840 ///
1841 /// This part of the public API will go away if Rust changes
1842 /// to make the referent of `pub const FOO: &'static Encoding`
1843 /// unique cross-crate or if Rust starts allowing static arrays
1844 /// to be initialized with `pub static FOO: &'static Encoding`
1845 /// items.
1846 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1847     name: "windows-1253",
1848     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1849 };
1850 
1851 /// The windows-1253 encoding.
1852 ///
1853 /// This is the Greek encoding for Windows. It is mostly an extension of
1854 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1855 ///
1856 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1857 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1858 ///
1859 /// This encoding matches the Windows code page 1253, except Windows decodes
1860 /// unassigned code points to the Private Use Area of Unicode.
1861 ///
1862 /// This will change from `static` to `const` if Rust changes
1863 /// to make the referent of `pub const FOO: &'static Encoding`
1864 /// unique cross-crate, so don't take the address of this
1865 /// `static`.
1866 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1867 
1868 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1869 ///
1870 /// For use only for taking the address of this form when
1871 /// Rust prohibits the use of the non-`_INIT` form directly,
1872 /// such as in initializers of other `static`s. If in doubt,
1873 /// use the corresponding non-`_INIT` reference-typed `static`.
1874 ///
1875 /// This part of the public API will go away if Rust changes
1876 /// to make the referent of `pub const FOO: &'static Encoding`
1877 /// unique cross-crate or if Rust starts allowing static arrays
1878 /// to be initialized with `pub static FOO: &'static Encoding`
1879 /// items.
1880 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1881     name: "windows-1254",
1882     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1883 };
1884 
1885 /// The windows-1254 encoding.
1886 ///
1887 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1888 /// which is known as Latin 5.
1889 ///
1890 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1891 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1892 ///
1893 /// This encoding matches the Windows code page 1254.
1894 ///
1895 /// This will change from `static` to `const` if Rust changes
1896 /// to make the referent of `pub const FOO: &'static Encoding`
1897 /// unique cross-crate, so don't take the address of this
1898 /// `static`.
1899 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1900 
1901 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1902 ///
1903 /// For use only for taking the address of this form when
1904 /// Rust prohibits the use of the non-`_INIT` form directly,
1905 /// such as in initializers of other `static`s. If in doubt,
1906 /// use the corresponding non-`_INIT` reference-typed `static`.
1907 ///
1908 /// This part of the public API will go away if Rust changes
1909 /// to make the referent of `pub const FOO: &'static Encoding`
1910 /// unique cross-crate or if Rust starts allowing static arrays
1911 /// to be initialized with `pub static FOO: &'static Encoding`
1912 /// items.
1913 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1914     name: "windows-1255",
1915     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1916 };
1917 
1918 /// The windows-1255 encoding.
1919 ///
1920 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1921 /// except for a currency sign swap.
1922 ///
1923 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1924 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1925 ///
1926 /// This encoding matches the Windows code page 1255, except Windows decodes
1927 /// unassigned code points to the Private Use Area of Unicode.
1928 ///
1929 /// This will change from `static` to `const` if Rust changes
1930 /// to make the referent of `pub const FOO: &'static Encoding`
1931 /// unique cross-crate, so don't take the address of this
1932 /// `static`.
1933 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1934 
1935 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1936 ///
1937 /// For use only for taking the address of this form when
1938 /// Rust prohibits the use of the non-`_INIT` form directly,
1939 /// such as in initializers of other `static`s. If in doubt,
1940 /// use the corresponding non-`_INIT` reference-typed `static`.
1941 ///
1942 /// This part of the public API will go away if Rust changes
1943 /// to make the referent of `pub const FOO: &'static Encoding`
1944 /// unique cross-crate or if Rust starts allowing static arrays
1945 /// to be initialized with `pub static FOO: &'static Encoding`
1946 /// items.
1947 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1948     name: "windows-1256",
1949     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1950 };
1951 
1952 /// The windows-1256 encoding.
1953 ///
1954 /// This is the Arabic encoding for Windows.
1955 ///
1956 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1957 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1958 ///
1959 /// This encoding matches the Windows code page 1256.
1960 ///
1961 /// This will change from `static` to `const` if Rust changes
1962 /// to make the referent of `pub const FOO: &'static Encoding`
1963 /// unique cross-crate, so don't take the address of this
1964 /// `static`.
1965 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1966 
1967 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1968 ///
1969 /// For use only for taking the address of this form when
1970 /// Rust prohibits the use of the non-`_INIT` form directly,
1971 /// such as in initializers of other `static`s. If in doubt,
1972 /// use the corresponding non-`_INIT` reference-typed `static`.
1973 ///
1974 /// This part of the public API will go away if Rust changes
1975 /// to make the referent of `pub const FOO: &'static Encoding`
1976 /// unique cross-crate or if Rust starts allowing static arrays
1977 /// to be initialized with `pub static FOO: &'static Encoding`
1978 /// items.
1979 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1980     name: "windows-1257",
1981     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1982 };
1983 
1984 /// The windows-1257 encoding.
1985 ///
1986 /// This is the Baltic encoding for Windows.
1987 ///
1988 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1989 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1990 ///
1991 /// This encoding matches the Windows code page 1257, except Windows decodes
1992 /// unassigned code points to the Private Use Area of Unicode.
1993 ///
1994 /// This will change from `static` to `const` if Rust changes
1995 /// to make the referent of `pub const FOO: &'static Encoding`
1996 /// unique cross-crate, so don't take the address of this
1997 /// `static`.
1998 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
1999 
2000 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2001 ///
2002 /// For use only for taking the address of this form when
2003 /// Rust prohibits the use of the non-`_INIT` form directly,
2004 /// such as in initializers of other `static`s. If in doubt,
2005 /// use the corresponding non-`_INIT` reference-typed `static`.
2006 ///
2007 /// This part of the public API will go away if Rust changes
2008 /// to make the referent of `pub const FOO: &'static Encoding`
2009 /// unique cross-crate or if Rust starts allowing static arrays
2010 /// to be initialized with `pub static FOO: &'static Encoding`
2011 /// items.
2012 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2013     name: "windows-1258",
2014     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2015 };
2016 
2017 /// The windows-1258 encoding.
2018 ///
2019 /// This is the Vietnamese encoding for Windows.
2020 ///
2021 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2022 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2023 ///
2024 /// This encoding matches the Windows code page 1258 when used in the
2025 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2026 /// result of decoding is not necessarily in Normalization Form C. On the
2027 /// other hand, input in the Normalization Form C is not encoded without
2028 /// replacement. In general, it's a bad idea to encode to encodings other
2029 /// than UTF-8, but this encoding is especially hazardous to encode to.
2030 ///
2031 /// This will change from `static` to `const` if Rust changes
2032 /// to make the referent of `pub const FOO: &'static Encoding`
2033 /// unique cross-crate, so don't take the address of this
2034 /// `static`.
2035 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2036 
2037 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2038 ///
2039 /// For use only for taking the address of this form when
2040 /// Rust prohibits the use of the non-`_INIT` form directly,
2041 /// such as in initializers of other `static`s. If in doubt,
2042 /// use the corresponding non-`_INIT` reference-typed `static`.
2043 ///
2044 /// This part of the public API will go away if Rust changes
2045 /// to make the referent of `pub const FOO: &'static Encoding`
2046 /// unique cross-crate or if Rust starts allowing static arrays
2047 /// to be initialized with `pub static FOO: &'static Encoding`
2048 /// items.
2049 pub static WINDOWS_874_INIT: Encoding = Encoding {
2050     name: "windows-874",
2051     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2052 };
2053 
2054 /// The windows-874 encoding.
2055 ///
2056 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2057 ///
2058 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2059 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2060 ///
2061 /// This encoding matches the Windows code page 874, except Windows decodes
2062 /// unassigned code points to the Private Use Area of Unicode.
2063 ///
2064 /// This will change from `static` to `const` if Rust changes
2065 /// to make the referent of `pub const FOO: &'static Encoding`
2066 /// unique cross-crate, so don't take the address of this
2067 /// `static`.
2068 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2069 
2070 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2071 ///
2072 /// For use only for taking the address of this form when
2073 /// Rust prohibits the use of the non-`_INIT` form directly,
2074 /// such as in initializers of other `static`s. If in doubt,
2075 /// use the corresponding non-`_INIT` reference-typed `static`.
2076 ///
2077 /// This part of the public API will go away if Rust changes
2078 /// to make the referent of `pub const FOO: &'static Encoding`
2079 /// unique cross-crate or if Rust starts allowing static arrays
2080 /// to be initialized with `pub static FOO: &'static Encoding`
2081 /// items.
2082 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2083     name: "x-mac-cyrillic",
2084     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2085 };
2086 
2087 /// The x-mac-cyrillic encoding.
2088 ///
2089 /// This is the MacUkrainian encoding from Mac OS Classic.
2090 ///
2091 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2092 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2093 ///
2094 /// This encoding matches the Windows code page 10017.
2095 ///
2096 /// This will change from `static` to `const` if Rust changes
2097 /// to make the referent of `pub const FOO: &'static Encoding`
2098 /// unique cross-crate, so don't take the address of this
2099 /// `static`.
2100 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2101 
2102 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2103 ///
2104 /// For use only for taking the address of this form when
2105 /// Rust prohibits the use of the non-`_INIT` form directly,
2106 /// such as in initializers of other `static`s. If in doubt,
2107 /// use the corresponding non-`_INIT` reference-typed `static`.
2108 ///
2109 /// This part of the public API will go away if Rust changes
2110 /// to make the referent of `pub const FOO: &'static Encoding`
2111 /// unique cross-crate or if Rust starts allowing static arrays
2112 /// to be initialized with `pub static FOO: &'static Encoding`
2113 /// items.
2114 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2115     name: "x-user-defined",
2116     variant: VariantEncoding::UserDefined,
2117 };
2118 
2119 /// The x-user-defined encoding.
2120 ///
2121 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2122 /// them to the Private Use Area of Unicode. It was used for loading binary
2123 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2124 /// the `"arraybuffer"` response type.
2125 ///
2126 /// This encoding does not have a Windows code page number.
2127 ///
2128 /// This will change from `static` to `const` if Rust changes
2129 /// to make the referent of `pub const FOO: &'static Encoding`
2130 /// unique cross-crate, so don't take the address of this
2131 /// `static`.
2132 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2133 
2134 static LABELS_SORTED: [&'static str; 219] = [
2135     "l1",
2136     "l2",
2137     "l3",
2138     "l4",
2139     "l5",
2140     "l6",
2141     "l9",
2142     "866",
2143     "mac",
2144     "koi",
2145     "gbk",
2146     "big5",
2147     "utf8",
2148     "koi8",
2149     "sjis",
2150     "ms932",
2151     "cp866",
2152     "utf-8",
2153     "cp819",
2154     "ascii",
2155     "x-gbk",
2156     "greek",
2157     "cp1250",
2158     "cp1251",
2159     "latin1",
2160     "gb2312",
2161     "cp1252",
2162     "latin2",
2163     "cp1253",
2164     "latin3",
2165     "cp1254",
2166     "latin4",
2167     "cp1255",
2168     "csbig5",
2169     "latin5",
2170     "utf-16",
2171     "cp1256",
2172     "ibm866",
2173     "latin6",
2174     "cp1257",
2175     "cp1258",
2176     "greek8",
2177     "ibm819",
2178     "arabic",
2179     "visual",
2180     "korean",
2181     "euc-jp",
2182     "koi8-r",
2183     "koi8_r",
2184     "euc-kr",
2185     "x-sjis",
2186     "koi8-u",
2187     "hebrew",
2188     "tis-620",
2189     "gb18030",
2190     "ksc5601",
2191     "gb_2312",
2192     "dos-874",
2193     "cn-big5",
2194     "chinese",
2195     "logical",
2196     "cskoi8r",
2197     "cseuckr",
2198     "koi8-ru",
2199     "x-cp1250",
2200     "ksc_5601",
2201     "x-cp1251",
2202     "iso88591",
2203     "csgb2312",
2204     "x-cp1252",
2205     "iso88592",
2206     "x-cp1253",
2207     "iso88593",
2208     "ecma-114",
2209     "x-cp1254",
2210     "iso88594",
2211     "x-cp1255",
2212     "iso88595",
2213     "x-x-big5",
2214     "x-cp1256",
2215     "csibm866",
2216     "iso88596",
2217     "x-cp1257",
2218     "iso88597",
2219     "asmo-708",
2220     "ecma-118",
2221     "elot_928",
2222     "x-cp1258",
2223     "iso88598",
2224     "iso88599",
2225     "cyrillic",
2226     "utf-16be",
2227     "utf-16le",
2228     "us-ascii",
2229     "ms_kanji",
2230     "x-euc-jp",
2231     "iso885910",
2232     "iso8859-1",
2233     "iso885911",
2234     "iso8859-2",
2235     "iso8859-3",
2236     "iso885913",
2237     "iso8859-4",
2238     "iso885914",
2239     "iso8859-5",
2240     "iso885915",
2241     "iso8859-6",
2242     "iso8859-7",
2243     "iso8859-8",
2244     "iso-ir-58",
2245     "iso8859-9",
2246     "macintosh",
2247     "shift-jis",
2248     "shift_jis",
2249     "iso-ir-100",
2250     "iso8859-10",
2251     "iso-ir-110",
2252     "gb_2312-80",
2253     "iso-8859-1",
2254     "iso_8859-1",
2255     "iso-ir-101",
2256     "iso8859-11",
2257     "iso-8859-2",
2258     "iso_8859-2",
2259     "hz-gb-2312",
2260     "iso-8859-3",
2261     "iso_8859-3",
2262     "iso8859-13",
2263     "iso-8859-4",
2264     "iso_8859-4",
2265     "iso8859-14",
2266     "iso-ir-144",
2267     "iso-8859-5",
2268     "iso_8859-5",
2269     "iso8859-15",
2270     "iso-8859-6",
2271     "iso_8859-6",
2272     "iso-ir-126",
2273     "iso-8859-7",
2274     "iso_8859-7",
2275     "iso-ir-127",
2276     "iso-ir-157",
2277     "iso-8859-8",
2278     "iso_8859-8",
2279     "iso-ir-138",
2280     "iso-ir-148",
2281     "iso-8859-9",
2282     "iso_8859-9",
2283     "iso-ir-109",
2284     "iso-ir-149",
2285     "big5-hkscs",
2286     "csshiftjis",
2287     "iso-8859-10",
2288     "iso-8859-11",
2289     "csisolatin1",
2290     "csisolatin2",
2291     "iso-8859-13",
2292     "csisolatin3",
2293     "iso-8859-14",
2294     "windows-874",
2295     "csisolatin4",
2296     "iso-8859-15",
2297     "iso_8859-15",
2298     "csisolatin5",
2299     "iso-8859-16",
2300     "csisolatin6",
2301     "windows-949",
2302     "csisolatin9",
2303     "csiso88596e",
2304     "csiso88598e",
2305     "csmacintosh",
2306     "csiso88596i",
2307     "csiso88598i",
2308     "windows-31j",
2309     "x-mac-roman",
2310     "iso-2022-cn",
2311     "iso-2022-jp",
2312     "csiso2022jp",
2313     "iso-2022-kr",
2314     "csiso2022kr",
2315     "replacement",
2316     "windows-1250",
2317     "windows-1251",
2318     "windows-1252",
2319     "windows-1253",
2320     "windows-1254",
2321     "windows-1255",
2322     "windows-1256",
2323     "windows-1257",
2324     "windows-1258",
2325     "iso-8859-6-e",
2326     "iso-8859-8-e",
2327     "iso-8859-6-i",
2328     "iso-8859-8-i",
2329     "sun_eu_greek",
2330     "csksc56011987",
2331     "ks_c_5601-1987",
2332     "ansi_x3.4-1968",
2333     "ks_c_5601-1989",
2334     "x-mac-cyrillic",
2335     "x-user-defined",
2336     "csiso58gb231280",
2337     "iso_8859-1:1987",
2338     "iso_8859-2:1987",
2339     "iso_8859-6:1987",
2340     "iso_8859-7:1987",
2341     "iso_8859-3:1988",
2342     "iso_8859-4:1988",
2343     "iso_8859-5:1988",
2344     "iso_8859-8:1988",
2345     "iso_8859-9:1989",
2346     "csisolatingreek",
2347     "x-mac-ukrainian",
2348     "iso-2022-cn-ext",
2349     "csisolatinarabic",
2350     "csisolatinhebrew",
2351     "unicode-1-1-utf-8",
2352     "csisolatincyrillic",
2353     "cseucpkdfmtjapanese",
2354 ];
2355 
2356 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2357     &WINDOWS_1252_INIT,
2358     &ISO_8859_2_INIT,
2359     &ISO_8859_3_INIT,
2360     &ISO_8859_4_INIT,
2361     &WINDOWS_1254_INIT,
2362     &ISO_8859_10_INIT,
2363     &ISO_8859_15_INIT,
2364     &IBM866_INIT,
2365     &MACINTOSH_INIT,
2366     &KOI8_R_INIT,
2367     &GBK_INIT,
2368     &BIG5_INIT,
2369     &UTF_8_INIT,
2370     &KOI8_R_INIT,
2371     &SHIFT_JIS_INIT,
2372     &SHIFT_JIS_INIT,
2373     &IBM866_INIT,
2374     &UTF_8_INIT,
2375     &WINDOWS_1252_INIT,
2376     &WINDOWS_1252_INIT,
2377     &GBK_INIT,
2378     &ISO_8859_7_INIT,
2379     &WINDOWS_1250_INIT,
2380     &WINDOWS_1251_INIT,
2381     &WINDOWS_1252_INIT,
2382     &GBK_INIT,
2383     &WINDOWS_1252_INIT,
2384     &ISO_8859_2_INIT,
2385     &WINDOWS_1253_INIT,
2386     &ISO_8859_3_INIT,
2387     &WINDOWS_1254_INIT,
2388     &ISO_8859_4_INIT,
2389     &WINDOWS_1255_INIT,
2390     &BIG5_INIT,
2391     &WINDOWS_1254_INIT,
2392     &UTF_16LE_INIT,
2393     &WINDOWS_1256_INIT,
2394     &IBM866_INIT,
2395     &ISO_8859_10_INIT,
2396     &WINDOWS_1257_INIT,
2397     &WINDOWS_1258_INIT,
2398     &ISO_8859_7_INIT,
2399     &WINDOWS_1252_INIT,
2400     &ISO_8859_6_INIT,
2401     &ISO_8859_8_INIT,
2402     &EUC_KR_INIT,
2403     &EUC_JP_INIT,
2404     &KOI8_R_INIT,
2405     &KOI8_R_INIT,
2406     &EUC_KR_INIT,
2407     &SHIFT_JIS_INIT,
2408     &KOI8_U_INIT,
2409     &ISO_8859_8_INIT,
2410     &WINDOWS_874_INIT,
2411     &GB18030_INIT,
2412     &EUC_KR_INIT,
2413     &GBK_INIT,
2414     &WINDOWS_874_INIT,
2415     &BIG5_INIT,
2416     &GBK_INIT,
2417     &ISO_8859_8_I_INIT,
2418     &KOI8_R_INIT,
2419     &EUC_KR_INIT,
2420     &KOI8_U_INIT,
2421     &WINDOWS_1250_INIT,
2422     &EUC_KR_INIT,
2423     &WINDOWS_1251_INIT,
2424     &WINDOWS_1252_INIT,
2425     &GBK_INIT,
2426     &WINDOWS_1252_INIT,
2427     &ISO_8859_2_INIT,
2428     &WINDOWS_1253_INIT,
2429     &ISO_8859_3_INIT,
2430     &ISO_8859_6_INIT,
2431     &WINDOWS_1254_INIT,
2432     &ISO_8859_4_INIT,
2433     &WINDOWS_1255_INIT,
2434     &ISO_8859_5_INIT,
2435     &BIG5_INIT,
2436     &WINDOWS_1256_INIT,
2437     &IBM866_INIT,
2438     &ISO_8859_6_INIT,
2439     &WINDOWS_1257_INIT,
2440     &ISO_8859_7_INIT,
2441     &ISO_8859_6_INIT,
2442     &ISO_8859_7_INIT,
2443     &ISO_8859_7_INIT,
2444     &WINDOWS_1258_INIT,
2445     &ISO_8859_8_INIT,
2446     &WINDOWS_1254_INIT,
2447     &ISO_8859_5_INIT,
2448     &UTF_16BE_INIT,
2449     &UTF_16LE_INIT,
2450     &WINDOWS_1252_INIT,
2451     &SHIFT_JIS_INIT,
2452     &EUC_JP_INIT,
2453     &ISO_8859_10_INIT,
2454     &WINDOWS_1252_INIT,
2455     &WINDOWS_874_INIT,
2456     &ISO_8859_2_INIT,
2457     &ISO_8859_3_INIT,
2458     &ISO_8859_13_INIT,
2459     &ISO_8859_4_INIT,
2460     &ISO_8859_14_INIT,
2461     &ISO_8859_5_INIT,
2462     &ISO_8859_15_INIT,
2463     &ISO_8859_6_INIT,
2464     &ISO_8859_7_INIT,
2465     &ISO_8859_8_INIT,
2466     &GBK_INIT,
2467     &WINDOWS_1254_INIT,
2468     &MACINTOSH_INIT,
2469     &SHIFT_JIS_INIT,
2470     &SHIFT_JIS_INIT,
2471     &WINDOWS_1252_INIT,
2472     &ISO_8859_10_INIT,
2473     &ISO_8859_4_INIT,
2474     &GBK_INIT,
2475     &WINDOWS_1252_INIT,
2476     &WINDOWS_1252_INIT,
2477     &ISO_8859_2_INIT,
2478     &WINDOWS_874_INIT,
2479     &ISO_8859_2_INIT,
2480     &ISO_8859_2_INIT,
2481     &REPLACEMENT_INIT,
2482     &ISO_8859_3_INIT,
2483     &ISO_8859_3_INIT,
2484     &ISO_8859_13_INIT,
2485     &ISO_8859_4_INIT,
2486     &ISO_8859_4_INIT,
2487     &ISO_8859_14_INIT,
2488     &ISO_8859_5_INIT,
2489     &ISO_8859_5_INIT,
2490     &ISO_8859_5_INIT,
2491     &ISO_8859_15_INIT,
2492     &ISO_8859_6_INIT,
2493     &ISO_8859_6_INIT,
2494     &ISO_8859_7_INIT,
2495     &ISO_8859_7_INIT,
2496     &ISO_8859_7_INIT,
2497     &ISO_8859_6_INIT,
2498     &ISO_8859_10_INIT,
2499     &ISO_8859_8_INIT,
2500     &ISO_8859_8_INIT,
2501     &ISO_8859_8_INIT,
2502     &WINDOWS_1254_INIT,
2503     &WINDOWS_1254_INIT,
2504     &WINDOWS_1254_INIT,
2505     &ISO_8859_3_INIT,
2506     &EUC_KR_INIT,
2507     &BIG5_INIT,
2508     &SHIFT_JIS_INIT,
2509     &ISO_8859_10_INIT,
2510     &WINDOWS_874_INIT,
2511     &WINDOWS_1252_INIT,
2512     &ISO_8859_2_INIT,
2513     &ISO_8859_13_INIT,
2514     &ISO_8859_3_INIT,
2515     &ISO_8859_14_INIT,
2516     &WINDOWS_874_INIT,
2517     &ISO_8859_4_INIT,
2518     &ISO_8859_15_INIT,
2519     &ISO_8859_15_INIT,
2520     &WINDOWS_1254_INIT,
2521     &ISO_8859_16_INIT,
2522     &ISO_8859_10_INIT,
2523     &EUC_KR_INIT,
2524     &ISO_8859_15_INIT,
2525     &ISO_8859_6_INIT,
2526     &ISO_8859_8_INIT,
2527     &MACINTOSH_INIT,
2528     &ISO_8859_6_INIT,
2529     &ISO_8859_8_I_INIT,
2530     &SHIFT_JIS_INIT,
2531     &MACINTOSH_INIT,
2532     &REPLACEMENT_INIT,
2533     &ISO_2022_JP_INIT,
2534     &ISO_2022_JP_INIT,
2535     &REPLACEMENT_INIT,
2536     &REPLACEMENT_INIT,
2537     &REPLACEMENT_INIT,
2538     &WINDOWS_1250_INIT,
2539     &WINDOWS_1251_INIT,
2540     &WINDOWS_1252_INIT,
2541     &WINDOWS_1253_INIT,
2542     &WINDOWS_1254_INIT,
2543     &WINDOWS_1255_INIT,
2544     &WINDOWS_1256_INIT,
2545     &WINDOWS_1257_INIT,
2546     &WINDOWS_1258_INIT,
2547     &ISO_8859_6_INIT,
2548     &ISO_8859_8_INIT,
2549     &ISO_8859_6_INIT,
2550     &ISO_8859_8_I_INIT,
2551     &ISO_8859_7_INIT,
2552     &EUC_KR_INIT,
2553     &EUC_KR_INIT,
2554     &WINDOWS_1252_INIT,
2555     &EUC_KR_INIT,
2556     &X_MAC_CYRILLIC_INIT,
2557     &X_USER_DEFINED_INIT,
2558     &GBK_INIT,
2559     &WINDOWS_1252_INIT,
2560     &ISO_8859_2_INIT,
2561     &ISO_8859_6_INIT,
2562     &ISO_8859_7_INIT,
2563     &ISO_8859_3_INIT,
2564     &ISO_8859_4_INIT,
2565     &ISO_8859_5_INIT,
2566     &ISO_8859_8_INIT,
2567     &WINDOWS_1254_INIT,
2568     &ISO_8859_7_INIT,
2569     &X_MAC_CYRILLIC_INIT,
2570     &REPLACEMENT_INIT,
2571     &ISO_8859_6_INIT,
2572     &ISO_8859_8_INIT,
2573     &UTF_8_INIT,
2574     &ISO_8859_5_INIT,
2575     &EUC_JP_INIT,
2576 ];
2577 
2578 // END GENERATED CODE
2579 
2580 /// An encoding as defined in the [Encoding Standard][1].
2581 ///
2582 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2583 /// and, in most cases, vice versa. Each encoding has a name, an output
2584 /// encoding, and one or more labels.
2585 ///
2586 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2587 /// encoding in formats and protocols. The _name_ of the encoding is the
2588 /// preferred label in the case appropriate for returning from the
2589 /// [`characterSet`][2] property of the `Document` DOM interface.
2590 ///
2591 /// The _output encoding_ is the encoding used for form submission and URL
2592 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2593 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2594 /// encodings.
2595 ///
2596 /// [1]: https://encoding.spec.whatwg.org/
2597 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2598 ///
2599 /// # Streaming vs. Non-Streaming
2600 ///
2601 /// When you have the entire input in a single buffer, you can use the
2602 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2603 /// [`decode_without_bom_handling()`][5],
2604 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2605 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2606 /// not available in the C API.) Unlike the rest of the API available to Rust,
2607 /// these methods perform heap allocations. You should the `Decoder` and
2608 /// `Encoder` objects when your input is split into multiple buffers or when
2609 /// you want to control the allocation of the output buffers.
2610 ///
2611 /// [3]: #method.decode
2612 /// [4]: #method.decode_with_bom_removal
2613 /// [5]: #method.decode_without_bom_handling
2614 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2615 /// [7]: #method.encode
2616 ///
2617 /// # Instances
2618 ///
2619 /// All instances of `Encoding` are statically allocated and have the `'static`
2620 /// lifetime. There is precisely one unique `Encoding` instance for each
2621 /// encoding defined in the Encoding Standard.
2622 ///
2623 /// To obtain a reference to a particular encoding whose identity you know at
2624 /// compile time, use a `static` that refers to encoding. There is a `static`
2625 /// for each encoding. The `static`s are named in all caps with hyphens
2626 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2627 /// name). For example, if you know at compile time that you will want to
2628 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2629 /// in C/C++).
2630 ///
2631 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2632 /// work around the problem that `static`s of the type `&'static Encoding`
2633 /// cannot be used to initialize items of an array whose type is
2634 /// `[&'static Encoding; N]`.
2635 ///
2636 /// If you don't know what encoding you need at compile time and need to
2637 /// dynamically get an encoding by label, use
2638 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2639 ///
2640 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2641 /// C/C++).
2642 pub struct Encoding {
2643     name: &'static str,
2644     variant: VariantEncoding,
2645 }
2646 
2647 impl Encoding {
2648     /// Implements the
2649     /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2650     /// algorithm.
2651     ///
2652     /// If, after ASCII-lowercasing and removing leading and trailing
2653     /// whitespace, the argument matches a label defined in the Encoding
2654     /// Standard, `Some(&'static Encoding)` representing the corresponding
2655     /// encoding is returned. If there is no match, `None` is returned.
2656     ///
2657     /// This is the right method to use if the action upon the method returning
2658     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2659     /// When the action upon the method returning `None` is not to proceed with
2660     /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2661     /// appropriate.
2662     ///
2663     /// The argument is of type `&[u8]` instead of `&str` to save callers
2664     /// that are extracting the label from a non-UTF-8 protocol the trouble
2665     /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2666     /// on it.)
2667     ///
2668     /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2669     pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2670         let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2671         let mut trimmed_pos = 0usize;
2672         let mut iter = label.into_iter();
2673         // before
2674         loop {
2675             match iter.next() {
2676                 None => {
2677                     return None;
2678                 }
2679                 Some(byte) => {
2680                     // The characters used in labels are:
2681                     // a-z (except q, but excluding it below seems excessive)
2682                     // 0-9
2683                     // . _ - :
2684                     match *byte {
2685                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2686                             continue;
2687                         }
2688                         b'A'..=b'Z' => {
2689                             trimmed[trimmed_pos] = *byte + 0x20u8;
2690                             trimmed_pos = 1usize;
2691                             break;
2692                         }
2693                         b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2694                             trimmed[trimmed_pos] = *byte;
2695                             trimmed_pos = 1usize;
2696                             break;
2697                         }
2698                         _ => {
2699                             return None;
2700                         }
2701                     }
2702                 }
2703             }
2704         }
2705         // inside
2706         loop {
2707             match iter.next() {
2708                 None => {
2709                     break;
2710                 }
2711                 Some(byte) => {
2712                     match *byte {
2713                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2714                             break;
2715                         }
2716                         b'A'..=b'Z' => {
2717                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2718                                 // There's no encoding with a label this long
2719                                 return None;
2720                             }
2721                             trimmed[trimmed_pos] = *byte + 0x20u8;
2722                             trimmed_pos += 1usize;
2723                             continue;
2724                         }
2725                         b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2726                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2727                                 // There's no encoding with a label this long
2728                                 return None;
2729                             }
2730                             trimmed[trimmed_pos] = *byte;
2731                             trimmed_pos += 1usize;
2732                             continue;
2733                         }
2734                         _ => {
2735                             return None;
2736                         }
2737                     }
2738                 }
2739             }
2740         }
2741         // after
2742         loop {
2743             match iter.next() {
2744                 None => {
2745                     break;
2746                 }
2747                 Some(byte) => {
2748                     match *byte {
2749                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2750                             continue;
2751                         }
2752                         _ => {
2753                             // There's no label with space in the middle
2754                             return None;
2755                         }
2756                     }
2757                 }
2758             }
2759         }
2760         let candidate = &trimmed[..trimmed_pos];
2761         match LABELS_SORTED.binary_search_by(|probe| {
2762             let bytes = probe.as_bytes();
2763             let c = bytes.len().cmp(&candidate.len());
2764             if c != Ordering::Equal {
2765                 return c;
2766             }
2767             let probe_iter = bytes.iter().rev();
2768             let candidate_iter = candidate.iter().rev();
2769             probe_iter.cmp(candidate_iter)
2770         }) {
2771             Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2772             Err(_) => None,
2773         }
2774     }
2775 
2776     /// This method behaves the same as `for_label()`, except when `for_label()`
2777     /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2778     ///
2779     /// This method is useful in scenarios where a fatal error is required
2780     /// upon invalid label, because in those cases the caller typically wishes
2781     /// to treat the labels that map to the replacement encoding as fatal
2782     /// errors, too.
2783     ///
2784     /// It is not OK to use this method when the action upon the method returning
2785     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2786     /// case, the `for_label()` method should be used instead in order to avoid
2787     /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2788     ///
2789     /// Available via the C wrapper.
2790     #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2791     pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2792         match Encoding::for_label(label) {
2793             None => None,
2794             Some(encoding) => {
2795                 if encoding == REPLACEMENT {
2796                     None
2797                 } else {
2798                     Some(encoding)
2799                 }
2800             }
2801         }
2802     }
2803 
2804     /// Performs non-incremental BOM sniffing.
2805     ///
2806     /// The argument must either be a buffer representing the entire input
2807     /// stream (non-streaming case) or a buffer representing at least the first
2808     /// three bytes of the input stream (streaming case).
2809     ///
2810     /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2811     /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2812     /// or UTF-16BE BOM or `None` otherwise.
2813     ///
2814     /// Available via the C wrapper.
2815     #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2816     pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2817         if buffer.starts_with(b"\xEF\xBB\xBF") {
2818             Some((UTF_8, 3))
2819         } else if buffer.starts_with(b"\xFF\xFE") {
2820             Some((UTF_16LE, 2))
2821         } else if buffer.starts_with(b"\xFE\xFF") {
2822             Some((UTF_16BE, 2))
2823         } else {
2824             None
2825         }
2826     }
2827 
2828     /// Returns the name of this encoding.
2829     ///
2830     /// This name is appropriate to return as-is from the DOM
2831     /// `document.characterSet` property.
2832     ///
2833     /// Available via the C wrapper.
2834     #[inline]
name(&'static self) -> &'static str2835     pub fn name(&'static self) -> &'static str {
2836         self.name
2837     }
2838 
2839     /// Checks whether the _output encoding_ of this encoding can encode every
2840     /// `char`. (Only true if the output encoding is UTF-8.)
2841     ///
2842     /// Available via the C wrapper.
2843     #[inline]
can_encode_everything(&'static self) -> bool2844     pub fn can_encode_everything(&'static self) -> bool {
2845         self.output_encoding() == UTF_8
2846     }
2847 
2848     /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2849     /// U+0000...U+007F and vice versa.
2850     ///
2851     /// Available via the C wrapper.
2852     #[inline]
is_ascii_compatible(&'static self) -> bool2853     pub fn is_ascii_compatible(&'static self) -> bool {
2854         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2855     }
2856 
2857     /// Checks whether this encoding maps one byte to one Basic Multilingual
2858     /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2859     /// vice versa (for mappable characters).
2860     ///
2861     /// `true` iff this encoding is on the list of [Legacy single-byte
2862     /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2863     /// in the spec or x-user-defined.
2864     ///
2865     /// Available via the C wrapper.
2866     #[inline]
is_single_byte(&'static self) -> bool2867     pub fn is_single_byte(&'static self) -> bool {
2868         self.variant.is_single_byte()
2869     }
2870 
2871     /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2872     /// U+0000...U+007F and vice versa.
2873     #[inline]
is_potentially_borrowable(&'static self) -> bool2874     fn is_potentially_borrowable(&'static self) -> bool {
2875         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2876     }
2877 
2878     /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2879     /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2880     ///
2881     /// Available via the C wrapper.
2882     #[inline]
output_encoding(&'static self) -> &'static Encoding2883     pub fn output_encoding(&'static self) -> &'static Encoding {
2884         if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2885             UTF_8
2886         } else {
2887             self
2888         }
2889     }
2890 
2891     /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2892     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2893     /// entire input is available as a single buffer (i.e. the end of the
2894     /// buffer marks the end of the stream).
2895     ///
2896     /// This method implements the (non-streaming version of) the
2897     /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2898     ///
2899     /// The second item in the returned tuple is the encoding that was actually
2900     /// used (which may differ from this encoding thanks to BOM sniffing).
2901     ///
2902     /// The third item in the returned tuple indicates whether there were
2903     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2904     ///
2905     /// _Note:_ It is wrong to use this when the input buffer represents only
2906     /// a segment of the input instead of the whole input. Use `new_decoder()`
2907     /// when decoding segmented input.
2908     ///
2909     /// This method performs a one or two heap allocations for the backing
2910     /// buffer of the `String` when unable to borrow. (One allocation if not
2911     /// errors and potentially another one in the presence of errors.) The
2912     /// first allocation assumes jemalloc and may not be optimal with
2913     /// allocators that do not use power-of-two buckets. A borrow is performed
2914     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2915     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2916     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2917     /// transitions.
2918     ///
2919     /// # Panics
2920     ///
2921     /// If the size calculation for a heap-allocated backing buffer overflows
2922     /// `usize`.
2923     ///
2924     /// Available to Rust only.
2925     #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2926     pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2927         let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2928             Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2929             None => (self, bytes),
2930         };
2931         let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2932         (cow, encoding, had_errors)
2933     }
2934 
2935     /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2936     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2937     /// entire input is available as a single buffer (i.e. the end of the
2938     /// buffer marks the end of the stream).
2939     ///
2940     /// When invoked on `UTF_8`, this method implements the (non-streaming
2941     /// version of) the
2942     /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2943     /// concept.
2944     ///
2945     /// The second item in the returned pair indicates whether there were
2946     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2947     ///
2948     /// _Note:_ It is wrong to use this when the input buffer represents only
2949     /// a segment of the input instead of the whole input. Use
2950     /// `new_decoder_with_bom_removal()` when decoding segmented input.
2951     ///
2952     /// This method performs a one or two heap allocations for the backing
2953     /// buffer of the `String` when unable to borrow. (One allocation if not
2954     /// errors and potentially another one in the presence of errors.) The
2955     /// first allocation assumes jemalloc and may not be optimal with
2956     /// allocators that do not use power-of-two buckets. A borrow is performed
2957     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2958     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2959     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2960     /// transitions.
2961     ///
2962     /// # Panics
2963     ///
2964     /// If the size calculation for a heap-allocated backing buffer overflows
2965     /// `usize`.
2966     ///
2967     /// Available to Rust only.
2968     #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)2969     pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2970         let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2971             &bytes[3..]
2972         } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2973             || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2974         {
2975             &bytes[2..]
2976         } else {
2977             bytes
2978         };
2979         self.decode_without_bom_handling(without_bom)
2980     }
2981 
2982     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2983     /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2984     /// the entire input is available as a single buffer (i.e. the end of the
2985     /// buffer marks the end of the stream).
2986     ///
2987     /// When invoked on `UTF_8`, this method implements the (non-streaming
2988     /// version of) the
2989     /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2990     /// spec concept.
2991     ///
2992     /// The second item in the returned pair indicates whether there were
2993     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2994     ///
2995     /// _Note:_ It is wrong to use this when the input buffer represents only
2996     /// a segment of the input instead of the whole input. Use
2997     /// `new_decoder_without_bom_handling()` when decoding segmented input.
2998     ///
2999     /// This method performs a one or two heap allocations for the backing
3000     /// buffer of the `String` when unable to borrow. (One allocation if not
3001     /// errors and potentially another one in the presence of errors.) The
3002     /// first allocation assumes jemalloc and may not be optimal with
3003     /// allocators that do not use power-of-two buckets. A borrow is performed
3004     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3005     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3006     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3007     /// transitions.
3008     ///
3009     /// # Panics
3010     ///
3011     /// If the size calculation for a heap-allocated backing buffer overflows
3012     /// `usize`.
3013     ///
3014     /// Available to Rust only.
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3015     pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3016         let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3017             let valid_up_to = if self == UTF_8 {
3018                 utf8_valid_up_to(bytes)
3019             } else if self == ISO_2022_JP {
3020                 iso_2022_jp_ascii_valid_up_to(bytes)
3021             } else {
3022                 ascii_valid_up_to(bytes)
3023             };
3024             if valid_up_to == bytes.len() {
3025                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3026                 return (Cow::Borrowed(str), false);
3027             }
3028             let decoder = self.new_decoder_without_bom_handling();
3029 
3030             let rounded_without_replacement = checked_next_power_of_two(checked_add(
3031                 valid_up_to,
3032                 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3033             ));
3034             let with_replacement = checked_add(
3035                 valid_up_to,
3036                 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3037             );
3038             let mut string = String::with_capacity(
3039                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3040             );
3041             unsafe {
3042                 let vec = string.as_mut_vec();
3043                 vec.set_len(valid_up_to);
3044                 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3045             }
3046             (decoder, string, valid_up_to)
3047         } else {
3048             let decoder = self.new_decoder_without_bom_handling();
3049             let rounded_without_replacement = checked_next_power_of_two(
3050                 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3051             );
3052             let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3053             let string = String::with_capacity(
3054                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3055             );
3056             (decoder, string, 0)
3057         };
3058 
3059         let mut total_had_errors = false;
3060         loop {
3061             let (result, read, had_errors) =
3062                 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3063             total_read += read;
3064             total_had_errors |= had_errors;
3065             match result {
3066                 CoderResult::InputEmpty => {
3067                     debug_assert_eq!(total_read, bytes.len());
3068                     return (Cow::Owned(string), total_had_errors);
3069                 }
3070                 CoderResult::OutputFull => {
3071                     // Allocate for the worst case. That is, we should come
3072                     // here at most once per invocation of this method.
3073                     let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3074                     string.reserve(needed.unwrap());
3075                 }
3076             }
3077         }
3078     }
3079 
3080     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3081     /// _with malformed sequences treated as fatal_ when the entire input is
3082     /// available as a single buffer (i.e. the end of the buffer marks the end
3083     /// of the stream).
3084     ///
3085     /// When invoked on `UTF_8`, this method implements the (non-streaming
3086     /// version of) the
3087     /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3088     /// spec concept.
3089     ///
3090     /// Returns `None` if a malformed sequence was encountered and the result
3091     /// of the decode as `Some(String)` otherwise.
3092     ///
3093     /// _Note:_ It is wrong to use this when the input buffer represents only
3094     /// a segment of the input instead of the whole input. Use
3095     /// `new_decoder_without_bom_handling()` when decoding segmented input.
3096     ///
3097     /// This method performs a single heap allocation for the backing
3098     /// buffer of the `String` when unable to borrow. A borrow is performed if
3099     /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3100     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3101     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3102     /// transitions.
3103     ///
3104     /// # Panics
3105     ///
3106     /// If the size calculation for a heap-allocated backing buffer overflows
3107     /// `usize`.
3108     ///
3109     /// Available to Rust only.
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3110     pub fn decode_without_bom_handling_and_without_replacement<'a>(
3111         &'static self,
3112         bytes: &'a [u8],
3113     ) -> Option<Cow<'a, str>> {
3114         if self == UTF_8 {
3115             let valid_up_to = utf8_valid_up_to(bytes);
3116             if valid_up_to == bytes.len() {
3117                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3118                 return Some(Cow::Borrowed(str));
3119             }
3120             return None;
3121         }
3122         let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3123             let valid_up_to = if self == ISO_2022_JP {
3124                 iso_2022_jp_ascii_valid_up_to(bytes)
3125             } else {
3126                 ascii_valid_up_to(bytes)
3127             };
3128             if valid_up_to == bytes.len() {
3129                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3130                 return Some(Cow::Borrowed(str));
3131             }
3132             let decoder = self.new_decoder_without_bom_handling();
3133             let mut string = String::with_capacity(
3134                 checked_add(
3135                     valid_up_to,
3136                     decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3137                 )
3138                 .unwrap(),
3139             );
3140             unsafe {
3141                 let vec = string.as_mut_vec();
3142                 vec.set_len(valid_up_to);
3143                 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3144             }
3145             (decoder, string, &bytes[valid_up_to..])
3146         } else {
3147             let decoder = self.new_decoder_without_bom_handling();
3148             let string = String::with_capacity(
3149                 decoder
3150                     .max_utf8_buffer_length_without_replacement(bytes.len())
3151                     .unwrap(),
3152             );
3153             (decoder, string, bytes)
3154         };
3155         let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3156         match result {
3157             DecoderResult::InputEmpty => {
3158                 debug_assert_eq!(read, input.len());
3159                 Some(Cow::Owned(string))
3160             }
3161             DecoderResult::Malformed(_, _) => None,
3162             DecoderResult::OutputFull => unreachable!(),
3163         }
3164     }
3165 
3166     /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3167     /// replaced with decimal numeric character references when the entire input
3168     /// is available as a single buffer (i.e. the end of the buffer marks the
3169     /// end of the stream).
3170     ///
3171     /// This method implements the (non-streaming version of) the
3172     /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3173     /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3174     /// spec concept, it is slightly more efficient to use
3175     /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3176     /// method on `UTF_8`.
3177     ///
3178     /// The second item in the returned tuple is the encoding that was actually
3179     /// used (which may differ from this encoding thanks to some encodings
3180     /// having UTF-8 as their output encoding).
3181     ///
3182     /// The third item in the returned tuple indicates whether there were
3183     /// unmappable characters (that were replaced with HTML numeric character
3184     /// references).
3185     ///
3186     /// _Note:_ It is wrong to use this when the input buffer represents only
3187     /// a segment of the input instead of the whole input. Use `new_encoder()`
3188     /// when encoding segmented output.
3189     ///
3190     /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3191     /// ASCII-compatible encoding, this method returns a borrow of the input
3192     /// without a heap allocation. Otherwise, this method performs a single
3193     /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3194     /// unmappable characters and potentially multiple heap allocations if
3195     /// there are. These allocations are tuned for jemalloc and may not be
3196     /// optimal when using a different allocator that doesn't use power-of-two
3197     /// buckets.
3198     ///
3199     /// # Panics
3200     ///
3201     /// If the size calculation for a heap-allocated backing buffer overflows
3202     /// `usize`.
3203     ///
3204     /// Available to Rust only.
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3205     pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3206         let output_encoding = self.output_encoding();
3207         if output_encoding == UTF_8 {
3208             return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3209         }
3210         debug_assert!(output_encoding.is_potentially_borrowable());
3211         let bytes = string.as_bytes();
3212         let valid_up_to = if output_encoding == ISO_2022_JP {
3213             iso_2022_jp_ascii_valid_up_to(bytes)
3214         } else {
3215             ascii_valid_up_to(bytes)
3216         };
3217         if valid_up_to == bytes.len() {
3218             return (Cow::Borrowed(bytes), output_encoding, false);
3219         }
3220         let mut encoder = output_encoding.new_encoder();
3221         let mut vec: Vec<u8> = Vec::with_capacity(
3222             (checked_add(
3223                 valid_up_to,
3224                 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3225             ))
3226             .unwrap()
3227             .next_power_of_two(),
3228         );
3229         unsafe {
3230             vec.set_len(valid_up_to);
3231             std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3232         }
3233         let mut total_read = valid_up_to;
3234         let mut total_had_errors = false;
3235         loop {
3236             let (result, read, had_errors) =
3237                 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3238             total_read += read;
3239             total_had_errors |= had_errors;
3240             match result {
3241                 CoderResult::InputEmpty => {
3242                     debug_assert_eq!(total_read, string.len());
3243                     return (Cow::Owned(vec), output_encoding, total_had_errors);
3244                 }
3245                 CoderResult::OutputFull => {
3246                     // reserve_exact wants to know how much more on top of current
3247                     // length--not current capacity.
3248                     let needed = encoder
3249                         .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3250                     let rounded = (checked_add(vec.capacity(), needed))
3251                         .unwrap()
3252                         .next_power_of_two();
3253                     let additional = rounded - vec.len();
3254                     vec.reserve_exact(additional);
3255                 }
3256             }
3257         }
3258     }
3259 
new_variant_decoder(&'static self) -> VariantDecoder3260     fn new_variant_decoder(&'static self) -> VariantDecoder {
3261         self.variant.new_variant_decoder()
3262     }
3263 
3264     /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3265     ///
3266     /// BOM sniffing may cause the returned decoder to morph into a decoder
3267     /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3268     ///
3269     /// Available via the C wrapper.
3270     #[inline]
new_decoder(&'static self) -> Decoder3271     pub fn new_decoder(&'static self) -> Decoder {
3272         Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3273     }
3274 
3275     /// Instantiates a new decoder for this encoding with BOM removal.
3276     ///
3277     /// If the input starts with bytes that are the BOM for this encoding,
3278     /// those bytes are removed. However, the decoder never morphs into a
3279     /// decoder for another encoding: A BOM for another encoding is treated as
3280     /// (potentially malformed) input to the decoding algorithm for this
3281     /// encoding.
3282     ///
3283     /// Available via the C wrapper.
3284     #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3285     pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3286         Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3287     }
3288 
3289     /// Instantiates a new decoder for this encoding with BOM handling disabled.
3290     ///
3291     /// If the input starts with bytes that look like a BOM, those bytes are
3292     /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3293     /// for another encoding.)
3294     ///
3295     /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3296     /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3297     /// instead of this method to cause the BOM to be removed.
3298     ///
3299     /// Available via the C wrapper.
3300     #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3301     pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3302         Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3303     }
3304 
3305     /// Instantiates a new encoder for the output encoding of this encoding.
3306     ///
3307     /// Available via the C wrapper.
3308     #[inline]
new_encoder(&'static self) -> Encoder3309     pub fn new_encoder(&'static self) -> Encoder {
3310         let enc = self.output_encoding();
3311         enc.variant.new_encoder(enc)
3312     }
3313 
3314     /// Validates UTF-8.
3315     ///
3316     /// Returns the index of the first byte that makes the input malformed as
3317     /// UTF-8 or the length of the slice if the slice is entirely valid.
3318     ///
3319     /// This is currently faster than the corresponding standard library
3320     /// functionality. If this implementation gets upstreamed to the standard
3321     /// library, this method may be removed in the future.
3322     ///
3323     /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3324     pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3325         utf8_valid_up_to(bytes)
3326     }
3327 
3328     /// Validates ASCII.
3329     ///
3330     /// Returns the index of the first byte that makes the input malformed as
3331     /// ASCII or the length of the slice if the slice is entirely valid.
3332     ///
3333     /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3334     pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3335         ascii_valid_up_to(bytes)
3336     }
3337 
3338     /// Validates ISO-2022-JP ASCII-state data.
3339     ///
3340     /// Returns the index of the first byte that makes the input not
3341     /// representable in the ASCII state of ISO-2022-JP or the length of the
3342     /// slice if the slice is entirely representable in the ASCII state of
3343     /// ISO-2022-JP.
3344     ///
3345     /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3346     pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3347         iso_2022_jp_ascii_valid_up_to(bytes)
3348     }
3349 }
3350 
3351 impl PartialEq for Encoding {
3352     #[inline]
eq(&self, other: &Encoding) -> bool3353     fn eq(&self, other: &Encoding) -> bool {
3354         (self as *const Encoding) == (other as *const Encoding)
3355     }
3356 }
3357 
3358 impl Eq for Encoding {}
3359 
3360 impl Hash for Encoding {
3361     #[inline]
hash<H: Hasher>(&self, state: &mut H)3362     fn hash<H: Hasher>(&self, state: &mut H) {
3363         (self as *const Encoding).hash(state);
3364     }
3365 }
3366 
3367 impl std::fmt::Debug for Encoding {
3368     #[inline]
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result3369     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3370         write!(f, "Encoding {{ {} }}", self.name)
3371     }
3372 }
3373 
3374 #[cfg(feature = "serde")]
3375 impl Serialize for Encoding {
3376     #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3377     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3378     where
3379         S: Serializer,
3380     {
3381         serializer.serialize_str(self.name)
3382     }
3383 }
3384 
3385 #[cfg(feature = "serde")]
3386 struct EncodingVisitor;
3387 
3388 #[cfg(feature = "serde")]
3389 impl<'de> Visitor<'de> for EncodingVisitor {
3390     type Value = &'static Encoding;
3391 
expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result3392     fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
3393         formatter.write_str("a valid encoding label")
3394     }
3395 
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3396     fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3397     where
3398         E: serde::de::Error,
3399     {
3400         if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3401             Ok(enc)
3402         } else {
3403             Err(E::custom(format!("invalid encoding label: {}", value)))
3404         }
3405     }
3406 }
3407 
3408 #[cfg(feature = "serde")]
3409 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3410     fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3411     where
3412         D: Deserializer<'de>,
3413     {
3414         deserializer.deserialize_str(EncodingVisitor)
3415     }
3416 }
3417 
3418 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3419 #[derive(PartialEq, Debug, Copy, Clone)]
3420 enum DecoderLifeCycle {
3421     /// The decoder has seen no input yet.
3422     AtStart,
3423     /// The decoder has seen no input yet but expects UTF-8.
3424     AtUtf8Start,
3425     /// The decoder has seen no input yet but expects UTF-16BE.
3426     AtUtf16BeStart,
3427     /// The decoder has seen no input yet but expects UTF-16LE.
3428     AtUtf16LeStart,
3429     /// The decoder has seen EF.
3430     SeenUtf8First,
3431     /// The decoder has seen EF, BB.
3432     SeenUtf8Second,
3433     /// The decoder has seen FE.
3434     SeenUtf16BeFirst,
3435     /// The decoder has seen FF.
3436     SeenUtf16LeFirst,
3437     /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3438     /// underlying decoder reported EF as an error, so we need to remember to
3439     /// push BB before the next buffer.
3440     ConvertingWithPendingBB,
3441     /// No longer looking for a BOM and EOF not yet seen.
3442     Converting,
3443     /// EOF has been seen.
3444     Finished,
3445 }
3446 
3447 /// Communicate the BOM handling mode.
3448 #[derive(Debug, Copy, Clone)]
3449 enum BomHandling {
3450     /// Don't handle the BOM
3451     Off,
3452     /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3453     Sniff,
3454     /// Remove the BOM only if it's the BOM for this encoding
3455     Remove,
3456 }
3457 
3458 /// Result of a (potentially partial) decode or encode operation with
3459 /// replacement.
3460 #[must_use]
3461 #[derive(Debug, PartialEq, Eq)]
3462 pub enum CoderResult {
3463     /// The input was exhausted.
3464     ///
3465     /// If this result was returned from a call where `last` was `true`, the
3466     /// conversion process has completed. Otherwise, the caller should call a
3467     /// decode or encode method again with more input.
3468     InputEmpty,
3469 
3470     /// The converter cannot produce another unit of output, because the output
3471     /// buffer does not have enough space left.
3472     ///
3473     /// The caller must provide more output space upon the next call and re-push
3474     /// the remaining input to the converter.
3475     OutputFull,
3476 }
3477 
3478 /// Result of a (potentially partial) decode operation without replacement.
3479 #[must_use]
3480 #[derive(Debug, PartialEq, Eq)]
3481 pub enum DecoderResult {
3482     /// The input was exhausted.
3483     ///
3484     /// If this result was returned from a call where `last` was `true`, the
3485     /// decoding process has completed. Otherwise, the caller should call a
3486     /// decode method again with more input.
3487     InputEmpty,
3488 
3489     /// The decoder cannot produce another unit of output, because the output
3490     /// buffer does not have enough space left.
3491     ///
3492     /// The caller must provide more output space upon the next call and re-push
3493     /// the remaining input to the decoder.
3494     OutputFull,
3495 
3496     /// The decoder encountered a malformed byte sequence.
3497     ///
3498     /// The caller must either treat this as a fatal error or must append one
3499     /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3500     /// the remaining input to the decoder.
3501     ///
3502     /// The first wrapped integer indicates the length of the malformed byte
3503     /// sequence. The second wrapped integer indicates the number of bytes
3504     /// that were consumed after the malformed sequence. If the second
3505     /// integer is zero, the last byte that was consumed is the last byte of
3506     /// the malformed sequence. Note that the malformed bytes may have been part
3507     /// of an earlier input buffer.
3508     ///
3509     /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3510     /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3511     /// of the two is 6, which happens with ISO-2022-JP.
3512     Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3513 }
3514 
3515 /// A converter that decodes a byte stream into Unicode according to a
3516 /// character encoding in a streaming (incremental) manner.
3517 ///
3518 /// The various `decode_*` methods take an input buffer (`src`) and an output
3519 /// buffer `dst` both of which are caller-allocated. There are variants for
3520 /// both UTF-8 and UTF-16 output buffers.
3521 ///
3522 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3523 /// into `dst` until one of the following three things happens:
3524 ///
3525 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3526 ///    variants only).
3527 ///
3528 /// 2. The output buffer has been filled so near capacity that the decoder
3529 ///    cannot be sure that processing an additional byte of input wouldn't
3530 ///    cause so much output that the output buffer would overflow.
3531 ///
3532 /// 3. All the input bytes have been processed.
3533 ///
3534 /// The `decode_*` method then returns tuple of a status indicating which one
3535 /// of the three reasons to return happened, how many input bytes were read,
3536 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3537 /// when decoding to UTF-16) were written (except when decoding into `String`,
3538 /// whose length change indicates this), and in the case of the
3539 /// variants performing replacement, a boolean indicating whether an error was
3540 /// replaced with the REPLACEMENT CHARACTER during the call.
3541 ///
3542 /// The number of bytes "written" is what's logically written. Garbage may be
3543 /// written in the output buffer beyond the point logically written to.
3544 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3545 /// methods that take an `&mut str` argument instead of the ones that take an
3546 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3547 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3548 /// latter don't.
3549 ///
3550 /// In the case of the `*_without_replacement` variants, the status is a
3551 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3552 /// `InputEmpty` corresponding to the three cases listed above).
3553 ///
3554 /// In the case of methods whose name does not end with
3555 /// `*_without_replacement`, malformed sequences are automatically replaced
3556 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3557 /// return early.
3558 ///
3559 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3560 /// space. When decoding to UTF-16, the output buffer must have at least two
3561 /// UTF-16 code units (`u16`) of space.
3562 ///
3563 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3564 /// not to return indicating that more output space is needed if the length
3565 /// of the output buffer is at least the length returned by
3566 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3567 /// with replacement, the length of the output buffer that guarantees the
3568 /// methods not to return indicating that more output space is needed is given
3569 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3570 /// or without replacement, the length of the output buffer that guarantees
3571 /// the methods not to return indicating that more output space is needed is
3572 /// given by [`max_utf16_buffer_length()`][4].
3573 ///
3574 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3575 /// and the output after each `decode_*` call is guaranteed to consist of
3576 /// complete characters. (I.e. the code unit sequence for the last character is
3577 /// guaranteed not to be split across output buffers.)
3578 ///
3579 /// The boolean argument `last` indicates that the end of the stream is reached
3580 /// when all the bytes in `src` have been consumed.
3581 ///
3582 /// A `Decoder` object can be used to incrementally decode a byte stream.
3583 ///
3584 /// During the processing of a single stream, the caller must call `decode_*`
3585 /// zero or more times with `last` set to `false` and then call `decode_*` at
3586 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3587 /// the processing of the stream has ended. Otherwise, the caller must call
3588 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3589 ///  a fatal error).
3590 ///
3591 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3592 /// That is, you need to create another one to process another stream.
3593 ///
3594 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3595 /// the caller does not wish to treat it as a fatal error, the input buffer
3596 /// `src` may not have been completely consumed. In that case, the caller must
3597 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3598 /// call.
3599 ///
3600 /// [1]: enum.DecoderResult.html
3601 /// [2]: #method.max_utf8_buffer_length_without_replacement
3602 /// [3]: #method.max_utf8_buffer_length
3603 /// [4]: #method.max_utf16_buffer_length
3604 ///
3605 /// # Infinite loops
3606 ///
3607 /// When converting with a fixed-size output buffer whose size is too small to
3608 /// accommodate one character or (when applicable) one numeric character
3609 /// reference of output, an infinite loop ensues. When converting with a
3610 /// fixed-size output buffer, it generally makes sense to make the buffer
3611 /// fairly large (e.g. couple of kilobytes).
3612 pub struct Decoder {
3613     encoding: &'static Encoding,
3614     variant: VariantDecoder,
3615     life_cycle: DecoderLifeCycle,
3616 }
3617 
3618 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3619     fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3620         Decoder {
3621             encoding: enc,
3622             variant: decoder,
3623             life_cycle: match sniffing {
3624                 BomHandling::Off => DecoderLifeCycle::Converting,
3625                 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3626                 BomHandling::Remove => {
3627                     if enc == UTF_8 {
3628                         DecoderLifeCycle::AtUtf8Start
3629                     } else if enc == UTF_16BE {
3630                         DecoderLifeCycle::AtUtf16BeStart
3631                     } else if enc == UTF_16LE {
3632                         DecoderLifeCycle::AtUtf16LeStart
3633                     } else {
3634                         DecoderLifeCycle::Converting
3635                     }
3636                 }
3637             },
3638         }
3639     }
3640 
3641     /// The `Encoding` this `Decoder` is for.
3642     ///
3643     /// BOM sniffing can change the return value of this method during the life
3644     /// of the decoder.
3645     ///
3646     /// Available via the C wrapper.
3647     #[inline]
encoding(&self) -> &'static Encoding3648     pub fn encoding(&self) -> &'static Encoding {
3649         self.encoding
3650     }
3651 
3652     /// Query the worst-case UTF-8 output size _with replacement_.
3653     ///
3654     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3655     /// that will not overflow given the current state of the decoder and
3656     /// `byte_length` number of additional input bytes when decoding with
3657     /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3658     /// sequence or `None` if `usize` would overflow.
3659     ///
3660     /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3661     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3662         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3663         // BOM getting pushed to the underlying decoder.
3664         match self.life_cycle {
3665             DecoderLifeCycle::Converting
3666             | DecoderLifeCycle::AtUtf8Start
3667             | DecoderLifeCycle::AtUtf16LeStart
3668             | DecoderLifeCycle::AtUtf16BeStart => {
3669                 return self.variant.max_utf8_buffer_length(byte_length);
3670             }
3671             DecoderLifeCycle::AtStart => {
3672                 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3673                     if let Some(utf16_bom) = checked_add(
3674                         1,
3675                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3676                     ) {
3677                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3678                         let encoding = self.encoding();
3679                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3680                             // No need to consider the internal state of the underlying decoder,
3681                             // because it is at start, because no data has reached it yet.
3682                             return Some(utf_bom);
3683                         } else if let Some(non_bom) =
3684                             self.variant.max_utf8_buffer_length(byte_length)
3685                         {
3686                             return Some(std::cmp::max(utf_bom, non_bom));
3687                         }
3688                     }
3689                 }
3690             }
3691             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3692                 // Add two bytes even when only one byte has been seen,
3693                 // because the one byte can become a lead byte in multibyte
3694                 // decoders, but only after the decoder has been queried
3695                 // for max length, so the decoder's own logic for adding
3696                 // one for a pending lead cannot work.
3697                 if let Some(sum) = byte_length.checked_add(2) {
3698                     if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3699                         if self.encoding() == UTF_8 {
3700                             // No need to consider the internal state of the underlying decoder,
3701                             // because it is at start, because no data has reached it yet.
3702                             return Some(utf8_bom);
3703                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3704                             return Some(std::cmp::max(utf8_bom, non_bom));
3705                         }
3706                     }
3707                 }
3708             }
3709             DecoderLifeCycle::ConvertingWithPendingBB => {
3710                 if let Some(sum) = byte_length.checked_add(2) {
3711                     return self.variant.max_utf8_buffer_length(sum);
3712                 }
3713             }
3714             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3715                 // Add two bytes even when only one byte has been seen,
3716                 // because the one byte can become a lead byte in multibyte
3717                 // decoders, but only after the decoder has been queried
3718                 // for max length, so the decoder's own logic for adding
3719                 // one for a pending lead cannot work.
3720                 if let Some(sum) = byte_length.checked_add(2) {
3721                     if let Some(utf16_bom) =
3722                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3723                     {
3724                         let encoding = self.encoding();
3725                         if encoding == UTF_16LE || encoding == UTF_16BE {
3726                             // No need to consider the internal state of the underlying decoder,
3727                             // because it is at start, because no data has reached it yet.
3728                             return Some(utf16_bom);
3729                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3730                             return Some(std::cmp::max(utf16_bom, non_bom));
3731                         }
3732                     }
3733                 }
3734             }
3735             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3736         }
3737         None
3738     }
3739 
3740     /// Query the worst-case UTF-8 output size _without replacement_.
3741     ///
3742     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3743     /// that will not overflow given the current state of the decoder and
3744     /// `byte_length` number of additional input bytes when decoding without
3745     /// replacement error handling or `None` if `usize` would overflow.
3746     ///
3747     /// Note that this value may be too small for the `_with_replacement` case.
3748     /// Use `max_utf8_buffer_length()` for that case.
3749     ///
3750     /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3751     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3752         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3753         // BOM getting pushed to the underlying decoder.
3754         match self.life_cycle {
3755             DecoderLifeCycle::Converting
3756             | DecoderLifeCycle::AtUtf8Start
3757             | DecoderLifeCycle::AtUtf16LeStart
3758             | DecoderLifeCycle::AtUtf16BeStart => {
3759                 return self
3760                     .variant
3761                     .max_utf8_buffer_length_without_replacement(byte_length);
3762             }
3763             DecoderLifeCycle::AtStart => {
3764                 if let Some(utf8_bom) = byte_length.checked_add(3) {
3765                     if let Some(utf16_bom) = checked_add(
3766                         1,
3767                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3768                     ) {
3769                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3770                         let encoding = self.encoding();
3771                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3772                             // No need to consider the internal state of the underlying decoder,
3773                             // because it is at start, because no data has reached it yet.
3774                             return Some(utf_bom);
3775                         } else if let Some(non_bom) = self
3776                             .variant
3777                             .max_utf8_buffer_length_without_replacement(byte_length)
3778                         {
3779                             return Some(std::cmp::max(utf_bom, non_bom));
3780                         }
3781                     }
3782                 }
3783             }
3784             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3785                 // Add two bytes even when only one byte has been seen,
3786                 // because the one byte can become a lead byte in multibyte
3787                 // decoders, but only after the decoder has been queried
3788                 // for max length, so the decoder's own logic for adding
3789                 // one for a pending lead cannot work.
3790                 if let Some(sum) = byte_length.checked_add(2) {
3791                     if let Some(utf8_bom) = sum.checked_add(3) {
3792                         if self.encoding() == UTF_8 {
3793                             // No need to consider the internal state of the underlying decoder,
3794                             // because it is at start, because no data has reached it yet.
3795                             return Some(utf8_bom);
3796                         } else if let Some(non_bom) =
3797                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3798                         {
3799                             return Some(std::cmp::max(utf8_bom, non_bom));
3800                         }
3801                     }
3802                 }
3803             }
3804             DecoderLifeCycle::ConvertingWithPendingBB => {
3805                 if let Some(sum) = byte_length.checked_add(2) {
3806                     return self.variant.max_utf8_buffer_length_without_replacement(sum);
3807                 }
3808             }
3809             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3810                 // Add two bytes even when only one byte has been seen,
3811                 // because the one byte can become a lead byte in multibyte
3812                 // decoders, but only after the decoder has been queried
3813                 // for max length, so the decoder's own logic for adding
3814                 // one for a pending lead cannot work.
3815                 if let Some(sum) = byte_length.checked_add(2) {
3816                     if let Some(utf16_bom) =
3817                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3818                     {
3819                         let encoding = self.encoding();
3820                         if encoding == UTF_16LE || encoding == UTF_16BE {
3821                             // No need to consider the internal state of the underlying decoder,
3822                             // because it is at start, because no data has reached it yet.
3823                             return Some(utf16_bom);
3824                         } else if let Some(non_bom) =
3825                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3826                         {
3827                             return Some(std::cmp::max(utf16_bom, non_bom));
3828                         }
3829                     }
3830                 }
3831             }
3832             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3833         }
3834         None
3835     }
3836 
3837     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3838     /// replaced with the REPLACEMENT CHARACTER.
3839     ///
3840     /// See the documentation of the struct for documentation for `decode_*`
3841     /// methods collectively.
3842     ///
3843     /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3844     pub fn decode_to_utf8(
3845         &mut self,
3846         src: &[u8],
3847         dst: &mut [u8],
3848         last: bool,
3849     ) -> (CoderResult, usize, usize, bool) {
3850         let mut had_errors = false;
3851         let mut total_read = 0usize;
3852         let mut total_written = 0usize;
3853         loop {
3854             let (result, read, written) = self.decode_to_utf8_without_replacement(
3855                 &src[total_read..],
3856                 &mut dst[total_written..],
3857                 last,
3858             );
3859             total_read += read;
3860             total_written += written;
3861             match result {
3862                 DecoderResult::InputEmpty => {
3863                     return (
3864                         CoderResult::InputEmpty,
3865                         total_read,
3866                         total_written,
3867                         had_errors,
3868                     );
3869                 }
3870                 DecoderResult::OutputFull => {
3871                     return (
3872                         CoderResult::OutputFull,
3873                         total_read,
3874                         total_written,
3875                         had_errors,
3876                     );
3877                 }
3878                 DecoderResult::Malformed(_, _) => {
3879                     had_errors = true;
3880                     // There should always be space for the U+FFFD, because
3881                     // otherwise we'd have gotten OutputFull already.
3882                     // XXX: is the above comment actually true for UTF-8 itself?
3883                     // TODO: Consider having fewer bound checks here.
3884                     dst[total_written] = 0xEFu8;
3885                     total_written += 1;
3886                     dst[total_written] = 0xBFu8;
3887                     total_written += 1;
3888                     dst[total_written] = 0xBDu8;
3889                     total_written += 1;
3890                 }
3891             }
3892         }
3893     }
3894 
3895     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3896     /// replaced with the REPLACEMENT CHARACTER with type system signaling
3897     /// of UTF-8 validity.
3898     ///
3899     /// This methods calls `decode_to_utf8` and then zeroes
3900     /// out up to three bytes that aren't logically part of the write in order
3901     /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3902     ///
3903     /// See the documentation of the struct for documentation for `decode_*`
3904     /// methods collectively.
3905     ///
3906     /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3907     pub fn decode_to_str(
3908         &mut self,
3909         src: &[u8],
3910         dst: &mut str,
3911         last: bool,
3912     ) -> (CoderResult, usize, usize, bool) {
3913         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3914         let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3915         let len = bytes.len();
3916         let mut trail = written;
3917         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3918         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3919         // encodings to avoid overwriting here.
3920         if self.encoding != UTF_8 {
3921             let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3922             while trail < max {
3923                 bytes[trail] = 0;
3924                 trail += 1;
3925             }
3926         }
3927         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3928             bytes[trail] = 0;
3929             trail += 1;
3930         }
3931         (result, read, written, replaced)
3932     }
3933 
3934     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3935     /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3936     ///
3937     /// Like the others, this method follows the logic that the output buffer is
3938     /// caller-allocated. This method treats the capacity of the `String` as
3939     /// the output limit. That is, this method guarantees not to cause a
3940     /// reallocation of the backing buffer of `String`.
3941     ///
3942     /// The return value is a tuple that contains the `DecoderResult`, the
3943     /// number of bytes read and a boolean indicating whether replacements
3944     /// were done. The number of bytes written is signaled via the length of
3945     /// the `String` changing.
3946     ///
3947     /// See the documentation of the struct for documentation for `decode_*`
3948     /// methods collectively.
3949     ///
3950     /// Available to Rust only.
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)3951     pub fn decode_to_string(
3952         &mut self,
3953         src: &[u8],
3954         dst: &mut String,
3955         last: bool,
3956     ) -> (CoderResult, usize, bool) {
3957         unsafe {
3958             let vec = dst.as_mut_vec();
3959             let old_len = vec.len();
3960             let capacity = vec.capacity();
3961             vec.set_len(capacity);
3962             let (result, read, written, replaced) =
3963                 self.decode_to_utf8(src, &mut vec[old_len..], last);
3964             vec.set_len(old_len + written);
3965             (result, read, replaced)
3966         }
3967     }
3968 
3969     public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3970                             /// _without replacement_.
3971                             ///
3972                             /// See the documentation of the struct for
3973                             /// documentation for `decode_*` methods
3974                             /// collectively.
3975                             ///
3976                             /// Available via the C wrapper.
3977                             ,
3978                             decode_to_utf8_without_replacement,
3979                             decode_to_utf8_raw,
3980                             decode_to_utf8_checking_end,
3981                             decode_to_utf8_after_one_potential_bom_byte,
3982                             decode_to_utf8_after_two_potential_bom_bytes,
3983                             decode_to_utf8_checking_end_with_offset,
3984                             u8);
3985 
3986     /// Incrementally decode a byte stream into UTF-8 with type system signaling
3987     /// of UTF-8 validity.
3988     ///
3989     /// This methods calls `decode_to_utf8` and then zeroes out up to three
3990     /// bytes that aren't logically part of the write in order to retain the
3991     /// UTF-8 validity even for the unwritten part of the buffer.
3992     ///
3993     /// See the documentation of the struct for documentation for `decode_*`
3994     /// methods collectively.
3995     ///
3996     /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)3997     pub fn decode_to_str_without_replacement(
3998         &mut self,
3999         src: &[u8],
4000         dst: &mut str,
4001         last: bool,
4002     ) -> (DecoderResult, usize, usize) {
4003         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4004         let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4005         let len = bytes.len();
4006         let mut trail = written;
4007         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4008         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4009         // encodings to avoid overwriting here.
4010         if self.encoding != UTF_8 {
4011             let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4012             while trail < max {
4013                 bytes[trail] = 0;
4014                 trail += 1;
4015             }
4016         }
4017         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4018             bytes[trail] = 0;
4019             trail += 1;
4020         }
4021         (result, read, written)
4022     }
4023 
4024     /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4025     ///
4026     /// Like the others, this method follows the logic that the output buffer is
4027     /// caller-allocated. This method treats the capacity of the `String` as
4028     /// the output limit. That is, this method guarantees not to cause a
4029     /// reallocation of the backing buffer of `String`.
4030     ///
4031     /// The return value is a pair that contains the `DecoderResult` and the
4032     /// number of bytes read. The number of bytes written is signaled via
4033     /// the length of the `String` changing.
4034     ///
4035     /// See the documentation of the struct for documentation for `decode_*`
4036     /// methods collectively.
4037     ///
4038     /// Available to Rust only.
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4039     pub fn decode_to_string_without_replacement(
4040         &mut self,
4041         src: &[u8],
4042         dst: &mut String,
4043         last: bool,
4044     ) -> (DecoderResult, usize) {
4045         unsafe {
4046             let vec = dst.as_mut_vec();
4047             let old_len = vec.len();
4048             let capacity = vec.capacity();
4049             vec.set_len(capacity);
4050             let (result, read, written) =
4051                 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4052             vec.set_len(old_len + written);
4053             (result, read)
4054         }
4055     }
4056 
4057     /// Query the worst-case UTF-16 output size (with or without replacement).
4058     ///
4059     /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4060     /// that will not overflow given the current state of the decoder and
4061     /// `byte_length` number of additional input bytes or `None` if `usize`
4062     /// would overflow.
4063     ///
4064     /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4065     /// return value of this method applies also in the
4066     /// `_without_replacement` case.
4067     ///
4068     /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4069     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4070         // Need to consider a) the decoder morphing due to the BOM and b) a partial
4071         // BOM getting pushed to the underlying decoder.
4072         match self.life_cycle {
4073             DecoderLifeCycle::Converting
4074             | DecoderLifeCycle::AtUtf8Start
4075             | DecoderLifeCycle::AtUtf16LeStart
4076             | DecoderLifeCycle::AtUtf16BeStart => {
4077                 return self.variant.max_utf16_buffer_length(byte_length);
4078             }
4079             DecoderLifeCycle::AtStart => {
4080                 if let Some(utf8_bom) = byte_length.checked_add(1) {
4081                     if let Some(utf16_bom) =
4082                         checked_add(1, checked_div(byte_length.checked_add(1), 2))
4083                     {
4084                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
4085                         let encoding = self.encoding();
4086                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4087                             // No need to consider the internal state of the underlying decoder,
4088                             // because it is at start, because no data has reached it yet.
4089                             return Some(utf_bom);
4090                         } else if let Some(non_bom) =
4091                             self.variant.max_utf16_buffer_length(byte_length)
4092                         {
4093                             return Some(std::cmp::max(utf_bom, non_bom));
4094                         }
4095                     }
4096                 }
4097             }
4098             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4099                 // Add two bytes even when only one byte has been seen,
4100                 // because the one byte can become a lead byte in multibyte
4101                 // decoders, but only after the decoder has been queried
4102                 // for max length, so the decoder's own logic for adding
4103                 // one for a pending lead cannot work.
4104                 if let Some(sum) = byte_length.checked_add(2) {
4105                     if let Some(utf8_bom) = sum.checked_add(1) {
4106                         if self.encoding() == UTF_8 {
4107                             // No need to consider the internal state of the underlying decoder,
4108                             // because it is at start, because no data has reached it yet.
4109                             return Some(utf8_bom);
4110                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4111                             return Some(std::cmp::max(utf8_bom, non_bom));
4112                         }
4113                     }
4114                 }
4115             }
4116             DecoderLifeCycle::ConvertingWithPendingBB => {
4117                 if let Some(sum) = byte_length.checked_add(2) {
4118                     return self.variant.max_utf16_buffer_length(sum);
4119                 }
4120             }
4121             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4122                 // Add two bytes even when only one byte has been seen,
4123                 // because the one byte can become a lead byte in multibyte
4124                 // decoders, but only after the decoder has been queried
4125                 // for max length, so the decoder's own logic for adding
4126                 // one for a pending lead cannot work.
4127                 if let Some(sum) = byte_length.checked_add(2) {
4128                     if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4129                         let encoding = self.encoding();
4130                         if encoding == UTF_16LE || encoding == UTF_16BE {
4131                             // No need to consider the internal state of the underlying decoder,
4132                             // because it is at start, because no data has reached it yet.
4133                             return Some(utf16_bom);
4134                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4135                             return Some(std::cmp::max(utf16_bom, non_bom));
4136                         }
4137                     }
4138                 }
4139             }
4140             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4141         }
4142         None
4143     }
4144 
4145     /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4146     /// replaced with the REPLACEMENT CHARACTER.
4147     ///
4148     /// See the documentation of the struct for documentation for `decode_*`
4149     /// methods collectively.
4150     ///
4151     /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4152     pub fn decode_to_utf16(
4153         &mut self,
4154         src: &[u8],
4155         dst: &mut [u16],
4156         last: bool,
4157     ) -> (CoderResult, usize, usize, bool) {
4158         let mut had_errors = false;
4159         let mut total_read = 0usize;
4160         let mut total_written = 0usize;
4161         loop {
4162             let (result, read, written) = self.decode_to_utf16_without_replacement(
4163                 &src[total_read..],
4164                 &mut dst[total_written..],
4165                 last,
4166             );
4167             total_read += read;
4168             total_written += written;
4169             match result {
4170                 DecoderResult::InputEmpty => {
4171                     return (
4172                         CoderResult::InputEmpty,
4173                         total_read,
4174                         total_written,
4175                         had_errors,
4176                     );
4177                 }
4178                 DecoderResult::OutputFull => {
4179                     return (
4180                         CoderResult::OutputFull,
4181                         total_read,
4182                         total_written,
4183                         had_errors,
4184                     );
4185                 }
4186                 DecoderResult::Malformed(_, _) => {
4187                     had_errors = true;
4188                     // There should always be space for the U+FFFD, because
4189                     // otherwise we'd have gotten OutputFull already.
4190                     dst[total_written] = 0xFFFD;
4191                     total_written += 1;
4192                 }
4193             }
4194         }
4195     }
4196 
4197     public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4198                             /// _without replacement_.
4199                             ///
4200                             /// See the documentation of the struct for
4201                             /// documentation for `decode_*` methods
4202                             /// collectively.
4203                             ///
4204                             /// Available via the C wrapper.
4205                             ,
4206                             decode_to_utf16_without_replacement,
4207                             decode_to_utf16_raw,
4208                             decode_to_utf16_checking_end,
4209                             decode_to_utf16_after_one_potential_bom_byte,
4210                             decode_to_utf16_after_two_potential_bom_bytes,
4211                             decode_to_utf16_checking_end_with_offset,
4212                             u16);
4213 
4214     /// Checks for compatibility with storing Unicode scalar values as unsigned
4215     /// bytes taking into account the state of the decoder.
4216     ///
4217     /// Returns `None` if the decoder is not in a neutral state, including waiting
4218     /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4219     ///
4220     /// Otherwise returns the index of the first byte whose unsigned value doesn't
4221     /// directly correspond to the decoded Unicode scalar value, or the length
4222     /// of the input if all bytes in the input decode directly to scalar values
4223     /// corresponding to the unsigned byte values.
4224     ///
4225     /// Does not change the state of the decoder.
4226     ///
4227     /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4228     /// storage optimizations.
4229     ///
4230     /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4231     pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4232         match self.life_cycle {
4233             DecoderLifeCycle::Converting => {
4234                 return self.variant.latin1_byte_compatible_up_to(bytes);
4235             }
4236             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4237             _ => None,
4238         }
4239     }
4240 }
4241 
4242 /// Result of a (potentially partial) encode operation without replacement.
4243 #[must_use]
4244 #[derive(Debug, PartialEq, Eq)]
4245 pub enum EncoderResult {
4246     /// The input was exhausted.
4247     ///
4248     /// If this result was returned from a call where `last` was `true`, the
4249     /// decoding process has completed. Otherwise, the caller should call a
4250     /// decode method again with more input.
4251     InputEmpty,
4252 
4253     /// The encoder cannot produce another unit of output, because the output
4254     /// buffer does not have enough space left.
4255     ///
4256     /// The caller must provide more output space upon the next call and re-push
4257     /// the remaining input to the decoder.
4258     OutputFull,
4259 
4260     /// The encoder encountered an unmappable character.
4261     ///
4262     /// The caller must either treat this as a fatal error or must append
4263     /// a placeholder to the output and then re-push the remaining input to the
4264     /// encoder.
4265     Unmappable(char),
4266 }
4267 
4268 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4269     fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4270         EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
4271     }
4272 }
4273 
4274 /// A converter that encodes a Unicode stream into bytes according to a
4275 /// character encoding in a streaming (incremental) manner.
4276 ///
4277 /// The various `encode_*` methods take an input buffer (`src`) and an output
4278 /// buffer `dst` both of which are caller-allocated. There are variants for
4279 /// both UTF-8 and UTF-16 input buffers.
4280 ///
4281 /// An `encode_*` method encode characters from `src` into bytes characters
4282 /// stored into `dst` until one of the following three things happens:
4283 ///
4284 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4285 ///    only).
4286 ///
4287 /// 2. The output buffer has been filled so near capacity that the decoder
4288 ///    cannot be sure that processing an additional character of input wouldn't
4289 ///    cause so much output that the output buffer would overflow.
4290 ///
4291 /// 3. All the input characters have been processed.
4292 ///
4293 /// The `encode_*` method then returns tuple of a status indicating which one
4294 /// of the three reasons to return happened, how many input code units (`u8`
4295 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4296 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4297 /// whose length change indicates this), and in the case of the variants that
4298 /// perform replacement, a boolean indicating whether an unmappable
4299 /// character was replaced with a numeric character reference during the call.
4300 ///
4301 /// The number of bytes "written" is what's logically written. Garbage may be
4302 /// written in the output buffer beyond the point logically written to.
4303 ///
4304 /// In the case of the methods whose name ends with
4305 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4306 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4307 /// the three cases listed above).
4308 ///
4309 /// In the case of methods whose name does not end with
4310 /// `*_without_replacement`, unmappable characters are automatically replaced
4311 /// with the corresponding numeric character references and unmappable
4312 /// characters do not cause the methods to return early.
4313 ///
4314 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4315 /// not to return indicating that more output space is needed if the length
4316 /// of the output buffer is at least the length returned by
4317 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4318 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4319 /// methods not to return indicating that more output space is needed in the
4320 /// absence of unmappable characters is given by
4321 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4322 /// UTF-16 without replacement, the methods are guaranteed not to return
4323 /// indicating that more output space is needed if the length of the output
4324 /// buffer is at least the length returned by
4325 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4326 /// from UTF-16 with replacement, the the length of the output buffer that
4327 /// guarantees the methods not to return indicating that more output space is
4328 /// needed in the absence of unmappable characters is given by
4329 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4330 /// When encoding with replacement, applications are not expected to size the
4331 /// buffer for the worst case ahead of time but to resize the buffer if there
4332 /// are unmappable characters. This is why max length queries are only available
4333 /// for the case where there are no unmappable characters.
4334 ///
4335 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4336 /// calling from Rust, the type system takes care of this.) When encoding from
4337 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4338 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4339 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4340 /// are not split across input buffer boundaries.
4341 ///
4342 /// After an `encode_*` call returns, the output produced so far, taken as a
4343 /// whole from the start of the stream, is guaranteed to consist of a valid
4344 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4345 /// character is guaranteed not to be split across output buffers. However, due
4346 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4347 /// from the start for it to be valid. For other encodings, the validity holds
4348 /// on a per-output buffer basis.)
4349 ///
4350 /// The boolean argument `last` indicates that the end of the stream is reached
4351 /// when all the characters in `src` have been consumed. This argument is needed
4352 /// for ISO-2022-JP and is ignored for other encodings.
4353 ///
4354 /// An `Encoder` object can be used to incrementally encode a byte stream.
4355 ///
4356 /// During the processing of a single stream, the caller must call `encode_*`
4357 /// zero or more times with `last` set to `false` and then call `encode_*` at
4358 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4359 /// the processing of the stream has ended. Otherwise, the caller must call
4360 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4361 /// as a fatal error).
4362 ///
4363 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4364 /// That is, you need to create another one to process another stream.
4365 ///
4366 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4367 /// and the caller does not wish to treat it as a fatal error, the input buffer
4368 /// `src` may not have been completely consumed. In that case, the caller must
4369 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4370 /// call.
4371 ///
4372 /// [1]: enum.EncoderResult.html
4373 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4374 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4375 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4376 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4377 ///
4378 /// # Infinite loops
4379 ///
4380 /// When converting with a fixed-size output buffer whose size is too small to
4381 /// accommodate one character of output, an infinite loop ensues. When
4382 /// converting with a fixed-size output buffer, it generally makes sense to
4383 /// make the buffer fairly large (e.g. couple of kilobytes).
4384 pub struct Encoder {
4385     encoding: &'static Encoding,
4386     variant: VariantEncoder,
4387 }
4388 
4389 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4390     fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4391         Encoder {
4392             encoding: enc,
4393             variant: encoder,
4394         }
4395     }
4396 
4397     /// The `Encoding` this `Encoder` is for.
4398     #[inline]
encoding(&self) -> &'static Encoding4399     pub fn encoding(&self) -> &'static Encoding {
4400         self.encoding
4401     }
4402 
4403     /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4404     /// ASCII state and `false` otherwise.
4405     #[inline]
has_pending_state(&self) -> bool4406     pub fn has_pending_state(&self) -> bool {
4407         self.variant.has_pending_state()
4408     }
4409 
4410     /// Query the worst-case output size when encoding from UTF-8 with
4411     /// replacement.
4412     ///
4413     /// Returns the size of the output buffer in bytes that will not overflow
4414     /// given the current state of the encoder and `byte_length` number of
4415     /// additional input code units if there are no unmappable characters in
4416     /// the input or `None` if `usize` would overflow.
4417     ///
4418     /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4419     pub fn max_buffer_length_from_utf8_if_no_unmappables(
4420         &self,
4421         byte_length: usize,
4422     ) -> Option<usize> {
4423         checked_add(
4424             if self.encoding().can_encode_everything() {
4425                 0
4426             } else {
4427                 NCR_EXTRA
4428             },
4429             self.max_buffer_length_from_utf8_without_replacement(byte_length),
4430         )
4431     }
4432 
4433     /// Query the worst-case output size when encoding from UTF-8 without
4434     /// replacement.
4435     ///
4436     /// Returns the size of the output buffer in bytes that will not overflow
4437     /// given the current state of the encoder and `byte_length` number of
4438     /// additional input code units or `None` if `usize` would overflow.
4439     ///
4440     /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4441     pub fn max_buffer_length_from_utf8_without_replacement(
4442         &self,
4443         byte_length: usize,
4444     ) -> Option<usize> {
4445         self.variant
4446             .max_buffer_length_from_utf8_without_replacement(byte_length)
4447     }
4448 
4449     /// Incrementally encode into byte stream from UTF-8 with unmappable
4450     /// characters replaced with HTML (decimal) numeric character references.
4451     ///
4452     /// See the documentation of the struct for documentation for `encode_*`
4453     /// methods collectively.
4454     ///
4455     /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4456     pub fn encode_from_utf8(
4457         &mut self,
4458         src: &str,
4459         dst: &mut [u8],
4460         last: bool,
4461     ) -> (CoderResult, usize, usize, bool) {
4462         let dst_len = dst.len();
4463         let effective_dst_len = if self.encoding().can_encode_everything() {
4464             dst_len
4465         } else {
4466             if dst_len < NCR_EXTRA {
4467                 if src.is_empty() && !(last && self.has_pending_state()) {
4468                     return (CoderResult::InputEmpty, 0, 0, false);
4469                 }
4470                 return (CoderResult::OutputFull, 0, 0, false);
4471             }
4472             dst_len - NCR_EXTRA
4473         };
4474         let mut had_unmappables = false;
4475         let mut total_read = 0usize;
4476         let mut total_written = 0usize;
4477         loop {
4478             let (result, read, written) = self.encode_from_utf8_without_replacement(
4479                 &src[total_read..],
4480                 &mut dst[total_written..effective_dst_len],
4481                 last,
4482             );
4483             total_read += read;
4484             total_written += written;
4485             match result {
4486                 EncoderResult::InputEmpty => {
4487                     return (
4488                         CoderResult::InputEmpty,
4489                         total_read,
4490                         total_written,
4491                         had_unmappables,
4492                     );
4493                 }
4494                 EncoderResult::OutputFull => {
4495                     return (
4496                         CoderResult::OutputFull,
4497                         total_read,
4498                         total_written,
4499                         had_unmappables,
4500                     );
4501                 }
4502                 EncoderResult::Unmappable(unmappable) => {
4503                     had_unmappables = true;
4504                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4505                     debug_assert_ne!(self.encoding(), UTF_16BE);
4506                     debug_assert_ne!(self.encoding(), UTF_16LE);
4507                     // Additionally, Iso2022JpEncoder is responsible for
4508                     // transitioning to ASCII when returning with Unmappable.
4509                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4510                     if total_written >= effective_dst_len {
4511                         if total_read == src.len() && !(last && self.has_pending_state()) {
4512                             return (
4513                                 CoderResult::InputEmpty,
4514                                 total_read,
4515                                 total_written,
4516                                 had_unmappables,
4517                             );
4518                         }
4519                         return (
4520                             CoderResult::OutputFull,
4521                             total_read,
4522                             total_written,
4523                             had_unmappables,
4524                         );
4525                     }
4526                 }
4527             }
4528         }
4529     }
4530 
4531     /// Incrementally encode into byte stream from UTF-8 with unmappable
4532     /// characters replaced with HTML (decimal) numeric character references.
4533     ///
4534     /// See the documentation of the struct for documentation for `encode_*`
4535     /// methods collectively.
4536     ///
4537     /// Available to Rust only.
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4538     pub fn encode_from_utf8_to_vec(
4539         &mut self,
4540         src: &str,
4541         dst: &mut Vec<u8>,
4542         last: bool,
4543     ) -> (CoderResult, usize, bool) {
4544         unsafe {
4545             let old_len = dst.len();
4546             let capacity = dst.capacity();
4547             dst.set_len(capacity);
4548             let (result, read, written, replaced) =
4549                 self.encode_from_utf8(src, &mut dst[old_len..], last);
4550             dst.set_len(old_len + written);
4551             (result, read, replaced)
4552         }
4553     }
4554 
4555     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4556     ///
4557     /// See the documentation of the struct for documentation for `encode_*`
4558     /// methods collectively.
4559     ///
4560     /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4561     pub fn encode_from_utf8_without_replacement(
4562         &mut self,
4563         src: &str,
4564         dst: &mut [u8],
4565         last: bool,
4566     ) -> (EncoderResult, usize, usize) {
4567         self.variant.encode_from_utf8_raw(src, dst, last)
4568     }
4569 
4570     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4571     ///
4572     /// See the documentation of the struct for documentation for `encode_*`
4573     /// methods collectively.
4574     ///
4575     /// Available to Rust only.
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4576     pub fn encode_from_utf8_to_vec_without_replacement(
4577         &mut self,
4578         src: &str,
4579         dst: &mut Vec<u8>,
4580         last: bool,
4581     ) -> (EncoderResult, usize) {
4582         unsafe {
4583             let old_len = dst.len();
4584             let capacity = dst.capacity();
4585             dst.set_len(capacity);
4586             let (result, read, written) =
4587                 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4588             dst.set_len(old_len + written);
4589             (result, read)
4590         }
4591     }
4592 
4593     /// Query the worst-case output size when encoding from UTF-16 with
4594     /// replacement.
4595     ///
4596     /// Returns the size of the output buffer in bytes that will not overflow
4597     /// given the current state of the encoder and `u16_length` number of
4598     /// additional input code units if there are no unmappable characters in
4599     /// the input or `None` if `usize` would overflow.
4600     ///
4601     /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4602     pub fn max_buffer_length_from_utf16_if_no_unmappables(
4603         &self,
4604         u16_length: usize,
4605     ) -> Option<usize> {
4606         checked_add(
4607             if self.encoding().can_encode_everything() {
4608                 0
4609             } else {
4610                 NCR_EXTRA
4611             },
4612             self.max_buffer_length_from_utf16_without_replacement(u16_length),
4613         )
4614     }
4615 
4616     /// Query the worst-case output size when encoding from UTF-16 without
4617     /// replacement.
4618     ///
4619     /// Returns the size of the output buffer in bytes that will not overflow
4620     /// given the current state of the encoder and `u16_length` number of
4621     /// additional input code units or `None` if `usize` would overflow.
4622     ///
4623     /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4624     pub fn max_buffer_length_from_utf16_without_replacement(
4625         &self,
4626         u16_length: usize,
4627     ) -> Option<usize> {
4628         self.variant
4629             .max_buffer_length_from_utf16_without_replacement(u16_length)
4630     }
4631 
4632     /// Incrementally encode into byte stream from UTF-16 with unmappable
4633     /// characters replaced with HTML (decimal) numeric character references.
4634     ///
4635     /// See the documentation of the struct for documentation for `encode_*`
4636     /// methods collectively.
4637     ///
4638     /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4639     pub fn encode_from_utf16(
4640         &mut self,
4641         src: &[u16],
4642         dst: &mut [u8],
4643         last: bool,
4644     ) -> (CoderResult, usize, usize, bool) {
4645         let dst_len = dst.len();
4646         let effective_dst_len = if self.encoding().can_encode_everything() {
4647             dst_len
4648         } else {
4649             if dst_len < NCR_EXTRA {
4650                 if src.is_empty() && !(last && self.has_pending_state()) {
4651                     return (CoderResult::InputEmpty, 0, 0, false);
4652                 }
4653                 return (CoderResult::OutputFull, 0, 0, false);
4654             }
4655             dst_len - NCR_EXTRA
4656         };
4657         let mut had_unmappables = false;
4658         let mut total_read = 0usize;
4659         let mut total_written = 0usize;
4660         loop {
4661             let (result, read, written) = self.encode_from_utf16_without_replacement(
4662                 &src[total_read..],
4663                 &mut dst[total_written..effective_dst_len],
4664                 last,
4665             );
4666             total_read += read;
4667             total_written += written;
4668             match result {
4669                 EncoderResult::InputEmpty => {
4670                     return (
4671                         CoderResult::InputEmpty,
4672                         total_read,
4673                         total_written,
4674                         had_unmappables,
4675                     );
4676                 }
4677                 EncoderResult::OutputFull => {
4678                     return (
4679                         CoderResult::OutputFull,
4680                         total_read,
4681                         total_written,
4682                         had_unmappables,
4683                     );
4684                 }
4685                 EncoderResult::Unmappable(unmappable) => {
4686                     had_unmappables = true;
4687                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4688                     // There are no UTF-16 encoders and even if there were,
4689                     // they'd never have unmappables.
4690                     debug_assert_ne!(self.encoding(), UTF_16BE);
4691                     debug_assert_ne!(self.encoding(), UTF_16LE);
4692                     // Additionally, Iso2022JpEncoder is responsible for
4693                     // transitioning to ASCII when returning with Unmappable
4694                     // from the jis0208 state. That is, when we encode
4695                     // ISO-2022-JP and come here, the encoder is in either the
4696                     // ASCII or the Roman state. We are allowed to generate any
4697                     // printable ASCII excluding \ and ~.
4698                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4699                     if total_written >= effective_dst_len {
4700                         if total_read == src.len() && !(last && self.has_pending_state()) {
4701                             return (
4702                                 CoderResult::InputEmpty,
4703                                 total_read,
4704                                 total_written,
4705                                 had_unmappables,
4706                             );
4707                         }
4708                         return (
4709                             CoderResult::OutputFull,
4710                             total_read,
4711                             total_written,
4712                             had_unmappables,
4713                         );
4714                     }
4715                 }
4716             }
4717         }
4718     }
4719 
4720     /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4721     ///
4722     /// See the documentation of the struct for documentation for `encode_*`
4723     /// methods collectively.
4724     ///
4725     /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4726     pub fn encode_from_utf16_without_replacement(
4727         &mut self,
4728         src: &[u16],
4729         dst: &mut [u8],
4730         last: bool,
4731     ) -> (EncoderResult, usize, usize) {
4732         self.variant.encode_from_utf16_raw(src, dst, last)
4733     }
4734 }
4735 
4736 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4737 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4738     // len is the number of decimal digits needed to represent unmappable plus
4739     // 3 (the length of "&#" and ";").
4740     let mut number = unmappable as u32;
4741     let len = if number >= 1_000_000u32 {
4742         10usize
4743     } else if number >= 100_000u32 {
4744         9usize
4745     } else if number >= 10_000u32 {
4746         8usize
4747     } else if number >= 1_000u32 {
4748         7usize
4749     } else if number >= 100u32 {
4750         6usize
4751     } else {
4752         // Review the outcome of https://github.com/whatwg/encoding/issues/15
4753         // to see if this case is possible
4754         5usize
4755     };
4756     debug_assert!(number >= 10u32);
4757     debug_assert!(len <= dst.len());
4758     let mut pos = len - 1;
4759     dst[pos] = b';';
4760     pos -= 1;
4761     loop {
4762         let rightmost = number % 10;
4763         dst[pos] = rightmost as u8 + b'0';
4764         pos -= 1;
4765         if number < 10 {
4766             break;
4767         }
4768         number /= 10;
4769     }
4770     dst[1] = b'#';
4771     dst[0] = b'&';
4772     len
4773 }
4774 
4775 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4776 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4777     i.wrapping_sub(start) < (end - start)
4778 }
4779 
4780 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4781 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4782     i.wrapping_sub(start) < (end - start)
4783 }
4784 
4785 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4786 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4787     i.wrapping_sub(start) <= (end - start)
4788 }
4789 
4790 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4791 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4792     i.wrapping_sub(start) <= (end - start)
4793 }
4794 
4795 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4796 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4797     i.wrapping_sub(start) <= (end - start)
4798 }
4799 
4800 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4801 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4802     i.wrapping_sub(start) <= (end - start)
4803 }
4804 
4805 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4806 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4807     if let Some(n) = opt {
4808         n.checked_add(num)
4809     } else {
4810         None
4811     }
4812 }
4813 
4814 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4815 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4816     if let Some(n) = one {
4817         checked_add(n, other)
4818     } else {
4819         None
4820     }
4821 }
4822 
4823 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4824 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4825     if let Some(n) = opt {
4826         n.checked_mul(num)
4827     } else {
4828         None
4829     }
4830 }
4831 
4832 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4833 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4834     if let Some(n) = opt {
4835         n.checked_div(num)
4836     } else {
4837         None
4838     }
4839 }
4840 
4841 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4842 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4843     opt.map(|n| n.next_power_of_two())
4844 }
4845 
4846 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4847 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4848     if let Some(a) = one {
4849         if let Some(b) = other {
4850             Some(::std::cmp::min(a, b))
4851         } else {
4852             Some(a)
4853         }
4854     } else {
4855         other
4856     }
4857 }
4858 
4859 // ############## TESTS ###############
4860 
4861 #[cfg(all(test, feature = "serde"))]
4862 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4863 struct Demo {
4864     num: u32,
4865     name: String,
4866     enc: &'static Encoding,
4867 }
4868 
4869 #[cfg(test)]
4870 mod test_labels_names;
4871 
4872 #[cfg(test)]
4873 mod tests {
4874     use super::*;
4875     use std::borrow::Cow;
4876 
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4877     fn sniff_to_utf16(
4878         initial_encoding: &'static Encoding,
4879         expected_encoding: &'static Encoding,
4880         bytes: &[u8],
4881         expect: &[u16],
4882         breaks: &[usize],
4883     ) {
4884         let mut decoder = initial_encoding.new_decoder();
4885 
4886         let mut dest: Vec<u16> =
4887             Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4888         let capacity = dest.capacity();
4889         dest.resize(capacity, 0u16);
4890 
4891         let mut total_written = 0usize;
4892         let mut start = 0usize;
4893         for br in breaks {
4894             let (result, read, written, _) =
4895                 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4896             total_written += written;
4897             assert_eq!(read, *br - start);
4898             match result {
4899                 CoderResult::InputEmpty => {}
4900                 CoderResult::OutputFull => {
4901                     unreachable!();
4902                 }
4903             }
4904             start = *br;
4905         }
4906         let (result, read, written, _) =
4907             decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4908         total_written += written;
4909         match result {
4910             CoderResult::InputEmpty => {}
4911             CoderResult::OutputFull => {
4912                 unreachable!();
4913             }
4914         }
4915         assert_eq!(read, bytes.len() - start);
4916         assert_eq!(total_written, expect.len());
4917         assert_eq!(&dest[..total_written], expect);
4918         assert_eq!(decoder.encoding(), expected_encoding);
4919     }
4920 
4921     // Any copyright to the test code below this comment is dedicated to the
4922     // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4923 
4924     #[test]
test_bom_sniffing()4925     fn test_bom_sniffing() {
4926         // ASCII
4927         sniff_to_utf16(
4928             WINDOWS_1252,
4929             WINDOWS_1252,
4930             b"\x61\x62",
4931             &[0x0061u16, 0x0062u16],
4932             &[],
4933         );
4934         // UTF-8
4935         sniff_to_utf16(
4936             WINDOWS_1252,
4937             UTF_8,
4938             b"\xEF\xBB\xBF\x61\x62",
4939             &[0x0061u16, 0x0062u16],
4940             &[],
4941         );
4942         sniff_to_utf16(
4943             WINDOWS_1252,
4944             UTF_8,
4945             b"\xEF\xBB\xBF\x61\x62",
4946             &[0x0061u16, 0x0062u16],
4947             &[1],
4948         );
4949         sniff_to_utf16(
4950             WINDOWS_1252,
4951             UTF_8,
4952             b"\xEF\xBB\xBF\x61\x62",
4953             &[0x0061u16, 0x0062u16],
4954             &[2],
4955         );
4956         sniff_to_utf16(
4957             WINDOWS_1252,
4958             UTF_8,
4959             b"\xEF\xBB\xBF\x61\x62",
4960             &[0x0061u16, 0x0062u16],
4961             &[3],
4962         );
4963         sniff_to_utf16(
4964             WINDOWS_1252,
4965             UTF_8,
4966             b"\xEF\xBB\xBF\x61\x62",
4967             &[0x0061u16, 0x0062u16],
4968             &[4],
4969         );
4970         sniff_to_utf16(
4971             WINDOWS_1252,
4972             UTF_8,
4973             b"\xEF\xBB\xBF\x61\x62",
4974             &[0x0061u16, 0x0062u16],
4975             &[2, 3],
4976         );
4977         sniff_to_utf16(
4978             WINDOWS_1252,
4979             UTF_8,
4980             b"\xEF\xBB\xBF\x61\x62",
4981             &[0x0061u16, 0x0062u16],
4982             &[1, 2],
4983         );
4984         sniff_to_utf16(
4985             WINDOWS_1252,
4986             UTF_8,
4987             b"\xEF\xBB\xBF\x61\x62",
4988             &[0x0061u16, 0x0062u16],
4989             &[1, 3],
4990         );
4991         sniff_to_utf16(
4992             WINDOWS_1252,
4993             UTF_8,
4994             b"\xEF\xBB\xBF\x61\x62",
4995             &[0x0061u16, 0x0062u16],
4996             &[1, 2, 3, 4],
4997         );
4998         sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
4999         // Not UTF-8
5000         sniff_to_utf16(
5001             WINDOWS_1252,
5002             WINDOWS_1252,
5003             b"\xEF\xBB\x61\x62",
5004             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5005             &[],
5006         );
5007         sniff_to_utf16(
5008             WINDOWS_1252,
5009             WINDOWS_1252,
5010             b"\xEF\xBB\x61\x62",
5011             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5012             &[1],
5013         );
5014         sniff_to_utf16(
5015             WINDOWS_1252,
5016             WINDOWS_1252,
5017             b"\xEF\x61\x62",
5018             &[0x00EFu16, 0x0061u16, 0x0062u16],
5019             &[],
5020         );
5021         sniff_to_utf16(
5022             WINDOWS_1252,
5023             WINDOWS_1252,
5024             b"\xEF\x61\x62",
5025             &[0x00EFu16, 0x0061u16, 0x0062u16],
5026             &[1],
5027         );
5028         sniff_to_utf16(
5029             WINDOWS_1252,
5030             WINDOWS_1252,
5031             b"\xEF\xBB",
5032             &[0x00EFu16, 0x00BBu16],
5033             &[],
5034         );
5035         sniff_to_utf16(
5036             WINDOWS_1252,
5037             WINDOWS_1252,
5038             b"\xEF\xBB",
5039             &[0x00EFu16, 0x00BBu16],
5040             &[1],
5041         );
5042         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5043         // Not UTF-16
5044         sniff_to_utf16(
5045             WINDOWS_1252,
5046             WINDOWS_1252,
5047             b"\xFE\x61\x62",
5048             &[0x00FEu16, 0x0061u16, 0x0062u16],
5049             &[],
5050         );
5051         sniff_to_utf16(
5052             WINDOWS_1252,
5053             WINDOWS_1252,
5054             b"\xFE\x61\x62",
5055             &[0x00FEu16, 0x0061u16, 0x0062u16],
5056             &[1],
5057         );
5058         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5059         sniff_to_utf16(
5060             WINDOWS_1252,
5061             WINDOWS_1252,
5062             b"\xFF\x61\x62",
5063             &[0x00FFu16, 0x0061u16, 0x0062u16],
5064             &[],
5065         );
5066         sniff_to_utf16(
5067             WINDOWS_1252,
5068             WINDOWS_1252,
5069             b"\xFF\x61\x62",
5070             &[0x00FFu16, 0x0061u16, 0x0062u16],
5071             &[1],
5072         );
5073         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5074         // UTF-16
5075         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5076         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5077         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5078         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5079     }
5080 
5081     #[test]
test_output_encoding()5082     fn test_output_encoding() {
5083         assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5084         assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5085         assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5086         assert_eq!(UTF_8.output_encoding(), UTF_8);
5087         assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5088         assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5089         assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5090         assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5091         assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5092         assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5093     }
5094 
5095     #[test]
test_label_resolution()5096     fn test_label_resolution() {
5097         assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5098         assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5099         assert_eq!(
5100             Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5101             Some(UTF_8)
5102         );
5103         assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5104         assert_eq!(Encoding::for_label(b"bogus"), None);
5105         assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5106     }
5107 
5108     #[test]
test_decode_valid_windows_1257_to_cow()5109     fn test_decode_valid_windows_1257_to_cow() {
5110         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5111         match cow {
5112             Cow::Borrowed(_) => unreachable!(),
5113             Cow::Owned(s) => {
5114                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5115             }
5116         }
5117         assert_eq!(encoding, WINDOWS_1257);
5118         assert!(!had_errors);
5119     }
5120 
5121     #[test]
test_decode_invalid_windows_1257_to_cow()5122     fn test_decode_invalid_windows_1257_to_cow() {
5123         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5124         match cow {
5125             Cow::Borrowed(_) => unreachable!(),
5126             Cow::Owned(s) => {
5127                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5128             }
5129         }
5130         assert_eq!(encoding, WINDOWS_1257);
5131         assert!(had_errors);
5132     }
5133 
5134     #[test]
test_decode_ascii_only_windows_1257_to_cow()5135     fn test_decode_ascii_only_windows_1257_to_cow() {
5136         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5137         match cow {
5138             Cow::Borrowed(s) => {
5139                 assert_eq!(s, "abc");
5140             }
5141             Cow::Owned(_) => unreachable!(),
5142         }
5143         assert_eq!(encoding, WINDOWS_1257);
5144         assert!(!had_errors);
5145     }
5146 
5147     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5148     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5149         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5150         match cow {
5151             Cow::Borrowed(s) => {
5152                 assert_eq!(s, "\u{20AC}\u{00E4}");
5153             }
5154             Cow::Owned(_) => unreachable!(),
5155         }
5156         assert_eq!(encoding, UTF_8);
5157         assert!(!had_errors);
5158     }
5159 
5160     #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5161     fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5162         let (cow, encoding, had_errors) =
5163             WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5164         match cow {
5165             Cow::Borrowed(_) => unreachable!(),
5166             Cow::Owned(s) => {
5167                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5168             }
5169         }
5170         assert_eq!(encoding, UTF_8);
5171         assert!(had_errors);
5172     }
5173 
5174     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5175     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5176         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5177         match cow {
5178             Cow::Borrowed(s) => {
5179                 assert_eq!(s, "\u{20AC}\u{00E4}");
5180             }
5181             Cow::Owned(_) => unreachable!(),
5182         }
5183         assert_eq!(encoding, UTF_8);
5184         assert!(!had_errors);
5185     }
5186 
5187     #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5188     fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5189         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5190         match cow {
5191             Cow::Borrowed(_) => unreachable!(),
5192             Cow::Owned(s) => {
5193                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5194             }
5195         }
5196         assert_eq!(encoding, UTF_8);
5197         assert!(had_errors);
5198     }
5199 
5200     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5201     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5202         let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5203         match cow {
5204             Cow::Borrowed(s) => {
5205                 assert_eq!(s, "\u{20AC}\u{00E4}");
5206             }
5207             Cow::Owned(_) => unreachable!(),
5208         }
5209         assert!(!had_errors);
5210     }
5211 
5212     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5213     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5214         let (cow, had_errors) =
5215             WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5216         match cow {
5217             Cow::Borrowed(_) => unreachable!(),
5218             Cow::Owned(s) => {
5219                 assert_eq!(
5220                     s,
5221                     "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5222                 );
5223             }
5224         }
5225         assert!(!had_errors);
5226     }
5227 
5228     #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5229     fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5230         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5231         match cow {
5232             Cow::Borrowed(_) => unreachable!(),
5233             Cow::Owned(s) => {
5234                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5235             }
5236         }
5237         assert!(!had_errors);
5238     }
5239 
5240     #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5241     fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5242         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5243         match cow {
5244             Cow::Borrowed(_) => unreachable!(),
5245             Cow::Owned(s) => {
5246                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5247             }
5248         }
5249         assert!(had_errors);
5250     }
5251 
5252     #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5253     fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5254         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5255         match cow {
5256             Cow::Borrowed(s) => {
5257                 assert_eq!(s, "abc");
5258             }
5259             Cow::Owned(_) => unreachable!(),
5260         }
5261         assert!(!had_errors);
5262     }
5263 
5264     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5265     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5266         let (cow, had_errors) =
5267             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5268         match cow {
5269             Cow::Borrowed(s) => {
5270                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5271             }
5272             Cow::Owned(_) => unreachable!(),
5273         }
5274         assert!(!had_errors);
5275     }
5276 
5277     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5278     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5279         let (cow, had_errors) =
5280             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5281         match cow {
5282             Cow::Borrowed(_) => unreachable!(),
5283             Cow::Owned(s) => {
5284                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5285             }
5286         }
5287         assert!(had_errors);
5288     }
5289 
5290     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5291     fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5292         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5293         match cow {
5294             Cow::Borrowed(_) => unreachable!(),
5295             Cow::Owned(s) => {
5296                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5297             }
5298         }
5299         assert!(!had_errors);
5300     }
5301 
5302     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5303     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5304         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5305         match cow {
5306             Cow::Borrowed(_) => unreachable!(),
5307             Cow::Owned(s) => {
5308                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5309             }
5310         }
5311         assert!(had_errors);
5312     }
5313 
5314     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5315     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5316         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5317         match cow {
5318             Cow::Borrowed(s) => {
5319                 assert_eq!(s, "abc");
5320             }
5321             Cow::Owned(_) => unreachable!(),
5322         }
5323         assert!(!had_errors);
5324     }
5325 
5326     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5327     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5328         match UTF_8.decode_without_bom_handling_and_without_replacement(
5329             b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5330         ) {
5331             Some(cow) => match cow {
5332                 Cow::Borrowed(s) => {
5333                     assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5334                 }
5335                 Cow::Owned(_) => unreachable!(),
5336             },
5337             None => unreachable!(),
5338         }
5339     }
5340 
5341     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5342     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5343         assert!(UTF_8
5344             .decode_without_bom_handling_and_without_replacement(
5345                 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5346             )
5347             .is_none());
5348     }
5349 
5350     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5351     fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5352         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5353             Some(cow) => match cow {
5354                 Cow::Borrowed(_) => unreachable!(),
5355                 Cow::Owned(s) => {
5356                     assert_eq!(s, "abc\u{20AC}\u{00E4}");
5357                 }
5358             },
5359             None => unreachable!(),
5360         }
5361     }
5362 
5363     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5364     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5365         assert!(WINDOWS_1257
5366             .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5367             .is_none());
5368     }
5369 
5370     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5371     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5372         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5373             Some(cow) => match cow {
5374                 Cow::Borrowed(s) => {
5375                     assert_eq!(s, "abc");
5376                 }
5377                 Cow::Owned(_) => unreachable!(),
5378             },
5379             None => unreachable!(),
5380         }
5381     }
5382 
5383     #[test]
test_encode_ascii_only_windows_1257_to_cow()5384     fn test_encode_ascii_only_windows_1257_to_cow() {
5385         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5386         match cow {
5387             Cow::Borrowed(s) => {
5388                 assert_eq!(s, b"abc");
5389             }
5390             Cow::Owned(_) => unreachable!(),
5391         }
5392         assert_eq!(encoding, WINDOWS_1257);
5393         assert!(!had_errors);
5394     }
5395 
5396     #[test]
test_encode_valid_windows_1257_to_cow()5397     fn test_encode_valid_windows_1257_to_cow() {
5398         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5399         match cow {
5400             Cow::Borrowed(_) => unreachable!(),
5401             Cow::Owned(s) => {
5402                 assert_eq!(s, b"abc\x80\xE4");
5403             }
5404         }
5405         assert_eq!(encoding, WINDOWS_1257);
5406         assert!(!had_errors);
5407     }
5408 
5409     #[test]
test_utf16_space_with_one_bom_byte()5410     fn test_utf16_space_with_one_bom_byte() {
5411         let mut decoder = UTF_16LE.new_decoder();
5412         let mut dst = [0u16; 12];
5413         {
5414             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5415             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5416             assert_eq!(result, CoderResult::InputEmpty);
5417         }
5418         {
5419             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5420             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5421             assert_eq!(result, CoderResult::InputEmpty);
5422         }
5423     }
5424 
5425     #[test]
test_utf8_space_with_one_bom_byte()5426     fn test_utf8_space_with_one_bom_byte() {
5427         let mut decoder = UTF_8.new_decoder();
5428         let mut dst = [0u16; 12];
5429         {
5430             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5431             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5432             assert_eq!(result, CoderResult::InputEmpty);
5433         }
5434         {
5435             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5436             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5437             assert_eq!(result, CoderResult::InputEmpty);
5438         }
5439     }
5440 
5441     #[test]
test_utf16_space_with_two_bom_bytes()5442     fn test_utf16_space_with_two_bom_bytes() {
5443         let mut decoder = UTF_16LE.new_decoder();
5444         let mut dst = [0u16; 12];
5445         {
5446             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5447             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5448             assert_eq!(result, CoderResult::InputEmpty);
5449         }
5450         {
5451             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5452             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5453             assert_eq!(result, CoderResult::InputEmpty);
5454         }
5455         {
5456             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5457             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5458             assert_eq!(result, CoderResult::InputEmpty);
5459         }
5460     }
5461 
5462     #[test]
test_utf8_space_with_two_bom_bytes()5463     fn test_utf8_space_with_two_bom_bytes() {
5464         let mut decoder = UTF_8.new_decoder();
5465         let mut dst = [0u16; 12];
5466         {
5467             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5468             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5469             assert_eq!(result, CoderResult::InputEmpty);
5470         }
5471         {
5472             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5473             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5474             assert_eq!(result, CoderResult::InputEmpty);
5475         }
5476         {
5477             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5478             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5479             assert_eq!(result, CoderResult::InputEmpty);
5480         }
5481     }
5482 
5483     #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5484     fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5485         let mut decoder = UTF_16LE.new_decoder();
5486         let mut dst = [0u16; 12];
5487         {
5488             let needed = decoder.max_utf16_buffer_length(2).unwrap();
5489             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5490             assert_eq!(result, CoderResult::InputEmpty);
5491         }
5492     }
5493 
5494     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5495     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5496         let mut dst = [0u8; 8];
5497         let mut encoder = ISO_2022_JP.new_encoder();
5498         {
5499             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5500             assert_eq!(result, CoderResult::InputEmpty);
5501         }
5502         {
5503             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5504             assert_eq!(result, CoderResult::InputEmpty);
5505         }
5506     }
5507 
5508     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5509     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5510         let mut dst = [0u8; 16];
5511         let mut encoder = ISO_2022_JP.new_encoder();
5512         {
5513             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5514             assert_eq!(result, CoderResult::InputEmpty);
5515         }
5516         {
5517             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5518             assert_eq!(result, CoderResult::InputEmpty);
5519         }
5520         {
5521             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5522             assert_eq!(result, CoderResult::OutputFull);
5523         }
5524     }
5525 
5526     #[test]
test_buffer_end_iso_2022_jp_from_utf8()5527     fn test_buffer_end_iso_2022_jp_from_utf8() {
5528         let mut dst = [0u8; 18];
5529         {
5530             let mut encoder = ISO_2022_JP.new_encoder();
5531             let (result, _, _, _) =
5532                 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5533             assert_eq!(result, CoderResult::InputEmpty);
5534         }
5535         {
5536             let mut encoder = ISO_2022_JP.new_encoder();
5537             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5538             assert_eq!(result, CoderResult::OutputFull);
5539         }
5540         {
5541             let mut encoder = ISO_2022_JP.new_encoder();
5542             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5543             assert_eq!(result, CoderResult::InputEmpty);
5544         }
5545         {
5546             let mut encoder = ISO_2022_JP.new_encoder();
5547             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5548             assert_eq!(result, CoderResult::InputEmpty);
5549         }
5550     }
5551 
5552     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5553     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5554         let mut dst = [0u8; 8];
5555         let mut encoder = ISO_2022_JP.new_encoder();
5556         {
5557             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5558             assert_eq!(result, CoderResult::InputEmpty);
5559         }
5560         {
5561             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5562             assert_eq!(result, CoderResult::InputEmpty);
5563         }
5564     }
5565 
5566     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5567     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5568         let mut dst = [0u8; 16];
5569         let mut encoder = ISO_2022_JP.new_encoder();
5570         {
5571             let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5572             assert_eq!(result, CoderResult::InputEmpty);
5573         }
5574         {
5575             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5576             assert_eq!(result, CoderResult::InputEmpty);
5577         }
5578         {
5579             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5580             assert_eq!(result, CoderResult::OutputFull);
5581         }
5582     }
5583 
5584     #[test]
test_buffer_end_iso_2022_jp_from_utf16()5585     fn test_buffer_end_iso_2022_jp_from_utf16() {
5586         let mut dst = [0u8; 18];
5587         {
5588             let mut encoder = ISO_2022_JP.new_encoder();
5589             let (result, _, _, _) =
5590                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5591             assert_eq!(result, CoderResult::InputEmpty);
5592         }
5593         {
5594             let mut encoder = ISO_2022_JP.new_encoder();
5595             let (result, _, _, _) =
5596                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5597             assert_eq!(result, CoderResult::OutputFull);
5598         }
5599         {
5600             let mut encoder = ISO_2022_JP.new_encoder();
5601             let (result, _, _, _) =
5602                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5603             assert_eq!(result, CoderResult::InputEmpty);
5604         }
5605         {
5606             let mut encoder = ISO_2022_JP.new_encoder();
5607             let (result, _, _, _) =
5608                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5609             assert_eq!(result, CoderResult::InputEmpty);
5610         }
5611     }
5612 
5613     #[test]
test_buffer_end_utf16be()5614     fn test_buffer_end_utf16be() {
5615         let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5616         let mut dest = [0u8; 4];
5617 
5618         assert_eq!(
5619             decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5620             (CoderResult::InputEmpty, 2, 0, false)
5621         );
5622 
5623         let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5624     }
5625 
5626     #[test]
test_hash()5627     fn test_hash() {
5628         let mut encodings = ::std::collections::HashSet::new();
5629         encodings.insert(UTF_8);
5630         encodings.insert(ISO_2022_JP);
5631         assert!(encodings.contains(UTF_8));
5632         assert!(encodings.contains(ISO_2022_JP));
5633         assert!(!encodings.contains(WINDOWS_1252));
5634         encodings.remove(ISO_2022_JP);
5635         assert!(!encodings.contains(ISO_2022_JP));
5636     }
5637 
5638     #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5639     fn test_iso_2022_jp_ncr_extra_from_utf16() {
5640         let mut dst = [0u8; 17];
5641         {
5642             let mut encoder = ISO_2022_JP.new_encoder();
5643             let (result, _, _, _) =
5644                 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5645             assert_eq!(result, CoderResult::OutputFull);
5646         }
5647     }
5648 
5649     #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5650     fn test_iso_2022_jp_ncr_extra_from_utf8() {
5651         let mut dst = [0u8; 17];
5652         {
5653             let mut encoder = ISO_2022_JP.new_encoder();
5654             let (result, _, _, _) =
5655                 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5656             assert_eq!(result, CoderResult::OutputFull);
5657         }
5658     }
5659 
5660     #[test]
test_max_length_with_bom_to_utf8()5661     fn test_max_length_with_bom_to_utf8() {
5662         let mut output = [0u8; 20];
5663         let mut decoder = REPLACEMENT.new_decoder();
5664         let input = b"\xEF\xBB\xBFA";
5665         {
5666             let needed = decoder
5667                 .max_utf8_buffer_length_without_replacement(input.len())
5668                 .unwrap();
5669             let (result, read, written) =
5670                 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5671             assert_eq!(result, DecoderResult::InputEmpty);
5672             assert_eq!(read, input.len());
5673             assert_eq!(written, 1);
5674             assert_eq!(output[0], 0x41);
5675         }
5676     }
5677 
5678     #[cfg(feature = "serde")]
5679     #[test]
test_serde()5680     fn test_serde() {
5681         let demo = Demo {
5682             num: 42,
5683             name: "foo".into(),
5684             enc: UTF_8,
5685         };
5686 
5687         let serialized = serde_json::to_string(&demo).unwrap();
5688 
5689         let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5690         assert_eq!(deserialized, demo);
5691 
5692         let bincoded = bincode::serialize(&demo).unwrap();
5693         let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5694         assert_eq!(debincoded, demo);
5695     }
5696 
5697     #[test]
test_is_single_byte()5698     fn test_is_single_byte() {
5699         assert!(!BIG5.is_single_byte());
5700         assert!(!EUC_JP.is_single_byte());
5701         assert!(!EUC_KR.is_single_byte());
5702         assert!(!GB18030.is_single_byte());
5703         assert!(!GBK.is_single_byte());
5704         assert!(!REPLACEMENT.is_single_byte());
5705         assert!(!SHIFT_JIS.is_single_byte());
5706         assert!(!UTF_8.is_single_byte());
5707         assert!(!UTF_16BE.is_single_byte());
5708         assert!(!UTF_16LE.is_single_byte());
5709         assert!(!ISO_2022_JP.is_single_byte());
5710 
5711         assert!(IBM866.is_single_byte());
5712         assert!(ISO_8859_2.is_single_byte());
5713         assert!(ISO_8859_3.is_single_byte());
5714         assert!(ISO_8859_4.is_single_byte());
5715         assert!(ISO_8859_5.is_single_byte());
5716         assert!(ISO_8859_6.is_single_byte());
5717         assert!(ISO_8859_7.is_single_byte());
5718         assert!(ISO_8859_8.is_single_byte());
5719         assert!(ISO_8859_10.is_single_byte());
5720         assert!(ISO_8859_13.is_single_byte());
5721         assert!(ISO_8859_14.is_single_byte());
5722         assert!(ISO_8859_15.is_single_byte());
5723         assert!(ISO_8859_16.is_single_byte());
5724         assert!(ISO_8859_8_I.is_single_byte());
5725         assert!(KOI8_R.is_single_byte());
5726         assert!(KOI8_U.is_single_byte());
5727         assert!(MACINTOSH.is_single_byte());
5728         assert!(WINDOWS_874.is_single_byte());
5729         assert!(WINDOWS_1250.is_single_byte());
5730         assert!(WINDOWS_1251.is_single_byte());
5731         assert!(WINDOWS_1252.is_single_byte());
5732         assert!(WINDOWS_1253.is_single_byte());
5733         assert!(WINDOWS_1254.is_single_byte());
5734         assert!(WINDOWS_1255.is_single_byte());
5735         assert!(WINDOWS_1256.is_single_byte());
5736         assert!(WINDOWS_1257.is_single_byte());
5737         assert!(WINDOWS_1258.is_single_byte());
5738         assert!(X_MAC_CYRILLIC.is_single_byte());
5739         assert!(X_USER_DEFINED.is_single_byte());
5740     }
5741 
5742     #[test]
test_latin1_byte_compatible_up_to()5743     fn test_latin1_byte_compatible_up_to() {
5744         let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5745         assert_eq!(
5746             BIG5.new_decoder_without_bom_handling()
5747                 .latin1_byte_compatible_up_to(buffer)
5748                 .unwrap(),
5749             1
5750         );
5751         assert_eq!(
5752             EUC_JP
5753                 .new_decoder_without_bom_handling()
5754                 .latin1_byte_compatible_up_to(buffer)
5755                 .unwrap(),
5756             1
5757         );
5758         assert_eq!(
5759             EUC_KR
5760                 .new_decoder_without_bom_handling()
5761                 .latin1_byte_compatible_up_to(buffer)
5762                 .unwrap(),
5763             1
5764         );
5765         assert_eq!(
5766             GB18030
5767                 .new_decoder_without_bom_handling()
5768                 .latin1_byte_compatible_up_to(buffer)
5769                 .unwrap(),
5770             1
5771         );
5772         assert_eq!(
5773             GBK.new_decoder_without_bom_handling()
5774                 .latin1_byte_compatible_up_to(buffer)
5775                 .unwrap(),
5776             1
5777         );
5778         assert!(REPLACEMENT
5779             .new_decoder_without_bom_handling()
5780             .latin1_byte_compatible_up_to(buffer)
5781             .is_none());
5782         assert_eq!(
5783             SHIFT_JIS
5784                 .new_decoder_without_bom_handling()
5785                 .latin1_byte_compatible_up_to(buffer)
5786                 .unwrap(),
5787             1
5788         );
5789         assert_eq!(
5790             UTF_8
5791                 .new_decoder_without_bom_handling()
5792                 .latin1_byte_compatible_up_to(buffer)
5793                 .unwrap(),
5794             1
5795         );
5796         assert!(UTF_16BE
5797             .new_decoder_without_bom_handling()
5798             .latin1_byte_compatible_up_to(buffer)
5799             .is_none());
5800         assert!(UTF_16LE
5801             .new_decoder_without_bom_handling()
5802             .latin1_byte_compatible_up_to(buffer)
5803             .is_none());
5804         assert_eq!(
5805             ISO_2022_JP
5806                 .new_decoder_without_bom_handling()
5807                 .latin1_byte_compatible_up_to(buffer)
5808                 .unwrap(),
5809             1
5810         );
5811 
5812         assert_eq!(
5813             IBM866
5814                 .new_decoder_without_bom_handling()
5815                 .latin1_byte_compatible_up_to(buffer)
5816                 .unwrap(),
5817             1
5818         );
5819         assert_eq!(
5820             ISO_8859_2
5821                 .new_decoder_without_bom_handling()
5822                 .latin1_byte_compatible_up_to(buffer)
5823                 .unwrap(),
5824             2
5825         );
5826         assert_eq!(
5827             ISO_8859_3
5828                 .new_decoder_without_bom_handling()
5829                 .latin1_byte_compatible_up_to(buffer)
5830                 .unwrap(),
5831             2
5832         );
5833         assert_eq!(
5834             ISO_8859_4
5835                 .new_decoder_without_bom_handling()
5836                 .latin1_byte_compatible_up_to(buffer)
5837                 .unwrap(),
5838             2
5839         );
5840         assert_eq!(
5841             ISO_8859_5
5842                 .new_decoder_without_bom_handling()
5843                 .latin1_byte_compatible_up_to(buffer)
5844                 .unwrap(),
5845             2
5846         );
5847         assert_eq!(
5848             ISO_8859_6
5849                 .new_decoder_without_bom_handling()
5850                 .latin1_byte_compatible_up_to(buffer)
5851                 .unwrap(),
5852             2
5853         );
5854         assert_eq!(
5855             ISO_8859_7
5856                 .new_decoder_without_bom_handling()
5857                 .latin1_byte_compatible_up_to(buffer)
5858                 .unwrap(),
5859             2
5860         );
5861         assert_eq!(
5862             ISO_8859_8
5863                 .new_decoder_without_bom_handling()
5864                 .latin1_byte_compatible_up_to(buffer)
5865                 .unwrap(),
5866             3
5867         );
5868         assert_eq!(
5869             ISO_8859_10
5870                 .new_decoder_without_bom_handling()
5871                 .latin1_byte_compatible_up_to(buffer)
5872                 .unwrap(),
5873             2
5874         );
5875         assert_eq!(
5876             ISO_8859_13
5877                 .new_decoder_without_bom_handling()
5878                 .latin1_byte_compatible_up_to(buffer)
5879                 .unwrap(),
5880             4
5881         );
5882         assert_eq!(
5883             ISO_8859_14
5884                 .new_decoder_without_bom_handling()
5885                 .latin1_byte_compatible_up_to(buffer)
5886                 .unwrap(),
5887             4
5888         );
5889         assert_eq!(
5890             ISO_8859_15
5891                 .new_decoder_without_bom_handling()
5892                 .latin1_byte_compatible_up_to(buffer)
5893                 .unwrap(),
5894             6
5895         );
5896         assert_eq!(
5897             ISO_8859_16
5898                 .new_decoder_without_bom_handling()
5899                 .latin1_byte_compatible_up_to(buffer)
5900                 .unwrap(),
5901             4
5902         );
5903         assert_eq!(
5904             ISO_8859_8_I
5905                 .new_decoder_without_bom_handling()
5906                 .latin1_byte_compatible_up_to(buffer)
5907                 .unwrap(),
5908             3
5909         );
5910         assert_eq!(
5911             KOI8_R
5912                 .new_decoder_without_bom_handling()
5913                 .latin1_byte_compatible_up_to(buffer)
5914                 .unwrap(),
5915             1
5916         );
5917         assert_eq!(
5918             KOI8_U
5919                 .new_decoder_without_bom_handling()
5920                 .latin1_byte_compatible_up_to(buffer)
5921                 .unwrap(),
5922             1
5923         );
5924         assert_eq!(
5925             MACINTOSH
5926                 .new_decoder_without_bom_handling()
5927                 .latin1_byte_compatible_up_to(buffer)
5928                 .unwrap(),
5929             1
5930         );
5931         assert_eq!(
5932             WINDOWS_874
5933                 .new_decoder_without_bom_handling()
5934                 .latin1_byte_compatible_up_to(buffer)
5935                 .unwrap(),
5936             2
5937         );
5938         assert_eq!(
5939             WINDOWS_1250
5940                 .new_decoder_without_bom_handling()
5941                 .latin1_byte_compatible_up_to(buffer)
5942                 .unwrap(),
5943             4
5944         );
5945         assert_eq!(
5946             WINDOWS_1251
5947                 .new_decoder_without_bom_handling()
5948                 .latin1_byte_compatible_up_to(buffer)
5949                 .unwrap(),
5950             1
5951         );
5952         assert_eq!(
5953             WINDOWS_1252
5954                 .new_decoder_without_bom_handling()
5955                 .latin1_byte_compatible_up_to(buffer)
5956                 .unwrap(),
5957             5
5958         );
5959         assert_eq!(
5960             WINDOWS_1253
5961                 .new_decoder_without_bom_handling()
5962                 .latin1_byte_compatible_up_to(buffer)
5963                 .unwrap(),
5964             3
5965         );
5966         assert_eq!(
5967             WINDOWS_1254
5968                 .new_decoder_without_bom_handling()
5969                 .latin1_byte_compatible_up_to(buffer)
5970                 .unwrap(),
5971             4
5972         );
5973         assert_eq!(
5974             WINDOWS_1255
5975                 .new_decoder_without_bom_handling()
5976                 .latin1_byte_compatible_up_to(buffer)
5977                 .unwrap(),
5978             3
5979         );
5980         assert_eq!(
5981             WINDOWS_1256
5982                 .new_decoder_without_bom_handling()
5983                 .latin1_byte_compatible_up_to(buffer)
5984                 .unwrap(),
5985             1
5986         );
5987         assert_eq!(
5988             WINDOWS_1257
5989                 .new_decoder_without_bom_handling()
5990                 .latin1_byte_compatible_up_to(buffer)
5991                 .unwrap(),
5992             4
5993         );
5994         assert_eq!(
5995             WINDOWS_1258
5996                 .new_decoder_without_bom_handling()
5997                 .latin1_byte_compatible_up_to(buffer)
5998                 .unwrap(),
5999             4
6000         );
6001         assert_eq!(
6002             X_MAC_CYRILLIC
6003                 .new_decoder_without_bom_handling()
6004                 .latin1_byte_compatible_up_to(buffer)
6005                 .unwrap(),
6006             1
6007         );
6008         assert_eq!(
6009             X_USER_DEFINED
6010                 .new_decoder_without_bom_handling()
6011                 .latin1_byte_compatible_up_to(buffer)
6012                 .unwrap(),
6013             1
6014         );
6015 
6016         assert!(UTF_8
6017             .new_decoder()
6018             .latin1_byte_compatible_up_to(buffer)
6019             .is_none());
6020 
6021         let mut decoder = UTF_8.new_decoder();
6022         let mut output = [0u16; 4];
6023         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6024         assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6025         let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6026         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6027         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6028         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6029     }
6030 }
6031