1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 #![cfg_attr(
11     feature = "cargo-clippy",
12     allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14 #![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.20")]
15 
16 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
17 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
18 //! Gecko-oriented means that converting to and from UTF-16 is supported in
19 //! addition to converting to and from UTF-8, that the performance and
20 //! streamability goals are browser-oriented, and that FFI-friendliness is a
21 //! goal.
22 //!
23 //! Additionally, the `mem` module provides functions that are useful for
24 //! applications that need to be able to deal with legacy in-memory
25 //! representations of Unicode.
26 //!
27 //! For expectation setting, please be sure to read the sections
28 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
29 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
30 //!
31 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
32 //! design and internals of the crate.
33 //!
34 //! # Availability
35 //!
36 //! The code is available under the
37 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
38 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
39 //! See the
40 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
41 //! file for details.
42 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
43 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
44 //!
45 //! # Integration with `std::io`
46 //!
47 //! This crate doesn't implement traits from `std::io`. However, for the case of
48 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
49 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
50 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
51 //!
52 //! # Examples
53 //!
54 //! Example programs:
55 //!
56 //! * [Rust](https://github.com/hsivonen/recode_rs)
57 //! * [C](https://github.com/hsivonen/recode_c)
58 //! * [C++](https://github.com/hsivonen/recode_cpp)
59 //!
60 //! Decode using the non-streaming API:
61 //!
62 //! ```
63 //! use encoding_rs::*;
64 //!
65 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
66 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
67 //!
68 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
69 //! assert_eq!(&cow[..], expectation);
70 //! assert_eq!(encoding_used, SHIFT_JIS);
71 //! assert!(!had_errors);
72 //! ```
73 //!
74 //! Decode using the streaming API with minimal `unsafe`:
75 //!
76 //! ```
77 //! use encoding_rs::*;
78 //!
79 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
80 //!
81 //! // Use an array of byte slices to demonstrate content arriving piece by
82 //! // piece from the network.
83 //! let bytes: [&'static [u8]; 4] = [b"\x83",
84 //!                                  b"n\x83\x8D\x81",
85 //!                                  b"[\x81E\x83\x8F\x81[\x83",
86 //!                                  b"\x8B\x83h"];
87 //!
88 //! // Very short output buffer to demonstrate the output buffer getting full.
89 //! // Normally, you'd use something like `[0u8; 2048]`.
90 //! let mut buffer_bytes = [0u8; 8];
91 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
92 //!
93 //! // How many bytes in the buffer currently hold significant data.
94 //! let mut bytes_in_buffer = 0usize;
95 //!
96 //! // Collect the output to a string for demonstration purposes.
97 //! let mut output = String::new();
98 //!
99 //! // The `Decoder`
100 //! let mut decoder = SHIFT_JIS.new_decoder();
101 //!
102 //! // Track whether we see errors.
103 //! let mut total_had_errors = false;
104 //!
105 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
106 //! // use of a fixed-size buffer; normally when the output of an incremental
107 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
108 //! // avoid the intermediate buffer).
109 //! for input in &bytes[..] {
110 //!     // The number of bytes already read from current `input` in total.
111 //!     let mut total_read_from_current_input = 0usize;
112 //!
113 //!     loop {
114 //!         let (result, read, written, had_errors) =
115 //!             decoder.decode_to_str(&input[total_read_from_current_input..],
116 //!                                   &mut buffer[bytes_in_buffer..],
117 //!                                   false);
118 //!         total_read_from_current_input += read;
119 //!         bytes_in_buffer += written;
120 //!         total_had_errors |= had_errors;
121 //!         match result {
122 //!             CoderResult::InputEmpty => {
123 //!                 // We have consumed the current input buffer. Break out of
124 //!                 // the inner loop to get the next input buffer from the
125 //!                 // outer loop.
126 //!                 break;
127 //!             },
128 //!             CoderResult::OutputFull => {
129 //!                 // Write the current buffer out and consider the buffer
130 //!                 // empty.
131 //!                 output.push_str(&buffer[..bytes_in_buffer]);
132 //!                 bytes_in_buffer = 0usize;
133 //!                 continue;
134 //!             }
135 //!         }
136 //!     }
137 //! }
138 //!
139 //! // Process EOF
140 //! loop {
141 //!     let (result, _, written, had_errors) =
142 //!         decoder.decode_to_str(b"",
143 //!                               &mut buffer[bytes_in_buffer..],
144 //!                               true);
145 //!     bytes_in_buffer += written;
146 //!     total_had_errors |= had_errors;
147 //!     // Write the current buffer out and consider the buffer empty.
148 //!     // Need to do this here for both `match` arms, because we exit the
149 //!     // loop on `CoderResult::InputEmpty`.
150 //!     output.push_str(&buffer[..bytes_in_buffer]);
151 //!     bytes_in_buffer = 0usize;
152 //!     match result {
153 //!         CoderResult::InputEmpty => {
154 //!             // Done!
155 //!             break;
156 //!         },
157 //!         CoderResult::OutputFull => {
158 //!             continue;
159 //!         }
160 //!     }
161 //! }
162 //!
163 //! assert_eq!(&output[..], expectation);
164 //! assert!(!total_had_errors);
165 //! ```
166 //!
167 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
168 //!
169 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
170 //! __so this crate does not provide encoders for those encodings__!
171 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
172 //! so you get an UTF-8 encoder if you request an encoder for them.
173 //!
174 //! Additionally, the Encoding Standard factors BOM handling into wrapper
175 //! algorithms so that BOM handling isn't part of the definition of the
176 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
177 //! Standard define BOM handling or lack thereof as part of the encoding
178 //! scheme.
179 //!
180 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
181 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
182 //! the Unicode Standard.
183 //!
184 //! When used with the `_with_bom_removal` entry points, the UTF-8
185 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
186 //! Standard.
187 //!
188 //! This crate does not provide a mode that matches the UTF-16 _encoding
189 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
190 //! the entry points without `_bom_` qualifiers is the closest match,
191 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
192 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
193 //! Unicode Standard.
194 //!
195 //! The UTF-32 family of Unicode encoding schemes is not supported
196 //! by this crate. The Encoding Standard doesn't define any UTF-32
197 //! family encodings, since they aren't necessary for consuming Web
198 //! content.
199 //!
200 //! ## ISO-8859-1
201 //!
202 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
203 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
204 //! byte value to the same Unicode scalar value is not available via
205 //! `Encoding` in this crate.
206 //!
207 //! However, the functions whose name starts with `convert` and contains
208 //! `latin1` in the `mem` module support such conversions, which are known as
209 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
210 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
211 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
212 //!
213 //! ## Web / Browser Focus
214 //!
215 //! Both in terms of scope and performance, the focus is on the Web. For scope,
216 //! this means that encoding_rs implements the Encoding Standard fully and
217 //! doesn't implement encodings that are not specified in the Encoding
218 //! Standard. For performance, this means that decoding performance is
219 //! important as well as performance for encoding into UTF-8 or encoding the
220 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
221 //! be encoded into legacy encodings in only two places in the Web platform: in
222 //! the query part of URLs, in which case it's a matter of relatively rare
223 //! error handling, and in form submission, in which case the user action and
224 //! networking tend to hide the performance of the encoder.
225 //!
226 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
227 //! encodings enables smaller code size thanks to the encoder side using the
228 //! decode-optimized data tables without having encode-optimized data tables at
229 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
230 //! multiplication operations.
231 //!
232 //! Additionally, performance is a non-goal for the ASCII-incompatible
233 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
234 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
235 //! of implementation.
236 //!
237 //! Despite the browser focus, the hope is that non-browser applications
238 //! that wish to consume Web content or submit Web forms in a Web-compatible
239 //! way will find encoding_rs useful. While encoding_rs does not try to match
240 //! Windows behavior, many of the encodings are close enough to legacy
241 //! encodings implemented by Windows that applications that need to consume
242 //! data in legacy Windows encodins may find encoding_rs useful. The
243 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
244 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
245 //!
246 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
247 //! to the encodings defined in the Encoding Standard. The
248 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
249 //! UTF-7 decoding for email purposes.
250 //!
251 //! # Preparing Text for the Encoders
252 //!
253 //! Normalizing text into Unicode Normalization Form C prior to encoding text
254 //! into a legacy encoding minimizes unmappable characters. Text can be
255 //! normalized to Unicode Normalization Form C using the
256 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
257 //!
258 //! The exception is windows-1258, which after normalizing to Unicode
259 //! Normalization Form C requires tone marks to be decomposed in order to
260 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
261 //! using the [`detone`](https://crates.io/crates/detone) crate.
262 //!
263 //! # Streaming & Non-Streaming; Rust & C/C++
264 //!
265 //! The API in Rust has two modes of operation: streaming and non-streaming.
266 //! The streaming API is the foundation of the implementation and should be
267 //! used when processing data that arrives piecemeal from an i/o stream. The
268 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
269 //! to C callers. The non-streaming part of the API is for Rust callers only and
270 //! is smart about borrowing instead of copying when possible. When
271 //! streamability is not needed, the non-streaming API should be preferrer in
272 //! order to avoid copying data when a borrow suffices.
273 //!
274 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
275 //! standard types for growable byte buffers and Unicode strings that know
276 //! their length.
277 //!
278 //! The C API (header file generated at `target/include/encoding_rs.h` when
279 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
280 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
281 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
282 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
283 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
284 //! as part of Mozilla [bug 1261841][4].
285 //!
286 //! The `Encoding` type is common to both the streaming and non-streaming
287 //! modes. In the streaming mode, decoding operations are performed with a
288 //! `Decoder` and encoding operations with an `Encoder` object obtained via
289 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
290 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
291 //! and `Encoder` objects are not used at all.
292 //!
293 //! [1]: https://github.com/hsivonen/encoding_c
294 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
295 //! [3]: https://github.com/Microsoft/GSL/
296 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
297 //!
298 //! # Memory management
299 //!
300 //! The non-streaming mode never performs heap allocations (even the methods
301 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
302 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
303 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
304 //!
305 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
306 //! perform heap allocations but only to allocate the backing buffer of the
307 //! `Vec<u8>` or the `String`.
308 //!
309 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
310 //! `Drop` cleanup.
311 //!
312 //! # Buffer reading and writing behavior
313 //!
314 //! Based on experience gained with the `java.nio.charset` encoding converter
315 //! API and with the Gecko uconv encoding converter API, the buffer reading
316 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
317 //! fully drained but output buffers are not always fully filled.
318 //!
319 //! When reading from an input buffer, encoding_rs always consumes all input
320 //! up to the next error or to the end of the buffer. In particular, when
321 //! decoding, even if the input buffer ends in the middle of a byte sequence
322 //! for a character, the decoder consumes all input. This has the benefit that
323 //! the caller of the API can always fill the next buffer from the start from
324 //! whatever source the bytes come from and never has to first copy the last
325 //! bytes of the previous buffer to the start of the next buffer. However, when
326 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
327 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
328 //! boundaries falling in the middle of a surrogate pair result in both
329 //! suggorates being treated individually as unpaired surrogates.
330 //!
331 //! Additionally, decoders guarantee that they can be fed even one byte at a
332 //! time and encoders guarantee that they can be fed even one code point at a
333 //! time. This has the benefit of not placing restrictions on the size of
334 //! chunks the content arrives e.g. from network.
335 //!
336 //! When writing into an output buffer, encoding_rs makes sure that the code
337 //! unit sequence for a character is never split across output buffer
338 //! boundaries. This may result in wasted space at the end of an output buffer,
339 //! but the advantages are that the output side of both decoders and encoders
340 //! is greatly simplified compared to designs that attempt to fill output
341 //! buffers exactly even when that entails splitting a code unit sequence and
342 //! when encoding_rs methods return to the caller, the output produces thus
343 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
344 //! the output needs to be considered as a whole, because the latest output
345 //! buffer taken alone might not be valid taken alone if the transition away
346 //! from the ASCII state occurred in an earlier output buffer. However, since
347 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
348 //! state as being in error despite the encoder generating a transition to the
349 //! ASCII state at the end, the claim about the partial output taken as a whole
350 //! being valid is true even for ISO-2022-JP.)
351 //!
352 //! # Error Reporting
353 //!
354 //! Based on experience gained with the `java.nio.charset` encoding converter
355 //! API and with the Gecko uconv encoding converter API, the error reporting
356 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
357 //! that leave it up to the caller to extract the erroneous bytes from the
358 //! input stream if the caller wishes to do so but encoder errors provide the
359 //! code point associated with the error without requiring the caller to
360 //! extract it from the input on its own.
361 //!
362 //! On the encoder side, an error is always triggered by the most recently
363 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
364 //! caller. Also, it's very typical for the caller to wish to do something with
365 //! this data: generate a numeric escape for the character. Additionally, the
366 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
367 //! certain cases, so requiring the caller to extract the character from the
368 //! input buffer would require the caller to handle ISO-2022-JP details.
369 //! Furthermore, requiring the caller to extract the character from the input
370 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
371 //! the job of an encoding conversion library.
372 //!
373 //! On the decoder side, errors are triggered in more complex ways. For
374 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
375 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
376 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
377 //! be the ones most recently pushed to the decoder and the error might not even
378 //! be in the current buffer.
379 //!
380 //! Some encoding conversion APIs address the problem by not acknowledging
381 //! trailing bytes of an input buffer as consumed if it's still possible for
382 //! future bytes to cause the trailing bytes to be in error. This way, error
383 //! reporting can always refer to the most recently pushed buffer. This has the
384 //! problem that the caller of the API has to copy the unconsumed trailing
385 //! bytes to the start of the next buffer before being able to fill the rest
386 //! of the next buffer. This is annoying, error-prone and inefficient.
387 //!
388 //! A possible solution would be making the decoder remember recently consumed
389 //! bytes in order to be able to include a copy of the erroneous bytes when
390 //! reporting an error. This has two problem: First, callers a rarely
391 //! interested in the erroneous bytes, so attempts to identify them are most
392 //! often just overhead anyway. Second, the rare applications that are
393 //! interested typically care about the location of the error in the input
394 //! stream.
395 //!
396 //! To keep the API convenient for common uses and the overhead low while making
397 //! it possible to develop applications, such as HTML validators, that care
398 //! about which bytes were in error, encoding_rs reports the length of the
399 //! erroneous sequence and the number of bytes consumed after the erroneous
400 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
401 //! this makes it possible for callers that care about the erroneous bytes to
402 //! locate them.
403 //!
404 //! # No Convenience API for Custom Replacements
405 //!
406 //! The Web Platform and, therefore, the Encoding Standard supports only one
407 //! error recovery mode for decoders and only one error recovery mode for
408 //! encoders. The supported error recovery mode for decoders is emitting the
409 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
410 //! encoders is emitting an HTML decimal numeric character reference for
411 //! unmappable characters.
412 //!
413 //! Since encoding_rs is Web-focused, these are the only error recovery modes
414 //! for which convenient support is provided. Moreover, on the decoder side,
415 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
416 //! on error (other than treating errors as fatal). In particular, simply
417 //! ignoring errors is a
418 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
419 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
420 //! callers to ignore errors.
421 //!
422 //! On the encoder side, there are plausible alternatives for HTML decimal
423 //! numeric character references. For example, when outputting CSS, CSS-style
424 //! escapes would seem to make sense. However, instead of facilitating the
425 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
426 //! position that you shouldn't generate output in encodings other than UTF-8,
427 //! except where backward compatibility with interacting with the legacy Web
428 //! requires it. The legacy Web requires it only when parsing the query strings
429 //! of URLs and when submitting forms, and those two both use HTML decimal
430 //! numeric character references.
431 //!
432 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
433 //! numeric character references easy, it does make them _possible_.
434 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
435 //! for unmappable characters, is implemented on top of
436 //! `encode_from_utf8_without_replacement()`. Applications that really, really
437 //! want other replacement schemes for unmappable characters can likewise
438 //! implement them on top of `encode_from_utf8_without_replacement()`.
439 //!
440 //! # No Extensibility by Design
441 //!
442 //! The set of encodings supported by encoding_rs is not extensible by design.
443 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
444 //! rather than `trait`s. encoding_rs takes the design position that all future
445 //! text interchange should be done using UTF-8, which can represent all of
446 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
447 //! Standard and encoding_rs that can represent all of Unicode and that has
448 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
449 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
450 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
451 //! other than being able to consume legacy content.
452 //!
453 //! Considering that UTF-8 can represent all of Unicode and is already supported
454 //! by all Web browsers, introducing a new encoding wouldn't add to the
455 //! expressiveness but would add to compatibility problems. In that sense,
456 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
457 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
458 //! the Web Platform. On the other hand, the set of legacy encodings that must
459 //! be supported for a Web browser to be able to be successful is not going to
460 //! expand. Empirically, the set of encodings specified in the Encoding Standard
461 //! is already sufficient and the set of legacy encodings won't grow
462 //! retroactively.
463 //!
464 //! Since extensibility doesn't make sense considering the Web focus of
465 //! encoding_rs and adding encodings to Web clients would be actively harmful,
466 //! it makes sense to make the set of encodings that encoding_rs supports
467 //! non-extensible and to take the (admittedly small) benefits arising from
468 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
469 //!  of time, which enables stack allocation thereof.
470 //!
471 //! This does have downsides for applications that might want to put encoding_rs
472 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
473 //! needed for Web uses. The needs of such applications should not complicate
474 //! encoding_rs itself, though. It is up to those applications to provide a
475 //! framework that delegates the operations with encodings that encoding_rs
476 //! supports to encoding_rs and operations with other encodings to something
477 //! else (as opposed to encoding_rs itself providing an extensibility
478 //! framework).
479 //!
480 //! # Panics
481 //!
482 //! Methods in encoding_rs can panic if the API is used against the requirements
483 //! stated in the documentation, if a state that's supposed to be impossible
484 //! is reached due to an internal bug or on integer overflow. When used
485 //! according to documentation with buffer sizes that stay below integer
486 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
487 //!
488 //! Panics arising from API misuse aren't documented beyond this on individual
489 //! methods.
490 //!
491 //! # At-Risk Parts of the API
492 //!
493 //! The foreseeable source of partially backward-incompatible API change is the
494 //! way the instances of `Encoding` are made available.
495 //!
496 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
497 //! initialized with `static`s of type `&'static Encoding`, the non-reference
498 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
499 //!
500 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
501 //! unique when the constant is used in different crates, the reference-typed
502 //! `static`s for the encoding instances will be changed from `static` to
503 //! `const` and the non-reference-typed `_INIT` instances will be removed.
504 //!
505 //! # Mapping Spec Concepts onto the API
506 //!
507 //! <table>
508 //! <thead>
509 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
510 //! </thead>
511 //! <tbody>
512 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
513 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
514 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
515 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
522 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
523 //! </tbody>
524 //! </table>
525 //!
526 //! # Compatibility with the rust-encoding API
527 //!
528 //! The crate
529 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
530 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
531 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
532 //!
533 //! # Mapping rust-encoding concepts to encoding_rs concepts
534 //!
535 //! The following table provides a mapping from rust-encoding constructs to
536 //! encoding_rs ones.
537 //!
538 //! <table>
539 //! <thead>
540 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
541 //! </thead>
542 //! <tbody>
543 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
544 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
545 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
546 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
547 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
548 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
549 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
550 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
551 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
552 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
553 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
554 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
555 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
556 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
557 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
558 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
559 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
560 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
561 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
562 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
563 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
564 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
565 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
566 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
567 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
568 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
570 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
571 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
572 //! </tbody>
573 //! </table>
574 //!
575 //! # Relationship with Windows Code Pages
576 //!
577 //! Despite the Web and browser focus, the encodings defined by the Encoding
578 //! Standard and implemented by this crate may be useful for decoding legacy
579 //! data that uses Windows code pages. The following table names the single-byte
580 //! encodings
581 //! that have a closely related Windows code page, the number of the closest
582 //! code page, a column indicating whether Windows maps unassigned code points
583 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
584 //! indicating remarks in the list after the table.
585 //!
586 //! <table>
587 //! <thead>
588 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
589 //! </thead>
590 //! <tbody>
591 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
592 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
593 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
594 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
595 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
596 //! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
597 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
598 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
599 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
600 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
601 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
602 //! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
603 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
604 //! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
605 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
606 //! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
607 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
608 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
609 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
610 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
611 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
612 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
613 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
614 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
615 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
616 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
617 //! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
618 //! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
619 //! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
620 //! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
621 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
622 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
623 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
624 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
625 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
626 //! </tbody>
627 //! </table>
628 //!
629 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
630 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
631 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
632 //!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
633 //!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
634 //!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
635 //!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
636 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
637 //!    of LRM and RLM.
638 //! 5. Remarks from the previous item apply.
639 //!
640 //! The differences between this crate and Windows in the case of multibyte encodings
641 //! are not yet fully documented here. The lack of remarks above should not be taken
642 //! as indication of lack of differences.
643 //!
644 //! # Notable Differences from IANA Naming
645 //!
646 //! In some cases, the Encoding Standard specifies the popular unextended encoding
647 //! name where in IANA terms one of the other labels would be more precise considering
648 //! the extensions that the Encoding Standard has unified into the encoding.
649 //!
650 //! <table>
651 //! <thead>
652 //! <tr><th>Encoding</th><th>IANA</th></tr>
653 //! </thead>
654 //! <tbody>
655 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
656 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
657 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
658 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
659 //! </tbody>
660 //! </table>
661 //!
662 //! In other cases where the Encoding Standard unifies unextended and extended
663 //! variants of an encoding, the encoding gets the name of the extended
664 //! variant.
665 //!
666 //! <table>
667 //! <thead>
668 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
669 //! </thead>
670 //! <tbody>
671 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
672 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
673 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
674 //! </tbody>
675 //! </table>
676 //!
677 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
678 //! for discussion about the UTF-16 family.
679 
680 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
681 
682 #[macro_use]
683 extern crate cfg_if;
684 
685 #[cfg(all(
686     feature = "simd-accel",
687     any(
688         target_feature = "sse2",
689         all(target_endian = "little", target_arch = "aarch64"),
690         all(target_endian = "little", target_feature = "neon")
691     )
692 ))]
693 #[macro_use(shuffle)]
694 extern crate packed_simd;
695 
696 #[cfg(feature = "serde")]
697 extern crate serde;
698 
699 #[cfg(all(test, feature = "serde"))]
700 extern crate bincode;
701 #[cfg(all(test, feature = "serde"))]
702 #[macro_use]
703 extern crate serde_derive;
704 #[cfg(all(test, feature = "serde"))]
705 extern crate serde_json;
706 
707 #[macro_use]
708 mod macros;
709 
710 #[cfg(all(
711     feature = "simd-accel",
712     any(
713         target_feature = "sse2",
714         all(target_endian = "little", target_arch = "aarch64"),
715         all(target_endian = "little", target_feature = "neon")
716     )
717 ))]
718 mod simd_funcs;
719 
720 #[cfg(test)]
721 mod testing;
722 
723 mod big5;
724 mod euc_jp;
725 mod euc_kr;
726 mod gb18030;
727 mod iso_2022_jp;
728 mod replacement;
729 mod shift_jis;
730 mod single_byte;
731 mod utf_16;
732 mod utf_8;
733 mod x_user_defined;
734 
735 mod ascii;
736 mod data;
737 mod handles;
738 mod variant;
739 
740 pub mod mem;
741 
742 use ascii::ascii_valid_up_to;
743 use ascii::iso_2022_jp_ascii_valid_up_to;
744 use utf_8::utf8_valid_up_to;
745 use variant::*;
746 
747 use std::borrow::Cow;
748 use std::cmp::Ordering;
749 use std::hash::Hash;
750 use std::hash::Hasher;
751 
752 #[cfg(feature = "serde")]
753 use serde::de::Visitor;
754 #[cfg(feature = "serde")]
755 use serde::{Deserialize, Deserializer, Serialize, Serializer};
756 
757 /// This has to be the max length of an NCR instead of max
758 /// minus one, because we can't rely on getting the minus
759 /// one from the space reserved for the current unmappable,
760 /// because the ISO-2022-JP encoder can fill up that space
761 /// with a state transition escape.
762 const NCR_EXTRA: usize = 10; // &#1114111;
763 
764 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
765 // Instead, please regenerate using generate-encoding-data.py
766 
767 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
768 
769 /// The initializer for the [Big5](static.BIG5.html) encoding.
770 ///
771 /// For use only for taking the address of this form when
772 /// Rust prohibits the use of the non-`_INIT` form directly,
773 /// such as in initializers of other `static`s. If in doubt,
774 /// use the corresponding non-`_INIT` reference-typed `static`.
775 ///
776 /// This part of the public API will go away if Rust changes
777 /// to make the referent of `pub const FOO: &'static Encoding`
778 /// unique cross-crate or if Rust starts allowing static arrays
779 /// to be initialized with `pub static FOO: &'static Encoding`
780 /// items.
781 pub static BIG5_INIT: Encoding = Encoding {
782     name: "Big5",
783     variant: VariantEncoding::Big5,
784 };
785 
786 /// The Big5 encoding.
787 ///
788 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
789 /// instead of the Private Use Area code points that have been used historically.
790 /// It is believed to be able to decode existing Web content in a way that makes
791 /// sense.
792 ///
793 /// To avoid form submissions generating data that Web servers don't understand,
794 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
795 /// Big5 in the lexical order.
796 ///
797 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
798 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
799 ///
800 /// This encoding is designed to be suited for decoding the Windows code page 950
801 /// and its HKSCS patched "951" variant such that the text makes sense, given
802 /// assignments that Unicode has made after those encodings used Private Use
803 /// Area characters.
804 ///
805 /// This will change from `static` to `const` if Rust changes
806 /// to make the referent of `pub const FOO: &'static Encoding`
807 /// unique cross-crate, so don't take the address of this
808 /// `static`.
809 pub static BIG5: &'static Encoding = &BIG5_INIT;
810 
811 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
812 ///
813 /// For use only for taking the address of this form when
814 /// Rust prohibits the use of the non-`_INIT` form directly,
815 /// such as in initializers of other `static`s. If in doubt,
816 /// use the corresponding non-`_INIT` reference-typed `static`.
817 ///
818 /// This part of the public API will go away if Rust changes
819 /// to make the referent of `pub const FOO: &'static Encoding`
820 /// unique cross-crate or if Rust starts allowing static arrays
821 /// to be initialized with `pub static FOO: &'static Encoding`
822 /// items.
823 pub static EUC_JP_INIT: Encoding = Encoding {
824     name: "EUC-JP",
825     variant: VariantEncoding::EucJp,
826 };
827 
828 /// The EUC-JP encoding.
829 ///
830 /// This is the legacy Unix encoding for Japanese.
831 ///
832 /// For compatibility with Web servers that don't expect three-byte sequences
833 /// in form submissions, the encoder doesn't generate three-byte sequences.
834 /// That is, the JIS X 0212 support is decode-only.
835 ///
836 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
837 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
838 ///
839 /// This encoding roughly matches the Windows code page 20932. There are error
840 /// handling differences and a handful of 2-byte sequences that decode differently.
841 /// Additionall, Windows doesn't support 3-byte sequences.
842 ///
843 /// This will change from `static` to `const` if Rust changes
844 /// to make the referent of `pub const FOO: &'static Encoding`
845 /// unique cross-crate, so don't take the address of this
846 /// `static`.
847 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
848 
849 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
850 ///
851 /// For use only for taking the address of this form when
852 /// Rust prohibits the use of the non-`_INIT` form directly,
853 /// such as in initializers of other `static`s. If in doubt,
854 /// use the corresponding non-`_INIT` reference-typed `static`.
855 ///
856 /// This part of the public API will go away if Rust changes
857 /// to make the referent of `pub const FOO: &'static Encoding`
858 /// unique cross-crate or if Rust starts allowing static arrays
859 /// to be initialized with `pub static FOO: &'static Encoding`
860 /// items.
861 pub static EUC_KR_INIT: Encoding = Encoding {
862     name: "EUC-KR",
863     variant: VariantEncoding::EucKr,
864 };
865 
866 /// The EUC-KR encoding.
867 ///
868 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
869 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
870 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
871 ///
872 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
873 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
874 ///
875 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
876 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
877 /// the question mark or the Private Use Area.
878 ///
879 /// This will change from `static` to `const` if Rust changes
880 /// to make the referent of `pub const FOO: &'static Encoding`
881 /// unique cross-crate, so don't take the address of this
882 /// `static`.
883 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
884 
885 /// The initializer for the [GBK](static.GBK.html) encoding.
886 ///
887 /// For use only for taking the address of this form when
888 /// Rust prohibits the use of the non-`_INIT` form directly,
889 /// such as in initializers of other `static`s. If in doubt,
890 /// use the corresponding non-`_INIT` reference-typed `static`.
891 ///
892 /// This part of the public API will go away if Rust changes
893 /// to make the referent of `pub const FOO: &'static Encoding`
894 /// unique cross-crate or if Rust starts allowing static arrays
895 /// to be initialized with `pub static FOO: &'static Encoding`
896 /// items.
897 pub static GBK_INIT: Encoding = Encoding {
898     name: "GBK",
899     variant: VariantEncoding::Gbk,
900 };
901 
902 /// The GBK encoding.
903 ///
904 /// The decoder for this encoding is the same as the decoder for gb18030.
905 /// The encoder side of this encoding is GBK with Windows code page 936 euro
906 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
907 /// Unicode block as well as a handful of ideographs from the CJK Unified
908 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
909 ///
910 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
911 /// unified with the gb18030 encoder in the Encoding Standard out of concern
912 /// that servers that expect GBK form submissions might not be able to handle
913 /// the four-byte sequences.
914 ///
915 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
916 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
917 ///
918 /// The encoder of this encoding roughly matches the Windows code page 936.
919 /// The decoder side is a superset.
920 ///
921 /// This will change from `static` to `const` if Rust changes
922 /// to make the referent of `pub const FOO: &'static Encoding`
923 /// unique cross-crate, so don't take the address of this
924 /// `static`.
925 pub static GBK: &'static Encoding = &GBK_INIT;
926 
927 /// The initializer for the [IBM866](static.IBM866.html) encoding.
928 ///
929 /// For use only for taking the address of this form when
930 /// Rust prohibits the use of the non-`_INIT` form directly,
931 /// such as in initializers of other `static`s. If in doubt,
932 /// use the corresponding non-`_INIT` reference-typed `static`.
933 ///
934 /// This part of the public API will go away if Rust changes
935 /// to make the referent of `pub const FOO: &'static Encoding`
936 /// unique cross-crate or if Rust starts allowing static arrays
937 /// to be initialized with `pub static FOO: &'static Encoding`
938 /// items.
939 pub static IBM866_INIT: Encoding = Encoding {
940     name: "IBM866",
941     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
942 };
943 
944 /// The IBM866 encoding.
945 ///
946 /// This the most notable one of the DOS Cyrillic code pages. It has the same
947 /// box drawing characters as code page 437, so it can be used for decoding
948 /// DOS-era ASCII + box drawing data.
949 ///
950 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
951 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
952 ///
953 /// This encoding matches the Windows code page 866.
954 ///
955 /// This will change from `static` to `const` if Rust changes
956 /// to make the referent of `pub const FOO: &'static Encoding`
957 /// unique cross-crate, so don't take the address of this
958 /// `static`.
959 pub static IBM866: &'static Encoding = &IBM866_INIT;
960 
961 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
962 ///
963 /// For use only for taking the address of this form when
964 /// Rust prohibits the use of the non-`_INIT` form directly,
965 /// such as in initializers of other `static`s. If in doubt,
966 /// use the corresponding non-`_INIT` reference-typed `static`.
967 ///
968 /// This part of the public API will go away if Rust changes
969 /// to make the referent of `pub const FOO: &'static Encoding`
970 /// unique cross-crate or if Rust starts allowing static arrays
971 /// to be initialized with `pub static FOO: &'static Encoding`
972 /// items.
973 pub static ISO_2022_JP_INIT: Encoding = Encoding {
974     name: "ISO-2022-JP",
975     variant: VariantEncoding::Iso2022Jp,
976 };
977 
978 /// The ISO-2022-JP encoding.
979 ///
980 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
981 /// byte range to encode non-Basic Latin characters. It's the only encoding
982 /// supported by this crate whose encoder is stateful.
983 ///
984 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
985 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
986 ///
987 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
988 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
989 /// error handling.
990 ///
991 /// This will change from `static` to `const` if Rust changes
992 /// to make the referent of `pub const FOO: &'static Encoding`
993 /// unique cross-crate, so don't take the address of this
994 /// `static`.
995 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
996 
997 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
998 ///
999 /// For use only for taking the address of this form when
1000 /// Rust prohibits the use of the non-`_INIT` form directly,
1001 /// such as in initializers of other `static`s. If in doubt,
1002 /// use the corresponding non-`_INIT` reference-typed `static`.
1003 ///
1004 /// This part of the public API will go away if Rust changes
1005 /// to make the referent of `pub const FOO: &'static Encoding`
1006 /// unique cross-crate or if Rust starts allowing static arrays
1007 /// to be initialized with `pub static FOO: &'static Encoding`
1008 /// items.
1009 pub static ISO_8859_10_INIT: Encoding = Encoding {
1010     name: "ISO-8859-10",
1011     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1012 };
1013 
1014 /// The ISO-8859-10 encoding.
1015 ///
1016 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1017 /// is also known as Latin 6.
1018 ///
1019 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1020 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1021 ///
1022 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1023 /// does not support this encoding.
1024 ///
1025 /// This will change from `static` to `const` if Rust changes
1026 /// to make the referent of `pub const FOO: &'static Encoding`
1027 /// unique cross-crate, so don't take the address of this
1028 /// `static`.
1029 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1030 
1031 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1032 ///
1033 /// For use only for taking the address of this form when
1034 /// Rust prohibits the use of the non-`_INIT` form directly,
1035 /// such as in initializers of other `static`s. If in doubt,
1036 /// use the corresponding non-`_INIT` reference-typed `static`.
1037 ///
1038 /// This part of the public API will go away if Rust changes
1039 /// to make the referent of `pub const FOO: &'static Encoding`
1040 /// unique cross-crate or if Rust starts allowing static arrays
1041 /// to be initialized with `pub static FOO: &'static Encoding`
1042 /// items.
1043 pub static ISO_8859_13_INIT: Encoding = Encoding {
1044     name: "ISO-8859-13",
1045     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1046 };
1047 
1048 /// The ISO-8859-13 encoding.
1049 ///
1050 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1051 /// is also known as Latin 7.
1052 ///
1053 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1054 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1055 ///
1056 /// This encoding matches the Windows code page 28603, except Windows decodes
1057 /// unassigned code points to the Private Use Area of Unicode.
1058 ///
1059 /// This will change from `static` to `const` if Rust changes
1060 /// to make the referent of `pub const FOO: &'static Encoding`
1061 /// unique cross-crate, so don't take the address of this
1062 /// `static`.
1063 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1064 
1065 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1066 ///
1067 /// For use only for taking the address of this form when
1068 /// Rust prohibits the use of the non-`_INIT` form directly,
1069 /// such as in initializers of other `static`s. If in doubt,
1070 /// use the corresponding non-`_INIT` reference-typed `static`.
1071 ///
1072 /// This part of the public API will go away if Rust changes
1073 /// to make the referent of `pub const FOO: &'static Encoding`
1074 /// unique cross-crate or if Rust starts allowing static arrays
1075 /// to be initialized with `pub static FOO: &'static Encoding`
1076 /// items.
1077 pub static ISO_8859_14_INIT: Encoding = Encoding {
1078     name: "ISO-8859-14",
1079     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1080 };
1081 
1082 /// The ISO-8859-14 encoding.
1083 ///
1084 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1085 /// is also known as Latin 8.
1086 ///
1087 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1088 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1089 ///
1090 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1091 /// does not support this encoding.
1092 ///
1093 /// This will change from `static` to `const` if Rust changes
1094 /// to make the referent of `pub const FOO: &'static Encoding`
1095 /// unique cross-crate, so don't take the address of this
1096 /// `static`.
1097 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1098 
1099 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1100 ///
1101 /// For use only for taking the address of this form when
1102 /// Rust prohibits the use of the non-`_INIT` form directly,
1103 /// such as in initializers of other `static`s. If in doubt,
1104 /// use the corresponding non-`_INIT` reference-typed `static`.
1105 ///
1106 /// This part of the public API will go away if Rust changes
1107 /// to make the referent of `pub const FOO: &'static Encoding`
1108 /// unique cross-crate or if Rust starts allowing static arrays
1109 /// to be initialized with `pub static FOO: &'static Encoding`
1110 /// items.
1111 pub static ISO_8859_15_INIT: Encoding = Encoding {
1112     name: "ISO-8859-15",
1113     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1114 };
1115 
1116 /// The ISO-8859-15 encoding.
1117 ///
1118 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1119 /// family. This encoding is also known as Latin 9.
1120 ///
1121 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1122 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1123 ///
1124 /// This encoding matches the Windows code page 28605.
1125 ///
1126 /// This will change from `static` to `const` if Rust changes
1127 /// to make the referent of `pub const FOO: &'static Encoding`
1128 /// unique cross-crate, so don't take the address of this
1129 /// `static`.
1130 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1131 
1132 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1133 ///
1134 /// For use only for taking the address of this form when
1135 /// Rust prohibits the use of the non-`_INIT` form directly,
1136 /// such as in initializers of other `static`s. If in doubt,
1137 /// use the corresponding non-`_INIT` reference-typed `static`.
1138 ///
1139 /// This part of the public API will go away if Rust changes
1140 /// to make the referent of `pub const FOO: &'static Encoding`
1141 /// unique cross-crate or if Rust starts allowing static arrays
1142 /// to be initialized with `pub static FOO: &'static Encoding`
1143 /// items.
1144 pub static ISO_8859_16_INIT: Encoding = Encoding {
1145     name: "ISO-8859-16",
1146     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1147 };
1148 
1149 /// The ISO-8859-16 encoding.
1150 ///
1151 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1152 /// family. This encoding is also known as Latin 10.
1153 ///
1154 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1155 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1156 ///
1157 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1158 /// does not support this encoding.
1159 ///
1160 /// This will change from `static` to `const` if Rust changes
1161 /// to make the referent of `pub const FOO: &'static Encoding`
1162 /// unique cross-crate, so don't take the address of this
1163 /// `static`.
1164 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1165 
1166 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1167 ///
1168 /// For use only for taking the address of this form when
1169 /// Rust prohibits the use of the non-`_INIT` form directly,
1170 /// such as in initializers of other `static`s. If in doubt,
1171 /// use the corresponding non-`_INIT` reference-typed `static`.
1172 ///
1173 /// This part of the public API will go away if Rust changes
1174 /// to make the referent of `pub const FOO: &'static Encoding`
1175 /// unique cross-crate or if Rust starts allowing static arrays
1176 /// to be initialized with `pub static FOO: &'static Encoding`
1177 /// items.
1178 pub static ISO_8859_2_INIT: Encoding = Encoding {
1179     name: "ISO-8859-2",
1180     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1181 };
1182 
1183 /// The ISO-8859-2 encoding.
1184 ///
1185 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1186 ///
1187 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1188 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1189 ///
1190 /// This encoding matches the Windows code page 28592.
1191 ///
1192 /// This will change from `static` to `const` if Rust changes
1193 /// to make the referent of `pub const FOO: &'static Encoding`
1194 /// unique cross-crate, so don't take the address of this
1195 /// `static`.
1196 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1197 
1198 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1199 ///
1200 /// For use only for taking the address of this form when
1201 /// Rust prohibits the use of the non-`_INIT` form directly,
1202 /// such as in initializers of other `static`s. If in doubt,
1203 /// use the corresponding non-`_INIT` reference-typed `static`.
1204 ///
1205 /// This part of the public API will go away if Rust changes
1206 /// to make the referent of `pub const FOO: &'static Encoding`
1207 /// unique cross-crate or if Rust starts allowing static arrays
1208 /// to be initialized with `pub static FOO: &'static Encoding`
1209 /// items.
1210 pub static ISO_8859_3_INIT: Encoding = Encoding {
1211     name: "ISO-8859-3",
1212     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1213 };
1214 
1215 /// The ISO-8859-3 encoding.
1216 ///
1217 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1218 ///
1219 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1220 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1221 ///
1222 /// This encoding matches the Windows code page 28593.
1223 ///
1224 /// This will change from `static` to `const` if Rust changes
1225 /// to make the referent of `pub const FOO: &'static Encoding`
1226 /// unique cross-crate, so don't take the address of this
1227 /// `static`.
1228 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1229 
1230 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1231 ///
1232 /// For use only for taking the address of this form when
1233 /// Rust prohibits the use of the non-`_INIT` form directly,
1234 /// such as in initializers of other `static`s. If in doubt,
1235 /// use the corresponding non-`_INIT` reference-typed `static`.
1236 ///
1237 /// This part of the public API will go away if Rust changes
1238 /// to make the referent of `pub const FOO: &'static Encoding`
1239 /// unique cross-crate or if Rust starts allowing static arrays
1240 /// to be initialized with `pub static FOO: &'static Encoding`
1241 /// items.
1242 pub static ISO_8859_4_INIT: Encoding = Encoding {
1243     name: "ISO-8859-4",
1244     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1245 };
1246 
1247 /// The ISO-8859-4 encoding.
1248 ///
1249 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1250 ///
1251 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1252 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1253 ///
1254 /// This encoding matches the Windows code page 28594.
1255 ///
1256 /// This will change from `static` to `const` if Rust changes
1257 /// to make the referent of `pub const FOO: &'static Encoding`
1258 /// unique cross-crate, so don't take the address of this
1259 /// `static`.
1260 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1261 
1262 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1263 ///
1264 /// For use only for taking the address of this form when
1265 /// Rust prohibits the use of the non-`_INIT` form directly,
1266 /// such as in initializers of other `static`s. If in doubt,
1267 /// use the corresponding non-`_INIT` reference-typed `static`.
1268 ///
1269 /// This part of the public API will go away if Rust changes
1270 /// to make the referent of `pub const FOO: &'static Encoding`
1271 /// unique cross-crate or if Rust starts allowing static arrays
1272 /// to be initialized with `pub static FOO: &'static Encoding`
1273 /// items.
1274 pub static ISO_8859_5_INIT: Encoding = Encoding {
1275     name: "ISO-8859-5",
1276     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1277 };
1278 
1279 /// The ISO-8859-5 encoding.
1280 ///
1281 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1282 ///
1283 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1284 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1285 ///
1286 /// This encoding matches the Windows code page 28595.
1287 ///
1288 /// This will change from `static` to `const` if Rust changes
1289 /// to make the referent of `pub const FOO: &'static Encoding`
1290 /// unique cross-crate, so don't take the address of this
1291 /// `static`.
1292 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1293 
1294 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1295 ///
1296 /// For use only for taking the address of this form when
1297 /// Rust prohibits the use of the non-`_INIT` form directly,
1298 /// such as in initializers of other `static`s. If in doubt,
1299 /// use the corresponding non-`_INIT` reference-typed `static`.
1300 ///
1301 /// This part of the public API will go away if Rust changes
1302 /// to make the referent of `pub const FOO: &'static Encoding`
1303 /// unique cross-crate or if Rust starts allowing static arrays
1304 /// to be initialized with `pub static FOO: &'static Encoding`
1305 /// items.
1306 pub static ISO_8859_6_INIT: Encoding = Encoding {
1307     name: "ISO-8859-6",
1308     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1309 };
1310 
1311 /// The ISO-8859-6 encoding.
1312 ///
1313 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1314 ///
1315 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1316 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1317 ///
1318 /// This encoding matches the Windows code page 28596, except Windows decodes
1319 /// unassigned code points to the Private Use Area of Unicode.
1320 ///
1321 /// This will change from `static` to `const` if Rust changes
1322 /// to make the referent of `pub const FOO: &'static Encoding`
1323 /// unique cross-crate, so don't take the address of this
1324 /// `static`.
1325 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1326 
1327 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1328 ///
1329 /// For use only for taking the address of this form when
1330 /// Rust prohibits the use of the non-`_INIT` form directly,
1331 /// such as in initializers of other `static`s. If in doubt,
1332 /// use the corresponding non-`_INIT` reference-typed `static`.
1333 ///
1334 /// This part of the public API will go away if Rust changes
1335 /// to make the referent of `pub const FOO: &'static Encoding`
1336 /// unique cross-crate or if Rust starts allowing static arrays
1337 /// to be initialized with `pub static FOO: &'static Encoding`
1338 /// items.
1339 pub static ISO_8859_7_INIT: Encoding = Encoding {
1340     name: "ISO-8859-7",
1341     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1342 };
1343 
1344 /// The ISO-8859-7 encoding.
1345 ///
1346 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1347 ///
1348 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1349 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1350 ///
1351 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1352 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1353 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1354 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1355 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1356 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1357 ///
1358 /// This will change from `static` to `const` if Rust changes
1359 /// to make the referent of `pub const FOO: &'static Encoding`
1360 /// unique cross-crate, so don't take the address of this
1361 /// `static`.
1362 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1363 
1364 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1365 ///
1366 /// For use only for taking the address of this form when
1367 /// Rust prohibits the use of the non-`_INIT` form directly,
1368 /// such as in initializers of other `static`s. If in doubt,
1369 /// use the corresponding non-`_INIT` reference-typed `static`.
1370 ///
1371 /// This part of the public API will go away if Rust changes
1372 /// to make the referent of `pub const FOO: &'static Encoding`
1373 /// unique cross-crate or if Rust starts allowing static arrays
1374 /// to be initialized with `pub static FOO: &'static Encoding`
1375 /// items.
1376 pub static ISO_8859_8_INIT: Encoding = Encoding {
1377     name: "ISO-8859-8",
1378     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1379 };
1380 
1381 /// The ISO-8859-8 encoding.
1382 ///
1383 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1384 ///
1385 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1386 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1387 ///
1388 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1389 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1390 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1391 /// the private use area.
1392 ///
1393 /// This will change from `static` to `const` if Rust changes
1394 /// to make the referent of `pub const FOO: &'static Encoding`
1395 /// unique cross-crate, so don't take the address of this
1396 /// `static`.
1397 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1398 
1399 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1400 ///
1401 /// For use only for taking the address of this form when
1402 /// Rust prohibits the use of the non-`_INIT` form directly,
1403 /// such as in initializers of other `static`s. If in doubt,
1404 /// use the corresponding non-`_INIT` reference-typed `static`.
1405 ///
1406 /// This part of the public API will go away if Rust changes
1407 /// to make the referent of `pub const FOO: &'static Encoding`
1408 /// unique cross-crate or if Rust starts allowing static arrays
1409 /// to be initialized with `pub static FOO: &'static Encoding`
1410 /// items.
1411 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1412     name: "ISO-8859-8-I",
1413     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1414 };
1415 
1416 /// The ISO-8859-8-I encoding.
1417 ///
1418 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1419 ///
1420 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1421 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1422 ///
1423 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1424 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1425 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1426 /// the private use area.
1427 ///
1428 /// This will change from `static` to `const` if Rust changes
1429 /// to make the referent of `pub const FOO: &'static Encoding`
1430 /// unique cross-crate, so don't take the address of this
1431 /// `static`.
1432 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1433 
1434 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1435 ///
1436 /// For use only for taking the address of this form when
1437 /// Rust prohibits the use of the non-`_INIT` form directly,
1438 /// such as in initializers of other `static`s. If in doubt,
1439 /// use the corresponding non-`_INIT` reference-typed `static`.
1440 ///
1441 /// This part of the public API will go away if Rust changes
1442 /// to make the referent of `pub const FOO: &'static Encoding`
1443 /// unique cross-crate or if Rust starts allowing static arrays
1444 /// to be initialized with `pub static FOO: &'static Encoding`
1445 /// items.
1446 pub static KOI8_R_INIT: Encoding = Encoding {
1447     name: "KOI8-R",
1448     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1449 };
1450 
1451 /// The KOI8-R encoding.
1452 ///
1453 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1454 ///
1455 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1456 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1457 ///
1458 /// This encoding matches the Windows code page 20866.
1459 ///
1460 /// This will change from `static` to `const` if Rust changes
1461 /// to make the referent of `pub const FOO: &'static Encoding`
1462 /// unique cross-crate, so don't take the address of this
1463 /// `static`.
1464 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1465 
1466 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1467 ///
1468 /// For use only for taking the address of this form when
1469 /// Rust prohibits the use of the non-`_INIT` form directly,
1470 /// such as in initializers of other `static`s. If in doubt,
1471 /// use the corresponding non-`_INIT` reference-typed `static`.
1472 ///
1473 /// This part of the public API will go away if Rust changes
1474 /// to make the referent of `pub const FOO: &'static Encoding`
1475 /// unique cross-crate or if Rust starts allowing static arrays
1476 /// to be initialized with `pub static FOO: &'static Encoding`
1477 /// items.
1478 pub static KOI8_U_INIT: Encoding = Encoding {
1479     name: "KOI8-U",
1480     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1481 };
1482 
1483 /// The KOI8-U encoding.
1484 ///
1485 /// This is an encoding for Ukrainian adapted from KOI8-R.
1486 ///
1487 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1488 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1489 ///
1490 /// This encoding matches the Windows code page 21866.
1491 ///
1492 /// This will change from `static` to `const` if Rust changes
1493 /// to make the referent of `pub const FOO: &'static Encoding`
1494 /// unique cross-crate, so don't take the address of this
1495 /// `static`.
1496 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1497 
1498 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1499 ///
1500 /// For use only for taking the address of this form when
1501 /// Rust prohibits the use of the non-`_INIT` form directly,
1502 /// such as in initializers of other `static`s. If in doubt,
1503 /// use the corresponding non-`_INIT` reference-typed `static`.
1504 ///
1505 /// This part of the public API will go away if Rust changes
1506 /// to make the referent of `pub const FOO: &'static Encoding`
1507 /// unique cross-crate or if Rust starts allowing static arrays
1508 /// to be initialized with `pub static FOO: &'static Encoding`
1509 /// items.
1510 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1511     name: "Shift_JIS",
1512     variant: VariantEncoding::ShiftJis,
1513 };
1514 
1515 /// The Shift_JIS encoding.
1516 ///
1517 /// This is the Japanese encoding for Windows.
1518 ///
1519 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1520 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1521 ///
1522 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1523 /// sequences that are error per the Encoding Standard to the question mark or the
1524 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1525 ///
1526 /// This will change from `static` to `const` if Rust changes
1527 /// to make the referent of `pub const FOO: &'static Encoding`
1528 /// unique cross-crate, so don't take the address of this
1529 /// `static`.
1530 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1531 
1532 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1533 ///
1534 /// For use only for taking the address of this form when
1535 /// Rust prohibits the use of the non-`_INIT` form directly,
1536 /// such as in initializers of other `static`s. If in doubt,
1537 /// use the corresponding non-`_INIT` reference-typed `static`.
1538 ///
1539 /// This part of the public API will go away if Rust changes
1540 /// to make the referent of `pub const FOO: &'static Encoding`
1541 /// unique cross-crate or if Rust starts allowing static arrays
1542 /// to be initialized with `pub static FOO: &'static Encoding`
1543 /// items.
1544 pub static UTF_16BE_INIT: Encoding = Encoding {
1545     name: "UTF-16BE",
1546     variant: VariantEncoding::Utf16Be,
1547 };
1548 
1549 /// The UTF-16BE encoding.
1550 ///
1551 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1552 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1553 /// mark the big endian byte order is assumed.
1554 ///
1555 /// There is no corresponding encoder in this crate or in the Encoding
1556 /// Standard. The output encoding of this encoding is UTF-8.
1557 ///
1558 /// This encoding matches the Windows code page 1201.
1559 ///
1560 /// This will change from `static` to `const` if Rust changes
1561 /// to make the referent of `pub const FOO: &'static Encoding`
1562 /// unique cross-crate, so don't take the address of this
1563 /// `static`.
1564 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1565 
1566 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1567 ///
1568 /// For use only for taking the address of this form when
1569 /// Rust prohibits the use of the non-`_INIT` form directly,
1570 /// such as in initializers of other `static`s. If in doubt,
1571 /// use the corresponding non-`_INIT` reference-typed `static`.
1572 ///
1573 /// This part of the public API will go away if Rust changes
1574 /// to make the referent of `pub const FOO: &'static Encoding`
1575 /// unique cross-crate or if Rust starts allowing static arrays
1576 /// to be initialized with `pub static FOO: &'static Encoding`
1577 /// items.
1578 pub static UTF_16LE_INIT: Encoding = Encoding {
1579     name: "UTF-16LE",
1580     variant: VariantEncoding::Utf16Le,
1581 };
1582 
1583 /// The UTF-16LE encoding.
1584 ///
1585 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1586 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1587 /// mark the little endian byte order is assumed.
1588 ///
1589 /// There is no corresponding encoder in this crate or in the Encoding
1590 /// Standard. The output encoding of this encoding is UTF-8.
1591 ///
1592 /// This encoding matches the Windows code page 1200.
1593 ///
1594 /// This will change from `static` to `const` if Rust changes
1595 /// to make the referent of `pub const FOO: &'static Encoding`
1596 /// unique cross-crate, so don't take the address of this
1597 /// `static`.
1598 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1599 
1600 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1601 ///
1602 /// For use only for taking the address of this form when
1603 /// Rust prohibits the use of the non-`_INIT` form directly,
1604 /// such as in initializers of other `static`s. If in doubt,
1605 /// use the corresponding non-`_INIT` reference-typed `static`.
1606 ///
1607 /// This part of the public API will go away if Rust changes
1608 /// to make the referent of `pub const FOO: &'static Encoding`
1609 /// unique cross-crate or if Rust starts allowing static arrays
1610 /// to be initialized with `pub static FOO: &'static Encoding`
1611 /// items.
1612 pub static UTF_8_INIT: Encoding = Encoding {
1613     name: "UTF-8",
1614     variant: VariantEncoding::Utf8,
1615 };
1616 
1617 /// The UTF-8 encoding.
1618 ///
1619 /// This is the encoding that should be used for all new development it can
1620 /// represent all of Unicode.
1621 ///
1622 /// This encoding matches the Windows code page 65001, except Windows differs
1623 /// in the number of errors generated for some erroneous byte sequences.
1624 ///
1625 /// This will change from `static` to `const` if Rust changes
1626 /// to make the referent of `pub const FOO: &'static Encoding`
1627 /// unique cross-crate, so don't take the address of this
1628 /// `static`.
1629 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1630 
1631 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1632 ///
1633 /// For use only for taking the address of this form when
1634 /// Rust prohibits the use of the non-`_INIT` form directly,
1635 /// such as in initializers of other `static`s. If in doubt,
1636 /// use the corresponding non-`_INIT` reference-typed `static`.
1637 ///
1638 /// This part of the public API will go away if Rust changes
1639 /// to make the referent of `pub const FOO: &'static Encoding`
1640 /// unique cross-crate or if Rust starts allowing static arrays
1641 /// to be initialized with `pub static FOO: &'static Encoding`
1642 /// items.
1643 pub static GB18030_INIT: Encoding = Encoding {
1644     name: "gb18030",
1645     variant: VariantEncoding::Gb18030,
1646 };
1647 
1648 /// The gb18030 encoding.
1649 ///
1650 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1651 /// maps to U+3000 for compatibility with existing Web content. As a result,
1652 /// this encoding can represent all of Unicode except for the private-use
1653 /// character U+E5E5.
1654 ///
1655 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1656 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1657 ///
1658 /// This encoding matches the Windows code page 54936.
1659 ///
1660 /// This will change from `static` to `const` if Rust changes
1661 /// to make the referent of `pub const FOO: &'static Encoding`
1662 /// unique cross-crate, so don't take the address of this
1663 /// `static`.
1664 pub static GB18030: &'static Encoding = &GB18030_INIT;
1665 
1666 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1667 ///
1668 /// For use only for taking the address of this form when
1669 /// Rust prohibits the use of the non-`_INIT` form directly,
1670 /// such as in initializers of other `static`s. If in doubt,
1671 /// use the corresponding non-`_INIT` reference-typed `static`.
1672 ///
1673 /// This part of the public API will go away if Rust changes
1674 /// to make the referent of `pub const FOO: &'static Encoding`
1675 /// unique cross-crate or if Rust starts allowing static arrays
1676 /// to be initialized with `pub static FOO: &'static Encoding`
1677 /// items.
1678 pub static MACINTOSH_INIT: Encoding = Encoding {
1679     name: "macintosh",
1680     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1681 };
1682 
1683 /// The macintosh encoding.
1684 ///
1685 /// This is the MacRoman encoding from Mac OS Classic.
1686 ///
1687 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1688 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1689 ///
1690 /// This encoding matches the Windows code page 10000, except Windows decodes
1691 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1692 ///
1693 /// This will change from `static` to `const` if Rust changes
1694 /// to make the referent of `pub const FOO: &'static Encoding`
1695 /// unique cross-crate, so don't take the address of this
1696 /// `static`.
1697 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1698 
1699 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1700 ///
1701 /// For use only for taking the address of this form when
1702 /// Rust prohibits the use of the non-`_INIT` form directly,
1703 /// such as in initializers of other `static`s. If in doubt,
1704 /// use the corresponding non-`_INIT` reference-typed `static`.
1705 ///
1706 /// This part of the public API will go away if Rust changes
1707 /// to make the referent of `pub const FOO: &'static Encoding`
1708 /// unique cross-crate or if Rust starts allowing static arrays
1709 /// to be initialized with `pub static FOO: &'static Encoding`
1710 /// items.
1711 pub static REPLACEMENT_INIT: Encoding = Encoding {
1712     name: "replacement",
1713     variant: VariantEncoding::Replacement,
1714 };
1715 
1716 /// The replacement encoding.
1717 ///
1718 /// This decode-only encoding decodes all non-zero-length streams to a single
1719 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1720 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1721 /// encodings that are no longer supported by the Web Platform and that
1722 /// would be dangerous to treat as ASCII-compatible.
1723 ///
1724 /// There is no corresponding encoder. The output encoding of this encoding
1725 /// is UTF-8.
1726 ///
1727 /// This encoding does not have a Windows code page number.
1728 ///
1729 /// This will change from `static` to `const` if Rust changes
1730 /// to make the referent of `pub const FOO: &'static Encoding`
1731 /// unique cross-crate, so don't take the address of this
1732 /// `static`.
1733 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1734 
1735 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1736 ///
1737 /// For use only for taking the address of this form when
1738 /// Rust prohibits the use of the non-`_INIT` form directly,
1739 /// such as in initializers of other `static`s. If in doubt,
1740 /// use the corresponding non-`_INIT` reference-typed `static`.
1741 ///
1742 /// This part of the public API will go away if Rust changes
1743 /// to make the referent of `pub const FOO: &'static Encoding`
1744 /// unique cross-crate or if Rust starts allowing static arrays
1745 /// to be initialized with `pub static FOO: &'static Encoding`
1746 /// items.
1747 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1748     name: "windows-1250",
1749     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1750 };
1751 
1752 /// The windows-1250 encoding.
1753 ///
1754 /// This is the Central European encoding for Windows.
1755 ///
1756 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1757 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1758 ///
1759 /// This encoding matches the Windows code page 1250.
1760 ///
1761 /// This will change from `static` to `const` if Rust changes
1762 /// to make the referent of `pub const FOO: &'static Encoding`
1763 /// unique cross-crate, so don't take the address of this
1764 /// `static`.
1765 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1766 
1767 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1768 ///
1769 /// For use only for taking the address of this form when
1770 /// Rust prohibits the use of the non-`_INIT` form directly,
1771 /// such as in initializers of other `static`s. If in doubt,
1772 /// use the corresponding non-`_INIT` reference-typed `static`.
1773 ///
1774 /// This part of the public API will go away if Rust changes
1775 /// to make the referent of `pub const FOO: &'static Encoding`
1776 /// unique cross-crate or if Rust starts allowing static arrays
1777 /// to be initialized with `pub static FOO: &'static Encoding`
1778 /// items.
1779 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1780     name: "windows-1251",
1781     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1782 };
1783 
1784 /// The windows-1251 encoding.
1785 ///
1786 /// This is the Cyrillic encoding for Windows.
1787 ///
1788 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1789 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1790 ///
1791 /// This encoding matches the Windows code page 1251.
1792 ///
1793 /// This will change from `static` to `const` if Rust changes
1794 /// to make the referent of `pub const FOO: &'static Encoding`
1795 /// unique cross-crate, so don't take the address of this
1796 /// `static`.
1797 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1798 
1799 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1800 ///
1801 /// For use only for taking the address of this form when
1802 /// Rust prohibits the use of the non-`_INIT` form directly,
1803 /// such as in initializers of other `static`s. If in doubt,
1804 /// use the corresponding non-`_INIT` reference-typed `static`.
1805 ///
1806 /// This part of the public API will go away if Rust changes
1807 /// to make the referent of `pub const FOO: &'static Encoding`
1808 /// unique cross-crate or if Rust starts allowing static arrays
1809 /// to be initialized with `pub static FOO: &'static Encoding`
1810 /// items.
1811 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1812     name: "windows-1252",
1813     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1814 };
1815 
1816 /// The windows-1252 encoding.
1817 ///
1818 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1819 /// which is known as Latin 1.
1820 ///
1821 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1822 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1823 ///
1824 /// This encoding matches the Windows code page 1252.
1825 ///
1826 /// This will change from `static` to `const` if Rust changes
1827 /// to make the referent of `pub const FOO: &'static Encoding`
1828 /// unique cross-crate, so don't take the address of this
1829 /// `static`.
1830 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1831 
1832 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1833 ///
1834 /// For use only for taking the address of this form when
1835 /// Rust prohibits the use of the non-`_INIT` form directly,
1836 /// such as in initializers of other `static`s. If in doubt,
1837 /// use the corresponding non-`_INIT` reference-typed `static`.
1838 ///
1839 /// This part of the public API will go away if Rust changes
1840 /// to make the referent of `pub const FOO: &'static Encoding`
1841 /// unique cross-crate or if Rust starts allowing static arrays
1842 /// to be initialized with `pub static FOO: &'static Encoding`
1843 /// items.
1844 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1845     name: "windows-1253",
1846     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1847 };
1848 
1849 /// The windows-1253 encoding.
1850 ///
1851 /// This is the Greek encoding for Windows. It is mostly an extension of
1852 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1853 ///
1854 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1855 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1856 ///
1857 /// This encoding matches the Windows code page 1253, except Windows decodes
1858 /// unassigned code points to the Private Use Area of Unicode.
1859 ///
1860 /// This will change from `static` to `const` if Rust changes
1861 /// to make the referent of `pub const FOO: &'static Encoding`
1862 /// unique cross-crate, so don't take the address of this
1863 /// `static`.
1864 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1865 
1866 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1867 ///
1868 /// For use only for taking the address of this form when
1869 /// Rust prohibits the use of the non-`_INIT` form directly,
1870 /// such as in initializers of other `static`s. If in doubt,
1871 /// use the corresponding non-`_INIT` reference-typed `static`.
1872 ///
1873 /// This part of the public API will go away if Rust changes
1874 /// to make the referent of `pub const FOO: &'static Encoding`
1875 /// unique cross-crate or if Rust starts allowing static arrays
1876 /// to be initialized with `pub static FOO: &'static Encoding`
1877 /// items.
1878 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1879     name: "windows-1254",
1880     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1881 };
1882 
1883 /// The windows-1254 encoding.
1884 ///
1885 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1886 /// which is known as Latin 5.
1887 ///
1888 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1889 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1890 ///
1891 /// This encoding matches the Windows code page 1254.
1892 ///
1893 /// This will change from `static` to `const` if Rust changes
1894 /// to make the referent of `pub const FOO: &'static Encoding`
1895 /// unique cross-crate, so don't take the address of this
1896 /// `static`.
1897 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1898 
1899 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1900 ///
1901 /// For use only for taking the address of this form when
1902 /// Rust prohibits the use of the non-`_INIT` form directly,
1903 /// such as in initializers of other `static`s. If in doubt,
1904 /// use the corresponding non-`_INIT` reference-typed `static`.
1905 ///
1906 /// This part of the public API will go away if Rust changes
1907 /// to make the referent of `pub const FOO: &'static Encoding`
1908 /// unique cross-crate or if Rust starts allowing static arrays
1909 /// to be initialized with `pub static FOO: &'static Encoding`
1910 /// items.
1911 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1912     name: "windows-1255",
1913     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1914 };
1915 
1916 /// The windows-1255 encoding.
1917 ///
1918 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1919 /// except for a currency sign swap.
1920 ///
1921 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1922 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1923 ///
1924 /// This encoding matches the Windows code page 1255, except Windows decodes
1925 /// unassigned code points to the Private Use Area of Unicode.
1926 ///
1927 /// This will change from `static` to `const` if Rust changes
1928 /// to make the referent of `pub const FOO: &'static Encoding`
1929 /// unique cross-crate, so don't take the address of this
1930 /// `static`.
1931 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1932 
1933 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1934 ///
1935 /// For use only for taking the address of this form when
1936 /// Rust prohibits the use of the non-`_INIT` form directly,
1937 /// such as in initializers of other `static`s. If in doubt,
1938 /// use the corresponding non-`_INIT` reference-typed `static`.
1939 ///
1940 /// This part of the public API will go away if Rust changes
1941 /// to make the referent of `pub const FOO: &'static Encoding`
1942 /// unique cross-crate or if Rust starts allowing static arrays
1943 /// to be initialized with `pub static FOO: &'static Encoding`
1944 /// items.
1945 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1946     name: "windows-1256",
1947     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1948 };
1949 
1950 /// The windows-1256 encoding.
1951 ///
1952 /// This is the Arabic encoding for Windows.
1953 ///
1954 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1955 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1956 ///
1957 /// This encoding matches the Windows code page 1256.
1958 ///
1959 /// This will change from `static` to `const` if Rust changes
1960 /// to make the referent of `pub const FOO: &'static Encoding`
1961 /// unique cross-crate, so don't take the address of this
1962 /// `static`.
1963 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1964 
1965 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1966 ///
1967 /// For use only for taking the address of this form when
1968 /// Rust prohibits the use of the non-`_INIT` form directly,
1969 /// such as in initializers of other `static`s. If in doubt,
1970 /// use the corresponding non-`_INIT` reference-typed `static`.
1971 ///
1972 /// This part of the public API will go away if Rust changes
1973 /// to make the referent of `pub const FOO: &'static Encoding`
1974 /// unique cross-crate or if Rust starts allowing static arrays
1975 /// to be initialized with `pub static FOO: &'static Encoding`
1976 /// items.
1977 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1978     name: "windows-1257",
1979     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1980 };
1981 
1982 /// The windows-1257 encoding.
1983 ///
1984 /// This is the Baltic encoding for Windows.
1985 ///
1986 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1987 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1988 ///
1989 /// This encoding matches the Windows code page 1257, except Windows decodes
1990 /// unassigned code points to the Private Use Area of Unicode.
1991 ///
1992 /// This will change from `static` to `const` if Rust changes
1993 /// to make the referent of `pub const FOO: &'static Encoding`
1994 /// unique cross-crate, so don't take the address of this
1995 /// `static`.
1996 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
1997 
1998 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
1999 ///
2000 /// For use only for taking the address of this form when
2001 /// Rust prohibits the use of the non-`_INIT` form directly,
2002 /// such as in initializers of other `static`s. If in doubt,
2003 /// use the corresponding non-`_INIT` reference-typed `static`.
2004 ///
2005 /// This part of the public API will go away if Rust changes
2006 /// to make the referent of `pub const FOO: &'static Encoding`
2007 /// unique cross-crate or if Rust starts allowing static arrays
2008 /// to be initialized with `pub static FOO: &'static Encoding`
2009 /// items.
2010 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2011     name: "windows-1258",
2012     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2013 };
2014 
2015 /// The windows-1258 encoding.
2016 ///
2017 /// This is the Vietnamese encoding for Windows.
2018 ///
2019 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2020 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2021 ///
2022 /// This encoding matches the Windows code page 1258 when used in the
2023 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2024 /// result of decoding is not necessarily in Normalization Form C. On the
2025 /// other hand, input in the Normalization Form C is not encoded without
2026 /// replacement. In general, it's a bad idea to encode to encodings other
2027 /// than UTF-8, but this encoding is especially hazardous to encode to.
2028 ///
2029 /// This will change from `static` to `const` if Rust changes
2030 /// to make the referent of `pub const FOO: &'static Encoding`
2031 /// unique cross-crate, so don't take the address of this
2032 /// `static`.
2033 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2034 
2035 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2036 ///
2037 /// For use only for taking the address of this form when
2038 /// Rust prohibits the use of the non-`_INIT` form directly,
2039 /// such as in initializers of other `static`s. If in doubt,
2040 /// use the corresponding non-`_INIT` reference-typed `static`.
2041 ///
2042 /// This part of the public API will go away if Rust changes
2043 /// to make the referent of `pub const FOO: &'static Encoding`
2044 /// unique cross-crate or if Rust starts allowing static arrays
2045 /// to be initialized with `pub static FOO: &'static Encoding`
2046 /// items.
2047 pub static WINDOWS_874_INIT: Encoding = Encoding {
2048     name: "windows-874",
2049     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2050 };
2051 
2052 /// The windows-874 encoding.
2053 ///
2054 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2055 ///
2056 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2057 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2058 ///
2059 /// This encoding matches the Windows code page 874, except Windows decodes
2060 /// unassigned code points to the Private Use Area of Unicode.
2061 ///
2062 /// This will change from `static` to `const` if Rust changes
2063 /// to make the referent of `pub const FOO: &'static Encoding`
2064 /// unique cross-crate, so don't take the address of this
2065 /// `static`.
2066 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2067 
2068 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2069 ///
2070 /// For use only for taking the address of this form when
2071 /// Rust prohibits the use of the non-`_INIT` form directly,
2072 /// such as in initializers of other `static`s. If in doubt,
2073 /// use the corresponding non-`_INIT` reference-typed `static`.
2074 ///
2075 /// This part of the public API will go away if Rust changes
2076 /// to make the referent of `pub const FOO: &'static Encoding`
2077 /// unique cross-crate or if Rust starts allowing static arrays
2078 /// to be initialized with `pub static FOO: &'static Encoding`
2079 /// items.
2080 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2081     name: "x-mac-cyrillic",
2082     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2083 };
2084 
2085 /// The x-mac-cyrillic encoding.
2086 ///
2087 /// This is the MacUkrainian encoding from Mac OS Classic.
2088 ///
2089 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2090 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2091 ///
2092 /// This encoding matches the Windows code page 10017.
2093 ///
2094 /// This will change from `static` to `const` if Rust changes
2095 /// to make the referent of `pub const FOO: &'static Encoding`
2096 /// unique cross-crate, so don't take the address of this
2097 /// `static`.
2098 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2099 
2100 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2101 ///
2102 /// For use only for taking the address of this form when
2103 /// Rust prohibits the use of the non-`_INIT` form directly,
2104 /// such as in initializers of other `static`s. If in doubt,
2105 /// use the corresponding non-`_INIT` reference-typed `static`.
2106 ///
2107 /// This part of the public API will go away if Rust changes
2108 /// to make the referent of `pub const FOO: &'static Encoding`
2109 /// unique cross-crate or if Rust starts allowing static arrays
2110 /// to be initialized with `pub static FOO: &'static Encoding`
2111 /// items.
2112 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2113     name: "x-user-defined",
2114     variant: VariantEncoding::UserDefined,
2115 };
2116 
2117 /// The x-user-defined encoding.
2118 ///
2119 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2120 /// them to the Private Use Area of Unicode. It was used for loading binary
2121 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2122 /// the `"arraybuffer"` response type.
2123 ///
2124 /// This encoding does not have a Windows code page number.
2125 ///
2126 /// This will change from `static` to `const` if Rust changes
2127 /// to make the referent of `pub const FOO: &'static Encoding`
2128 /// unique cross-crate, so don't take the address of this
2129 /// `static`.
2130 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2131 
2132 static LABELS_SORTED: [&'static str; 219] = [
2133     "l1",
2134     "l2",
2135     "l3",
2136     "l4",
2137     "l5",
2138     "l6",
2139     "l9",
2140     "866",
2141     "mac",
2142     "koi",
2143     "gbk",
2144     "big5",
2145     "utf8",
2146     "koi8",
2147     "sjis",
2148     "ms932",
2149     "cp866",
2150     "utf-8",
2151     "cp819",
2152     "ascii",
2153     "x-gbk",
2154     "greek",
2155     "cp1250",
2156     "cp1251",
2157     "latin1",
2158     "gb2312",
2159     "cp1252",
2160     "latin2",
2161     "cp1253",
2162     "latin3",
2163     "cp1254",
2164     "latin4",
2165     "cp1255",
2166     "csbig5",
2167     "latin5",
2168     "utf-16",
2169     "cp1256",
2170     "ibm866",
2171     "latin6",
2172     "cp1257",
2173     "cp1258",
2174     "greek8",
2175     "ibm819",
2176     "arabic",
2177     "visual",
2178     "korean",
2179     "euc-jp",
2180     "koi8-r",
2181     "koi8_r",
2182     "euc-kr",
2183     "x-sjis",
2184     "koi8-u",
2185     "hebrew",
2186     "tis-620",
2187     "gb18030",
2188     "ksc5601",
2189     "gb_2312",
2190     "dos-874",
2191     "cn-big5",
2192     "chinese",
2193     "logical",
2194     "cskoi8r",
2195     "cseuckr",
2196     "koi8-ru",
2197     "x-cp1250",
2198     "ksc_5601",
2199     "x-cp1251",
2200     "iso88591",
2201     "csgb2312",
2202     "x-cp1252",
2203     "iso88592",
2204     "x-cp1253",
2205     "iso88593",
2206     "ecma-114",
2207     "x-cp1254",
2208     "iso88594",
2209     "x-cp1255",
2210     "iso88595",
2211     "x-x-big5",
2212     "x-cp1256",
2213     "csibm866",
2214     "iso88596",
2215     "x-cp1257",
2216     "iso88597",
2217     "asmo-708",
2218     "ecma-118",
2219     "elot_928",
2220     "x-cp1258",
2221     "iso88598",
2222     "iso88599",
2223     "cyrillic",
2224     "utf-16be",
2225     "utf-16le",
2226     "us-ascii",
2227     "ms_kanji",
2228     "x-euc-jp",
2229     "iso885910",
2230     "iso8859-1",
2231     "iso885911",
2232     "iso8859-2",
2233     "iso8859-3",
2234     "iso885913",
2235     "iso8859-4",
2236     "iso885914",
2237     "iso8859-5",
2238     "iso885915",
2239     "iso8859-6",
2240     "iso8859-7",
2241     "iso8859-8",
2242     "iso-ir-58",
2243     "iso8859-9",
2244     "macintosh",
2245     "shift-jis",
2246     "shift_jis",
2247     "iso-ir-100",
2248     "iso8859-10",
2249     "iso-ir-110",
2250     "gb_2312-80",
2251     "iso-8859-1",
2252     "iso_8859-1",
2253     "iso-ir-101",
2254     "iso8859-11",
2255     "iso-8859-2",
2256     "iso_8859-2",
2257     "hz-gb-2312",
2258     "iso-8859-3",
2259     "iso_8859-3",
2260     "iso8859-13",
2261     "iso-8859-4",
2262     "iso_8859-4",
2263     "iso8859-14",
2264     "iso-ir-144",
2265     "iso-8859-5",
2266     "iso_8859-5",
2267     "iso8859-15",
2268     "iso-8859-6",
2269     "iso_8859-6",
2270     "iso-ir-126",
2271     "iso-8859-7",
2272     "iso_8859-7",
2273     "iso-ir-127",
2274     "iso-ir-157",
2275     "iso-8859-8",
2276     "iso_8859-8",
2277     "iso-ir-138",
2278     "iso-ir-148",
2279     "iso-8859-9",
2280     "iso_8859-9",
2281     "iso-ir-109",
2282     "iso-ir-149",
2283     "big5-hkscs",
2284     "csshiftjis",
2285     "iso-8859-10",
2286     "iso-8859-11",
2287     "csisolatin1",
2288     "csisolatin2",
2289     "iso-8859-13",
2290     "csisolatin3",
2291     "iso-8859-14",
2292     "windows-874",
2293     "csisolatin4",
2294     "iso-8859-15",
2295     "iso_8859-15",
2296     "csisolatin5",
2297     "iso-8859-16",
2298     "csisolatin6",
2299     "windows-949",
2300     "csisolatin9",
2301     "csiso88596e",
2302     "csiso88598e",
2303     "csmacintosh",
2304     "csiso88596i",
2305     "csiso88598i",
2306     "windows-31j",
2307     "x-mac-roman",
2308     "iso-2022-cn",
2309     "iso-2022-jp",
2310     "csiso2022jp",
2311     "iso-2022-kr",
2312     "csiso2022kr",
2313     "replacement",
2314     "windows-1250",
2315     "windows-1251",
2316     "windows-1252",
2317     "windows-1253",
2318     "windows-1254",
2319     "windows-1255",
2320     "windows-1256",
2321     "windows-1257",
2322     "windows-1258",
2323     "iso-8859-6-e",
2324     "iso-8859-8-e",
2325     "iso-8859-6-i",
2326     "iso-8859-8-i",
2327     "sun_eu_greek",
2328     "csksc56011987",
2329     "ks_c_5601-1987",
2330     "ansi_x3.4-1968",
2331     "ks_c_5601-1989",
2332     "x-mac-cyrillic",
2333     "x-user-defined",
2334     "csiso58gb231280",
2335     "iso_8859-1:1987",
2336     "iso_8859-2:1987",
2337     "iso_8859-6:1987",
2338     "iso_8859-7:1987",
2339     "iso_8859-3:1988",
2340     "iso_8859-4:1988",
2341     "iso_8859-5:1988",
2342     "iso_8859-8:1988",
2343     "iso_8859-9:1989",
2344     "csisolatingreek",
2345     "x-mac-ukrainian",
2346     "iso-2022-cn-ext",
2347     "csisolatinarabic",
2348     "csisolatinhebrew",
2349     "unicode-1-1-utf-8",
2350     "csisolatincyrillic",
2351     "cseucpkdfmtjapanese",
2352 ];
2353 
2354 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2355     &WINDOWS_1252_INIT,
2356     &ISO_8859_2_INIT,
2357     &ISO_8859_3_INIT,
2358     &ISO_8859_4_INIT,
2359     &WINDOWS_1254_INIT,
2360     &ISO_8859_10_INIT,
2361     &ISO_8859_15_INIT,
2362     &IBM866_INIT,
2363     &MACINTOSH_INIT,
2364     &KOI8_R_INIT,
2365     &GBK_INIT,
2366     &BIG5_INIT,
2367     &UTF_8_INIT,
2368     &KOI8_R_INIT,
2369     &SHIFT_JIS_INIT,
2370     &SHIFT_JIS_INIT,
2371     &IBM866_INIT,
2372     &UTF_8_INIT,
2373     &WINDOWS_1252_INIT,
2374     &WINDOWS_1252_INIT,
2375     &GBK_INIT,
2376     &ISO_8859_7_INIT,
2377     &WINDOWS_1250_INIT,
2378     &WINDOWS_1251_INIT,
2379     &WINDOWS_1252_INIT,
2380     &GBK_INIT,
2381     &WINDOWS_1252_INIT,
2382     &ISO_8859_2_INIT,
2383     &WINDOWS_1253_INIT,
2384     &ISO_8859_3_INIT,
2385     &WINDOWS_1254_INIT,
2386     &ISO_8859_4_INIT,
2387     &WINDOWS_1255_INIT,
2388     &BIG5_INIT,
2389     &WINDOWS_1254_INIT,
2390     &UTF_16LE_INIT,
2391     &WINDOWS_1256_INIT,
2392     &IBM866_INIT,
2393     &ISO_8859_10_INIT,
2394     &WINDOWS_1257_INIT,
2395     &WINDOWS_1258_INIT,
2396     &ISO_8859_7_INIT,
2397     &WINDOWS_1252_INIT,
2398     &ISO_8859_6_INIT,
2399     &ISO_8859_8_INIT,
2400     &EUC_KR_INIT,
2401     &EUC_JP_INIT,
2402     &KOI8_R_INIT,
2403     &KOI8_R_INIT,
2404     &EUC_KR_INIT,
2405     &SHIFT_JIS_INIT,
2406     &KOI8_U_INIT,
2407     &ISO_8859_8_INIT,
2408     &WINDOWS_874_INIT,
2409     &GB18030_INIT,
2410     &EUC_KR_INIT,
2411     &GBK_INIT,
2412     &WINDOWS_874_INIT,
2413     &BIG5_INIT,
2414     &GBK_INIT,
2415     &ISO_8859_8_I_INIT,
2416     &KOI8_R_INIT,
2417     &EUC_KR_INIT,
2418     &KOI8_U_INIT,
2419     &WINDOWS_1250_INIT,
2420     &EUC_KR_INIT,
2421     &WINDOWS_1251_INIT,
2422     &WINDOWS_1252_INIT,
2423     &GBK_INIT,
2424     &WINDOWS_1252_INIT,
2425     &ISO_8859_2_INIT,
2426     &WINDOWS_1253_INIT,
2427     &ISO_8859_3_INIT,
2428     &ISO_8859_6_INIT,
2429     &WINDOWS_1254_INIT,
2430     &ISO_8859_4_INIT,
2431     &WINDOWS_1255_INIT,
2432     &ISO_8859_5_INIT,
2433     &BIG5_INIT,
2434     &WINDOWS_1256_INIT,
2435     &IBM866_INIT,
2436     &ISO_8859_6_INIT,
2437     &WINDOWS_1257_INIT,
2438     &ISO_8859_7_INIT,
2439     &ISO_8859_6_INIT,
2440     &ISO_8859_7_INIT,
2441     &ISO_8859_7_INIT,
2442     &WINDOWS_1258_INIT,
2443     &ISO_8859_8_INIT,
2444     &WINDOWS_1254_INIT,
2445     &ISO_8859_5_INIT,
2446     &UTF_16BE_INIT,
2447     &UTF_16LE_INIT,
2448     &WINDOWS_1252_INIT,
2449     &SHIFT_JIS_INIT,
2450     &EUC_JP_INIT,
2451     &ISO_8859_10_INIT,
2452     &WINDOWS_1252_INIT,
2453     &WINDOWS_874_INIT,
2454     &ISO_8859_2_INIT,
2455     &ISO_8859_3_INIT,
2456     &ISO_8859_13_INIT,
2457     &ISO_8859_4_INIT,
2458     &ISO_8859_14_INIT,
2459     &ISO_8859_5_INIT,
2460     &ISO_8859_15_INIT,
2461     &ISO_8859_6_INIT,
2462     &ISO_8859_7_INIT,
2463     &ISO_8859_8_INIT,
2464     &GBK_INIT,
2465     &WINDOWS_1254_INIT,
2466     &MACINTOSH_INIT,
2467     &SHIFT_JIS_INIT,
2468     &SHIFT_JIS_INIT,
2469     &WINDOWS_1252_INIT,
2470     &ISO_8859_10_INIT,
2471     &ISO_8859_4_INIT,
2472     &GBK_INIT,
2473     &WINDOWS_1252_INIT,
2474     &WINDOWS_1252_INIT,
2475     &ISO_8859_2_INIT,
2476     &WINDOWS_874_INIT,
2477     &ISO_8859_2_INIT,
2478     &ISO_8859_2_INIT,
2479     &REPLACEMENT_INIT,
2480     &ISO_8859_3_INIT,
2481     &ISO_8859_3_INIT,
2482     &ISO_8859_13_INIT,
2483     &ISO_8859_4_INIT,
2484     &ISO_8859_4_INIT,
2485     &ISO_8859_14_INIT,
2486     &ISO_8859_5_INIT,
2487     &ISO_8859_5_INIT,
2488     &ISO_8859_5_INIT,
2489     &ISO_8859_15_INIT,
2490     &ISO_8859_6_INIT,
2491     &ISO_8859_6_INIT,
2492     &ISO_8859_7_INIT,
2493     &ISO_8859_7_INIT,
2494     &ISO_8859_7_INIT,
2495     &ISO_8859_6_INIT,
2496     &ISO_8859_10_INIT,
2497     &ISO_8859_8_INIT,
2498     &ISO_8859_8_INIT,
2499     &ISO_8859_8_INIT,
2500     &WINDOWS_1254_INIT,
2501     &WINDOWS_1254_INIT,
2502     &WINDOWS_1254_INIT,
2503     &ISO_8859_3_INIT,
2504     &EUC_KR_INIT,
2505     &BIG5_INIT,
2506     &SHIFT_JIS_INIT,
2507     &ISO_8859_10_INIT,
2508     &WINDOWS_874_INIT,
2509     &WINDOWS_1252_INIT,
2510     &ISO_8859_2_INIT,
2511     &ISO_8859_13_INIT,
2512     &ISO_8859_3_INIT,
2513     &ISO_8859_14_INIT,
2514     &WINDOWS_874_INIT,
2515     &ISO_8859_4_INIT,
2516     &ISO_8859_15_INIT,
2517     &ISO_8859_15_INIT,
2518     &WINDOWS_1254_INIT,
2519     &ISO_8859_16_INIT,
2520     &ISO_8859_10_INIT,
2521     &EUC_KR_INIT,
2522     &ISO_8859_15_INIT,
2523     &ISO_8859_6_INIT,
2524     &ISO_8859_8_INIT,
2525     &MACINTOSH_INIT,
2526     &ISO_8859_6_INIT,
2527     &ISO_8859_8_I_INIT,
2528     &SHIFT_JIS_INIT,
2529     &MACINTOSH_INIT,
2530     &REPLACEMENT_INIT,
2531     &ISO_2022_JP_INIT,
2532     &ISO_2022_JP_INIT,
2533     &REPLACEMENT_INIT,
2534     &REPLACEMENT_INIT,
2535     &REPLACEMENT_INIT,
2536     &WINDOWS_1250_INIT,
2537     &WINDOWS_1251_INIT,
2538     &WINDOWS_1252_INIT,
2539     &WINDOWS_1253_INIT,
2540     &WINDOWS_1254_INIT,
2541     &WINDOWS_1255_INIT,
2542     &WINDOWS_1256_INIT,
2543     &WINDOWS_1257_INIT,
2544     &WINDOWS_1258_INIT,
2545     &ISO_8859_6_INIT,
2546     &ISO_8859_8_INIT,
2547     &ISO_8859_6_INIT,
2548     &ISO_8859_8_I_INIT,
2549     &ISO_8859_7_INIT,
2550     &EUC_KR_INIT,
2551     &EUC_KR_INIT,
2552     &WINDOWS_1252_INIT,
2553     &EUC_KR_INIT,
2554     &X_MAC_CYRILLIC_INIT,
2555     &X_USER_DEFINED_INIT,
2556     &GBK_INIT,
2557     &WINDOWS_1252_INIT,
2558     &ISO_8859_2_INIT,
2559     &ISO_8859_6_INIT,
2560     &ISO_8859_7_INIT,
2561     &ISO_8859_3_INIT,
2562     &ISO_8859_4_INIT,
2563     &ISO_8859_5_INIT,
2564     &ISO_8859_8_INIT,
2565     &WINDOWS_1254_INIT,
2566     &ISO_8859_7_INIT,
2567     &X_MAC_CYRILLIC_INIT,
2568     &REPLACEMENT_INIT,
2569     &ISO_8859_6_INIT,
2570     &ISO_8859_8_INIT,
2571     &UTF_8_INIT,
2572     &ISO_8859_5_INIT,
2573     &EUC_JP_INIT,
2574 ];
2575 
2576 // END GENERATED CODE
2577 
2578 /// An encoding as defined in the [Encoding Standard][1].
2579 ///
2580 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2581 /// and, in most cases, vice versa. Each encoding has a name, an output
2582 /// encoding, and one or more labels.
2583 ///
2584 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2585 /// encoding in formats and protocols. The _name_ of the encoding is the
2586 /// preferred label in the case appropriate for returning from the
2587 /// [`characterSet`][2] property of the `Document` DOM interface.
2588 ///
2589 /// The _output encoding_ is the encoding used for form submission and URL
2590 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2591 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2592 /// encodings.
2593 ///
2594 /// [1]: https://encoding.spec.whatwg.org/
2595 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2596 ///
2597 /// # Streaming vs. Non-Streaming
2598 ///
2599 /// When you have the entire input in a single buffer, you can use the
2600 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2601 /// [`decode_without_bom_handling()`][5],
2602 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2603 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2604 /// not available in the C API.) Unlike the rest of the API available to Rust,
2605 /// these methods perform heap allocations. You should the `Decoder` and
2606 /// `Encoder` objects when your input is split into multiple buffers or when
2607 /// you want to control the allocation of the output buffers.
2608 ///
2609 /// [3]: #method.decode
2610 /// [4]: #method.decode_with_bom_removal
2611 /// [5]: #method.decode_without_bom_handling
2612 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2613 /// [7]: #method.encode
2614 ///
2615 /// # Instances
2616 ///
2617 /// All instances of `Encoding` are statically allocated and have the `'static`
2618 /// lifetime. There is precisely one unique `Encoding` instance for each
2619 /// encoding defined in the Encoding Standard.
2620 ///
2621 /// To obtain a reference to a particular encoding whose identity you know at
2622 /// compile time, use a `static` that refers to encoding. There is a `static`
2623 /// for each encoding. The `static`s are named in all caps with hyphens
2624 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2625 /// name). For example, if you know at compile time that you will want to
2626 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2627 /// in C/C++).
2628 ///
2629 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2630 /// work around the problem that `static`s of the type `&'static Encoding`
2631 /// cannot be used to initialize items of an array whose type is
2632 /// `[&'static Encoding; N]`.
2633 ///
2634 /// If you don't know what encoding you need at compile time and need to
2635 /// dynamically get an encoding by label, use
2636 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2637 ///
2638 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2639 /// C/C++).
2640 pub struct Encoding {
2641     name: &'static str,
2642     variant: VariantEncoding,
2643 }
2644 
2645 impl Encoding {
2646     /// Implements the
2647     /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2648     /// algorithm.
2649     ///
2650     /// If, after ASCII-lowercasing and removing leading and trailing
2651     /// whitespace, the argument matches a label defined in the Encoding
2652     /// Standard, `Some(&'static Encoding)` representing the corresponding
2653     /// encoding is returned. If there is no match, `None` is returned.
2654     ///
2655     /// This is the right method to use if the action upon the method returning
2656     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2657     /// When the action upon the method returning `None` is not to proceed with
2658     /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2659     /// appropriate.
2660     ///
2661     /// The argument is of type `&[u8]` instead of `&str` to save callers
2662     /// that are extracting the label from a non-UTF-8 protocol the trouble
2663     /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2664     /// on it.)
2665     ///
2666     /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2667     pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2668         let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2669         let mut trimmed_pos = 0usize;
2670         let mut iter = label.into_iter();
2671         // before
2672         loop {
2673             match iter.next() {
2674                 None => {
2675                     return None;
2676                 }
2677                 Some(byte) => {
2678                     // The characters used in labels are:
2679                     // a-z (except q, but excluding it below seems excessive)
2680                     // 0-9
2681                     // . _ - :
2682                     match *byte {
2683                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2684                             continue;
2685                         }
2686                         b'A'...b'Z' => {
2687                             trimmed[trimmed_pos] = *byte + 0x20u8;
2688                             trimmed_pos = 1usize;
2689                             break;
2690                         }
2691                         b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2692                             trimmed[trimmed_pos] = *byte;
2693                             trimmed_pos = 1usize;
2694                             break;
2695                         }
2696                         _ => {
2697                             return None;
2698                         }
2699                     }
2700                 }
2701             }
2702         }
2703         // inside
2704         loop {
2705             match iter.next() {
2706                 None => {
2707                     break;
2708                 }
2709                 Some(byte) => {
2710                     match *byte {
2711                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2712                             break;
2713                         }
2714                         b'A'...b'Z' => {
2715                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2716                                 // There's no encoding with a label this long
2717                                 return None;
2718                             }
2719                             trimmed[trimmed_pos] = *byte + 0x20u8;
2720                             trimmed_pos += 1usize;
2721                             continue;
2722                         }
2723                         b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2724                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2725                                 // There's no encoding with a label this long
2726                                 return None;
2727                             }
2728                             trimmed[trimmed_pos] = *byte;
2729                             trimmed_pos += 1usize;
2730                             continue;
2731                         }
2732                         _ => {
2733                             return None;
2734                         }
2735                     }
2736                 }
2737             }
2738         }
2739         // after
2740         loop {
2741             match iter.next() {
2742                 None => {
2743                     break;
2744                 }
2745                 Some(byte) => {
2746                     match *byte {
2747                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2748                             continue;
2749                         }
2750                         _ => {
2751                             // There's no label with space in the middle
2752                             return None;
2753                         }
2754                     }
2755                 }
2756             }
2757         }
2758         let candidate = &trimmed[..trimmed_pos];
2759         match LABELS_SORTED.binary_search_by(|probe| {
2760             let bytes = probe.as_bytes();
2761             let c = bytes.len().cmp(&candidate.len());
2762             if c != Ordering::Equal {
2763                 return c;
2764             }
2765             let probe_iter = bytes.iter().rev();
2766             let candidate_iter = candidate.iter().rev();
2767             probe_iter.cmp(candidate_iter)
2768         }) {
2769             Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2770             Err(_) => None,
2771         }
2772     }
2773 
2774     /// This method behaves the same as `for_label()`, except when `for_label()`
2775     /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2776     ///
2777     /// This method is useful in scenarios where a fatal error is required
2778     /// upon invalid label, because in those cases the caller typically wishes
2779     /// to treat the labels that map to the replacement encoding as fatal
2780     /// errors, too.
2781     ///
2782     /// It is not OK to use this method when the action upon the method returning
2783     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2784     /// case, the `for_label()` method should be used instead in order to avoid
2785     /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2786     ///
2787     /// Available via the C wrapper.
2788     #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2789     pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2790         match Encoding::for_label(label) {
2791             None => None,
2792             Some(encoding) => {
2793                 if encoding == REPLACEMENT {
2794                     None
2795                 } else {
2796                     Some(encoding)
2797                 }
2798             }
2799         }
2800     }
2801 
2802     /// Performs non-incremental BOM sniffing.
2803     ///
2804     /// The argument must either be a buffer representing the entire input
2805     /// stream (non-streaming case) or a buffer representing at least the first
2806     /// three bytes of the input stream (streaming case).
2807     ///
2808     /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2809     /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2810     /// or UTF-16BE BOM or `None` otherwise.
2811     ///
2812     /// Available via the C wrapper.
2813     #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2814     pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2815         if buffer.starts_with(b"\xEF\xBB\xBF") {
2816             Some((UTF_8, 3))
2817         } else if buffer.starts_with(b"\xFF\xFE") {
2818             Some((UTF_16LE, 2))
2819         } else if buffer.starts_with(b"\xFE\xFF") {
2820             Some((UTF_16BE, 2))
2821         } else {
2822             None
2823         }
2824     }
2825 
2826     /// Returns the name of this encoding.
2827     ///
2828     /// This name is appropriate to return as-is from the DOM
2829     /// `document.characterSet` property.
2830     ///
2831     /// Available via the C wrapper.
2832     #[inline]
name(&'static self) -> &'static str2833     pub fn name(&'static self) -> &'static str {
2834         self.name
2835     }
2836 
2837     /// Checks whether the _output encoding_ of this encoding can encode every
2838     /// `char`. (Only true if the output encoding is UTF-8.)
2839     ///
2840     /// Available via the C wrapper.
2841     #[inline]
can_encode_everything(&'static self) -> bool2842     pub fn can_encode_everything(&'static self) -> bool {
2843         self.output_encoding() == UTF_8
2844     }
2845 
2846     /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2847     /// U+0000...U+007F and vice versa.
2848     ///
2849     /// Available via the C wrapper.
2850     #[inline]
is_ascii_compatible(&'static self) -> bool2851     pub fn is_ascii_compatible(&'static self) -> bool {
2852         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2853     }
2854 
2855     /// Checks whether this encoding maps one byte to one Basic Multilingual
2856     /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2857     /// vice versa (for mappable characters).
2858     ///
2859     /// `true` iff this encoding is on the list of [Legacy single-byte
2860     /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2861     /// in the spec or x-user-defined.
2862     ///
2863     /// Available via the C wrapper.
2864     #[inline]
is_single_byte(&'static self) -> bool2865     pub fn is_single_byte(&'static self) -> bool {
2866         self.variant.is_single_byte()
2867     }
2868 
2869     /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2870     /// U+0000...U+007F and vice versa.
2871     #[inline]
is_potentially_borrowable(&'static self) -> bool2872     fn is_potentially_borrowable(&'static self) -> bool {
2873         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2874     }
2875 
2876     /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2877     /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2878     ///
2879     /// Available via the C wrapper.
2880     #[inline]
output_encoding(&'static self) -> &'static Encoding2881     pub fn output_encoding(&'static self) -> &'static Encoding {
2882         if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2883             UTF_8
2884         } else {
2885             self
2886         }
2887     }
2888 
2889     /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2890     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2891     /// entire input is available as a single buffer (i.e. the end of the
2892     /// buffer marks the end of the stream).
2893     ///
2894     /// This method implements the (non-streaming version of) the
2895     /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2896     ///
2897     /// The second item in the returned tuple is the encoding that was actually
2898     /// used (which may differ from this encoding thanks to BOM sniffing).
2899     ///
2900     /// The third item in the returned tuple indicates whether there were
2901     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2902     ///
2903     /// _Note:_ It is wrong to use this when the input buffer represents only
2904     /// a segment of the input instead of the whole input. Use `new_decoder()`
2905     /// when decoding segmented input.
2906     ///
2907     /// This method performs a one or two heap allocations for the backing
2908     /// buffer of the `String` when unable to borrow. (One allocation if not
2909     /// errors and potentially another one in the presence of errors.) The
2910     /// first allocation assumes jemalloc and may not be optimal with
2911     /// allocators that do not use power-of-two buckets. A borrow is performed
2912     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2913     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2914     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2915     /// transitions.
2916     ///
2917     /// # Panics
2918     ///
2919     /// If the size calculation for a heap-allocated backing buffer overflows
2920     /// `usize`.
2921     ///
2922     /// Available to Rust only.
2923     #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2924     pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2925         let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2926             Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2927             None => (self, bytes),
2928         };
2929         let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2930         (cow, encoding, had_errors)
2931     }
2932 
2933     /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2934     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2935     /// entire input is available as a single buffer (i.e. the end of the
2936     /// buffer marks the end of the stream).
2937     ///
2938     /// When invoked on `UTF_8`, this method implements the (non-streaming
2939     /// version of) the
2940     /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2941     /// concept.
2942     ///
2943     /// The second item in the returned pair indicates whether there were
2944     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2945     ///
2946     /// _Note:_ It is wrong to use this when the input buffer represents only
2947     /// a segment of the input instead of the whole input. Use
2948     /// `new_decoder_with_bom_removal()` when decoding segmented input.
2949     ///
2950     /// This method performs a one or two heap allocations for the backing
2951     /// buffer of the `String` when unable to borrow. (One allocation if not
2952     /// errors and potentially another one in the presence of errors.) The
2953     /// first allocation assumes jemalloc and may not be optimal with
2954     /// allocators that do not use power-of-two buckets. A borrow is performed
2955     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2956     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2957     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2958     /// transitions.
2959     ///
2960     /// # Panics
2961     ///
2962     /// If the size calculation for a heap-allocated backing buffer overflows
2963     /// `usize`.
2964     ///
2965     /// Available to Rust only.
2966     #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)2967     pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2968         let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2969             &bytes[3..]
2970         } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2971             || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2972         {
2973             &bytes[2..]
2974         } else {
2975             bytes
2976         };
2977         self.decode_without_bom_handling(without_bom)
2978     }
2979 
2980     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2981     /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2982     /// the entire input is available as a single buffer (i.e. the end of the
2983     /// buffer marks the end of the stream).
2984     ///
2985     /// When invoked on `UTF_8`, this method implements the (non-streaming
2986     /// version of) the
2987     /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2988     /// spec concept.
2989     ///
2990     /// The second item in the returned pair indicates whether there were
2991     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2992     ///
2993     /// _Note:_ It is wrong to use this when the input buffer represents only
2994     /// a segment of the input instead of the whole input. Use
2995     /// `new_decoder_without_bom_handling()` when decoding segmented input.
2996     ///
2997     /// This method performs a one or two heap allocations for the backing
2998     /// buffer of the `String` when unable to borrow. (One allocation if not
2999     /// errors and potentially another one in the presence of errors.) The
3000     /// first allocation assumes jemalloc and may not be optimal with
3001     /// allocators that do not use power-of-two buckets. A borrow is performed
3002     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3003     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3004     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3005     /// transitions.
3006     ///
3007     /// # Panics
3008     ///
3009     /// If the size calculation for a heap-allocated backing buffer overflows
3010     /// `usize`.
3011     ///
3012     /// Available to Rust only.
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3013     pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3014         let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3015             let valid_up_to = if self == UTF_8 {
3016                 utf8_valid_up_to(bytes)
3017             } else if self == ISO_2022_JP {
3018                 iso_2022_jp_ascii_valid_up_to(bytes)
3019             } else {
3020                 ascii_valid_up_to(bytes)
3021             };
3022             if valid_up_to == bytes.len() {
3023                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3024                 return (Cow::Borrowed(str), false);
3025             }
3026             let decoder = self.new_decoder_without_bom_handling();
3027 
3028             let rounded_without_replacement = checked_next_power_of_two(checked_add(
3029                 valid_up_to,
3030                 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3031             ));
3032             let with_replacement = checked_add(
3033                 valid_up_to,
3034                 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3035             );
3036             let mut string = String::with_capacity(
3037                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3038             );
3039             unsafe {
3040                 let vec = string.as_mut_vec();
3041                 vec.set_len(valid_up_to);
3042                 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3043             }
3044             (decoder, string, valid_up_to)
3045         } else {
3046             let decoder = self.new_decoder_without_bom_handling();
3047             let rounded_without_replacement = checked_next_power_of_two(
3048                 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3049             );
3050             let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3051             let string = String::with_capacity(
3052                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3053             );
3054             (decoder, string, 0)
3055         };
3056 
3057         let mut total_had_errors = false;
3058         loop {
3059             let (result, read, had_errors) =
3060                 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3061             total_read += read;
3062             total_had_errors |= had_errors;
3063             match result {
3064                 CoderResult::InputEmpty => {
3065                     debug_assert_eq!(total_read, bytes.len());
3066                     return (Cow::Owned(string), total_had_errors);
3067                 }
3068                 CoderResult::OutputFull => {
3069                     // Allocate for the worst case. That is, we should come
3070                     // here at most once per invocation of this method.
3071                     let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3072                     string.reserve(needed.unwrap());
3073                 }
3074             }
3075         }
3076     }
3077 
3078     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3079     /// _with malformed sequences treated as fatal_ when the entire input is
3080     /// available as a single buffer (i.e. the end of the buffer marks the end
3081     /// of the stream).
3082     ///
3083     /// When invoked on `UTF_8`, this method implements the (non-streaming
3084     /// version of) the
3085     /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3086     /// spec concept.
3087     ///
3088     /// Returns `None` if a malformed sequence was encountered and the result
3089     /// of the decode as `Some(String)` otherwise.
3090     ///
3091     /// _Note:_ It is wrong to use this when the input buffer represents only
3092     /// a segment of the input instead of the whole input. Use
3093     /// `new_decoder_without_bom_handling()` when decoding segmented input.
3094     ///
3095     /// This method performs a single heap allocation for the backing
3096     /// buffer of the `String` when unable to borrow. A borrow is performed if
3097     /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3098     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3099     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3100     /// transitions.
3101     ///
3102     /// # Panics
3103     ///
3104     /// If the size calculation for a heap-allocated backing buffer overflows
3105     /// `usize`.
3106     ///
3107     /// Available to Rust only.
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3108     pub fn decode_without_bom_handling_and_without_replacement<'a>(
3109         &'static self,
3110         bytes: &'a [u8],
3111     ) -> Option<Cow<'a, str>> {
3112         if self == UTF_8 {
3113             let valid_up_to = utf8_valid_up_to(bytes);
3114             if valid_up_to == bytes.len() {
3115                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3116                 return Some(Cow::Borrowed(str));
3117             }
3118             return None;
3119         }
3120         let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3121             let valid_up_to = if self == ISO_2022_JP {
3122                 iso_2022_jp_ascii_valid_up_to(bytes)
3123             } else {
3124                 ascii_valid_up_to(bytes)
3125             };
3126             if valid_up_to == bytes.len() {
3127                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3128                 return Some(Cow::Borrowed(str));
3129             }
3130             let decoder = self.new_decoder_without_bom_handling();
3131             let mut string = String::with_capacity(
3132                 checked_add(
3133                     valid_up_to,
3134                     decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3135                 )
3136                 .unwrap(),
3137             );
3138             unsafe {
3139                 let vec = string.as_mut_vec();
3140                 vec.set_len(valid_up_to);
3141                 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3142             }
3143             (decoder, string, &bytes[valid_up_to..])
3144         } else {
3145             let decoder = self.new_decoder_without_bom_handling();
3146             let string = String::with_capacity(
3147                 decoder
3148                     .max_utf8_buffer_length_without_replacement(bytes.len())
3149                     .unwrap(),
3150             );
3151             (decoder, string, bytes)
3152         };
3153         let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3154         match result {
3155             DecoderResult::InputEmpty => {
3156                 debug_assert_eq!(read, input.len());
3157                 Some(Cow::Owned(string))
3158             }
3159             DecoderResult::Malformed(_, _) => None,
3160             DecoderResult::OutputFull => unreachable!(),
3161         }
3162     }
3163 
3164     /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3165     /// replaced with decimal numeric character references when the entire input
3166     /// is available as a single buffer (i.e. the end of the buffer marks the
3167     /// end of the stream).
3168     ///
3169     /// This method implements the (non-streaming version of) the
3170     /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3171     /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3172     /// spec concept, it is slightly more efficient to use
3173     /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3174     /// method on `UTF_8`.
3175     ///
3176     /// The second item in the returned tuple is the encoding that was actually
3177     /// used (which may differ from this encoding thanks to some encodings
3178     /// having UTF-8 as their output encoding).
3179     ///
3180     /// The third item in the returned tuple indicates whether there were
3181     /// unmappable characters (that were replaced with HTML numeric character
3182     /// references).
3183     ///
3184     /// _Note:_ It is wrong to use this when the input buffer represents only
3185     /// a segment of the input instead of the whole input. Use `new_encoder()`
3186     /// when encoding segmented output.
3187     ///
3188     /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3189     /// ASCII-compatible encoding, this method returns a borrow of the input
3190     /// without a heap allocation. Otherwise, this method performs a single
3191     /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3192     /// unmappable characters and potentially multiple heap allocations if
3193     /// there are. These allocations are tuned for jemalloc and may not be
3194     /// optimal when using a different allocator that doesn't use power-of-two
3195     /// buckets.
3196     ///
3197     /// # Panics
3198     ///
3199     /// If the size calculation for a heap-allocated backing buffer overflows
3200     /// `usize`.
3201     ///
3202     /// Available to Rust only.
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3203     pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3204         let output_encoding = self.output_encoding();
3205         if output_encoding == UTF_8 {
3206             return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3207         }
3208         debug_assert!(output_encoding.is_potentially_borrowable());
3209         let bytes = string.as_bytes();
3210         let valid_up_to = if output_encoding == ISO_2022_JP {
3211             iso_2022_jp_ascii_valid_up_to(bytes)
3212         } else {
3213             ascii_valid_up_to(bytes)
3214         };
3215         if valid_up_to == bytes.len() {
3216             return (Cow::Borrowed(bytes), output_encoding, false);
3217         }
3218         let mut encoder = output_encoding.new_encoder();
3219         let mut vec: Vec<u8> = Vec::with_capacity(
3220             (checked_add(
3221                 valid_up_to,
3222                 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3223             ))
3224             .unwrap()
3225             .next_power_of_two(),
3226         );
3227         unsafe {
3228             vec.set_len(valid_up_to);
3229             std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3230         }
3231         let mut total_read = valid_up_to;
3232         let mut total_had_errors = false;
3233         loop {
3234             let (result, read, had_errors) =
3235                 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3236             total_read += read;
3237             total_had_errors |= had_errors;
3238             match result {
3239                 CoderResult::InputEmpty => {
3240                     debug_assert_eq!(total_read, string.len());
3241                     return (Cow::Owned(vec), output_encoding, total_had_errors);
3242                 }
3243                 CoderResult::OutputFull => {
3244                     // reserve_exact wants to know how much more on top of current
3245                     // length--not current capacity.
3246                     let needed = encoder
3247                         .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3248                     let rounded = (checked_add(vec.capacity(), needed))
3249                         .unwrap()
3250                         .next_power_of_two();
3251                     let additional = rounded - vec.len();
3252                     vec.reserve_exact(additional);
3253                 }
3254             }
3255         }
3256     }
3257 
new_variant_decoder(&'static self) -> VariantDecoder3258     fn new_variant_decoder(&'static self) -> VariantDecoder {
3259         self.variant.new_variant_decoder()
3260     }
3261 
3262     /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3263     ///
3264     /// BOM sniffing may cause the returned decoder to morph into a decoder
3265     /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3266     ///
3267     /// Available via the C wrapper.
3268     #[inline]
new_decoder(&'static self) -> Decoder3269     pub fn new_decoder(&'static self) -> Decoder {
3270         Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3271     }
3272 
3273     /// Instantiates a new decoder for this encoding with BOM removal.
3274     ///
3275     /// If the input starts with bytes that are the BOM for this encoding,
3276     /// those bytes are removed. However, the decoder never morphs into a
3277     /// decoder for another encoding: A BOM for another encoding is treated as
3278     /// (potentially malformed) input to the decoding algorithm for this
3279     /// encoding.
3280     ///
3281     /// Available via the C wrapper.
3282     #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3283     pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3284         Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3285     }
3286 
3287     /// Instantiates a new decoder for this encoding with BOM handling disabled.
3288     ///
3289     /// If the input starts with bytes that look like a BOM, those bytes are
3290     /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3291     /// for another encoding.)
3292     ///
3293     /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3294     /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3295     /// instead of this method to cause the BOM to be removed.
3296     ///
3297     /// Available via the C wrapper.
3298     #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3299     pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3300         Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3301     }
3302 
3303     /// Instantiates a new encoder for the output encoding of this encoding.
3304     ///
3305     /// Available via the C wrapper.
3306     #[inline]
new_encoder(&'static self) -> Encoder3307     pub fn new_encoder(&'static self) -> Encoder {
3308         let enc = self.output_encoding();
3309         enc.variant.new_encoder(enc)
3310     }
3311 
3312     /// Validates UTF-8.
3313     ///
3314     /// Returns the index of the first byte that makes the input malformed as
3315     /// UTF-8 or the length of the slice if the slice is entirely valid.
3316     ///
3317     /// This is currently faster than the corresponding standard library
3318     /// functionality. If this implementation gets upstreamed to the standard
3319     /// library, this method may be removed in the future.
3320     ///
3321     /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3322     pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3323         utf8_valid_up_to(bytes)
3324     }
3325 
3326     /// Validates ASCII.
3327     ///
3328     /// Returns the index of the first byte that makes the input malformed as
3329     /// ASCII or the length of the slice if the slice is entirely valid.
3330     ///
3331     /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3332     pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3333         ascii_valid_up_to(bytes)
3334     }
3335 
3336     /// Validates ISO-2022-JP ASCII-state data.
3337     ///
3338     /// Returns the index of the first byte that makes the input not
3339     /// representable in the ASCII state of ISO-2022-JP or the length of the
3340     /// slice if the slice is entirely representable in the ASCII state of
3341     /// ISO-2022-JP.
3342     ///
3343     /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3344     pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3345         iso_2022_jp_ascii_valid_up_to(bytes)
3346     }
3347 }
3348 
3349 impl PartialEq for Encoding {
3350     #[inline]
eq(&self, other: &Encoding) -> bool3351     fn eq(&self, other: &Encoding) -> bool {
3352         (self as *const Encoding) == (other as *const Encoding)
3353     }
3354 }
3355 
3356 impl Eq for Encoding {}
3357 
3358 impl Hash for Encoding {
3359     #[inline]
hash<H: Hasher>(&self, state: &mut H)3360     fn hash<H: Hasher>(&self, state: &mut H) {
3361         (self as *const Encoding).hash(state);
3362     }
3363 }
3364 
3365 impl std::fmt::Debug for Encoding {
3366     #[inline]
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result3367     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3368         write!(f, "Encoding {{ {} }}", self.name)
3369     }
3370 }
3371 
3372 #[cfg(feature = "serde")]
3373 impl Serialize for Encoding {
3374     #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3375     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3376     where
3377         S: Serializer,
3378     {
3379         serializer.serialize_str(self.name)
3380     }
3381 }
3382 
3383 #[cfg(feature = "serde")]
3384 struct EncodingVisitor;
3385 
3386 #[cfg(feature = "serde")]
3387 impl<'de> Visitor<'de> for EncodingVisitor {
3388     type Value = &'static Encoding;
3389 
expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result3390     fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
3391         formatter.write_str("a valid encoding label")
3392     }
3393 
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3394     fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3395     where
3396         E: serde::de::Error,
3397     {
3398         if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3399             Ok(enc)
3400         } else {
3401             Err(E::custom(format!("invalid encoding label: {}", value)))
3402         }
3403     }
3404 }
3405 
3406 #[cfg(feature = "serde")]
3407 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3408     fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3409     where
3410         D: Deserializer<'de>,
3411     {
3412         deserializer.deserialize_str(EncodingVisitor)
3413     }
3414 }
3415 
3416 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3417 #[derive(PartialEq, Debug, Copy, Clone)]
3418 enum DecoderLifeCycle {
3419     /// The decoder has seen no input yet.
3420     AtStart,
3421     /// The decoder has seen no input yet but expects UTF-8.
3422     AtUtf8Start,
3423     /// The decoder has seen no input yet but expects UTF-16BE.
3424     AtUtf16BeStart,
3425     /// The decoder has seen no input yet but expects UTF-16LE.
3426     AtUtf16LeStart,
3427     /// The decoder has seen EF.
3428     SeenUtf8First,
3429     /// The decoder has seen EF, BB.
3430     SeenUtf8Second,
3431     /// The decoder has seen FE.
3432     SeenUtf16BeFirst,
3433     /// The decoder has seen FF.
3434     SeenUtf16LeFirst,
3435     /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3436     /// underlying decoder reported EF as an error, so we need to remember to
3437     /// push BB before the next buffer.
3438     ConvertingWithPendingBB,
3439     /// No longer looking for a BOM and EOF not yet seen.
3440     Converting,
3441     /// EOF has been seen.
3442     Finished,
3443 }
3444 
3445 /// Communicate the BOM handling mode.
3446 #[derive(Debug, Copy, Clone)]
3447 enum BomHandling {
3448     /// Don't handle the BOM
3449     Off,
3450     /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3451     Sniff,
3452     /// Remove the BOM only if it's the BOM for this encoding
3453     Remove,
3454 }
3455 
3456 /// Result of a (potentially partial) decode or encode operation with
3457 /// replacement.
3458 #[must_use]
3459 #[derive(Debug, PartialEq, Eq)]
3460 pub enum CoderResult {
3461     /// The input was exhausted.
3462     ///
3463     /// If this result was returned from a call where `last` was `true`, the
3464     /// conversion process has completed. Otherwise, the caller should call a
3465     /// decode or encode method again with more input.
3466     InputEmpty,
3467 
3468     /// The converter cannot produce another unit of output, because the output
3469     /// buffer does not have enough space left.
3470     ///
3471     /// The caller must provide more output space upon the next call and re-push
3472     /// the remaining input to the converter.
3473     OutputFull,
3474 }
3475 
3476 /// Result of a (potentially partial) decode operation without replacement.
3477 #[must_use]
3478 #[derive(Debug, PartialEq, Eq)]
3479 pub enum DecoderResult {
3480     /// The input was exhausted.
3481     ///
3482     /// If this result was returned from a call where `last` was `true`, the
3483     /// decoding process has completed. Otherwise, the caller should call a
3484     /// decode method again with more input.
3485     InputEmpty,
3486 
3487     /// The decoder cannot produce another unit of output, because the output
3488     /// buffer does not have enough space left.
3489     ///
3490     /// The caller must provide more output space upon the next call and re-push
3491     /// the remaining input to the decoder.
3492     OutputFull,
3493 
3494     /// The decoder encountered a malformed byte sequence.
3495     ///
3496     /// The caller must either treat this as a fatal error or must append one
3497     /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3498     /// the remaining input to the decoder.
3499     ///
3500     /// The first wrapped integer indicates the length of the malformed byte
3501     /// sequence. The second wrapped integer indicates the number of bytes
3502     /// that were consumed after the malformed sequence. If the second
3503     /// integer is zero, the last byte that was consumed is the last byte of
3504     /// the malformed sequence. Note that the malformed bytes may have been part
3505     /// of an earlier input buffer.
3506     ///
3507     /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3508     /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3509     /// of the two is 6, which happens with ISO-2022-JP.
3510     Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3511 }
3512 
3513 /// A converter that decodes a byte stream into Unicode according to a
3514 /// character encoding in a streaming (incremental) manner.
3515 ///
3516 /// The various `decode_*` methods take an input buffer (`src`) and an output
3517 /// buffer `dst` both of which are caller-allocated. There are variants for
3518 /// both UTF-8 and UTF-16 output buffers.
3519 ///
3520 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3521 /// into `dst` until one of the following three things happens:
3522 ///
3523 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3524 ///    variants only).
3525 ///
3526 /// 2. The output buffer has been filled so near capacity that the decoder
3527 ///    cannot be sure that processing an additional byte of input wouldn't
3528 ///    cause so much output that the output buffer would overflow.
3529 ///
3530 /// 3. All the input bytes have been processed.
3531 ///
3532 /// The `decode_*` method then returns tuple of a status indicating which one
3533 /// of the three reasons to return happened, how many input bytes were read,
3534 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3535 /// when decoding to UTF-16) were written (except when decoding into `String`,
3536 /// whose length change indicates this), and in the case of the
3537 /// variants performing replacement, a boolean indicating whether an error was
3538 /// replaced with the REPLACEMENT CHARACTER during the call.
3539 ///
3540 /// The number of bytes "written" is what's logically written. Garbage may be
3541 /// written in the output buffer beyond the point logically written to.
3542 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3543 /// methods that take an `&mut str` argument instead of the ones that take an
3544 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3545 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3546 /// latter don't.
3547 ///
3548 /// In the case of the `*_without_replacement` variants, the status is a
3549 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3550 /// `InputEmpty` corresponding to the three cases listed above).
3551 ///
3552 /// In the case of methods whose name does not end with
3553 /// `*_without_replacement`, malformed sequences are automatically replaced
3554 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3555 /// return early.
3556 ///
3557 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3558 /// space. When decoding to UTF-16, the output buffer must have at least two
3559 /// UTF-16 code units (`u16`) of space.
3560 ///
3561 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3562 /// not to return indicating that more output space is needed if the length
3563 /// of the output buffer is at least the length returned by
3564 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3565 /// with replacement, the length of the output buffer that guarantees the
3566 /// methods not to return indicating that more output space is needed is given
3567 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3568 /// or without replacement, the length of the output buffer that guarantees
3569 /// the methods not to return indicating that more output space is needed is
3570 /// given by [`max_utf16_buffer_length()`][4].
3571 ///
3572 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3573 /// and the output after each `decode_*` call is guaranteed to consist of
3574 /// complete characters. (I.e. the code unit sequence for the last character is
3575 /// guaranteed not to be split across output buffers.)
3576 ///
3577 /// The boolean argument `last` indicates that the end of the stream is reached
3578 /// when all the bytes in `src` have been consumed.
3579 ///
3580 /// A `Decoder` object can be used to incrementally decode a byte stream.
3581 ///
3582 /// During the processing of a single stream, the caller must call `decode_*`
3583 /// zero or more times with `last` set to `false` and then call `decode_*` at
3584 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3585 /// the processing of the stream has ended. Otherwise, the caller must call
3586 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3587 ///  a fatal error).
3588 ///
3589 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3590 /// That is, you need to create another one to process another stream.
3591 ///
3592 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3593 /// the caller does not wish to treat it as a fatal error, the input buffer
3594 /// `src` may not have been completely consumed. In that case, the caller must
3595 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3596 /// call.
3597 ///
3598 /// [1]: enum.DecoderResult.html
3599 /// [2]: #method.max_utf8_buffer_length_without_replacement
3600 /// [3]: #method.max_utf8_buffer_length
3601 /// [4]: #method.max_utf16_buffer_length
3602 ///
3603 /// # Infinite loops
3604 ///
3605 /// When converting with a fixed-size output buffer whose size is too small to
3606 /// accommodate one character or (when applicable) one numeric character
3607 /// reference of output, an infinite loop ensues. When converting with a
3608 /// fixed-size output buffer, it generally makes sense to make the buffer
3609 /// fairly large (e.g. couple of kilobytes).
3610 pub struct Decoder {
3611     encoding: &'static Encoding,
3612     variant: VariantDecoder,
3613     life_cycle: DecoderLifeCycle,
3614 }
3615 
3616 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3617     fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3618         Decoder {
3619             encoding: enc,
3620             variant: decoder,
3621             life_cycle: match sniffing {
3622                 BomHandling::Off => DecoderLifeCycle::Converting,
3623                 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3624                 BomHandling::Remove => {
3625                     if enc == UTF_8 {
3626                         DecoderLifeCycle::AtUtf8Start
3627                     } else if enc == UTF_16BE {
3628                         DecoderLifeCycle::AtUtf16BeStart
3629                     } else if enc == UTF_16LE {
3630                         DecoderLifeCycle::AtUtf16LeStart
3631                     } else {
3632                         DecoderLifeCycle::Converting
3633                     }
3634                 }
3635             },
3636         }
3637     }
3638 
3639     /// The `Encoding` this `Decoder` is for.
3640     ///
3641     /// BOM sniffing can change the return value of this method during the life
3642     /// of the decoder.
3643     ///
3644     /// Available via the C wrapper.
3645     #[inline]
encoding(&self) -> &'static Encoding3646     pub fn encoding(&self) -> &'static Encoding {
3647         self.encoding
3648     }
3649 
3650     /// Query the worst-case UTF-8 output size _with replacement_.
3651     ///
3652     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3653     /// that will not overflow given the current state of the decoder and
3654     /// `byte_length` number of additional input bytes when decoding with
3655     /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3656     /// sequence or `None` if `usize` would overflow.
3657     ///
3658     /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3659     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3660         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3661         // BOM getting pushed to the underlying decoder.
3662         match self.life_cycle {
3663             DecoderLifeCycle::Converting
3664             | DecoderLifeCycle::AtUtf8Start
3665             | DecoderLifeCycle::AtUtf16LeStart
3666             | DecoderLifeCycle::AtUtf16BeStart => {
3667                 return self.variant.max_utf8_buffer_length(byte_length);
3668             }
3669             DecoderLifeCycle::AtStart => {
3670                 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3671                     if let Some(utf16_bom) = checked_add(
3672                         1,
3673                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3674                     ) {
3675                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3676                         let encoding = self.encoding();
3677                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3678                             // No need to consider the internal state of the underlying decoder,
3679                             // because it is at start, because no data has reached it yet.
3680                             return Some(utf_bom);
3681                         } else if let Some(non_bom) =
3682                             self.variant.max_utf8_buffer_length(byte_length)
3683                         {
3684                             return Some(std::cmp::max(utf_bom, non_bom));
3685                         }
3686                     }
3687                 }
3688             }
3689             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3690                 // Add two bytes even when only one byte has been seen,
3691                 // because the one byte can become a lead byte in multibyte
3692                 // decoders, but only after the decoder has been queried
3693                 // for max length, so the decoder's own logic for adding
3694                 // one for a pending lead cannot work.
3695                 if let Some(sum) = byte_length.checked_add(2) {
3696                     if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3697                         if self.encoding() == UTF_8 {
3698                             // No need to consider the internal state of the underlying decoder,
3699                             // because it is at start, because no data has reached it yet.
3700                             return Some(utf8_bom);
3701                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3702                             return Some(std::cmp::max(utf8_bom, non_bom));
3703                         }
3704                     }
3705                 }
3706             }
3707             DecoderLifeCycle::ConvertingWithPendingBB => {
3708                 if let Some(sum) = byte_length.checked_add(2) {
3709                     return self.variant.max_utf8_buffer_length(sum);
3710                 }
3711             }
3712             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3713                 // Add two bytes even when only one byte has been seen,
3714                 // because the one byte can become a lead byte in multibyte
3715                 // decoders, but only after the decoder has been queried
3716                 // for max length, so the decoder's own logic for adding
3717                 // one for a pending lead cannot work.
3718                 if let Some(sum) = byte_length.checked_add(2) {
3719                     if let Some(utf16_bom) =
3720                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3721                     {
3722                         let encoding = self.encoding();
3723                         if encoding == UTF_16LE || encoding == UTF_16BE {
3724                             // No need to consider the internal state of the underlying decoder,
3725                             // because it is at start, because no data has reached it yet.
3726                             return Some(utf16_bom);
3727                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3728                             return Some(std::cmp::max(utf16_bom, non_bom));
3729                         }
3730                     }
3731                 }
3732             }
3733             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3734         }
3735         None
3736     }
3737 
3738     /// Query the worst-case UTF-8 output size _without replacement_.
3739     ///
3740     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3741     /// that will not overflow given the current state of the decoder and
3742     /// `byte_length` number of additional input bytes when decoding without
3743     /// replacement error handling or `None` if `usize` would overflow.
3744     ///
3745     /// Note that this value may be too small for the `_with_replacement` case.
3746     /// Use `max_utf8_buffer_length()` for that case.
3747     ///
3748     /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3749     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3750         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3751         // BOM getting pushed to the underlying decoder.
3752         match self.life_cycle {
3753             DecoderLifeCycle::Converting
3754             | DecoderLifeCycle::AtUtf8Start
3755             | DecoderLifeCycle::AtUtf16LeStart
3756             | DecoderLifeCycle::AtUtf16BeStart => {
3757                 return self
3758                     .variant
3759                     .max_utf8_buffer_length_without_replacement(byte_length);
3760             }
3761             DecoderLifeCycle::AtStart => {
3762                 if let Some(utf8_bom) = byte_length.checked_add(3) {
3763                     if let Some(utf16_bom) = checked_add(
3764                         1,
3765                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3766                     ) {
3767                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3768                         let encoding = self.encoding();
3769                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3770                             // No need to consider the internal state of the underlying decoder,
3771                             // because it is at start, because no data has reached it yet.
3772                             return Some(utf_bom);
3773                         } else if let Some(non_bom) = self
3774                             .variant
3775                             .max_utf8_buffer_length_without_replacement(byte_length)
3776                         {
3777                             return Some(std::cmp::max(utf_bom, non_bom));
3778                         }
3779                     }
3780                 }
3781             }
3782             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3783                 // Add two bytes even when only one byte has been seen,
3784                 // because the one byte can become a lead byte in multibyte
3785                 // decoders, but only after the decoder has been queried
3786                 // for max length, so the decoder's own logic for adding
3787                 // one for a pending lead cannot work.
3788                 if let Some(sum) = byte_length.checked_add(2) {
3789                     if let Some(utf8_bom) = sum.checked_add(3) {
3790                         if self.encoding() == UTF_8 {
3791                             // No need to consider the internal state of the underlying decoder,
3792                             // because it is at start, because no data has reached it yet.
3793                             return Some(utf8_bom);
3794                         } else if let Some(non_bom) =
3795                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3796                         {
3797                             return Some(std::cmp::max(utf8_bom, non_bom));
3798                         }
3799                     }
3800                 }
3801             }
3802             DecoderLifeCycle::ConvertingWithPendingBB => {
3803                 if let Some(sum) = byte_length.checked_add(2) {
3804                     return self.variant.max_utf8_buffer_length_without_replacement(sum);
3805                 }
3806             }
3807             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3808                 // Add two bytes even when only one byte has been seen,
3809                 // because the one byte can become a lead byte in multibyte
3810                 // decoders, but only after the decoder has been queried
3811                 // for max length, so the decoder's own logic for adding
3812                 // one for a pending lead cannot work.
3813                 if let Some(sum) = byte_length.checked_add(2) {
3814                     if let Some(utf16_bom) =
3815                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3816                     {
3817                         let encoding = self.encoding();
3818                         if encoding == UTF_16LE || encoding == UTF_16BE {
3819                             // No need to consider the internal state of the underlying decoder,
3820                             // because it is at start, because no data has reached it yet.
3821                             return Some(utf16_bom);
3822                         } else if let Some(non_bom) =
3823                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3824                         {
3825                             return Some(std::cmp::max(utf16_bom, non_bom));
3826                         }
3827                     }
3828                 }
3829             }
3830             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3831         }
3832         None
3833     }
3834 
3835     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3836     /// replaced with the REPLACEMENT CHARACTER.
3837     ///
3838     /// See the documentation of the struct for documentation for `decode_*`
3839     /// methods collectively.
3840     ///
3841     /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3842     pub fn decode_to_utf8(
3843         &mut self,
3844         src: &[u8],
3845         dst: &mut [u8],
3846         last: bool,
3847     ) -> (CoderResult, usize, usize, bool) {
3848         let mut had_errors = false;
3849         let mut total_read = 0usize;
3850         let mut total_written = 0usize;
3851         loop {
3852             let (result, read, written) = self.decode_to_utf8_without_replacement(
3853                 &src[total_read..],
3854                 &mut dst[total_written..],
3855                 last,
3856             );
3857             total_read += read;
3858             total_written += written;
3859             match result {
3860                 DecoderResult::InputEmpty => {
3861                     return (
3862                         CoderResult::InputEmpty,
3863                         total_read,
3864                         total_written,
3865                         had_errors,
3866                     );
3867                 }
3868                 DecoderResult::OutputFull => {
3869                     return (
3870                         CoderResult::OutputFull,
3871                         total_read,
3872                         total_written,
3873                         had_errors,
3874                     );
3875                 }
3876                 DecoderResult::Malformed(_, _) => {
3877                     had_errors = true;
3878                     // There should always be space for the U+FFFD, because
3879                     // otherwise we'd have gotten OutputFull already.
3880                     // XXX: is the above comment actually true for UTF-8 itself?
3881                     // TODO: Consider having fewer bound checks here.
3882                     dst[total_written] = 0xEFu8;
3883                     total_written += 1;
3884                     dst[total_written] = 0xBFu8;
3885                     total_written += 1;
3886                     dst[total_written] = 0xBDu8;
3887                     total_written += 1;
3888                 }
3889             }
3890         }
3891     }
3892 
3893     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3894     /// replaced with the REPLACEMENT CHARACTER with type system signaling
3895     /// of UTF-8 validity.
3896     ///
3897     /// This methods calls `decode_to_utf8` and then zeroes
3898     /// out up to three bytes that aren't logically part of the write in order
3899     /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3900     ///
3901     /// See the documentation of the struct for documentation for `decode_*`
3902     /// methods collectively.
3903     ///
3904     /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3905     pub fn decode_to_str(
3906         &mut self,
3907         src: &[u8],
3908         dst: &mut str,
3909         last: bool,
3910     ) -> (CoderResult, usize, usize, bool) {
3911         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3912         let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3913         let len = bytes.len();
3914         let mut trail = written;
3915         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3916         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3917         // encodings to avoid overwriting here.
3918         if self.encoding != UTF_8 {
3919             let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3920             while trail < max {
3921                 bytes[trail] = 0;
3922                 trail += 1;
3923             }
3924         }
3925         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3926             bytes[trail] = 0;
3927             trail += 1;
3928         }
3929         (result, read, written, replaced)
3930     }
3931 
3932     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3933     /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3934     ///
3935     /// Like the others, this method follows the logic that the output buffer is
3936     /// caller-allocated. This method treats the capacity of the `String` as
3937     /// the output limit. That is, this method guarantees not to cause a
3938     /// reallocation of the backing buffer of `String`.
3939     ///
3940     /// The return value is a tuple that contains the `DecoderResult`, the
3941     /// number of bytes read and a boolean indicating whether replacements
3942     /// were done. The number of bytes written is signaled via the length of
3943     /// the `String` changing.
3944     ///
3945     /// See the documentation of the struct for documentation for `decode_*`
3946     /// methods collectively.
3947     ///
3948     /// Available to Rust only.
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)3949     pub fn decode_to_string(
3950         &mut self,
3951         src: &[u8],
3952         dst: &mut String,
3953         last: bool,
3954     ) -> (CoderResult, usize, bool) {
3955         unsafe {
3956             let vec = dst.as_mut_vec();
3957             let old_len = vec.len();
3958             let capacity = vec.capacity();
3959             vec.set_len(capacity);
3960             let (result, read, written, replaced) =
3961                 self.decode_to_utf8(src, &mut vec[old_len..], last);
3962             vec.set_len(old_len + written);
3963             (result, read, replaced)
3964         }
3965     }
3966 
3967     public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3968                             /// _without replacement_.
3969                             ///
3970                             /// See the documentation of the struct for
3971                             /// documentation for `decode_*` methods
3972                             /// collectively.
3973                             ///
3974                             /// Available via the C wrapper.
3975                             ,
3976                             decode_to_utf8_without_replacement,
3977                             decode_to_utf8_raw,
3978                             decode_to_utf8_checking_end,
3979                             decode_to_utf8_after_one_potential_bom_byte,
3980                             decode_to_utf8_after_two_potential_bom_bytes,
3981                             decode_to_utf8_checking_end_with_offset,
3982                             u8);
3983 
3984     /// Incrementally decode a byte stream into UTF-8 with type system signaling
3985     /// of UTF-8 validity.
3986     ///
3987     /// This methods calls `decode_to_utf8` and then zeroes out up to three
3988     /// bytes that aren't logically part of the write in order to retain the
3989     /// UTF-8 validity even for the unwritten part of the buffer.
3990     ///
3991     /// See the documentation of the struct for documentation for `decode_*`
3992     /// methods collectively.
3993     ///
3994     /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)3995     pub fn decode_to_str_without_replacement(
3996         &mut self,
3997         src: &[u8],
3998         dst: &mut str,
3999         last: bool,
4000     ) -> (DecoderResult, usize, usize) {
4001         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4002         let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4003         let len = bytes.len();
4004         let mut trail = written;
4005         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4006         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4007         // encodings to avoid overwriting here.
4008         if self.encoding != UTF_8 {
4009             let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4010             while trail < max {
4011                 bytes[trail] = 0;
4012                 trail += 1;
4013             }
4014         }
4015         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4016             bytes[trail] = 0;
4017             trail += 1;
4018         }
4019         (result, read, written)
4020     }
4021 
4022     /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4023     ///
4024     /// Like the others, this method follows the logic that the output buffer is
4025     /// caller-allocated. This method treats the capacity of the `String` as
4026     /// the output limit. That is, this method guarantees not to cause a
4027     /// reallocation of the backing buffer of `String`.
4028     ///
4029     /// The return value is a pair that contains the `DecoderResult` and the
4030     /// number of bytes read. The number of bytes written is signaled via
4031     /// the length of the `String` changing.
4032     ///
4033     /// See the documentation of the struct for documentation for `decode_*`
4034     /// methods collectively.
4035     ///
4036     /// Available to Rust only.
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4037     pub fn decode_to_string_without_replacement(
4038         &mut self,
4039         src: &[u8],
4040         dst: &mut String,
4041         last: bool,
4042     ) -> (DecoderResult, usize) {
4043         unsafe {
4044             let vec = dst.as_mut_vec();
4045             let old_len = vec.len();
4046             let capacity = vec.capacity();
4047             vec.set_len(capacity);
4048             let (result, read, written) =
4049                 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4050             vec.set_len(old_len + written);
4051             (result, read)
4052         }
4053     }
4054 
4055     /// Query the worst-case UTF-16 output size (with or without replacement).
4056     ///
4057     /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4058     /// that will not overflow given the current state of the decoder and
4059     /// `byte_length` number of additional input bytes or `None` if `usize`
4060     /// would overflow.
4061     ///
4062     /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4063     /// return value of this method applies also in the
4064     /// `_without_replacement` case.
4065     ///
4066     /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4067     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4068         // Need to consider a) the decoder morphing due to the BOM and b) a partial
4069         // BOM getting pushed to the underlying decoder.
4070         match self.life_cycle {
4071             DecoderLifeCycle::Converting
4072             | DecoderLifeCycle::AtUtf8Start
4073             | DecoderLifeCycle::AtUtf16LeStart
4074             | DecoderLifeCycle::AtUtf16BeStart => {
4075                 return self.variant.max_utf16_buffer_length(byte_length);
4076             }
4077             DecoderLifeCycle::AtStart => {
4078                 if let Some(utf8_bom) = byte_length.checked_add(1) {
4079                     if let Some(utf16_bom) =
4080                         checked_add(1, checked_div(byte_length.checked_add(1), 2))
4081                     {
4082                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
4083                         let encoding = self.encoding();
4084                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4085                             // No need to consider the internal state of the underlying decoder,
4086                             // because it is at start, because no data has reached it yet.
4087                             return Some(utf_bom);
4088                         } else if let Some(non_bom) =
4089                             self.variant.max_utf16_buffer_length(byte_length)
4090                         {
4091                             return Some(std::cmp::max(utf_bom, non_bom));
4092                         }
4093                     }
4094                 }
4095             }
4096             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4097                 // Add two bytes even when only one byte has been seen,
4098                 // because the one byte can become a lead byte in multibyte
4099                 // decoders, but only after the decoder has been queried
4100                 // for max length, so the decoder's own logic for adding
4101                 // one for a pending lead cannot work.
4102                 if let Some(sum) = byte_length.checked_add(2) {
4103                     if let Some(utf8_bom) = sum.checked_add(1) {
4104                         if self.encoding() == UTF_8 {
4105                             // No need to consider the internal state of the underlying decoder,
4106                             // because it is at start, because no data has reached it yet.
4107                             return Some(utf8_bom);
4108                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4109                             return Some(std::cmp::max(utf8_bom, non_bom));
4110                         }
4111                     }
4112                 }
4113             }
4114             DecoderLifeCycle::ConvertingWithPendingBB => {
4115                 if let Some(sum) = byte_length.checked_add(2) {
4116                     return self.variant.max_utf16_buffer_length(sum);
4117                 }
4118             }
4119             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4120                 // Add two bytes even when only one byte has been seen,
4121                 // because the one byte can become a lead byte in multibyte
4122                 // decoders, but only after the decoder has been queried
4123                 // for max length, so the decoder's own logic for adding
4124                 // one for a pending lead cannot work.
4125                 if let Some(sum) = byte_length.checked_add(2) {
4126                     if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4127                         let encoding = self.encoding();
4128                         if encoding == UTF_16LE || encoding == UTF_16BE {
4129                             // No need to consider the internal state of the underlying decoder,
4130                             // because it is at start, because no data has reached it yet.
4131                             return Some(utf16_bom);
4132                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4133                             return Some(std::cmp::max(utf16_bom, non_bom));
4134                         }
4135                     }
4136                 }
4137             }
4138             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4139         }
4140         None
4141     }
4142 
4143     /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4144     /// replaced with the REPLACEMENT CHARACTER.
4145     ///
4146     /// See the documentation of the struct for documentation for `decode_*`
4147     /// methods collectively.
4148     ///
4149     /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4150     pub fn decode_to_utf16(
4151         &mut self,
4152         src: &[u8],
4153         dst: &mut [u16],
4154         last: bool,
4155     ) -> (CoderResult, usize, usize, bool) {
4156         let mut had_errors = false;
4157         let mut total_read = 0usize;
4158         let mut total_written = 0usize;
4159         loop {
4160             let (result, read, written) = self.decode_to_utf16_without_replacement(
4161                 &src[total_read..],
4162                 &mut dst[total_written..],
4163                 last,
4164             );
4165             total_read += read;
4166             total_written += written;
4167             match result {
4168                 DecoderResult::InputEmpty => {
4169                     return (
4170                         CoderResult::InputEmpty,
4171                         total_read,
4172                         total_written,
4173                         had_errors,
4174                     );
4175                 }
4176                 DecoderResult::OutputFull => {
4177                     return (
4178                         CoderResult::OutputFull,
4179                         total_read,
4180                         total_written,
4181                         had_errors,
4182                     );
4183                 }
4184                 DecoderResult::Malformed(_, _) => {
4185                     had_errors = true;
4186                     // There should always be space for the U+FFFD, because
4187                     // otherwise we'd have gotten OutputFull already.
4188                     dst[total_written] = 0xFFFD;
4189                     total_written += 1;
4190                 }
4191             }
4192         }
4193     }
4194 
4195     public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4196                             /// _without replacement_.
4197                             ///
4198                             /// See the documentation of the struct for
4199                             /// documentation for `decode_*` methods
4200                             /// collectively.
4201                             ///
4202                             /// Available via the C wrapper.
4203                             ,
4204                             decode_to_utf16_without_replacement,
4205                             decode_to_utf16_raw,
4206                             decode_to_utf16_checking_end,
4207                             decode_to_utf16_after_one_potential_bom_byte,
4208                             decode_to_utf16_after_two_potential_bom_bytes,
4209                             decode_to_utf16_checking_end_with_offset,
4210                             u16);
4211 
4212     /// Checks for compatibility with storing Unicode scalar values as unsigned
4213     /// bytes taking into account the state of the decoder.
4214     ///
4215     /// Returns `None` if the decoder is not in a neutral state, including waiting
4216     /// for the BOM or if the encoding is never Latin-byte-compatible.
4217     ///
4218     /// Otherwise returns the index of the first byte whose unsigned value doesn't
4219     /// directly correspond to the decoded Unicode scalar value, or the length
4220     /// of the input if all bytes in the input decode directly to scalar values
4221     /// corresponding to the unsigned byte values.
4222     ///
4223     /// Does not change the state of the decoder.
4224     ///
4225     /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4226     /// storage optimizations.
4227     ///
4228     /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4229     pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4230         match self.life_cycle {
4231             DecoderLifeCycle::Converting => {
4232                 return self.variant.latin1_byte_compatible_up_to(bytes);
4233             }
4234             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4235             _ => None,
4236         }
4237     }
4238 }
4239 
4240 /// Result of a (potentially partial) encode operation without replacement.
4241 #[must_use]
4242 #[derive(Debug, PartialEq, Eq)]
4243 pub enum EncoderResult {
4244     /// The input was exhausted.
4245     ///
4246     /// If this result was returned from a call where `last` was `true`, the
4247     /// decoding process has completed. Otherwise, the caller should call a
4248     /// decode method again with more input.
4249     InputEmpty,
4250 
4251     /// The encoder cannot produce another unit of output, because the output
4252     /// buffer does not have enough space left.
4253     ///
4254     /// The caller must provide more output space upon the next call and re-push
4255     /// the remaining input to the decoder.
4256     OutputFull,
4257 
4258     /// The encoder encountered an unmappable character.
4259     ///
4260     /// The caller must either treat this as a fatal error or must append
4261     /// a placeholder to the output and then re-push the remaining input to the
4262     /// encoder.
4263     Unmappable(char),
4264 }
4265 
4266 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4267     fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4268         EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
4269     }
4270 }
4271 
4272 /// A converter that encodes a Unicode stream into bytes according to a
4273 /// character encoding in a streaming (incremental) manner.
4274 ///
4275 /// The various `encode_*` methods take an input buffer (`src`) and an output
4276 /// buffer `dst` both of which are caller-allocated. There are variants for
4277 /// both UTF-8 and UTF-16 input buffers.
4278 ///
4279 /// An `encode_*` method encode characters from `src` into bytes characters
4280 /// stored into `dst` until one of the following three things happens:
4281 ///
4282 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4283 ///    only).
4284 ///
4285 /// 2. The output buffer has been filled so near capacity that the decoder
4286 ///    cannot be sure that processing an additional character of input wouldn't
4287 ///    cause so much output that the output buffer would overflow.
4288 ///
4289 /// 3. All the input characters have been processed.
4290 ///
4291 /// The `encode_*` method then returns tuple of a status indicating which one
4292 /// of the three reasons to return happened, how many input code units (`u8`
4293 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4294 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4295 /// whose length change indicates this), and in the case of the variants that
4296 /// perform replacement, a boolean indicating whether an unmappable
4297 /// character was replaced with a numeric character reference during the call.
4298 ///
4299 /// The number of bytes "written" is what's logically written. Garbage may be
4300 /// written in the output buffer beyond the point logically written to.
4301 ///
4302 /// In the case of the methods whose name ends with
4303 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4304 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4305 /// the three cases listed above).
4306 ///
4307 /// In the case of methods whose name does not end with
4308 /// `*_without_replacement`, unmappable characters are automatically replaced
4309 /// with the corresponding numeric character references and unmappable
4310 /// characters do not cause the methods to return early.
4311 ///
4312 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4313 /// not to return indicating that more output space is needed if the length
4314 /// of the output buffer is at least the length returned by
4315 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4316 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4317 /// methods not to return indicating that more output space is needed in the
4318 /// absence of unmappable characters is given by
4319 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4320 /// UTF-16 without replacement, the methods are guaranteed not to return
4321 /// indicating that more output space is needed if the length of the output
4322 /// buffer is at least the length returned by
4323 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4324 /// from UTF-16 with replacement, the the length of the output buffer that
4325 /// guarantees the methods not to return indicating that more output space is
4326 /// needed in the absence of unmappable characters is given by
4327 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4328 /// When encoding with replacement, applications are not expected to size the
4329 /// buffer for the worst case ahead of time but to resize the buffer if there
4330 /// are unmappable characters. This is why max length queries are only available
4331 /// for the case where there are no unmappable characters.
4332 ///
4333 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4334 /// calling from Rust, the type system takes care of this.) When encoding from
4335 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4336 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4337 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4338 /// are not split across input buffer boundaries.
4339 ///
4340 /// After an `encode_*` call returns, the output produced so far, taken as a
4341 /// whole from the start of the stream, is guaranteed to consist of a valid
4342 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4343 /// character is guaranteed not to be split across output buffers. However, due
4344 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4345 /// from the start for it to be valid. For other encodings, the validity holds
4346 /// on a per-output buffer basis.)
4347 ///
4348 /// The boolean argument `last` indicates that the end of the stream is reached
4349 /// when all the characters in `src` have been consumed. This argument is needed
4350 /// for ISO-2022-JP and is ignored for other encodings.
4351 ///
4352 /// An `Encoder` object can be used to incrementally encode a byte stream.
4353 ///
4354 /// During the processing of a single stream, the caller must call `encode_*`
4355 /// zero or more times with `last` set to `false` and then call `encode_*` at
4356 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4357 /// the processing of the stream has ended. Otherwise, the caller must call
4358 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4359 /// as a fatal error).
4360 ///
4361 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4362 /// That is, you need to create another one to process another stream.
4363 ///
4364 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4365 /// and the caller does not wish to treat it as a fatal error, the input buffer
4366 /// `src` may not have been completely consumed. In that case, the caller must
4367 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4368 /// call.
4369 ///
4370 /// [1]: enum.EncoderResult.html
4371 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4372 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4373 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4374 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4375 ///
4376 /// # Infinite loops
4377 ///
4378 /// When converting with a fixed-size output buffer whose size is too small to
4379 /// accommodate one character of output, an infinite loop ensues. When
4380 /// converting with a fixed-size output buffer, it generally makes sense to
4381 /// make the buffer fairly large (e.g. couple of kilobytes).
4382 pub struct Encoder {
4383     encoding: &'static Encoding,
4384     variant: VariantEncoder,
4385 }
4386 
4387 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4388     fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4389         Encoder {
4390             encoding: enc,
4391             variant: encoder,
4392         }
4393     }
4394 
4395     /// The `Encoding` this `Encoder` is for.
4396     #[inline]
encoding(&self) -> &'static Encoding4397     pub fn encoding(&self) -> &'static Encoding {
4398         self.encoding
4399     }
4400 
4401     /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4402     /// ASCII state and `false` otherwise.
4403     #[inline]
has_pending_state(&self) -> bool4404     pub fn has_pending_state(&self) -> bool {
4405         self.variant.has_pending_state()
4406     }
4407 
4408     /// Query the worst-case output size when encoding from UTF-8 with
4409     /// replacement.
4410     ///
4411     /// Returns the size of the output buffer in bytes that will not overflow
4412     /// given the current state of the encoder and `byte_length` number of
4413     /// additional input code units if there are no unmappable characters in
4414     /// the input or `None` if `usize` would overflow.
4415     ///
4416     /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4417     pub fn max_buffer_length_from_utf8_if_no_unmappables(
4418         &self,
4419         byte_length: usize,
4420     ) -> Option<usize> {
4421         checked_add(
4422             if self.encoding().can_encode_everything() {
4423                 0
4424             } else {
4425                 NCR_EXTRA
4426             },
4427             self.max_buffer_length_from_utf8_without_replacement(byte_length),
4428         )
4429     }
4430 
4431     /// Query the worst-case output size when encoding from UTF-8 without
4432     /// replacement.
4433     ///
4434     /// Returns the size of the output buffer in bytes that will not overflow
4435     /// given the current state of the encoder and `byte_length` number of
4436     /// additional input code units or `None` if `usize` would overflow.
4437     ///
4438     /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4439     pub fn max_buffer_length_from_utf8_without_replacement(
4440         &self,
4441         byte_length: usize,
4442     ) -> Option<usize> {
4443         self.variant
4444             .max_buffer_length_from_utf8_without_replacement(byte_length)
4445     }
4446 
4447     /// Incrementally encode into byte stream from UTF-8 with unmappable
4448     /// characters replaced with HTML (decimal) numeric character references.
4449     ///
4450     /// See the documentation of the struct for documentation for `encode_*`
4451     /// methods collectively.
4452     ///
4453     /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4454     pub fn encode_from_utf8(
4455         &mut self,
4456         src: &str,
4457         dst: &mut [u8],
4458         last: bool,
4459     ) -> (CoderResult, usize, usize, bool) {
4460         let dst_len = dst.len();
4461         let effective_dst_len = if self.encoding().can_encode_everything() {
4462             dst_len
4463         } else {
4464             if dst_len < NCR_EXTRA {
4465                 if src.is_empty() && !(last && self.has_pending_state()) {
4466                     return (CoderResult::InputEmpty, 0, 0, false);
4467                 }
4468                 return (CoderResult::OutputFull, 0, 0, false);
4469             }
4470             dst_len - NCR_EXTRA
4471         };
4472         let mut had_unmappables = false;
4473         let mut total_read = 0usize;
4474         let mut total_written = 0usize;
4475         loop {
4476             let (result, read, written) = self.encode_from_utf8_without_replacement(
4477                 &src[total_read..],
4478                 &mut dst[total_written..effective_dst_len],
4479                 last,
4480             );
4481             total_read += read;
4482             total_written += written;
4483             match result {
4484                 EncoderResult::InputEmpty => {
4485                     return (
4486                         CoderResult::InputEmpty,
4487                         total_read,
4488                         total_written,
4489                         had_unmappables,
4490                     );
4491                 }
4492                 EncoderResult::OutputFull => {
4493                     return (
4494                         CoderResult::OutputFull,
4495                         total_read,
4496                         total_written,
4497                         had_unmappables,
4498                     );
4499                 }
4500                 EncoderResult::Unmappable(unmappable) => {
4501                     had_unmappables = true;
4502                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4503                     debug_assert_ne!(self.encoding(), UTF_16BE);
4504                     debug_assert_ne!(self.encoding(), UTF_16LE);
4505                     // Additionally, Iso2022JpEncoder is responsible for
4506                     // transitioning to ASCII when returning with Unmappable.
4507                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4508                     if total_written >= effective_dst_len {
4509                         if total_read == src.len() && !(last && self.has_pending_state()) {
4510                             return (
4511                                 CoderResult::InputEmpty,
4512                                 total_read,
4513                                 total_written,
4514                                 had_unmappables,
4515                             );
4516                         }
4517                         return (
4518                             CoderResult::OutputFull,
4519                             total_read,
4520                             total_written,
4521                             had_unmappables,
4522                         );
4523                     }
4524                 }
4525             }
4526         }
4527     }
4528 
4529     /// Incrementally encode into byte stream from UTF-8 with unmappable
4530     /// characters replaced with HTML (decimal) numeric character references.
4531     ///
4532     /// See the documentation of the struct for documentation for `encode_*`
4533     /// methods collectively.
4534     ///
4535     /// Available to Rust only.
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4536     pub fn encode_from_utf8_to_vec(
4537         &mut self,
4538         src: &str,
4539         dst: &mut Vec<u8>,
4540         last: bool,
4541     ) -> (CoderResult, usize, bool) {
4542         unsafe {
4543             let old_len = dst.len();
4544             let capacity = dst.capacity();
4545             dst.set_len(capacity);
4546             let (result, read, written, replaced) =
4547                 self.encode_from_utf8(src, &mut dst[old_len..], last);
4548             dst.set_len(old_len + written);
4549             (result, read, replaced)
4550         }
4551     }
4552 
4553     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4554     ///
4555     /// See the documentation of the struct for documentation for `encode_*`
4556     /// methods collectively.
4557     ///
4558     /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4559     pub fn encode_from_utf8_without_replacement(
4560         &mut self,
4561         src: &str,
4562         dst: &mut [u8],
4563         last: bool,
4564     ) -> (EncoderResult, usize, usize) {
4565         self.variant.encode_from_utf8_raw(src, dst, last)
4566     }
4567 
4568     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4569     ///
4570     /// See the documentation of the struct for documentation for `encode_*`
4571     /// methods collectively.
4572     ///
4573     /// Available to Rust only.
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4574     pub fn encode_from_utf8_to_vec_without_replacement(
4575         &mut self,
4576         src: &str,
4577         dst: &mut Vec<u8>,
4578         last: bool,
4579     ) -> (EncoderResult, usize) {
4580         unsafe {
4581             let old_len = dst.len();
4582             let capacity = dst.capacity();
4583             dst.set_len(capacity);
4584             let (result, read, written) =
4585                 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4586             dst.set_len(old_len + written);
4587             (result, read)
4588         }
4589     }
4590 
4591     /// Query the worst-case output size when encoding from UTF-16 with
4592     /// replacement.
4593     ///
4594     /// Returns the size of the output buffer in bytes that will not overflow
4595     /// given the current state of the encoder and `u16_length` number of
4596     /// additional input code units if there are no unmappable characters in
4597     /// the input or `None` if `usize` would overflow.
4598     ///
4599     /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4600     pub fn max_buffer_length_from_utf16_if_no_unmappables(
4601         &self,
4602         u16_length: usize,
4603     ) -> Option<usize> {
4604         checked_add(
4605             if self.encoding().can_encode_everything() {
4606                 0
4607             } else {
4608                 NCR_EXTRA
4609             },
4610             self.max_buffer_length_from_utf16_without_replacement(u16_length),
4611         )
4612     }
4613 
4614     /// Query the worst-case output size when encoding from UTF-16 without
4615     /// replacement.
4616     ///
4617     /// Returns the size of the output buffer in bytes that will not overflow
4618     /// given the current state of the encoder and `u16_length` number of
4619     /// additional input code units or `None` if `usize` would overflow.
4620     ///
4621     /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4622     pub fn max_buffer_length_from_utf16_without_replacement(
4623         &self,
4624         u16_length: usize,
4625     ) -> Option<usize> {
4626         self.variant
4627             .max_buffer_length_from_utf16_without_replacement(u16_length)
4628     }
4629 
4630     /// Incrementally encode into byte stream from UTF-16 with unmappable
4631     /// characters replaced with HTML (decimal) numeric character references.
4632     ///
4633     /// See the documentation of the struct for documentation for `encode_*`
4634     /// methods collectively.
4635     ///
4636     /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4637     pub fn encode_from_utf16(
4638         &mut self,
4639         src: &[u16],
4640         dst: &mut [u8],
4641         last: bool,
4642     ) -> (CoderResult, usize, usize, bool) {
4643         let dst_len = dst.len();
4644         let effective_dst_len = if self.encoding().can_encode_everything() {
4645             dst_len
4646         } else {
4647             if dst_len < NCR_EXTRA {
4648                 if src.is_empty() && !(last && self.has_pending_state()) {
4649                     return (CoderResult::InputEmpty, 0, 0, false);
4650                 }
4651                 return (CoderResult::OutputFull, 0, 0, false);
4652             }
4653             dst_len - NCR_EXTRA
4654         };
4655         let mut had_unmappables = false;
4656         let mut total_read = 0usize;
4657         let mut total_written = 0usize;
4658         loop {
4659             let (result, read, written) = self.encode_from_utf16_without_replacement(
4660                 &src[total_read..],
4661                 &mut dst[total_written..effective_dst_len],
4662                 last,
4663             );
4664             total_read += read;
4665             total_written += written;
4666             match result {
4667                 EncoderResult::InputEmpty => {
4668                     return (
4669                         CoderResult::InputEmpty,
4670                         total_read,
4671                         total_written,
4672                         had_unmappables,
4673                     );
4674                 }
4675                 EncoderResult::OutputFull => {
4676                     return (
4677                         CoderResult::OutputFull,
4678                         total_read,
4679                         total_written,
4680                         had_unmappables,
4681                     );
4682                 }
4683                 EncoderResult::Unmappable(unmappable) => {
4684                     had_unmappables = true;
4685                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4686                     // There are no UTF-16 encoders and even if there were,
4687                     // they'd never have unmappables.
4688                     debug_assert_ne!(self.encoding(), UTF_16BE);
4689                     debug_assert_ne!(self.encoding(), UTF_16LE);
4690                     // Additionally, Iso2022JpEncoder is responsible for
4691                     // transitioning to ASCII when returning with Unmappable
4692                     // from the jis0208 state. That is, when we encode
4693                     // ISO-2022-JP and come here, the encoder is in either the
4694                     // ASCII or the Roman state. We are allowed to generate any
4695                     // printable ASCII excluding \ and ~.
4696                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4697                     if total_written >= effective_dst_len {
4698                         if total_read == src.len() && !(last && self.has_pending_state()) {
4699                             return (
4700                                 CoderResult::InputEmpty,
4701                                 total_read,
4702                                 total_written,
4703                                 had_unmappables,
4704                             );
4705                         }
4706                         return (
4707                             CoderResult::OutputFull,
4708                             total_read,
4709                             total_written,
4710                             had_unmappables,
4711                         );
4712                     }
4713                 }
4714             }
4715         }
4716     }
4717 
4718     /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4719     ///
4720     /// See the documentation of the struct for documentation for `encode_*`
4721     /// methods collectively.
4722     ///
4723     /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4724     pub fn encode_from_utf16_without_replacement(
4725         &mut self,
4726         src: &[u16],
4727         dst: &mut [u8],
4728         last: bool,
4729     ) -> (EncoderResult, usize, usize) {
4730         self.variant.encode_from_utf16_raw(src, dst, last)
4731     }
4732 }
4733 
4734 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4735 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4736     // len is the number of decimal digits needed to represent unmappable plus
4737     // 3 (the length of "&#" and ";").
4738     let mut number = unmappable as u32;
4739     let len = if number >= 1_000_000u32 {
4740         10usize
4741     } else if number >= 100_000u32 {
4742         9usize
4743     } else if number >= 10_000u32 {
4744         8usize
4745     } else if number >= 1_000u32 {
4746         7usize
4747     } else if number >= 100u32 {
4748         6usize
4749     } else {
4750         // Review the outcome of https://github.com/whatwg/encoding/issues/15
4751         // to see if this case is possible
4752         5usize
4753     };
4754     debug_assert!(number >= 10u32);
4755     debug_assert!(len <= dst.len());
4756     let mut pos = len - 1;
4757     dst[pos] = b';';
4758     pos -= 1;
4759     loop {
4760         let rightmost = number % 10;
4761         dst[pos] = rightmost as u8 + b'0';
4762         pos -= 1;
4763         if number < 10 {
4764             break;
4765         }
4766         number /= 10;
4767     }
4768     dst[1] = b'#';
4769     dst[0] = b'&';
4770     len
4771 }
4772 
4773 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4774 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4775     i.wrapping_sub(start) < (end - start)
4776 }
4777 
4778 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4779 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4780     i.wrapping_sub(start) < (end - start)
4781 }
4782 
4783 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4784 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4785     i.wrapping_sub(start) <= (end - start)
4786 }
4787 
4788 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4789 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4790     i.wrapping_sub(start) <= (end - start)
4791 }
4792 
4793 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4794 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4795     i.wrapping_sub(start) <= (end - start)
4796 }
4797 
4798 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4799 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4800     i.wrapping_sub(start) <= (end - start)
4801 }
4802 
4803 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4804 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4805     if let Some(n) = opt {
4806         n.checked_add(num)
4807     } else {
4808         None
4809     }
4810 }
4811 
4812 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4813 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4814     if let Some(n) = one {
4815         checked_add(n, other)
4816     } else {
4817         None
4818     }
4819 }
4820 
4821 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4822 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4823     if let Some(n) = opt {
4824         n.checked_mul(num)
4825     } else {
4826         None
4827     }
4828 }
4829 
4830 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4831 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4832     if let Some(n) = opt {
4833         n.checked_div(num)
4834     } else {
4835         None
4836     }
4837 }
4838 
4839 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4840 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4841     opt.map(|n| n.next_power_of_two())
4842 }
4843 
4844 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4845 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4846     if let Some(a) = one {
4847         if let Some(b) = other {
4848             Some(::std::cmp::min(a, b))
4849         } else {
4850             Some(a)
4851         }
4852     } else {
4853         other
4854     }
4855 }
4856 
4857 // ############## TESTS ###############
4858 
4859 #[cfg(all(test, feature = "serde"))]
4860 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4861 struct Demo {
4862     num: u32,
4863     name: String,
4864     enc: &'static Encoding,
4865 }
4866 
4867 #[cfg(test)]
4868 mod test_labels_names;
4869 
4870 #[cfg(test)]
4871 mod tests {
4872     use super::*;
4873     use std::borrow::Cow;
4874 
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4875     fn sniff_to_utf16(
4876         initial_encoding: &'static Encoding,
4877         expected_encoding: &'static Encoding,
4878         bytes: &[u8],
4879         expect: &[u16],
4880         breaks: &[usize],
4881     ) {
4882         let mut decoder = initial_encoding.new_decoder();
4883 
4884         let mut dest: Vec<u16> =
4885             Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4886         let capacity = dest.capacity();
4887         dest.resize(capacity, 0u16);
4888 
4889         let mut total_written = 0usize;
4890         let mut start = 0usize;
4891         for br in breaks {
4892             let (result, read, written, _) =
4893                 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4894             total_written += written;
4895             assert_eq!(read, *br - start);
4896             match result {
4897                 CoderResult::InputEmpty => {}
4898                 CoderResult::OutputFull => {
4899                     unreachable!();
4900                 }
4901             }
4902             start = *br;
4903         }
4904         let (result, read, written, _) =
4905             decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4906         total_written += written;
4907         match result {
4908             CoderResult::InputEmpty => {}
4909             CoderResult::OutputFull => {
4910                 unreachable!();
4911             }
4912         }
4913         assert_eq!(read, bytes.len() - start);
4914         assert_eq!(total_written, expect.len());
4915         assert_eq!(&dest[..total_written], expect);
4916         assert_eq!(decoder.encoding(), expected_encoding);
4917     }
4918 
4919     // Any copyright to the test code below this comment is dedicated to the
4920     // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4921 
4922     #[test]
test_bom_sniffing()4923     fn test_bom_sniffing() {
4924         // ASCII
4925         sniff_to_utf16(
4926             WINDOWS_1252,
4927             WINDOWS_1252,
4928             b"\x61\x62",
4929             &[0x0061u16, 0x0062u16],
4930             &[],
4931         );
4932         // UTF-8
4933         sniff_to_utf16(
4934             WINDOWS_1252,
4935             UTF_8,
4936             b"\xEF\xBB\xBF\x61\x62",
4937             &[0x0061u16, 0x0062u16],
4938             &[],
4939         );
4940         sniff_to_utf16(
4941             WINDOWS_1252,
4942             UTF_8,
4943             b"\xEF\xBB\xBF\x61\x62",
4944             &[0x0061u16, 0x0062u16],
4945             &[1],
4946         );
4947         sniff_to_utf16(
4948             WINDOWS_1252,
4949             UTF_8,
4950             b"\xEF\xBB\xBF\x61\x62",
4951             &[0x0061u16, 0x0062u16],
4952             &[2],
4953         );
4954         sniff_to_utf16(
4955             WINDOWS_1252,
4956             UTF_8,
4957             b"\xEF\xBB\xBF\x61\x62",
4958             &[0x0061u16, 0x0062u16],
4959             &[3],
4960         );
4961         sniff_to_utf16(
4962             WINDOWS_1252,
4963             UTF_8,
4964             b"\xEF\xBB\xBF\x61\x62",
4965             &[0x0061u16, 0x0062u16],
4966             &[4],
4967         );
4968         sniff_to_utf16(
4969             WINDOWS_1252,
4970             UTF_8,
4971             b"\xEF\xBB\xBF\x61\x62",
4972             &[0x0061u16, 0x0062u16],
4973             &[2, 3],
4974         );
4975         sniff_to_utf16(
4976             WINDOWS_1252,
4977             UTF_8,
4978             b"\xEF\xBB\xBF\x61\x62",
4979             &[0x0061u16, 0x0062u16],
4980             &[1, 2],
4981         );
4982         sniff_to_utf16(
4983             WINDOWS_1252,
4984             UTF_8,
4985             b"\xEF\xBB\xBF\x61\x62",
4986             &[0x0061u16, 0x0062u16],
4987             &[1, 3],
4988         );
4989         sniff_to_utf16(
4990             WINDOWS_1252,
4991             UTF_8,
4992             b"\xEF\xBB\xBF\x61\x62",
4993             &[0x0061u16, 0x0062u16],
4994             &[1, 2, 3, 4],
4995         );
4996         sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
4997         // Not UTF-8
4998         sniff_to_utf16(
4999             WINDOWS_1252,
5000             WINDOWS_1252,
5001             b"\xEF\xBB\x61\x62",
5002             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5003             &[],
5004         );
5005         sniff_to_utf16(
5006             WINDOWS_1252,
5007             WINDOWS_1252,
5008             b"\xEF\xBB\x61\x62",
5009             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5010             &[1],
5011         );
5012         sniff_to_utf16(
5013             WINDOWS_1252,
5014             WINDOWS_1252,
5015             b"\xEF\x61\x62",
5016             &[0x00EFu16, 0x0061u16, 0x0062u16],
5017             &[],
5018         );
5019         sniff_to_utf16(
5020             WINDOWS_1252,
5021             WINDOWS_1252,
5022             b"\xEF\x61\x62",
5023             &[0x00EFu16, 0x0061u16, 0x0062u16],
5024             &[1],
5025         );
5026         sniff_to_utf16(
5027             WINDOWS_1252,
5028             WINDOWS_1252,
5029             b"\xEF\xBB",
5030             &[0x00EFu16, 0x00BBu16],
5031             &[],
5032         );
5033         sniff_to_utf16(
5034             WINDOWS_1252,
5035             WINDOWS_1252,
5036             b"\xEF\xBB",
5037             &[0x00EFu16, 0x00BBu16],
5038             &[1],
5039         );
5040         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5041         // Not UTF-16
5042         sniff_to_utf16(
5043             WINDOWS_1252,
5044             WINDOWS_1252,
5045             b"\xFE\x61\x62",
5046             &[0x00FEu16, 0x0061u16, 0x0062u16],
5047             &[],
5048         );
5049         sniff_to_utf16(
5050             WINDOWS_1252,
5051             WINDOWS_1252,
5052             b"\xFE\x61\x62",
5053             &[0x00FEu16, 0x0061u16, 0x0062u16],
5054             &[1],
5055         );
5056         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5057         sniff_to_utf16(
5058             WINDOWS_1252,
5059             WINDOWS_1252,
5060             b"\xFF\x61\x62",
5061             &[0x00FFu16, 0x0061u16, 0x0062u16],
5062             &[],
5063         );
5064         sniff_to_utf16(
5065             WINDOWS_1252,
5066             WINDOWS_1252,
5067             b"\xFF\x61\x62",
5068             &[0x00FFu16, 0x0061u16, 0x0062u16],
5069             &[1],
5070         );
5071         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5072         // UTF-16
5073         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5074         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5075         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5076         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5077     }
5078 
5079     #[test]
test_output_encoding()5080     fn test_output_encoding() {
5081         assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5082         assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5083         assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5084         assert_eq!(UTF_8.output_encoding(), UTF_8);
5085         assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5086         assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5087         assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5088         assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5089         assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5090         assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5091     }
5092 
5093     #[test]
test_label_resolution()5094     fn test_label_resolution() {
5095         assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5096         assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5097         assert_eq!(
5098             Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5099             Some(UTF_8)
5100         );
5101         assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5102         assert_eq!(Encoding::for_label(b"bogus"), None);
5103         assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5104     }
5105 
5106     #[test]
test_decode_valid_windows_1257_to_cow()5107     fn test_decode_valid_windows_1257_to_cow() {
5108         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5109         match cow {
5110             Cow::Borrowed(_) => unreachable!(),
5111             Cow::Owned(s) => {
5112                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5113             }
5114         }
5115         assert_eq!(encoding, WINDOWS_1257);
5116         assert!(!had_errors);
5117     }
5118 
5119     #[test]
test_decode_invalid_windows_1257_to_cow()5120     fn test_decode_invalid_windows_1257_to_cow() {
5121         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5122         match cow {
5123             Cow::Borrowed(_) => unreachable!(),
5124             Cow::Owned(s) => {
5125                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5126             }
5127         }
5128         assert_eq!(encoding, WINDOWS_1257);
5129         assert!(had_errors);
5130     }
5131 
5132     #[test]
test_decode_ascii_only_windows_1257_to_cow()5133     fn test_decode_ascii_only_windows_1257_to_cow() {
5134         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5135         match cow {
5136             Cow::Borrowed(s) => {
5137                 assert_eq!(s, "abc");
5138             }
5139             Cow::Owned(_) => unreachable!(),
5140         }
5141         assert_eq!(encoding, WINDOWS_1257);
5142         assert!(!had_errors);
5143     }
5144 
5145     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5146     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5147         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5148         match cow {
5149             Cow::Borrowed(s) => {
5150                 assert_eq!(s, "\u{20AC}\u{00E4}");
5151             }
5152             Cow::Owned(_) => unreachable!(),
5153         }
5154         assert_eq!(encoding, UTF_8);
5155         assert!(!had_errors);
5156     }
5157 
5158     #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5159     fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5160         let (cow, encoding, had_errors) =
5161             WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5162         match cow {
5163             Cow::Borrowed(_) => unreachable!(),
5164             Cow::Owned(s) => {
5165                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5166             }
5167         }
5168         assert_eq!(encoding, UTF_8);
5169         assert!(had_errors);
5170     }
5171 
5172     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5173     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5174         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5175         match cow {
5176             Cow::Borrowed(s) => {
5177                 assert_eq!(s, "\u{20AC}\u{00E4}");
5178             }
5179             Cow::Owned(_) => unreachable!(),
5180         }
5181         assert_eq!(encoding, UTF_8);
5182         assert!(!had_errors);
5183     }
5184 
5185     #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5186     fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5187         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5188         match cow {
5189             Cow::Borrowed(_) => unreachable!(),
5190             Cow::Owned(s) => {
5191                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5192             }
5193         }
5194         assert_eq!(encoding, UTF_8);
5195         assert!(had_errors);
5196     }
5197 
5198     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5199     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5200         let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5201         match cow {
5202             Cow::Borrowed(s) => {
5203                 assert_eq!(s, "\u{20AC}\u{00E4}");
5204             }
5205             Cow::Owned(_) => unreachable!(),
5206         }
5207         assert!(!had_errors);
5208     }
5209 
5210     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5211     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5212         let (cow, had_errors) =
5213             WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5214         match cow {
5215             Cow::Borrowed(_) => unreachable!(),
5216             Cow::Owned(s) => {
5217                 assert_eq!(
5218                     s,
5219                     "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5220                 );
5221             }
5222         }
5223         assert!(!had_errors);
5224     }
5225 
5226     #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5227     fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5228         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5229         match cow {
5230             Cow::Borrowed(_) => unreachable!(),
5231             Cow::Owned(s) => {
5232                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5233             }
5234         }
5235         assert!(!had_errors);
5236     }
5237 
5238     #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5239     fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5240         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5241         match cow {
5242             Cow::Borrowed(_) => unreachable!(),
5243             Cow::Owned(s) => {
5244                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5245             }
5246         }
5247         assert!(had_errors);
5248     }
5249 
5250     #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5251     fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5252         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5253         match cow {
5254             Cow::Borrowed(s) => {
5255                 assert_eq!(s, "abc");
5256             }
5257             Cow::Owned(_) => unreachable!(),
5258         }
5259         assert!(!had_errors);
5260     }
5261 
5262     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5263     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5264         let (cow, had_errors) =
5265             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5266         match cow {
5267             Cow::Borrowed(s) => {
5268                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5269             }
5270             Cow::Owned(_) => unreachable!(),
5271         }
5272         assert!(!had_errors);
5273     }
5274 
5275     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5276     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5277         let (cow, had_errors) =
5278             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5279         match cow {
5280             Cow::Borrowed(_) => unreachable!(),
5281             Cow::Owned(s) => {
5282                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5283             }
5284         }
5285         assert!(had_errors);
5286     }
5287 
5288     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5289     fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5290         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5291         match cow {
5292             Cow::Borrowed(_) => unreachable!(),
5293             Cow::Owned(s) => {
5294                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5295             }
5296         }
5297         assert!(!had_errors);
5298     }
5299 
5300     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5301     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5302         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5303         match cow {
5304             Cow::Borrowed(_) => unreachable!(),
5305             Cow::Owned(s) => {
5306                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5307             }
5308         }
5309         assert!(had_errors);
5310     }
5311 
5312     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5313     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5314         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5315         match cow {
5316             Cow::Borrowed(s) => {
5317                 assert_eq!(s, "abc");
5318             }
5319             Cow::Owned(_) => unreachable!(),
5320         }
5321         assert!(!had_errors);
5322     }
5323 
5324     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5325     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5326         match UTF_8.decode_without_bom_handling_and_without_replacement(
5327             b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5328         ) {
5329             Some(cow) => match cow {
5330                 Cow::Borrowed(s) => {
5331                     assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5332                 }
5333                 Cow::Owned(_) => unreachable!(),
5334             },
5335             None => unreachable!(),
5336         }
5337     }
5338 
5339     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5340     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5341         assert!(UTF_8
5342             .decode_without_bom_handling_and_without_replacement(
5343                 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5344             )
5345             .is_none());
5346     }
5347 
5348     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5349     fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5350         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5351             Some(cow) => match cow {
5352                 Cow::Borrowed(_) => unreachable!(),
5353                 Cow::Owned(s) => {
5354                     assert_eq!(s, "abc\u{20AC}\u{00E4}");
5355                 }
5356             },
5357             None => unreachable!(),
5358         }
5359     }
5360 
5361     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5362     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5363         assert!(WINDOWS_1257
5364             .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5365             .is_none());
5366     }
5367 
5368     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5369     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5370         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5371             Some(cow) => match cow {
5372                 Cow::Borrowed(s) => {
5373                     assert_eq!(s, "abc");
5374                 }
5375                 Cow::Owned(_) => unreachable!(),
5376             },
5377             None => unreachable!(),
5378         }
5379     }
5380 
5381     #[test]
test_encode_ascii_only_windows_1257_to_cow()5382     fn test_encode_ascii_only_windows_1257_to_cow() {
5383         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5384         match cow {
5385             Cow::Borrowed(s) => {
5386                 assert_eq!(s, b"abc");
5387             }
5388             Cow::Owned(_) => unreachable!(),
5389         }
5390         assert_eq!(encoding, WINDOWS_1257);
5391         assert!(!had_errors);
5392     }
5393 
5394     #[test]
test_encode_valid_windows_1257_to_cow()5395     fn test_encode_valid_windows_1257_to_cow() {
5396         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5397         match cow {
5398             Cow::Borrowed(_) => unreachable!(),
5399             Cow::Owned(s) => {
5400                 assert_eq!(s, b"abc\x80\xE4");
5401             }
5402         }
5403         assert_eq!(encoding, WINDOWS_1257);
5404         assert!(!had_errors);
5405     }
5406 
5407     #[test]
test_utf16_space_with_one_bom_byte()5408     fn test_utf16_space_with_one_bom_byte() {
5409         let mut decoder = UTF_16LE.new_decoder();
5410         let mut dst = [0u16; 12];
5411         {
5412             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5413             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5414             assert_eq!(result, CoderResult::InputEmpty);
5415         }
5416         {
5417             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5418             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5419             assert_eq!(result, CoderResult::InputEmpty);
5420         }
5421     }
5422 
5423     #[test]
test_utf8_space_with_one_bom_byte()5424     fn test_utf8_space_with_one_bom_byte() {
5425         let mut decoder = UTF_8.new_decoder();
5426         let mut dst = [0u16; 12];
5427         {
5428             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5429             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5430             assert_eq!(result, CoderResult::InputEmpty);
5431         }
5432         {
5433             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5434             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5435             assert_eq!(result, CoderResult::InputEmpty);
5436         }
5437     }
5438 
5439     #[test]
test_utf16_space_with_two_bom_bytes()5440     fn test_utf16_space_with_two_bom_bytes() {
5441         let mut decoder = UTF_16LE.new_decoder();
5442         let mut dst = [0u16; 12];
5443         {
5444             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5445             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5446             assert_eq!(result, CoderResult::InputEmpty);
5447         }
5448         {
5449             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5450             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5451             assert_eq!(result, CoderResult::InputEmpty);
5452         }
5453         {
5454             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5455             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5456             assert_eq!(result, CoderResult::InputEmpty);
5457         }
5458     }
5459 
5460     #[test]
test_utf8_space_with_two_bom_bytes()5461     fn test_utf8_space_with_two_bom_bytes() {
5462         let mut decoder = UTF_8.new_decoder();
5463         let mut dst = [0u16; 12];
5464         {
5465             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5466             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5467             assert_eq!(result, CoderResult::InputEmpty);
5468         }
5469         {
5470             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5471             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5472             assert_eq!(result, CoderResult::InputEmpty);
5473         }
5474         {
5475             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5476             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5477             assert_eq!(result, CoderResult::InputEmpty);
5478         }
5479     }
5480 
5481     #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5482     fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5483         let mut decoder = UTF_16LE.new_decoder();
5484         let mut dst = [0u16; 12];
5485         {
5486             let needed = decoder.max_utf16_buffer_length(2).unwrap();
5487             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5488             assert_eq!(result, CoderResult::InputEmpty);
5489         }
5490     }
5491 
5492     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5493     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5494         let mut dst = [0u8; 8];
5495         let mut encoder = ISO_2022_JP.new_encoder();
5496         {
5497             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5498             assert_eq!(result, CoderResult::InputEmpty);
5499         }
5500         {
5501             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5502             assert_eq!(result, CoderResult::InputEmpty);
5503         }
5504     }
5505 
5506     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5507     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5508         let mut dst = [0u8; 16];
5509         let mut encoder = ISO_2022_JP.new_encoder();
5510         {
5511             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5512             assert_eq!(result, CoderResult::InputEmpty);
5513         }
5514         {
5515             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5516             assert_eq!(result, CoderResult::InputEmpty);
5517         }
5518         {
5519             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5520             assert_eq!(result, CoderResult::OutputFull);
5521         }
5522     }
5523 
5524     #[test]
test_buffer_end_iso_2022_jp_from_utf8()5525     fn test_buffer_end_iso_2022_jp_from_utf8() {
5526         let mut dst = [0u8; 18];
5527         {
5528             let mut encoder = ISO_2022_JP.new_encoder();
5529             let (result, _, _, _) =
5530                 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5531             assert_eq!(result, CoderResult::InputEmpty);
5532         }
5533         {
5534             let mut encoder = ISO_2022_JP.new_encoder();
5535             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5536             assert_eq!(result, CoderResult::OutputFull);
5537         }
5538         {
5539             let mut encoder = ISO_2022_JP.new_encoder();
5540             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5541             assert_eq!(result, CoderResult::InputEmpty);
5542         }
5543         {
5544             let mut encoder = ISO_2022_JP.new_encoder();
5545             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5546             assert_eq!(result, CoderResult::InputEmpty);
5547         }
5548     }
5549 
5550     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5551     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5552         let mut dst = [0u8; 8];
5553         let mut encoder = ISO_2022_JP.new_encoder();
5554         {
5555             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5556             assert_eq!(result, CoderResult::InputEmpty);
5557         }
5558         {
5559             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5560             assert_eq!(result, CoderResult::InputEmpty);
5561         }
5562     }
5563 
5564     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5565     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5566         let mut dst = [0u8; 16];
5567         let mut encoder = ISO_2022_JP.new_encoder();
5568         {
5569             let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5570             assert_eq!(result, CoderResult::InputEmpty);
5571         }
5572         {
5573             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5574             assert_eq!(result, CoderResult::InputEmpty);
5575         }
5576         {
5577             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5578             assert_eq!(result, CoderResult::OutputFull);
5579         }
5580     }
5581 
5582     #[test]
test_buffer_end_iso_2022_jp_from_utf16()5583     fn test_buffer_end_iso_2022_jp_from_utf16() {
5584         let mut dst = [0u8; 18];
5585         {
5586             let mut encoder = ISO_2022_JP.new_encoder();
5587             let (result, _, _, _) =
5588                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5589             assert_eq!(result, CoderResult::InputEmpty);
5590         }
5591         {
5592             let mut encoder = ISO_2022_JP.new_encoder();
5593             let (result, _, _, _) =
5594                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5595             assert_eq!(result, CoderResult::OutputFull);
5596         }
5597         {
5598             let mut encoder = ISO_2022_JP.new_encoder();
5599             let (result, _, _, _) =
5600                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5601             assert_eq!(result, CoderResult::InputEmpty);
5602         }
5603         {
5604             let mut encoder = ISO_2022_JP.new_encoder();
5605             let (result, _, _, _) =
5606                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5607             assert_eq!(result, CoderResult::InputEmpty);
5608         }
5609     }
5610 
5611     #[test]
test_hash()5612     fn test_hash() {
5613         let mut encodings = ::std::collections::HashSet::new();
5614         encodings.insert(UTF_8);
5615         encodings.insert(ISO_2022_JP);
5616         assert!(encodings.contains(UTF_8));
5617         assert!(encodings.contains(ISO_2022_JP));
5618         assert!(!encodings.contains(WINDOWS_1252));
5619         encodings.remove(ISO_2022_JP);
5620         assert!(!encodings.contains(ISO_2022_JP));
5621     }
5622 
5623     #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5624     fn test_iso_2022_jp_ncr_extra_from_utf16() {
5625         let mut dst = [0u8; 17];
5626         {
5627             let mut encoder = ISO_2022_JP.new_encoder();
5628             let (result, _, _, _) =
5629                 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5630             assert_eq!(result, CoderResult::OutputFull);
5631         }
5632     }
5633 
5634     #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5635     fn test_iso_2022_jp_ncr_extra_from_utf8() {
5636         let mut dst = [0u8; 17];
5637         {
5638             let mut encoder = ISO_2022_JP.new_encoder();
5639             let (result, _, _, _) =
5640                 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5641             assert_eq!(result, CoderResult::OutputFull);
5642         }
5643     }
5644 
5645     #[test]
test_max_length_with_bom_to_utf8()5646     fn test_max_length_with_bom_to_utf8() {
5647         let mut output = [0u8; 20];
5648         let mut decoder = REPLACEMENT.new_decoder();
5649         let input = b"\xEF\xBB\xBFA";
5650         {
5651             let needed = decoder
5652                 .max_utf8_buffer_length_without_replacement(input.len())
5653                 .unwrap();
5654             let (result, read, written) =
5655                 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5656             assert_eq!(result, DecoderResult::InputEmpty);
5657             assert_eq!(read, input.len());
5658             assert_eq!(written, 1);
5659             assert_eq!(output[0], 0x41);
5660         }
5661     }
5662 
5663     #[cfg(feature = "serde")]
5664     #[test]
test_serde()5665     fn test_serde() {
5666         let demo = Demo {
5667             num: 42,
5668             name: "foo".into(),
5669             enc: UTF_8,
5670         };
5671 
5672         let serialized = serde_json::to_string(&demo).unwrap();
5673 
5674         let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5675         assert_eq!(deserialized, demo);
5676 
5677         let bincoded = bincode::serialize(&demo).unwrap();
5678         let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5679         assert_eq!(debincoded, demo);
5680     }
5681 
5682     #[test]
test_is_single_byte()5683     fn test_is_single_byte() {
5684         assert!(!BIG5.is_single_byte());
5685         assert!(!EUC_JP.is_single_byte());
5686         assert!(!EUC_KR.is_single_byte());
5687         assert!(!GB18030.is_single_byte());
5688         assert!(!GBK.is_single_byte());
5689         assert!(!REPLACEMENT.is_single_byte());
5690         assert!(!SHIFT_JIS.is_single_byte());
5691         assert!(!UTF_8.is_single_byte());
5692         assert!(!UTF_16BE.is_single_byte());
5693         assert!(!UTF_16LE.is_single_byte());
5694         assert!(!ISO_2022_JP.is_single_byte());
5695 
5696         assert!(IBM866.is_single_byte());
5697         assert!(ISO_8859_2.is_single_byte());
5698         assert!(ISO_8859_3.is_single_byte());
5699         assert!(ISO_8859_4.is_single_byte());
5700         assert!(ISO_8859_5.is_single_byte());
5701         assert!(ISO_8859_6.is_single_byte());
5702         assert!(ISO_8859_7.is_single_byte());
5703         assert!(ISO_8859_8.is_single_byte());
5704         assert!(ISO_8859_10.is_single_byte());
5705         assert!(ISO_8859_13.is_single_byte());
5706         assert!(ISO_8859_14.is_single_byte());
5707         assert!(ISO_8859_15.is_single_byte());
5708         assert!(ISO_8859_16.is_single_byte());
5709         assert!(ISO_8859_8_I.is_single_byte());
5710         assert!(KOI8_R.is_single_byte());
5711         assert!(KOI8_U.is_single_byte());
5712         assert!(MACINTOSH.is_single_byte());
5713         assert!(WINDOWS_874.is_single_byte());
5714         assert!(WINDOWS_1250.is_single_byte());
5715         assert!(WINDOWS_1251.is_single_byte());
5716         assert!(WINDOWS_1252.is_single_byte());
5717         assert!(WINDOWS_1253.is_single_byte());
5718         assert!(WINDOWS_1254.is_single_byte());
5719         assert!(WINDOWS_1255.is_single_byte());
5720         assert!(WINDOWS_1256.is_single_byte());
5721         assert!(WINDOWS_1257.is_single_byte());
5722         assert!(WINDOWS_1258.is_single_byte());
5723         assert!(X_MAC_CYRILLIC.is_single_byte());
5724         assert!(X_USER_DEFINED.is_single_byte());
5725     }
5726 
5727     #[test]
test_latin1_byte_compatible_up_to()5728     fn test_latin1_byte_compatible_up_to() {
5729         let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5730         assert_eq!(
5731             BIG5.new_decoder_without_bom_handling()
5732                 .latin1_byte_compatible_up_to(buffer)
5733                 .unwrap(),
5734             1
5735         );
5736         assert_eq!(
5737             EUC_JP
5738                 .new_decoder_without_bom_handling()
5739                 .latin1_byte_compatible_up_to(buffer)
5740                 .unwrap(),
5741             1
5742         );
5743         assert_eq!(
5744             EUC_KR
5745                 .new_decoder_without_bom_handling()
5746                 .latin1_byte_compatible_up_to(buffer)
5747                 .unwrap(),
5748             1
5749         );
5750         assert_eq!(
5751             GB18030
5752                 .new_decoder_without_bom_handling()
5753                 .latin1_byte_compatible_up_to(buffer)
5754                 .unwrap(),
5755             1
5756         );
5757         assert_eq!(
5758             GBK.new_decoder_without_bom_handling()
5759                 .latin1_byte_compatible_up_to(buffer)
5760                 .unwrap(),
5761             1
5762         );
5763         assert!(REPLACEMENT
5764             .new_decoder_without_bom_handling()
5765             .latin1_byte_compatible_up_to(buffer)
5766             .is_none());
5767         assert_eq!(
5768             SHIFT_JIS
5769                 .new_decoder_without_bom_handling()
5770                 .latin1_byte_compatible_up_to(buffer)
5771                 .unwrap(),
5772             1
5773         );
5774         assert_eq!(
5775             UTF_8
5776                 .new_decoder_without_bom_handling()
5777                 .latin1_byte_compatible_up_to(buffer)
5778                 .unwrap(),
5779             1
5780         );
5781         assert!(UTF_16BE
5782             .new_decoder_without_bom_handling()
5783             .latin1_byte_compatible_up_to(buffer)
5784             .is_none());
5785         assert!(UTF_16LE
5786             .new_decoder_without_bom_handling()
5787             .latin1_byte_compatible_up_to(buffer)
5788             .is_none());
5789         assert_eq!(
5790             ISO_2022_JP
5791                 .new_decoder_without_bom_handling()
5792                 .latin1_byte_compatible_up_to(buffer)
5793                 .unwrap(),
5794             1
5795         );
5796 
5797         assert_eq!(
5798             IBM866
5799                 .new_decoder_without_bom_handling()
5800                 .latin1_byte_compatible_up_to(buffer)
5801                 .unwrap(),
5802             1
5803         );
5804         assert_eq!(
5805             ISO_8859_2
5806                 .new_decoder_without_bom_handling()
5807                 .latin1_byte_compatible_up_to(buffer)
5808                 .unwrap(),
5809             2
5810         );
5811         assert_eq!(
5812             ISO_8859_3
5813                 .new_decoder_without_bom_handling()
5814                 .latin1_byte_compatible_up_to(buffer)
5815                 .unwrap(),
5816             2
5817         );
5818         assert_eq!(
5819             ISO_8859_4
5820                 .new_decoder_without_bom_handling()
5821                 .latin1_byte_compatible_up_to(buffer)
5822                 .unwrap(),
5823             2
5824         );
5825         assert_eq!(
5826             ISO_8859_5
5827                 .new_decoder_without_bom_handling()
5828                 .latin1_byte_compatible_up_to(buffer)
5829                 .unwrap(),
5830             2
5831         );
5832         assert_eq!(
5833             ISO_8859_6
5834                 .new_decoder_without_bom_handling()
5835                 .latin1_byte_compatible_up_to(buffer)
5836                 .unwrap(),
5837             2
5838         );
5839         assert_eq!(
5840             ISO_8859_7
5841                 .new_decoder_without_bom_handling()
5842                 .latin1_byte_compatible_up_to(buffer)
5843                 .unwrap(),
5844             2
5845         );
5846         assert_eq!(
5847             ISO_8859_8
5848                 .new_decoder_without_bom_handling()
5849                 .latin1_byte_compatible_up_to(buffer)
5850                 .unwrap(),
5851             3
5852         );
5853         assert_eq!(
5854             ISO_8859_10
5855                 .new_decoder_without_bom_handling()
5856                 .latin1_byte_compatible_up_to(buffer)
5857                 .unwrap(),
5858             2
5859         );
5860         assert_eq!(
5861             ISO_8859_13
5862                 .new_decoder_without_bom_handling()
5863                 .latin1_byte_compatible_up_to(buffer)
5864                 .unwrap(),
5865             4
5866         );
5867         assert_eq!(
5868             ISO_8859_14
5869                 .new_decoder_without_bom_handling()
5870                 .latin1_byte_compatible_up_to(buffer)
5871                 .unwrap(),
5872             4
5873         );
5874         assert_eq!(
5875             ISO_8859_15
5876                 .new_decoder_without_bom_handling()
5877                 .latin1_byte_compatible_up_to(buffer)
5878                 .unwrap(),
5879             6
5880         );
5881         assert_eq!(
5882             ISO_8859_16
5883                 .new_decoder_without_bom_handling()
5884                 .latin1_byte_compatible_up_to(buffer)
5885                 .unwrap(),
5886             4
5887         );
5888         assert_eq!(
5889             ISO_8859_8_I
5890                 .new_decoder_without_bom_handling()
5891                 .latin1_byte_compatible_up_to(buffer)
5892                 .unwrap(),
5893             3
5894         );
5895         assert_eq!(
5896             KOI8_R
5897                 .new_decoder_without_bom_handling()
5898                 .latin1_byte_compatible_up_to(buffer)
5899                 .unwrap(),
5900             1
5901         );
5902         assert_eq!(
5903             KOI8_U
5904                 .new_decoder_without_bom_handling()
5905                 .latin1_byte_compatible_up_to(buffer)
5906                 .unwrap(),
5907             1
5908         );
5909         assert_eq!(
5910             MACINTOSH
5911                 .new_decoder_without_bom_handling()
5912                 .latin1_byte_compatible_up_to(buffer)
5913                 .unwrap(),
5914             1
5915         );
5916         assert_eq!(
5917             WINDOWS_874
5918                 .new_decoder_without_bom_handling()
5919                 .latin1_byte_compatible_up_to(buffer)
5920                 .unwrap(),
5921             2
5922         );
5923         assert_eq!(
5924             WINDOWS_1250
5925                 .new_decoder_without_bom_handling()
5926                 .latin1_byte_compatible_up_to(buffer)
5927                 .unwrap(),
5928             4
5929         );
5930         assert_eq!(
5931             WINDOWS_1251
5932                 .new_decoder_without_bom_handling()
5933                 .latin1_byte_compatible_up_to(buffer)
5934                 .unwrap(),
5935             1
5936         );
5937         assert_eq!(
5938             WINDOWS_1252
5939                 .new_decoder_without_bom_handling()
5940                 .latin1_byte_compatible_up_to(buffer)
5941                 .unwrap(),
5942             5
5943         );
5944         assert_eq!(
5945             WINDOWS_1253
5946                 .new_decoder_without_bom_handling()
5947                 .latin1_byte_compatible_up_to(buffer)
5948                 .unwrap(),
5949             3
5950         );
5951         assert_eq!(
5952             WINDOWS_1254
5953                 .new_decoder_without_bom_handling()
5954                 .latin1_byte_compatible_up_to(buffer)
5955                 .unwrap(),
5956             4
5957         );
5958         assert_eq!(
5959             WINDOWS_1255
5960                 .new_decoder_without_bom_handling()
5961                 .latin1_byte_compatible_up_to(buffer)
5962                 .unwrap(),
5963             3
5964         );
5965         assert_eq!(
5966             WINDOWS_1256
5967                 .new_decoder_without_bom_handling()
5968                 .latin1_byte_compatible_up_to(buffer)
5969                 .unwrap(),
5970             1
5971         );
5972         assert_eq!(
5973             WINDOWS_1257
5974                 .new_decoder_without_bom_handling()
5975                 .latin1_byte_compatible_up_to(buffer)
5976                 .unwrap(),
5977             4
5978         );
5979         assert_eq!(
5980             WINDOWS_1258
5981                 .new_decoder_without_bom_handling()
5982                 .latin1_byte_compatible_up_to(buffer)
5983                 .unwrap(),
5984             4
5985         );
5986         assert_eq!(
5987             X_MAC_CYRILLIC
5988                 .new_decoder_without_bom_handling()
5989                 .latin1_byte_compatible_up_to(buffer)
5990                 .unwrap(),
5991             1
5992         );
5993         assert_eq!(
5994             X_USER_DEFINED
5995                 .new_decoder_without_bom_handling()
5996                 .latin1_byte_compatible_up_to(buffer)
5997                 .unwrap(),
5998             1
5999         );
6000 
6001         assert!(UTF_8
6002             .new_decoder()
6003             .latin1_byte_compatible_up_to(buffer)
6004             .is_none());
6005 
6006         let mut decoder = UTF_8.new_decoder();
6007         let mut output = [0u16; 4];
6008         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6009         assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6010         let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6011         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6012         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6013         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6014     }
6015 }
6016