1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 #![cfg_attr(
11     feature = "cargo-clippy",
12     allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14 
15 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17 //! Gecko-oriented means that converting to and from UTF-16 is supported in
18 //! addition to converting to and from UTF-8, that the performance and
19 //! streamability goals are browser-oriented, and that FFI-friendliness is a
20 //! goal.
21 //!
22 //! Additionally, the `mem` module provides functions that are useful for
23 //! applications that need to be able to deal with legacy in-memory
24 //! representations of Unicode.
25 //!
26 //! For expectation setting, please be sure to read the sections
27 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29 //!
30 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31 //! design and internals of the crate.
32 //!
33 //! # Availability
34 //!
35 //! The code is available under the
36 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38 //! See the
39 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40 //! file for details.
41 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43 //!
44 //! # Integration with `std::io`
45 //!
46 //! This crate doesn't implement traits from `std::io`. However, for the case of
47 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50 //!
51 //! # Examples
52 //!
53 //! Example programs:
54 //!
55 //! * [Rust](https://github.com/hsivonen/recode_rs)
56 //! * [C](https://github.com/hsivonen/recode_c)
57 //! * [C++](https://github.com/hsivonen/recode_cpp)
58 //!
59 //! Decode using the non-streaming API:
60 //!
61 //! ```
62 //! use encoding_rs::*;
63 //!
64 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
65 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
66 //!
67 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
68 //! assert_eq!(&cow[..], expectation);
69 //! assert_eq!(encoding_used, SHIFT_JIS);
70 //! assert!(!had_errors);
71 //! ```
72 //!
73 //! Decode using the streaming API with minimal `unsafe`:
74 //!
75 //! ```
76 //! use encoding_rs::*;
77 //!
78 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
79 //!
80 //! // Use an array of byte slices to demonstrate content arriving piece by
81 //! // piece from the network.
82 //! let bytes: [&'static [u8]; 4] = [b"\x83",
83 //!                                  b"n\x83\x8D\x81",
84 //!                                  b"[\x81E\x83\x8F\x81[\x83",
85 //!                                  b"\x8B\x83h"];
86 //!
87 //! // Very short output buffer to demonstrate the output buffer getting full.
88 //! // Normally, you'd use something like `[0u8; 2048]`.
89 //! let mut buffer_bytes = [0u8; 8];
90 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
91 //!
92 //! // How many bytes in the buffer currently hold significant data.
93 //! let mut bytes_in_buffer = 0usize;
94 //!
95 //! // Collect the output to a string for demonstration purposes.
96 //! let mut output = String::new();
97 //!
98 //! // The `Decoder`
99 //! let mut decoder = SHIFT_JIS.new_decoder();
100 //!
101 //! // Track whether we see errors.
102 //! let mut total_had_errors = false;
103 //!
104 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
105 //! // use of a fixed-size buffer; normally when the output of an incremental
106 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
107 //! // avoid the intermediate buffer).
108 //! for input in &bytes[..] {
109 //!     // The number of bytes already read from current `input` in total.
110 //!     let mut total_read_from_current_input = 0usize;
111 //!
112 //!     loop {
113 //!         let (result, read, written, had_errors) =
114 //!             decoder.decode_to_str(&input[total_read_from_current_input..],
115 //!                                   &mut buffer[bytes_in_buffer..],
116 //!                                   false);
117 //!         total_read_from_current_input += read;
118 //!         bytes_in_buffer += written;
119 //!         total_had_errors |= had_errors;
120 //!         match result {
121 //!             CoderResult::InputEmpty => {
122 //!                 // We have consumed the current input buffer. Break out of
123 //!                 // the inner loop to get the next input buffer from the
124 //!                 // outer loop.
125 //!                 break;
126 //!             },
127 //!             CoderResult::OutputFull => {
128 //!                 // Write the current buffer out and consider the buffer
129 //!                 // empty.
130 //!                 output.push_str(&buffer[..bytes_in_buffer]);
131 //!                 bytes_in_buffer = 0usize;
132 //!                 continue;
133 //!             }
134 //!         }
135 //!     }
136 //! }
137 //!
138 //! // Process EOF
139 //! loop {
140 //!     let (result, _, written, had_errors) =
141 //!         decoder.decode_to_str(b"",
142 //!                               &mut buffer[bytes_in_buffer..],
143 //!                               true);
144 //!     bytes_in_buffer += written;
145 //!     total_had_errors |= had_errors;
146 //!     // Write the current buffer out and consider the buffer empty.
147 //!     // Need to do this here for both `match` arms, because we exit the
148 //!     // loop on `CoderResult::InputEmpty`.
149 //!     output.push_str(&buffer[..bytes_in_buffer]);
150 //!     bytes_in_buffer = 0usize;
151 //!     match result {
152 //!         CoderResult::InputEmpty => {
153 //!             // Done!
154 //!             break;
155 //!         },
156 //!         CoderResult::OutputFull => {
157 //!             continue;
158 //!         }
159 //!     }
160 //! }
161 //!
162 //! assert_eq!(&output[..], expectation);
163 //! assert!(!total_had_errors);
164 //! ```
165 //!
166 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
167 //!
168 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
169 //! __so this crate does not provide encoders for those encodings__!
170 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
171 //! so you get an UTF-8 encoder if you request an encoder for them.
172 //!
173 //! Additionally, the Encoding Standard factors BOM handling into wrapper
174 //! algorithms so that BOM handling isn't part of the definition of the
175 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
176 //! Standard define BOM handling or lack thereof as part of the encoding
177 //! scheme.
178 //!
179 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
180 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
181 //! the Unicode Standard.
182 //!
183 //! When used with the `_with_bom_removal` entry points, the UTF-8
184 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
185 //! Standard.
186 //!
187 //! This crate does not provide a mode that matches the UTF-16 _encoding
188 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
189 //! the entry points without `_bom_` qualifiers is the closest match,
190 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
191 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
192 //! Unicode Standard.
193 //!
194 //! The UTF-32 family of Unicode encoding schemes is not supported
195 //! by this crate. The Encoding Standard doesn't define any UTF-32
196 //! family encodings, since they aren't necessary for consuming Web
197 //! content.
198 //!
199 //! ## ISO-8859-1
200 //!
201 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
202 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
203 //! byte value to the same Unicode scalar value is not available via
204 //! `Encoding` in this crate.
205 //!
206 //! However, the functions whose name starts with `convert` and contains
207 //! `latin1` in the `mem` module support such conversions, which are known as
208 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
209 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
210 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
211 //!
212 //! ## Web / Browser Focus
213 //!
214 //! Both in terms of scope and performance, the focus is on the Web. For scope,
215 //! this means that encoding_rs implements the Encoding Standard fully and
216 //! doesn't implement encodings that are not specified in the Encoding
217 //! Standard. For performance, this means that decoding performance is
218 //! important as well as performance for encoding into UTF-8 or encoding the
219 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
220 //! be encoded into legacy encodings in only two places in the Web platform: in
221 //! the query part of URLs, in which case it's a matter of relatively rare
222 //! error handling, and in form submission, in which case the user action and
223 //! networking tend to hide the performance of the encoder.
224 //!
225 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
226 //! encodings enables smaller code size thanks to the encoder side using the
227 //! decode-optimized data tables without having encode-optimized data tables at
228 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
229 //! multiplication operations.
230 //!
231 //! Additionally, performance is a non-goal for the ASCII-incompatible
232 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
233 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
234 //! of implementation.
235 //!
236 //! Despite the browser focus, the hope is that non-browser applications
237 //! that wish to consume Web content or submit Web forms in a Web-compatible
238 //! way will find encoding_rs useful. While encoding_rs does not try to match
239 //! Windows behavior, many of the encodings are close enough to legacy
240 //! encodings implemented by Windows that applications that need to consume
241 //! data in legacy Windows encodins may find encoding_rs useful. The
242 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
243 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
244 //!
245 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
246 //! to the encodings defined in the Encoding Standard. The
247 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
248 //! UTF-7 decoding for email purposes.
249 //!
250 //! # Preparing Text for the Encoders
251 //!
252 //! Normalizing text into Unicode Normalization Form C prior to encoding text
253 //! into a legacy encoding minimizes unmappable characters. Text can be
254 //! normalized to Unicode Normalization Form C using the
255 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
256 //!
257 //! The exception is windows-1258, which after normalizing to Unicode
258 //! Normalization Form C requires tone marks to be decomposed in order to
259 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
260 //! using the [`detone`](https://crates.io/crates/detone) crate.
261 //!
262 //! # Streaming & Non-Streaming; Rust & C/C++
263 //!
264 //! The API in Rust has two modes of operation: streaming and non-streaming.
265 //! The streaming API is the foundation of the implementation and should be
266 //! used when processing data that arrives piecemeal from an i/o stream. The
267 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
268 //! to C callers. The non-streaming part of the API is for Rust callers only and
269 //! is smart about borrowing instead of copying when possible. When
270 //! streamability is not needed, the non-streaming API should be preferrer in
271 //! order to avoid copying data when a borrow suffices.
272 //!
273 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
274 //! standard types for growable byte buffers and Unicode strings that know
275 //! their length.
276 //!
277 //! The C API (header file generated at `target/include/encoding_rs.h` when
278 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
279 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
280 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
281 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
282 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
283 //! as part of Mozilla [bug 1261841][4].
284 //!
285 //! The `Encoding` type is common to both the streaming and non-streaming
286 //! modes. In the streaming mode, decoding operations are performed with a
287 //! `Decoder` and encoding operations with an `Encoder` object obtained via
288 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
289 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
290 //! and `Encoder` objects are not used at all.
291 //!
292 //! [1]: https://github.com/hsivonen/encoding_c
293 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
294 //! [3]: https://github.com/Microsoft/GSL/
295 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
296 //!
297 //! # Memory management
298 //!
299 //! The non-streaming mode never performs heap allocations (even the methods
300 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
301 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
302 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
303 //!
304 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
305 //! perform heap allocations but only to allocate the backing buffer of the
306 //! `Vec<u8>` or the `String`.
307 //!
308 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
309 //! `Drop` cleanup.
310 //!
311 //! # Buffer reading and writing behavior
312 //!
313 //! Based on experience gained with the `java.nio.charset` encoding converter
314 //! API and with the Gecko uconv encoding converter API, the buffer reading
315 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
316 //! fully drained but output buffers are not always fully filled.
317 //!
318 //! When reading from an input buffer, encoding_rs always consumes all input
319 //! up to the next error or to the end of the buffer. In particular, when
320 //! decoding, even if the input buffer ends in the middle of a byte sequence
321 //! for a character, the decoder consumes all input. This has the benefit that
322 //! the caller of the API can always fill the next buffer from the start from
323 //! whatever source the bytes come from and never has to first copy the last
324 //! bytes of the previous buffer to the start of the next buffer. However, when
325 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
326 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
327 //! boundaries falling in the middle of a surrogate pair result in both
328 //! suggorates being treated individually as unpaired surrogates.
329 //!
330 //! Additionally, decoders guarantee that they can be fed even one byte at a
331 //! time and encoders guarantee that they can be fed even one code point at a
332 //! time. This has the benefit of not placing restrictions on the size of
333 //! chunks the content arrives e.g. from network.
334 //!
335 //! When writing into an output buffer, encoding_rs makes sure that the code
336 //! unit sequence for a character is never split across output buffer
337 //! boundaries. This may result in wasted space at the end of an output buffer,
338 //! but the advantages are that the output side of both decoders and encoders
339 //! is greatly simplified compared to designs that attempt to fill output
340 //! buffers exactly even when that entails splitting a code unit sequence and
341 //! when encoding_rs methods return to the caller, the output produces thus
342 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
343 //! the output needs to be considered as a whole, because the latest output
344 //! buffer taken alone might not be valid taken alone if the transition away
345 //! from the ASCII state occurred in an earlier output buffer. However, since
346 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
347 //! state as being in error despite the encoder generating a transition to the
348 //! ASCII state at the end, the claim about the partial output taken as a whole
349 //! being valid is true even for ISO-2022-JP.)
350 //!
351 //! # Error Reporting
352 //!
353 //! Based on experience gained with the `java.nio.charset` encoding converter
354 //! API and with the Gecko uconv encoding converter API, the error reporting
355 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
356 //! that leave it up to the caller to extract the erroneous bytes from the
357 //! input stream if the caller wishes to do so but encoder errors provide the
358 //! code point associated with the error without requiring the caller to
359 //! extract it from the input on its own.
360 //!
361 //! On the encoder side, an error is always triggered by the most recently
362 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
363 //! caller. Also, it's very typical for the caller to wish to do something with
364 //! this data: generate a numeric escape for the character. Additionally, the
365 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
366 //! certain cases, so requiring the caller to extract the character from the
367 //! input buffer would require the caller to handle ISO-2022-JP details.
368 //! Furthermore, requiring the caller to extract the character from the input
369 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
370 //! the job of an encoding conversion library.
371 //!
372 //! On the decoder side, errors are triggered in more complex ways. For
373 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
374 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
375 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
376 //! be the ones most recently pushed to the decoder and the error might not even
377 //! be in the current buffer.
378 //!
379 //! Some encoding conversion APIs address the problem by not acknowledging
380 //! trailing bytes of an input buffer as consumed if it's still possible for
381 //! future bytes to cause the trailing bytes to be in error. This way, error
382 //! reporting can always refer to the most recently pushed buffer. This has the
383 //! problem that the caller of the API has to copy the unconsumed trailing
384 //! bytes to the start of the next buffer before being able to fill the rest
385 //! of the next buffer. This is annoying, error-prone and inefficient.
386 //!
387 //! A possible solution would be making the decoder remember recently consumed
388 //! bytes in order to be able to include a copy of the erroneous bytes when
389 //! reporting an error. This has two problem: First, callers a rarely
390 //! interested in the erroneous bytes, so attempts to identify them are most
391 //! often just overhead anyway. Second, the rare applications that are
392 //! interested typically care about the location of the error in the input
393 //! stream.
394 //!
395 //! To keep the API convenient for common uses and the overhead low while making
396 //! it possible to develop applications, such as HTML validators, that care
397 //! about which bytes were in error, encoding_rs reports the length of the
398 //! erroneous sequence and the number of bytes consumed after the erroneous
399 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
400 //! this makes it possible for callers that care about the erroneous bytes to
401 //! locate them.
402 //!
403 //! # No Convenience API for Custom Replacements
404 //!
405 //! The Web Platform and, therefore, the Encoding Standard supports only one
406 //! error recovery mode for decoders and only one error recovery mode for
407 //! encoders. The supported error recovery mode for decoders is emitting the
408 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
409 //! encoders is emitting an HTML decimal numeric character reference for
410 //! unmappable characters.
411 //!
412 //! Since encoding_rs is Web-focused, these are the only error recovery modes
413 //! for which convenient support is provided. Moreover, on the decoder side,
414 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
415 //! on error (other than treating errors as fatal). In particular, simply
416 //! ignoring errors is a
417 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
418 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
419 //! callers to ignore errors.
420 //!
421 //! On the encoder side, there are plausible alternatives for HTML decimal
422 //! numeric character references. For example, when outputting CSS, CSS-style
423 //! escapes would seem to make sense. However, instead of facilitating the
424 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
425 //! position that you shouldn't generate output in encodings other than UTF-8,
426 //! except where backward compatibility with interacting with the legacy Web
427 //! requires it. The legacy Web requires it only when parsing the query strings
428 //! of URLs and when submitting forms, and those two both use HTML decimal
429 //! numeric character references.
430 //!
431 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
432 //! numeric character references easy, it does make them _possible_.
433 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
434 //! for unmappable characters, is implemented on top of
435 //! `encode_from_utf8_without_replacement()`. Applications that really, really
436 //! want other replacement schemes for unmappable characters can likewise
437 //! implement them on top of `encode_from_utf8_without_replacement()`.
438 //!
439 //! # No Extensibility by Design
440 //!
441 //! The set of encodings supported by encoding_rs is not extensible by design.
442 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
443 //! rather than `trait`s. encoding_rs takes the design position that all future
444 //! text interchange should be done using UTF-8, which can represent all of
445 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
446 //! Standard and encoding_rs that can represent all of Unicode and that has
447 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
448 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
449 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
450 //! other than being able to consume legacy content.
451 //!
452 //! Considering that UTF-8 can represent all of Unicode and is already supported
453 //! by all Web browsers, introducing a new encoding wouldn't add to the
454 //! expressiveness but would add to compatibility problems. In that sense,
455 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
456 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
457 //! the Web Platform. On the other hand, the set of legacy encodings that must
458 //! be supported for a Web browser to be able to be successful is not going to
459 //! expand. Empirically, the set of encodings specified in the Encoding Standard
460 //! is already sufficient and the set of legacy encodings won't grow
461 //! retroactively.
462 //!
463 //! Since extensibility doesn't make sense considering the Web focus of
464 //! encoding_rs and adding encodings to Web clients would be actively harmful,
465 //! it makes sense to make the set of encodings that encoding_rs supports
466 //! non-extensible and to take the (admittedly small) benefits arising from
467 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
468 //!  of time, which enables stack allocation thereof.
469 //!
470 //! This does have downsides for applications that might want to put encoding_rs
471 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
472 //! needed for Web uses. The needs of such applications should not complicate
473 //! encoding_rs itself, though. It is up to those applications to provide a
474 //! framework that delegates the operations with encodings that encoding_rs
475 //! supports to encoding_rs and operations with other encodings to something
476 //! else (as opposed to encoding_rs itself providing an extensibility
477 //! framework).
478 //!
479 //! # Panics
480 //!
481 //! Methods in encoding_rs can panic if the API is used against the requirements
482 //! stated in the documentation, if a state that's supposed to be impossible
483 //! is reached due to an internal bug or on integer overflow. When used
484 //! according to documentation with buffer sizes that stay below integer
485 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
486 //!
487 //! Panics arising from API misuse aren't documented beyond this on individual
488 //! methods.
489 //!
490 //! # At-Risk Parts of the API
491 //!
492 //! The foreseeable source of partially backward-incompatible API change is the
493 //! way the instances of `Encoding` are made available.
494 //!
495 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
496 //! initialized with `static`s of type `&'static Encoding`, the non-reference
497 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
498 //!
499 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
500 //! unique when the constant is used in different crates, the reference-typed
501 //! `static`s for the encoding instances will be changed from `static` to
502 //! `const` and the non-reference-typed `_INIT` instances will be removed.
503 //!
504 //! # Mapping Spec Concepts onto the API
505 //!
506 //! <table>
507 //! <thead>
508 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
509 //! </thead>
510 //! <tbody>
511 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
512 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
513 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
514 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
515 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
522 //! </tbody>
523 //! </table>
524 //!
525 //! # Compatibility with the rust-encoding API
526 //!
527 //! The crate
528 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
529 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
530 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
531 //!
532 //! # Mapping rust-encoding concepts to encoding_rs concepts
533 //!
534 //! The following table provides a mapping from rust-encoding constructs to
535 //! encoding_rs ones.
536 //!
537 //! <table>
538 //! <thead>
539 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
540 //! </thead>
541 //! <tbody>
542 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
543 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
544 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
545 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
546 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
547 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
548 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
549 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
550 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
551 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
552 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
553 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
554 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
555 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
556 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
557 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
558 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
559 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
560 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
561 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
562 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
563 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
564 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
565 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
566 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
567 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
568 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
570 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571 //! </tbody>
572 //! </table>
573 //!
574 //! # Relationship with Windows Code Pages
575 //!
576 //! Despite the Web and browser focus, the encodings defined by the Encoding
577 //! Standard and implemented by this crate may be useful for decoding legacy
578 //! data that uses Windows code pages. The following table names the single-byte
579 //! encodings
580 //! that have a closely related Windows code page, the number of the closest
581 //! code page, a column indicating whether Windows maps unassigned code points
582 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
583 //! indicating remarks in the list after the table.
584 //!
585 //! <table>
586 //! <thead>
587 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
588 //! </thead>
589 //! <tbody>
590 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
591 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
592 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
593 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
594 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
595 //! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
596 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
597 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
598 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
599 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
600 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
601 //! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
602 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
603 //! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
604 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
605 //! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
606 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
607 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
608 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
609 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
610 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
611 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
612 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
613 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
614 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
615 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
616 //! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
617 //! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
618 //! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
619 //! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
620 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
621 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
622 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
623 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
624 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
625 //! </tbody>
626 //! </table>
627 //!
628 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
629 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
630 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
631 //!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
632 //!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
633 //!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
634 //!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
635 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
636 //!    of LRM and RLM.
637 //! 5. Remarks from the previous item apply.
638 //!
639 //! The differences between this crate and Windows in the case of multibyte encodings
640 //! are not yet fully documented here. The lack of remarks above should not be taken
641 //! as indication of lack of differences.
642 //!
643 //! # Notable Differences from IANA Naming
644 //!
645 //! In some cases, the Encoding Standard specifies the popular unextended encoding
646 //! name where in IANA terms one of the other labels would be more precise considering
647 //! the extensions that the Encoding Standard has unified into the encoding.
648 //!
649 //! <table>
650 //! <thead>
651 //! <tr><th>Encoding</th><th>IANA</th></tr>
652 //! </thead>
653 //! <tbody>
654 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
655 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
656 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
657 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
658 //! </tbody>
659 //! </table>
660 //!
661 //! In other cases where the Encoding Standard unifies unextended and extended
662 //! variants of an encoding, the encoding gets the name of the extended
663 //! variant.
664 //!
665 //! <table>
666 //! <thead>
667 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
668 //! </thead>
669 //! <tbody>
670 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
671 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
672 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
673 //! </tbody>
674 //! </table>
675 //!
676 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
677 //! for discussion about the UTF-16 family.
678 
679 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
680 
681 #[macro_use]
682 extern crate cfg_if;
683 
684 #[cfg(all(
685     feature = "simd-accel",
686     any(
687         target_feature = "sse2",
688         all(target_endian = "little", target_arch = "aarch64"),
689         all(target_endian = "little", target_feature = "neon")
690     )
691 ))]
692 #[macro_use(shuffle)]
693 extern crate packed_simd;
694 
695 #[cfg(feature = "serde")]
696 extern crate serde;
697 
698 #[cfg(all(test, feature = "serde"))]
699 extern crate bincode;
700 #[cfg(all(test, feature = "serde"))]
701 #[macro_use]
702 extern crate serde_derive;
703 #[cfg(all(test, feature = "serde"))]
704 extern crate serde_json;
705 
706 #[macro_use]
707 mod macros;
708 
709 #[cfg(all(
710     feature = "simd-accel",
711     any(
712         target_feature = "sse2",
713         all(target_endian = "little", target_arch = "aarch64"),
714         all(target_endian = "little", target_feature = "neon")
715     )
716 ))]
717 mod simd_funcs;
718 
719 #[cfg(test)]
720 mod testing;
721 
722 mod big5;
723 mod euc_jp;
724 mod euc_kr;
725 mod gb18030;
726 mod iso_2022_jp;
727 mod replacement;
728 mod shift_jis;
729 mod single_byte;
730 mod utf_16;
731 mod utf_8;
732 mod x_user_defined;
733 
734 mod ascii;
735 mod data;
736 mod handles;
737 mod variant;
738 
739 pub mod mem;
740 
741 use ascii::ascii_valid_up_to;
742 use ascii::iso_2022_jp_ascii_valid_up_to;
743 use utf_8::utf8_valid_up_to;
744 use variant::*;
745 
746 use std::borrow::Cow;
747 use std::cmp::Ordering;
748 use std::hash::Hash;
749 use std::hash::Hasher;
750 
751 #[cfg(feature = "serde")]
752 use serde::de::Visitor;
753 #[cfg(feature = "serde")]
754 use serde::{Deserialize, Deserializer, Serialize, Serializer};
755 
756 /// This has to be the max length of an NCR instead of max
757 /// minus one, because we can't rely on getting the minus
758 /// one from the space reserved for the current unmappable,
759 /// because the ISO-2022-JP encoder can fill up that space
760 /// with a state transition escape.
761 const NCR_EXTRA: usize = 10; // &#1114111;
762 
763 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
764 // Instead, please regenerate using generate-encoding-data.py
765 
766 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
767 
768 /// The initializer for the [Big5](static.BIG5.html) encoding.
769 ///
770 /// For use only for taking the address of this form when
771 /// Rust prohibits the use of the non-`_INIT` form directly,
772 /// such as in initializers of other `static`s. If in doubt,
773 /// use the corresponding non-`_INIT` reference-typed `static`.
774 ///
775 /// This part of the public API will go away if Rust changes
776 /// to make the referent of `pub const FOO: &'static Encoding`
777 /// unique cross-crate or if Rust starts allowing static arrays
778 /// to be initialized with `pub static FOO: &'static Encoding`
779 /// items.
780 pub static BIG5_INIT: Encoding = Encoding {
781     name: "Big5",
782     variant: VariantEncoding::Big5,
783 };
784 
785 /// The Big5 encoding.
786 ///
787 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
788 /// instead of the Private Use Area code points that have been used historically.
789 /// It is believed to be able to decode existing Web content in a way that makes
790 /// sense.
791 ///
792 /// To avoid form submissions generating data that Web servers don't understand,
793 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
794 /// Big5 in the lexical order.
795 ///
796 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
797 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
798 ///
799 /// This encoding is designed to be suited for decoding the Windows code page 950
800 /// and its HKSCS patched "951" variant such that the text makes sense, given
801 /// assignments that Unicode has made after those encodings used Private Use
802 /// Area characters.
803 ///
804 /// This will change from `static` to `const` if Rust changes
805 /// to make the referent of `pub const FOO: &'static Encoding`
806 /// unique cross-crate, so don't take the address of this
807 /// `static`.
808 pub static BIG5: &'static Encoding = &BIG5_INIT;
809 
810 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
811 ///
812 /// For use only for taking the address of this form when
813 /// Rust prohibits the use of the non-`_INIT` form directly,
814 /// such as in initializers of other `static`s. If in doubt,
815 /// use the corresponding non-`_INIT` reference-typed `static`.
816 ///
817 /// This part of the public API will go away if Rust changes
818 /// to make the referent of `pub const FOO: &'static Encoding`
819 /// unique cross-crate or if Rust starts allowing static arrays
820 /// to be initialized with `pub static FOO: &'static Encoding`
821 /// items.
822 pub static EUC_JP_INIT: Encoding = Encoding {
823     name: "EUC-JP",
824     variant: VariantEncoding::EucJp,
825 };
826 
827 /// The EUC-JP encoding.
828 ///
829 /// This is the legacy Unix encoding for Japanese.
830 ///
831 /// For compatibility with Web servers that don't expect three-byte sequences
832 /// in form submissions, the encoder doesn't generate three-byte sequences.
833 /// That is, the JIS X 0212 support is decode-only.
834 ///
835 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
836 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
837 ///
838 /// This encoding roughly matches the Windows code page 20932. There are error
839 /// handling differences and a handful of 2-byte sequences that decode differently.
840 /// Additionall, Windows doesn't support 3-byte sequences.
841 ///
842 /// This will change from `static` to `const` if Rust changes
843 /// to make the referent of `pub const FOO: &'static Encoding`
844 /// unique cross-crate, so don't take the address of this
845 /// `static`.
846 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
847 
848 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
849 ///
850 /// For use only for taking the address of this form when
851 /// Rust prohibits the use of the non-`_INIT` form directly,
852 /// such as in initializers of other `static`s. If in doubt,
853 /// use the corresponding non-`_INIT` reference-typed `static`.
854 ///
855 /// This part of the public API will go away if Rust changes
856 /// to make the referent of `pub const FOO: &'static Encoding`
857 /// unique cross-crate or if Rust starts allowing static arrays
858 /// to be initialized with `pub static FOO: &'static Encoding`
859 /// items.
860 pub static EUC_KR_INIT: Encoding = Encoding {
861     name: "EUC-KR",
862     variant: VariantEncoding::EucKr,
863 };
864 
865 /// The EUC-KR encoding.
866 ///
867 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
868 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
869 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
870 ///
871 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
872 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
873 ///
874 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
875 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
876 /// the question mark or the Private Use Area.
877 ///
878 /// This will change from `static` to `const` if Rust changes
879 /// to make the referent of `pub const FOO: &'static Encoding`
880 /// unique cross-crate, so don't take the address of this
881 /// `static`.
882 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
883 
884 /// The initializer for the [GBK](static.GBK.html) encoding.
885 ///
886 /// For use only for taking the address of this form when
887 /// Rust prohibits the use of the non-`_INIT` form directly,
888 /// such as in initializers of other `static`s. If in doubt,
889 /// use the corresponding non-`_INIT` reference-typed `static`.
890 ///
891 /// This part of the public API will go away if Rust changes
892 /// to make the referent of `pub const FOO: &'static Encoding`
893 /// unique cross-crate or if Rust starts allowing static arrays
894 /// to be initialized with `pub static FOO: &'static Encoding`
895 /// items.
896 pub static GBK_INIT: Encoding = Encoding {
897     name: "GBK",
898     variant: VariantEncoding::Gbk,
899 };
900 
901 /// The GBK encoding.
902 ///
903 /// The decoder for this encoding is the same as the decoder for gb18030.
904 /// The encoder side of this encoding is GBK with Windows code page 936 euro
905 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
906 /// Unicode block as well as a handful of ideographs from the CJK Unified
907 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
908 ///
909 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
910 /// unified with the gb18030 encoder in the Encoding Standard out of concern
911 /// that servers that expect GBK form submissions might not be able to handle
912 /// the four-byte sequences.
913 ///
914 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
915 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
916 ///
917 /// The encoder of this encoding roughly matches the Windows code page 936.
918 /// The decoder side is a superset.
919 ///
920 /// This will change from `static` to `const` if Rust changes
921 /// to make the referent of `pub const FOO: &'static Encoding`
922 /// unique cross-crate, so don't take the address of this
923 /// `static`.
924 pub static GBK: &'static Encoding = &GBK_INIT;
925 
926 /// The initializer for the [IBM866](static.IBM866.html) encoding.
927 ///
928 /// For use only for taking the address of this form when
929 /// Rust prohibits the use of the non-`_INIT` form directly,
930 /// such as in initializers of other `static`s. If in doubt,
931 /// use the corresponding non-`_INIT` reference-typed `static`.
932 ///
933 /// This part of the public API will go away if Rust changes
934 /// to make the referent of `pub const FOO: &'static Encoding`
935 /// unique cross-crate or if Rust starts allowing static arrays
936 /// to be initialized with `pub static FOO: &'static Encoding`
937 /// items.
938 pub static IBM866_INIT: Encoding = Encoding {
939     name: "IBM866",
940     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
941 };
942 
943 /// The IBM866 encoding.
944 ///
945 /// This the most notable one of the DOS Cyrillic code pages. It has the same
946 /// box drawing characters as code page 437, so it can be used for decoding
947 /// DOS-era ASCII + box drawing data.
948 ///
949 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
950 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
951 ///
952 /// This encoding matches the Windows code page 866.
953 ///
954 /// This will change from `static` to `const` if Rust changes
955 /// to make the referent of `pub const FOO: &'static Encoding`
956 /// unique cross-crate, so don't take the address of this
957 /// `static`.
958 pub static IBM866: &'static Encoding = &IBM866_INIT;
959 
960 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
961 ///
962 /// For use only for taking the address of this form when
963 /// Rust prohibits the use of the non-`_INIT` form directly,
964 /// such as in initializers of other `static`s. If in doubt,
965 /// use the corresponding non-`_INIT` reference-typed `static`.
966 ///
967 /// This part of the public API will go away if Rust changes
968 /// to make the referent of `pub const FOO: &'static Encoding`
969 /// unique cross-crate or if Rust starts allowing static arrays
970 /// to be initialized with `pub static FOO: &'static Encoding`
971 /// items.
972 pub static ISO_2022_JP_INIT: Encoding = Encoding {
973     name: "ISO-2022-JP",
974     variant: VariantEncoding::Iso2022Jp,
975 };
976 
977 /// The ISO-2022-JP encoding.
978 ///
979 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
980 /// byte range to encode non-Basic Latin characters. It's the only encoding
981 /// supported by this crate whose encoder is stateful.
982 ///
983 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
984 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
985 ///
986 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
987 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
988 /// error handling.
989 ///
990 /// This will change from `static` to `const` if Rust changes
991 /// to make the referent of `pub const FOO: &'static Encoding`
992 /// unique cross-crate, so don't take the address of this
993 /// `static`.
994 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
995 
996 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
997 ///
998 /// For use only for taking the address of this form when
999 /// Rust prohibits the use of the non-`_INIT` form directly,
1000 /// such as in initializers of other `static`s. If in doubt,
1001 /// use the corresponding non-`_INIT` reference-typed `static`.
1002 ///
1003 /// This part of the public API will go away if Rust changes
1004 /// to make the referent of `pub const FOO: &'static Encoding`
1005 /// unique cross-crate or if Rust starts allowing static arrays
1006 /// to be initialized with `pub static FOO: &'static Encoding`
1007 /// items.
1008 pub static ISO_8859_10_INIT: Encoding = Encoding {
1009     name: "ISO-8859-10",
1010     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1011 };
1012 
1013 /// The ISO-8859-10 encoding.
1014 ///
1015 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1016 /// is also known as Latin 6.
1017 ///
1018 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1019 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1020 ///
1021 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1022 /// does not support this encoding.
1023 ///
1024 /// This will change from `static` to `const` if Rust changes
1025 /// to make the referent of `pub const FOO: &'static Encoding`
1026 /// unique cross-crate, so don't take the address of this
1027 /// `static`.
1028 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1029 
1030 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1031 ///
1032 /// For use only for taking the address of this form when
1033 /// Rust prohibits the use of the non-`_INIT` form directly,
1034 /// such as in initializers of other `static`s. If in doubt,
1035 /// use the corresponding non-`_INIT` reference-typed `static`.
1036 ///
1037 /// This part of the public API will go away if Rust changes
1038 /// to make the referent of `pub const FOO: &'static Encoding`
1039 /// unique cross-crate or if Rust starts allowing static arrays
1040 /// to be initialized with `pub static FOO: &'static Encoding`
1041 /// items.
1042 pub static ISO_8859_13_INIT: Encoding = Encoding {
1043     name: "ISO-8859-13",
1044     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1045 };
1046 
1047 /// The ISO-8859-13 encoding.
1048 ///
1049 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1050 /// is also known as Latin 7.
1051 ///
1052 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1053 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1054 ///
1055 /// This encoding matches the Windows code page 28603, except Windows decodes
1056 /// unassigned code points to the Private Use Area of Unicode.
1057 ///
1058 /// This will change from `static` to `const` if Rust changes
1059 /// to make the referent of `pub const FOO: &'static Encoding`
1060 /// unique cross-crate, so don't take the address of this
1061 /// `static`.
1062 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1063 
1064 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1065 ///
1066 /// For use only for taking the address of this form when
1067 /// Rust prohibits the use of the non-`_INIT` form directly,
1068 /// such as in initializers of other `static`s. If in doubt,
1069 /// use the corresponding non-`_INIT` reference-typed `static`.
1070 ///
1071 /// This part of the public API will go away if Rust changes
1072 /// to make the referent of `pub const FOO: &'static Encoding`
1073 /// unique cross-crate or if Rust starts allowing static arrays
1074 /// to be initialized with `pub static FOO: &'static Encoding`
1075 /// items.
1076 pub static ISO_8859_14_INIT: Encoding = Encoding {
1077     name: "ISO-8859-14",
1078     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1079 };
1080 
1081 /// The ISO-8859-14 encoding.
1082 ///
1083 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1084 /// is also known as Latin 8.
1085 ///
1086 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1087 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1088 ///
1089 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1090 /// does not support this encoding.
1091 ///
1092 /// This will change from `static` to `const` if Rust changes
1093 /// to make the referent of `pub const FOO: &'static Encoding`
1094 /// unique cross-crate, so don't take the address of this
1095 /// `static`.
1096 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1097 
1098 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1099 ///
1100 /// For use only for taking the address of this form when
1101 /// Rust prohibits the use of the non-`_INIT` form directly,
1102 /// such as in initializers of other `static`s. If in doubt,
1103 /// use the corresponding non-`_INIT` reference-typed `static`.
1104 ///
1105 /// This part of the public API will go away if Rust changes
1106 /// to make the referent of `pub const FOO: &'static Encoding`
1107 /// unique cross-crate or if Rust starts allowing static arrays
1108 /// to be initialized with `pub static FOO: &'static Encoding`
1109 /// items.
1110 pub static ISO_8859_15_INIT: Encoding = Encoding {
1111     name: "ISO-8859-15",
1112     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1113 };
1114 
1115 /// The ISO-8859-15 encoding.
1116 ///
1117 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1118 /// family. This encoding is also known as Latin 9.
1119 ///
1120 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1121 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1122 ///
1123 /// This encoding matches the Windows code page 28605.
1124 ///
1125 /// This will change from `static` to `const` if Rust changes
1126 /// to make the referent of `pub const FOO: &'static Encoding`
1127 /// unique cross-crate, so don't take the address of this
1128 /// `static`.
1129 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1130 
1131 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1132 ///
1133 /// For use only for taking the address of this form when
1134 /// Rust prohibits the use of the non-`_INIT` form directly,
1135 /// such as in initializers of other `static`s. If in doubt,
1136 /// use the corresponding non-`_INIT` reference-typed `static`.
1137 ///
1138 /// This part of the public API will go away if Rust changes
1139 /// to make the referent of `pub const FOO: &'static Encoding`
1140 /// unique cross-crate or if Rust starts allowing static arrays
1141 /// to be initialized with `pub static FOO: &'static Encoding`
1142 /// items.
1143 pub static ISO_8859_16_INIT: Encoding = Encoding {
1144     name: "ISO-8859-16",
1145     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1146 };
1147 
1148 /// The ISO-8859-16 encoding.
1149 ///
1150 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1151 /// family. This encoding is also known as Latin 10.
1152 ///
1153 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1154 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1155 ///
1156 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1157 /// does not support this encoding.
1158 ///
1159 /// This will change from `static` to `const` if Rust changes
1160 /// to make the referent of `pub const FOO: &'static Encoding`
1161 /// unique cross-crate, so don't take the address of this
1162 /// `static`.
1163 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1164 
1165 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1166 ///
1167 /// For use only for taking the address of this form when
1168 /// Rust prohibits the use of the non-`_INIT` form directly,
1169 /// such as in initializers of other `static`s. If in doubt,
1170 /// use the corresponding non-`_INIT` reference-typed `static`.
1171 ///
1172 /// This part of the public API will go away if Rust changes
1173 /// to make the referent of `pub const FOO: &'static Encoding`
1174 /// unique cross-crate or if Rust starts allowing static arrays
1175 /// to be initialized with `pub static FOO: &'static Encoding`
1176 /// items.
1177 pub static ISO_8859_2_INIT: Encoding = Encoding {
1178     name: "ISO-8859-2",
1179     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1180 };
1181 
1182 /// The ISO-8859-2 encoding.
1183 ///
1184 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1185 ///
1186 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1187 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1188 ///
1189 /// This encoding matches the Windows code page 28592.
1190 ///
1191 /// This will change from `static` to `const` if Rust changes
1192 /// to make the referent of `pub const FOO: &'static Encoding`
1193 /// unique cross-crate, so don't take the address of this
1194 /// `static`.
1195 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1196 
1197 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1198 ///
1199 /// For use only for taking the address of this form when
1200 /// Rust prohibits the use of the non-`_INIT` form directly,
1201 /// such as in initializers of other `static`s. If in doubt,
1202 /// use the corresponding non-`_INIT` reference-typed `static`.
1203 ///
1204 /// This part of the public API will go away if Rust changes
1205 /// to make the referent of `pub const FOO: &'static Encoding`
1206 /// unique cross-crate or if Rust starts allowing static arrays
1207 /// to be initialized with `pub static FOO: &'static Encoding`
1208 /// items.
1209 pub static ISO_8859_3_INIT: Encoding = Encoding {
1210     name: "ISO-8859-3",
1211     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1212 };
1213 
1214 /// The ISO-8859-3 encoding.
1215 ///
1216 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1217 ///
1218 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1219 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1220 ///
1221 /// This encoding matches the Windows code page 28593.
1222 ///
1223 /// This will change from `static` to `const` if Rust changes
1224 /// to make the referent of `pub const FOO: &'static Encoding`
1225 /// unique cross-crate, so don't take the address of this
1226 /// `static`.
1227 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1228 
1229 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1230 ///
1231 /// For use only for taking the address of this form when
1232 /// Rust prohibits the use of the non-`_INIT` form directly,
1233 /// such as in initializers of other `static`s. If in doubt,
1234 /// use the corresponding non-`_INIT` reference-typed `static`.
1235 ///
1236 /// This part of the public API will go away if Rust changes
1237 /// to make the referent of `pub const FOO: &'static Encoding`
1238 /// unique cross-crate or if Rust starts allowing static arrays
1239 /// to be initialized with `pub static FOO: &'static Encoding`
1240 /// items.
1241 pub static ISO_8859_4_INIT: Encoding = Encoding {
1242     name: "ISO-8859-4",
1243     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1244 };
1245 
1246 /// The ISO-8859-4 encoding.
1247 ///
1248 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1249 ///
1250 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1251 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1252 ///
1253 /// This encoding matches the Windows code page 28594.
1254 ///
1255 /// This will change from `static` to `const` if Rust changes
1256 /// to make the referent of `pub const FOO: &'static Encoding`
1257 /// unique cross-crate, so don't take the address of this
1258 /// `static`.
1259 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1260 
1261 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1262 ///
1263 /// For use only for taking the address of this form when
1264 /// Rust prohibits the use of the non-`_INIT` form directly,
1265 /// such as in initializers of other `static`s. If in doubt,
1266 /// use the corresponding non-`_INIT` reference-typed `static`.
1267 ///
1268 /// This part of the public API will go away if Rust changes
1269 /// to make the referent of `pub const FOO: &'static Encoding`
1270 /// unique cross-crate or if Rust starts allowing static arrays
1271 /// to be initialized with `pub static FOO: &'static Encoding`
1272 /// items.
1273 pub static ISO_8859_5_INIT: Encoding = Encoding {
1274     name: "ISO-8859-5",
1275     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1276 };
1277 
1278 /// The ISO-8859-5 encoding.
1279 ///
1280 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1281 ///
1282 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1283 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1284 ///
1285 /// This encoding matches the Windows code page 28595.
1286 ///
1287 /// This will change from `static` to `const` if Rust changes
1288 /// to make the referent of `pub const FOO: &'static Encoding`
1289 /// unique cross-crate, so don't take the address of this
1290 /// `static`.
1291 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1292 
1293 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1294 ///
1295 /// For use only for taking the address of this form when
1296 /// Rust prohibits the use of the non-`_INIT` form directly,
1297 /// such as in initializers of other `static`s. If in doubt,
1298 /// use the corresponding non-`_INIT` reference-typed `static`.
1299 ///
1300 /// This part of the public API will go away if Rust changes
1301 /// to make the referent of `pub const FOO: &'static Encoding`
1302 /// unique cross-crate or if Rust starts allowing static arrays
1303 /// to be initialized with `pub static FOO: &'static Encoding`
1304 /// items.
1305 pub static ISO_8859_6_INIT: Encoding = Encoding {
1306     name: "ISO-8859-6",
1307     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1308 };
1309 
1310 /// The ISO-8859-6 encoding.
1311 ///
1312 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1313 ///
1314 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1315 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1316 ///
1317 /// This encoding matches the Windows code page 28596, except Windows decodes
1318 /// unassigned code points to the Private Use Area of Unicode.
1319 ///
1320 /// This will change from `static` to `const` if Rust changes
1321 /// to make the referent of `pub const FOO: &'static Encoding`
1322 /// unique cross-crate, so don't take the address of this
1323 /// `static`.
1324 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1325 
1326 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1327 ///
1328 /// For use only for taking the address of this form when
1329 /// Rust prohibits the use of the non-`_INIT` form directly,
1330 /// such as in initializers of other `static`s. If in doubt,
1331 /// use the corresponding non-`_INIT` reference-typed `static`.
1332 ///
1333 /// This part of the public API will go away if Rust changes
1334 /// to make the referent of `pub const FOO: &'static Encoding`
1335 /// unique cross-crate or if Rust starts allowing static arrays
1336 /// to be initialized with `pub static FOO: &'static Encoding`
1337 /// items.
1338 pub static ISO_8859_7_INIT: Encoding = Encoding {
1339     name: "ISO-8859-7",
1340     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1341 };
1342 
1343 /// The ISO-8859-7 encoding.
1344 ///
1345 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1346 ///
1347 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1348 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1349 ///
1350 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1351 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1352 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1353 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1354 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1355 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1356 ///
1357 /// This will change from `static` to `const` if Rust changes
1358 /// to make the referent of `pub const FOO: &'static Encoding`
1359 /// unique cross-crate, so don't take the address of this
1360 /// `static`.
1361 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1362 
1363 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1364 ///
1365 /// For use only for taking the address of this form when
1366 /// Rust prohibits the use of the non-`_INIT` form directly,
1367 /// such as in initializers of other `static`s. If in doubt,
1368 /// use the corresponding non-`_INIT` reference-typed `static`.
1369 ///
1370 /// This part of the public API will go away if Rust changes
1371 /// to make the referent of `pub const FOO: &'static Encoding`
1372 /// unique cross-crate or if Rust starts allowing static arrays
1373 /// to be initialized with `pub static FOO: &'static Encoding`
1374 /// items.
1375 pub static ISO_8859_8_INIT: Encoding = Encoding {
1376     name: "ISO-8859-8",
1377     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1378 };
1379 
1380 /// The ISO-8859-8 encoding.
1381 ///
1382 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1383 ///
1384 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1385 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1386 ///
1387 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1388 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1389 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1390 /// the private use area.
1391 ///
1392 /// This will change from `static` to `const` if Rust changes
1393 /// to make the referent of `pub const FOO: &'static Encoding`
1394 /// unique cross-crate, so don't take the address of this
1395 /// `static`.
1396 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1397 
1398 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1399 ///
1400 /// For use only for taking the address of this form when
1401 /// Rust prohibits the use of the non-`_INIT` form directly,
1402 /// such as in initializers of other `static`s. If in doubt,
1403 /// use the corresponding non-`_INIT` reference-typed `static`.
1404 ///
1405 /// This part of the public API will go away if Rust changes
1406 /// to make the referent of `pub const FOO: &'static Encoding`
1407 /// unique cross-crate or if Rust starts allowing static arrays
1408 /// to be initialized with `pub static FOO: &'static Encoding`
1409 /// items.
1410 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1411     name: "ISO-8859-8-I",
1412     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1413 };
1414 
1415 /// The ISO-8859-8-I encoding.
1416 ///
1417 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1418 ///
1419 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1420 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1421 ///
1422 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1423 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1424 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1425 /// the private use area.
1426 ///
1427 /// This will change from `static` to `const` if Rust changes
1428 /// to make the referent of `pub const FOO: &'static Encoding`
1429 /// unique cross-crate, so don't take the address of this
1430 /// `static`.
1431 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1432 
1433 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1434 ///
1435 /// For use only for taking the address of this form when
1436 /// Rust prohibits the use of the non-`_INIT` form directly,
1437 /// such as in initializers of other `static`s. If in doubt,
1438 /// use the corresponding non-`_INIT` reference-typed `static`.
1439 ///
1440 /// This part of the public API will go away if Rust changes
1441 /// to make the referent of `pub const FOO: &'static Encoding`
1442 /// unique cross-crate or if Rust starts allowing static arrays
1443 /// to be initialized with `pub static FOO: &'static Encoding`
1444 /// items.
1445 pub static KOI8_R_INIT: Encoding = Encoding {
1446     name: "KOI8-R",
1447     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1448 };
1449 
1450 /// The KOI8-R encoding.
1451 ///
1452 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1453 ///
1454 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1455 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1456 ///
1457 /// This encoding matches the Windows code page 20866.
1458 ///
1459 /// This will change from `static` to `const` if Rust changes
1460 /// to make the referent of `pub const FOO: &'static Encoding`
1461 /// unique cross-crate, so don't take the address of this
1462 /// `static`.
1463 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1464 
1465 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1466 ///
1467 /// For use only for taking the address of this form when
1468 /// Rust prohibits the use of the non-`_INIT` form directly,
1469 /// such as in initializers of other `static`s. If in doubt,
1470 /// use the corresponding non-`_INIT` reference-typed `static`.
1471 ///
1472 /// This part of the public API will go away if Rust changes
1473 /// to make the referent of `pub const FOO: &'static Encoding`
1474 /// unique cross-crate or if Rust starts allowing static arrays
1475 /// to be initialized with `pub static FOO: &'static Encoding`
1476 /// items.
1477 pub static KOI8_U_INIT: Encoding = Encoding {
1478     name: "KOI8-U",
1479     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1480 };
1481 
1482 /// The KOI8-U encoding.
1483 ///
1484 /// This is an encoding for Ukrainian adapted from KOI8-R.
1485 ///
1486 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1487 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1488 ///
1489 /// This encoding matches the Windows code page 21866.
1490 ///
1491 /// This will change from `static` to `const` if Rust changes
1492 /// to make the referent of `pub const FOO: &'static Encoding`
1493 /// unique cross-crate, so don't take the address of this
1494 /// `static`.
1495 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1496 
1497 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1498 ///
1499 /// For use only for taking the address of this form when
1500 /// Rust prohibits the use of the non-`_INIT` form directly,
1501 /// such as in initializers of other `static`s. If in doubt,
1502 /// use the corresponding non-`_INIT` reference-typed `static`.
1503 ///
1504 /// This part of the public API will go away if Rust changes
1505 /// to make the referent of `pub const FOO: &'static Encoding`
1506 /// unique cross-crate or if Rust starts allowing static arrays
1507 /// to be initialized with `pub static FOO: &'static Encoding`
1508 /// items.
1509 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1510     name: "Shift_JIS",
1511     variant: VariantEncoding::ShiftJis,
1512 };
1513 
1514 /// The Shift_JIS encoding.
1515 ///
1516 /// This is the Japanese encoding for Windows.
1517 ///
1518 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1519 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1520 ///
1521 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1522 /// sequences that are error per the Encoding Standard to the question mark or the
1523 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1524 ///
1525 /// This will change from `static` to `const` if Rust changes
1526 /// to make the referent of `pub const FOO: &'static Encoding`
1527 /// unique cross-crate, so don't take the address of this
1528 /// `static`.
1529 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1530 
1531 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1532 ///
1533 /// For use only for taking the address of this form when
1534 /// Rust prohibits the use of the non-`_INIT` form directly,
1535 /// such as in initializers of other `static`s. If in doubt,
1536 /// use the corresponding non-`_INIT` reference-typed `static`.
1537 ///
1538 /// This part of the public API will go away if Rust changes
1539 /// to make the referent of `pub const FOO: &'static Encoding`
1540 /// unique cross-crate or if Rust starts allowing static arrays
1541 /// to be initialized with `pub static FOO: &'static Encoding`
1542 /// items.
1543 pub static UTF_16BE_INIT: Encoding = Encoding {
1544     name: "UTF-16BE",
1545     variant: VariantEncoding::Utf16Be,
1546 };
1547 
1548 /// The UTF-16BE encoding.
1549 ///
1550 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1551 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1552 /// mark the big endian byte order is assumed.
1553 ///
1554 /// There is no corresponding encoder in this crate or in the Encoding
1555 /// Standard. The output encoding of this encoding is UTF-8.
1556 ///
1557 /// This encoding matches the Windows code page 1201.
1558 ///
1559 /// This will change from `static` to `const` if Rust changes
1560 /// to make the referent of `pub const FOO: &'static Encoding`
1561 /// unique cross-crate, so don't take the address of this
1562 /// `static`.
1563 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1564 
1565 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1566 ///
1567 /// For use only for taking the address of this form when
1568 /// Rust prohibits the use of the non-`_INIT` form directly,
1569 /// such as in initializers of other `static`s. If in doubt,
1570 /// use the corresponding non-`_INIT` reference-typed `static`.
1571 ///
1572 /// This part of the public API will go away if Rust changes
1573 /// to make the referent of `pub const FOO: &'static Encoding`
1574 /// unique cross-crate or if Rust starts allowing static arrays
1575 /// to be initialized with `pub static FOO: &'static Encoding`
1576 /// items.
1577 pub static UTF_16LE_INIT: Encoding = Encoding {
1578     name: "UTF-16LE",
1579     variant: VariantEncoding::Utf16Le,
1580 };
1581 
1582 /// The UTF-16LE encoding.
1583 ///
1584 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1585 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1586 /// mark the little endian byte order is assumed.
1587 ///
1588 /// There is no corresponding encoder in this crate or in the Encoding
1589 /// Standard. The output encoding of this encoding is UTF-8.
1590 ///
1591 /// This encoding matches the Windows code page 1200.
1592 ///
1593 /// This will change from `static` to `const` if Rust changes
1594 /// to make the referent of `pub const FOO: &'static Encoding`
1595 /// unique cross-crate, so don't take the address of this
1596 /// `static`.
1597 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1598 
1599 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1600 ///
1601 /// For use only for taking the address of this form when
1602 /// Rust prohibits the use of the non-`_INIT` form directly,
1603 /// such as in initializers of other `static`s. If in doubt,
1604 /// use the corresponding non-`_INIT` reference-typed `static`.
1605 ///
1606 /// This part of the public API will go away if Rust changes
1607 /// to make the referent of `pub const FOO: &'static Encoding`
1608 /// unique cross-crate or if Rust starts allowing static arrays
1609 /// to be initialized with `pub static FOO: &'static Encoding`
1610 /// items.
1611 pub static UTF_8_INIT: Encoding = Encoding {
1612     name: "UTF-8",
1613     variant: VariantEncoding::Utf8,
1614 };
1615 
1616 /// The UTF-8 encoding.
1617 ///
1618 /// This is the encoding that should be used for all new development it can
1619 /// represent all of Unicode.
1620 ///
1621 /// This encoding matches the Windows code page 65001, except Windows differs
1622 /// in the number of errors generated for some erroneous byte sequences.
1623 ///
1624 /// This will change from `static` to `const` if Rust changes
1625 /// to make the referent of `pub const FOO: &'static Encoding`
1626 /// unique cross-crate, so don't take the address of this
1627 /// `static`.
1628 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1629 
1630 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1631 ///
1632 /// For use only for taking the address of this form when
1633 /// Rust prohibits the use of the non-`_INIT` form directly,
1634 /// such as in initializers of other `static`s. If in doubt,
1635 /// use the corresponding non-`_INIT` reference-typed `static`.
1636 ///
1637 /// This part of the public API will go away if Rust changes
1638 /// to make the referent of `pub const FOO: &'static Encoding`
1639 /// unique cross-crate or if Rust starts allowing static arrays
1640 /// to be initialized with `pub static FOO: &'static Encoding`
1641 /// items.
1642 pub static GB18030_INIT: Encoding = Encoding {
1643     name: "gb18030",
1644     variant: VariantEncoding::Gb18030,
1645 };
1646 
1647 /// The gb18030 encoding.
1648 ///
1649 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1650 /// maps to U+3000 for compatibility with existing Web content. As a result,
1651 /// this encoding can represent all of Unicode except for the private-use
1652 /// character U+E5E5.
1653 ///
1654 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1655 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1656 ///
1657 /// This encoding matches the Windows code page 54936.
1658 ///
1659 /// This will change from `static` to `const` if Rust changes
1660 /// to make the referent of `pub const FOO: &'static Encoding`
1661 /// unique cross-crate, so don't take the address of this
1662 /// `static`.
1663 pub static GB18030: &'static Encoding = &GB18030_INIT;
1664 
1665 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1666 ///
1667 /// For use only for taking the address of this form when
1668 /// Rust prohibits the use of the non-`_INIT` form directly,
1669 /// such as in initializers of other `static`s. If in doubt,
1670 /// use the corresponding non-`_INIT` reference-typed `static`.
1671 ///
1672 /// This part of the public API will go away if Rust changes
1673 /// to make the referent of `pub const FOO: &'static Encoding`
1674 /// unique cross-crate or if Rust starts allowing static arrays
1675 /// to be initialized with `pub static FOO: &'static Encoding`
1676 /// items.
1677 pub static MACINTOSH_INIT: Encoding = Encoding {
1678     name: "macintosh",
1679     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1680 };
1681 
1682 /// The macintosh encoding.
1683 ///
1684 /// This is the MacRoman encoding from Mac OS Classic.
1685 ///
1686 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1687 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1688 ///
1689 /// This encoding matches the Windows code page 10000, except Windows decodes
1690 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1691 ///
1692 /// This will change from `static` to `const` if Rust changes
1693 /// to make the referent of `pub const FOO: &'static Encoding`
1694 /// unique cross-crate, so don't take the address of this
1695 /// `static`.
1696 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1697 
1698 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1699 ///
1700 /// For use only for taking the address of this form when
1701 /// Rust prohibits the use of the non-`_INIT` form directly,
1702 /// such as in initializers of other `static`s. If in doubt,
1703 /// use the corresponding non-`_INIT` reference-typed `static`.
1704 ///
1705 /// This part of the public API will go away if Rust changes
1706 /// to make the referent of `pub const FOO: &'static Encoding`
1707 /// unique cross-crate or if Rust starts allowing static arrays
1708 /// to be initialized with `pub static FOO: &'static Encoding`
1709 /// items.
1710 pub static REPLACEMENT_INIT: Encoding = Encoding {
1711     name: "replacement",
1712     variant: VariantEncoding::Replacement,
1713 };
1714 
1715 /// The replacement encoding.
1716 ///
1717 /// This decode-only encoding decodes all non-zero-length streams to a single
1718 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1719 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1720 /// encodings that are no longer supported by the Web Platform and that
1721 /// would be dangerous to treat as ASCII-compatible.
1722 ///
1723 /// There is no corresponding encoder. The output encoding of this encoding
1724 /// is UTF-8.
1725 ///
1726 /// This encoding does not have a Windows code page number.
1727 ///
1728 /// This will change from `static` to `const` if Rust changes
1729 /// to make the referent of `pub const FOO: &'static Encoding`
1730 /// unique cross-crate, so don't take the address of this
1731 /// `static`.
1732 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1733 
1734 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1735 ///
1736 /// For use only for taking the address of this form when
1737 /// Rust prohibits the use of the non-`_INIT` form directly,
1738 /// such as in initializers of other `static`s. If in doubt,
1739 /// use the corresponding non-`_INIT` reference-typed `static`.
1740 ///
1741 /// This part of the public API will go away if Rust changes
1742 /// to make the referent of `pub const FOO: &'static Encoding`
1743 /// unique cross-crate or if Rust starts allowing static arrays
1744 /// to be initialized with `pub static FOO: &'static Encoding`
1745 /// items.
1746 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1747     name: "windows-1250",
1748     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1749 };
1750 
1751 /// The windows-1250 encoding.
1752 ///
1753 /// This is the Central European encoding for Windows.
1754 ///
1755 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1756 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1757 ///
1758 /// This encoding matches the Windows code page 1250.
1759 ///
1760 /// This will change from `static` to `const` if Rust changes
1761 /// to make the referent of `pub const FOO: &'static Encoding`
1762 /// unique cross-crate, so don't take the address of this
1763 /// `static`.
1764 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1765 
1766 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1767 ///
1768 /// For use only for taking the address of this form when
1769 /// Rust prohibits the use of the non-`_INIT` form directly,
1770 /// such as in initializers of other `static`s. If in doubt,
1771 /// use the corresponding non-`_INIT` reference-typed `static`.
1772 ///
1773 /// This part of the public API will go away if Rust changes
1774 /// to make the referent of `pub const FOO: &'static Encoding`
1775 /// unique cross-crate or if Rust starts allowing static arrays
1776 /// to be initialized with `pub static FOO: &'static Encoding`
1777 /// items.
1778 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1779     name: "windows-1251",
1780     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1781 };
1782 
1783 /// The windows-1251 encoding.
1784 ///
1785 /// This is the Cyrillic encoding for Windows.
1786 ///
1787 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1788 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1789 ///
1790 /// This encoding matches the Windows code page 1251.
1791 ///
1792 /// This will change from `static` to `const` if Rust changes
1793 /// to make the referent of `pub const FOO: &'static Encoding`
1794 /// unique cross-crate, so don't take the address of this
1795 /// `static`.
1796 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1797 
1798 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1799 ///
1800 /// For use only for taking the address of this form when
1801 /// Rust prohibits the use of the non-`_INIT` form directly,
1802 /// such as in initializers of other `static`s. If in doubt,
1803 /// use the corresponding non-`_INIT` reference-typed `static`.
1804 ///
1805 /// This part of the public API will go away if Rust changes
1806 /// to make the referent of `pub const FOO: &'static Encoding`
1807 /// unique cross-crate or if Rust starts allowing static arrays
1808 /// to be initialized with `pub static FOO: &'static Encoding`
1809 /// items.
1810 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1811     name: "windows-1252",
1812     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1813 };
1814 
1815 /// The windows-1252 encoding.
1816 ///
1817 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1818 /// which is known as Latin 1.
1819 ///
1820 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1821 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1822 ///
1823 /// This encoding matches the Windows code page 1252.
1824 ///
1825 /// This will change from `static` to `const` if Rust changes
1826 /// to make the referent of `pub const FOO: &'static Encoding`
1827 /// unique cross-crate, so don't take the address of this
1828 /// `static`.
1829 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1830 
1831 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1832 ///
1833 /// For use only for taking the address of this form when
1834 /// Rust prohibits the use of the non-`_INIT` form directly,
1835 /// such as in initializers of other `static`s. If in doubt,
1836 /// use the corresponding non-`_INIT` reference-typed `static`.
1837 ///
1838 /// This part of the public API will go away if Rust changes
1839 /// to make the referent of `pub const FOO: &'static Encoding`
1840 /// unique cross-crate or if Rust starts allowing static arrays
1841 /// to be initialized with `pub static FOO: &'static Encoding`
1842 /// items.
1843 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1844     name: "windows-1253",
1845     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1846 };
1847 
1848 /// The windows-1253 encoding.
1849 ///
1850 /// This is the Greek encoding for Windows. It is mostly an extension of
1851 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1852 ///
1853 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1854 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1855 ///
1856 /// This encoding matches the Windows code page 1253, except Windows decodes
1857 /// unassigned code points to the Private Use Area of Unicode.
1858 ///
1859 /// This will change from `static` to `const` if Rust changes
1860 /// to make the referent of `pub const FOO: &'static Encoding`
1861 /// unique cross-crate, so don't take the address of this
1862 /// `static`.
1863 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1864 
1865 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1866 ///
1867 /// For use only for taking the address of this form when
1868 /// Rust prohibits the use of the non-`_INIT` form directly,
1869 /// such as in initializers of other `static`s. If in doubt,
1870 /// use the corresponding non-`_INIT` reference-typed `static`.
1871 ///
1872 /// This part of the public API will go away if Rust changes
1873 /// to make the referent of `pub const FOO: &'static Encoding`
1874 /// unique cross-crate or if Rust starts allowing static arrays
1875 /// to be initialized with `pub static FOO: &'static Encoding`
1876 /// items.
1877 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1878     name: "windows-1254",
1879     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1880 };
1881 
1882 /// The windows-1254 encoding.
1883 ///
1884 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1885 /// which is known as Latin 5.
1886 ///
1887 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1888 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1889 ///
1890 /// This encoding matches the Windows code page 1254.
1891 ///
1892 /// This will change from `static` to `const` if Rust changes
1893 /// to make the referent of `pub const FOO: &'static Encoding`
1894 /// unique cross-crate, so don't take the address of this
1895 /// `static`.
1896 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1897 
1898 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1899 ///
1900 /// For use only for taking the address of this form when
1901 /// Rust prohibits the use of the non-`_INIT` form directly,
1902 /// such as in initializers of other `static`s. If in doubt,
1903 /// use the corresponding non-`_INIT` reference-typed `static`.
1904 ///
1905 /// This part of the public API will go away if Rust changes
1906 /// to make the referent of `pub const FOO: &'static Encoding`
1907 /// unique cross-crate or if Rust starts allowing static arrays
1908 /// to be initialized with `pub static FOO: &'static Encoding`
1909 /// items.
1910 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1911     name: "windows-1255",
1912     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1913 };
1914 
1915 /// The windows-1255 encoding.
1916 ///
1917 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1918 /// except for a currency sign swap.
1919 ///
1920 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1921 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1922 ///
1923 /// This encoding matches the Windows code page 1255, except Windows decodes
1924 /// unassigned code points to the Private Use Area of Unicode.
1925 ///
1926 /// This will change from `static` to `const` if Rust changes
1927 /// to make the referent of `pub const FOO: &'static Encoding`
1928 /// unique cross-crate, so don't take the address of this
1929 /// `static`.
1930 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1931 
1932 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1933 ///
1934 /// For use only for taking the address of this form when
1935 /// Rust prohibits the use of the non-`_INIT` form directly,
1936 /// such as in initializers of other `static`s. If in doubt,
1937 /// use the corresponding non-`_INIT` reference-typed `static`.
1938 ///
1939 /// This part of the public API will go away if Rust changes
1940 /// to make the referent of `pub const FOO: &'static Encoding`
1941 /// unique cross-crate or if Rust starts allowing static arrays
1942 /// to be initialized with `pub static FOO: &'static Encoding`
1943 /// items.
1944 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1945     name: "windows-1256",
1946     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1947 };
1948 
1949 /// The windows-1256 encoding.
1950 ///
1951 /// This is the Arabic encoding for Windows.
1952 ///
1953 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1954 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1955 ///
1956 /// This encoding matches the Windows code page 1256.
1957 ///
1958 /// This will change from `static` to `const` if Rust changes
1959 /// to make the referent of `pub const FOO: &'static Encoding`
1960 /// unique cross-crate, so don't take the address of this
1961 /// `static`.
1962 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1963 
1964 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1965 ///
1966 /// For use only for taking the address of this form when
1967 /// Rust prohibits the use of the non-`_INIT` form directly,
1968 /// such as in initializers of other `static`s. If in doubt,
1969 /// use the corresponding non-`_INIT` reference-typed `static`.
1970 ///
1971 /// This part of the public API will go away if Rust changes
1972 /// to make the referent of `pub const FOO: &'static Encoding`
1973 /// unique cross-crate or if Rust starts allowing static arrays
1974 /// to be initialized with `pub static FOO: &'static Encoding`
1975 /// items.
1976 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1977     name: "windows-1257",
1978     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1979 };
1980 
1981 /// The windows-1257 encoding.
1982 ///
1983 /// This is the Baltic encoding for Windows.
1984 ///
1985 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1986 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1987 ///
1988 /// This encoding matches the Windows code page 1257, except Windows decodes
1989 /// unassigned code points to the Private Use Area of Unicode.
1990 ///
1991 /// This will change from `static` to `const` if Rust changes
1992 /// to make the referent of `pub const FOO: &'static Encoding`
1993 /// unique cross-crate, so don't take the address of this
1994 /// `static`.
1995 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
1996 
1997 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
1998 ///
1999 /// For use only for taking the address of this form when
2000 /// Rust prohibits the use of the non-`_INIT` form directly,
2001 /// such as in initializers of other `static`s. If in doubt,
2002 /// use the corresponding non-`_INIT` reference-typed `static`.
2003 ///
2004 /// This part of the public API will go away if Rust changes
2005 /// to make the referent of `pub const FOO: &'static Encoding`
2006 /// unique cross-crate or if Rust starts allowing static arrays
2007 /// to be initialized with `pub static FOO: &'static Encoding`
2008 /// items.
2009 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2010     name: "windows-1258",
2011     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2012 };
2013 
2014 /// The windows-1258 encoding.
2015 ///
2016 /// This is the Vietnamese encoding for Windows.
2017 ///
2018 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2019 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2020 ///
2021 /// This encoding matches the Windows code page 1258 when used in the
2022 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2023 /// result of decoding is not necessarily in Normalization Form C. On the
2024 /// other hand, input in the Normalization Form C is not encoded without
2025 /// replacement. In general, it's a bad idea to encode to encodings other
2026 /// than UTF-8, but this encoding is especially hazardous to encode to.
2027 ///
2028 /// This will change from `static` to `const` if Rust changes
2029 /// to make the referent of `pub const FOO: &'static Encoding`
2030 /// unique cross-crate, so don't take the address of this
2031 /// `static`.
2032 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2033 
2034 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2035 ///
2036 /// For use only for taking the address of this form when
2037 /// Rust prohibits the use of the non-`_INIT` form directly,
2038 /// such as in initializers of other `static`s. If in doubt,
2039 /// use the corresponding non-`_INIT` reference-typed `static`.
2040 ///
2041 /// This part of the public API will go away if Rust changes
2042 /// to make the referent of `pub const FOO: &'static Encoding`
2043 /// unique cross-crate or if Rust starts allowing static arrays
2044 /// to be initialized with `pub static FOO: &'static Encoding`
2045 /// items.
2046 pub static WINDOWS_874_INIT: Encoding = Encoding {
2047     name: "windows-874",
2048     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2049 };
2050 
2051 /// The windows-874 encoding.
2052 ///
2053 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2054 ///
2055 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2056 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2057 ///
2058 /// This encoding matches the Windows code page 874, except Windows decodes
2059 /// unassigned code points to the Private Use Area of Unicode.
2060 ///
2061 /// This will change from `static` to `const` if Rust changes
2062 /// to make the referent of `pub const FOO: &'static Encoding`
2063 /// unique cross-crate, so don't take the address of this
2064 /// `static`.
2065 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2066 
2067 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2068 ///
2069 /// For use only for taking the address of this form when
2070 /// Rust prohibits the use of the non-`_INIT` form directly,
2071 /// such as in initializers of other `static`s. If in doubt,
2072 /// use the corresponding non-`_INIT` reference-typed `static`.
2073 ///
2074 /// This part of the public API will go away if Rust changes
2075 /// to make the referent of `pub const FOO: &'static Encoding`
2076 /// unique cross-crate or if Rust starts allowing static arrays
2077 /// to be initialized with `pub static FOO: &'static Encoding`
2078 /// items.
2079 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2080     name: "x-mac-cyrillic",
2081     variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2082 };
2083 
2084 /// The x-mac-cyrillic encoding.
2085 ///
2086 /// This is the MacUkrainian encoding from Mac OS Classic.
2087 ///
2088 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2089 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2090 ///
2091 /// This encoding matches the Windows code page 10017.
2092 ///
2093 /// This will change from `static` to `const` if Rust changes
2094 /// to make the referent of `pub const FOO: &'static Encoding`
2095 /// unique cross-crate, so don't take the address of this
2096 /// `static`.
2097 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2098 
2099 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2100 ///
2101 /// For use only for taking the address of this form when
2102 /// Rust prohibits the use of the non-`_INIT` form directly,
2103 /// such as in initializers of other `static`s. If in doubt,
2104 /// use the corresponding non-`_INIT` reference-typed `static`.
2105 ///
2106 /// This part of the public API will go away if Rust changes
2107 /// to make the referent of `pub const FOO: &'static Encoding`
2108 /// unique cross-crate or if Rust starts allowing static arrays
2109 /// to be initialized with `pub static FOO: &'static Encoding`
2110 /// items.
2111 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2112     name: "x-user-defined",
2113     variant: VariantEncoding::UserDefined,
2114 };
2115 
2116 /// The x-user-defined encoding.
2117 ///
2118 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2119 /// them to the Private Use Area of Unicode. It was used for loading binary
2120 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2121 /// the `"arraybuffer"` response type.
2122 ///
2123 /// This encoding does not have a Windows code page number.
2124 ///
2125 /// This will change from `static` to `const` if Rust changes
2126 /// to make the referent of `pub const FOO: &'static Encoding`
2127 /// unique cross-crate, so don't take the address of this
2128 /// `static`.
2129 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2130 
2131 static LABELS_SORTED: [&'static str; 219] = [
2132     "l1",
2133     "l2",
2134     "l3",
2135     "l4",
2136     "l5",
2137     "l6",
2138     "l9",
2139     "866",
2140     "mac",
2141     "koi",
2142     "gbk",
2143     "big5",
2144     "utf8",
2145     "koi8",
2146     "sjis",
2147     "ms932",
2148     "cp866",
2149     "utf-8",
2150     "cp819",
2151     "ascii",
2152     "x-gbk",
2153     "greek",
2154     "cp1250",
2155     "cp1251",
2156     "latin1",
2157     "gb2312",
2158     "cp1252",
2159     "latin2",
2160     "cp1253",
2161     "latin3",
2162     "cp1254",
2163     "latin4",
2164     "cp1255",
2165     "csbig5",
2166     "latin5",
2167     "utf-16",
2168     "cp1256",
2169     "ibm866",
2170     "latin6",
2171     "cp1257",
2172     "cp1258",
2173     "greek8",
2174     "ibm819",
2175     "arabic",
2176     "visual",
2177     "korean",
2178     "euc-jp",
2179     "koi8-r",
2180     "koi8_r",
2181     "euc-kr",
2182     "x-sjis",
2183     "koi8-u",
2184     "hebrew",
2185     "tis-620",
2186     "gb18030",
2187     "ksc5601",
2188     "gb_2312",
2189     "dos-874",
2190     "cn-big5",
2191     "chinese",
2192     "logical",
2193     "cskoi8r",
2194     "cseuckr",
2195     "koi8-ru",
2196     "x-cp1250",
2197     "ksc_5601",
2198     "x-cp1251",
2199     "iso88591",
2200     "csgb2312",
2201     "x-cp1252",
2202     "iso88592",
2203     "x-cp1253",
2204     "iso88593",
2205     "ecma-114",
2206     "x-cp1254",
2207     "iso88594",
2208     "x-cp1255",
2209     "iso88595",
2210     "x-x-big5",
2211     "x-cp1256",
2212     "csibm866",
2213     "iso88596",
2214     "x-cp1257",
2215     "iso88597",
2216     "asmo-708",
2217     "ecma-118",
2218     "elot_928",
2219     "x-cp1258",
2220     "iso88598",
2221     "iso88599",
2222     "cyrillic",
2223     "utf-16be",
2224     "utf-16le",
2225     "us-ascii",
2226     "ms_kanji",
2227     "x-euc-jp",
2228     "iso885910",
2229     "iso8859-1",
2230     "iso885911",
2231     "iso8859-2",
2232     "iso8859-3",
2233     "iso885913",
2234     "iso8859-4",
2235     "iso885914",
2236     "iso8859-5",
2237     "iso885915",
2238     "iso8859-6",
2239     "iso8859-7",
2240     "iso8859-8",
2241     "iso-ir-58",
2242     "iso8859-9",
2243     "macintosh",
2244     "shift-jis",
2245     "shift_jis",
2246     "iso-ir-100",
2247     "iso8859-10",
2248     "iso-ir-110",
2249     "gb_2312-80",
2250     "iso-8859-1",
2251     "iso_8859-1",
2252     "iso-ir-101",
2253     "iso8859-11",
2254     "iso-8859-2",
2255     "iso_8859-2",
2256     "hz-gb-2312",
2257     "iso-8859-3",
2258     "iso_8859-3",
2259     "iso8859-13",
2260     "iso-8859-4",
2261     "iso_8859-4",
2262     "iso8859-14",
2263     "iso-ir-144",
2264     "iso-8859-5",
2265     "iso_8859-5",
2266     "iso8859-15",
2267     "iso-8859-6",
2268     "iso_8859-6",
2269     "iso-ir-126",
2270     "iso-8859-7",
2271     "iso_8859-7",
2272     "iso-ir-127",
2273     "iso-ir-157",
2274     "iso-8859-8",
2275     "iso_8859-8",
2276     "iso-ir-138",
2277     "iso-ir-148",
2278     "iso-8859-9",
2279     "iso_8859-9",
2280     "iso-ir-109",
2281     "iso-ir-149",
2282     "big5-hkscs",
2283     "csshiftjis",
2284     "iso-8859-10",
2285     "iso-8859-11",
2286     "csisolatin1",
2287     "csisolatin2",
2288     "iso-8859-13",
2289     "csisolatin3",
2290     "iso-8859-14",
2291     "windows-874",
2292     "csisolatin4",
2293     "iso-8859-15",
2294     "iso_8859-15",
2295     "csisolatin5",
2296     "iso-8859-16",
2297     "csisolatin6",
2298     "windows-949",
2299     "csisolatin9",
2300     "csiso88596e",
2301     "csiso88598e",
2302     "csmacintosh",
2303     "csiso88596i",
2304     "csiso88598i",
2305     "windows-31j",
2306     "x-mac-roman",
2307     "iso-2022-cn",
2308     "iso-2022-jp",
2309     "csiso2022jp",
2310     "iso-2022-kr",
2311     "csiso2022kr",
2312     "replacement",
2313     "windows-1250",
2314     "windows-1251",
2315     "windows-1252",
2316     "windows-1253",
2317     "windows-1254",
2318     "windows-1255",
2319     "windows-1256",
2320     "windows-1257",
2321     "windows-1258",
2322     "iso-8859-6-e",
2323     "iso-8859-8-e",
2324     "iso-8859-6-i",
2325     "iso-8859-8-i",
2326     "sun_eu_greek",
2327     "csksc56011987",
2328     "ks_c_5601-1987",
2329     "ansi_x3.4-1968",
2330     "ks_c_5601-1989",
2331     "x-mac-cyrillic",
2332     "x-user-defined",
2333     "csiso58gb231280",
2334     "iso_8859-1:1987",
2335     "iso_8859-2:1987",
2336     "iso_8859-6:1987",
2337     "iso_8859-7:1987",
2338     "iso_8859-3:1988",
2339     "iso_8859-4:1988",
2340     "iso_8859-5:1988",
2341     "iso_8859-8:1988",
2342     "iso_8859-9:1989",
2343     "csisolatingreek",
2344     "x-mac-ukrainian",
2345     "iso-2022-cn-ext",
2346     "csisolatinarabic",
2347     "csisolatinhebrew",
2348     "unicode-1-1-utf-8",
2349     "csisolatincyrillic",
2350     "cseucpkdfmtjapanese",
2351 ];
2352 
2353 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2354     &WINDOWS_1252_INIT,
2355     &ISO_8859_2_INIT,
2356     &ISO_8859_3_INIT,
2357     &ISO_8859_4_INIT,
2358     &WINDOWS_1254_INIT,
2359     &ISO_8859_10_INIT,
2360     &ISO_8859_15_INIT,
2361     &IBM866_INIT,
2362     &MACINTOSH_INIT,
2363     &KOI8_R_INIT,
2364     &GBK_INIT,
2365     &BIG5_INIT,
2366     &UTF_8_INIT,
2367     &KOI8_R_INIT,
2368     &SHIFT_JIS_INIT,
2369     &SHIFT_JIS_INIT,
2370     &IBM866_INIT,
2371     &UTF_8_INIT,
2372     &WINDOWS_1252_INIT,
2373     &WINDOWS_1252_INIT,
2374     &GBK_INIT,
2375     &ISO_8859_7_INIT,
2376     &WINDOWS_1250_INIT,
2377     &WINDOWS_1251_INIT,
2378     &WINDOWS_1252_INIT,
2379     &GBK_INIT,
2380     &WINDOWS_1252_INIT,
2381     &ISO_8859_2_INIT,
2382     &WINDOWS_1253_INIT,
2383     &ISO_8859_3_INIT,
2384     &WINDOWS_1254_INIT,
2385     &ISO_8859_4_INIT,
2386     &WINDOWS_1255_INIT,
2387     &BIG5_INIT,
2388     &WINDOWS_1254_INIT,
2389     &UTF_16LE_INIT,
2390     &WINDOWS_1256_INIT,
2391     &IBM866_INIT,
2392     &ISO_8859_10_INIT,
2393     &WINDOWS_1257_INIT,
2394     &WINDOWS_1258_INIT,
2395     &ISO_8859_7_INIT,
2396     &WINDOWS_1252_INIT,
2397     &ISO_8859_6_INIT,
2398     &ISO_8859_8_INIT,
2399     &EUC_KR_INIT,
2400     &EUC_JP_INIT,
2401     &KOI8_R_INIT,
2402     &KOI8_R_INIT,
2403     &EUC_KR_INIT,
2404     &SHIFT_JIS_INIT,
2405     &KOI8_U_INIT,
2406     &ISO_8859_8_INIT,
2407     &WINDOWS_874_INIT,
2408     &GB18030_INIT,
2409     &EUC_KR_INIT,
2410     &GBK_INIT,
2411     &WINDOWS_874_INIT,
2412     &BIG5_INIT,
2413     &GBK_INIT,
2414     &ISO_8859_8_I_INIT,
2415     &KOI8_R_INIT,
2416     &EUC_KR_INIT,
2417     &KOI8_U_INIT,
2418     &WINDOWS_1250_INIT,
2419     &EUC_KR_INIT,
2420     &WINDOWS_1251_INIT,
2421     &WINDOWS_1252_INIT,
2422     &GBK_INIT,
2423     &WINDOWS_1252_INIT,
2424     &ISO_8859_2_INIT,
2425     &WINDOWS_1253_INIT,
2426     &ISO_8859_3_INIT,
2427     &ISO_8859_6_INIT,
2428     &WINDOWS_1254_INIT,
2429     &ISO_8859_4_INIT,
2430     &WINDOWS_1255_INIT,
2431     &ISO_8859_5_INIT,
2432     &BIG5_INIT,
2433     &WINDOWS_1256_INIT,
2434     &IBM866_INIT,
2435     &ISO_8859_6_INIT,
2436     &WINDOWS_1257_INIT,
2437     &ISO_8859_7_INIT,
2438     &ISO_8859_6_INIT,
2439     &ISO_8859_7_INIT,
2440     &ISO_8859_7_INIT,
2441     &WINDOWS_1258_INIT,
2442     &ISO_8859_8_INIT,
2443     &WINDOWS_1254_INIT,
2444     &ISO_8859_5_INIT,
2445     &UTF_16BE_INIT,
2446     &UTF_16LE_INIT,
2447     &WINDOWS_1252_INIT,
2448     &SHIFT_JIS_INIT,
2449     &EUC_JP_INIT,
2450     &ISO_8859_10_INIT,
2451     &WINDOWS_1252_INIT,
2452     &WINDOWS_874_INIT,
2453     &ISO_8859_2_INIT,
2454     &ISO_8859_3_INIT,
2455     &ISO_8859_13_INIT,
2456     &ISO_8859_4_INIT,
2457     &ISO_8859_14_INIT,
2458     &ISO_8859_5_INIT,
2459     &ISO_8859_15_INIT,
2460     &ISO_8859_6_INIT,
2461     &ISO_8859_7_INIT,
2462     &ISO_8859_8_INIT,
2463     &GBK_INIT,
2464     &WINDOWS_1254_INIT,
2465     &MACINTOSH_INIT,
2466     &SHIFT_JIS_INIT,
2467     &SHIFT_JIS_INIT,
2468     &WINDOWS_1252_INIT,
2469     &ISO_8859_10_INIT,
2470     &ISO_8859_4_INIT,
2471     &GBK_INIT,
2472     &WINDOWS_1252_INIT,
2473     &WINDOWS_1252_INIT,
2474     &ISO_8859_2_INIT,
2475     &WINDOWS_874_INIT,
2476     &ISO_8859_2_INIT,
2477     &ISO_8859_2_INIT,
2478     &REPLACEMENT_INIT,
2479     &ISO_8859_3_INIT,
2480     &ISO_8859_3_INIT,
2481     &ISO_8859_13_INIT,
2482     &ISO_8859_4_INIT,
2483     &ISO_8859_4_INIT,
2484     &ISO_8859_14_INIT,
2485     &ISO_8859_5_INIT,
2486     &ISO_8859_5_INIT,
2487     &ISO_8859_5_INIT,
2488     &ISO_8859_15_INIT,
2489     &ISO_8859_6_INIT,
2490     &ISO_8859_6_INIT,
2491     &ISO_8859_7_INIT,
2492     &ISO_8859_7_INIT,
2493     &ISO_8859_7_INIT,
2494     &ISO_8859_6_INIT,
2495     &ISO_8859_10_INIT,
2496     &ISO_8859_8_INIT,
2497     &ISO_8859_8_INIT,
2498     &ISO_8859_8_INIT,
2499     &WINDOWS_1254_INIT,
2500     &WINDOWS_1254_INIT,
2501     &WINDOWS_1254_INIT,
2502     &ISO_8859_3_INIT,
2503     &EUC_KR_INIT,
2504     &BIG5_INIT,
2505     &SHIFT_JIS_INIT,
2506     &ISO_8859_10_INIT,
2507     &WINDOWS_874_INIT,
2508     &WINDOWS_1252_INIT,
2509     &ISO_8859_2_INIT,
2510     &ISO_8859_13_INIT,
2511     &ISO_8859_3_INIT,
2512     &ISO_8859_14_INIT,
2513     &WINDOWS_874_INIT,
2514     &ISO_8859_4_INIT,
2515     &ISO_8859_15_INIT,
2516     &ISO_8859_15_INIT,
2517     &WINDOWS_1254_INIT,
2518     &ISO_8859_16_INIT,
2519     &ISO_8859_10_INIT,
2520     &EUC_KR_INIT,
2521     &ISO_8859_15_INIT,
2522     &ISO_8859_6_INIT,
2523     &ISO_8859_8_INIT,
2524     &MACINTOSH_INIT,
2525     &ISO_8859_6_INIT,
2526     &ISO_8859_8_I_INIT,
2527     &SHIFT_JIS_INIT,
2528     &MACINTOSH_INIT,
2529     &REPLACEMENT_INIT,
2530     &ISO_2022_JP_INIT,
2531     &ISO_2022_JP_INIT,
2532     &REPLACEMENT_INIT,
2533     &REPLACEMENT_INIT,
2534     &REPLACEMENT_INIT,
2535     &WINDOWS_1250_INIT,
2536     &WINDOWS_1251_INIT,
2537     &WINDOWS_1252_INIT,
2538     &WINDOWS_1253_INIT,
2539     &WINDOWS_1254_INIT,
2540     &WINDOWS_1255_INIT,
2541     &WINDOWS_1256_INIT,
2542     &WINDOWS_1257_INIT,
2543     &WINDOWS_1258_INIT,
2544     &ISO_8859_6_INIT,
2545     &ISO_8859_8_INIT,
2546     &ISO_8859_6_INIT,
2547     &ISO_8859_8_I_INIT,
2548     &ISO_8859_7_INIT,
2549     &EUC_KR_INIT,
2550     &EUC_KR_INIT,
2551     &WINDOWS_1252_INIT,
2552     &EUC_KR_INIT,
2553     &X_MAC_CYRILLIC_INIT,
2554     &X_USER_DEFINED_INIT,
2555     &GBK_INIT,
2556     &WINDOWS_1252_INIT,
2557     &ISO_8859_2_INIT,
2558     &ISO_8859_6_INIT,
2559     &ISO_8859_7_INIT,
2560     &ISO_8859_3_INIT,
2561     &ISO_8859_4_INIT,
2562     &ISO_8859_5_INIT,
2563     &ISO_8859_8_INIT,
2564     &WINDOWS_1254_INIT,
2565     &ISO_8859_7_INIT,
2566     &X_MAC_CYRILLIC_INIT,
2567     &REPLACEMENT_INIT,
2568     &ISO_8859_6_INIT,
2569     &ISO_8859_8_INIT,
2570     &UTF_8_INIT,
2571     &ISO_8859_5_INIT,
2572     &EUC_JP_INIT,
2573 ];
2574 
2575 // END GENERATED CODE
2576 
2577 /// An encoding as defined in the [Encoding Standard][1].
2578 ///
2579 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2580 /// and, in most cases, vice versa. Each encoding has a name, an output
2581 /// encoding, and one or more labels.
2582 ///
2583 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2584 /// encoding in formats and protocols. The _name_ of the encoding is the
2585 /// preferred label in the case appropriate for returning from the
2586 /// [`characterSet`][2] property of the `Document` DOM interface.
2587 ///
2588 /// The _output encoding_ is the encoding used for form submission and URL
2589 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2590 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2591 /// encodings.
2592 ///
2593 /// [1]: https://encoding.spec.whatwg.org/
2594 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2595 ///
2596 /// # Streaming vs. Non-Streaming
2597 ///
2598 /// When you have the entire input in a single buffer, you can use the
2599 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2600 /// [`decode_without_bom_handling()`][5],
2601 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2602 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2603 /// not available in the C API.) Unlike the rest of the API available to Rust,
2604 /// these methods perform heap allocations. You should the `Decoder` and
2605 /// `Encoder` objects when your input is split into multiple buffers or when
2606 /// you want to control the allocation of the output buffers.
2607 ///
2608 /// [3]: #method.decode
2609 /// [4]: #method.decode_with_bom_removal
2610 /// [5]: #method.decode_without_bom_handling
2611 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2612 /// [7]: #method.encode
2613 ///
2614 /// # Instances
2615 ///
2616 /// All instances of `Encoding` are statically allocated and have the `'static`
2617 /// lifetime. There is precisely one unique `Encoding` instance for each
2618 /// encoding defined in the Encoding Standard.
2619 ///
2620 /// To obtain a reference to a particular encoding whose identity you know at
2621 /// compile time, use a `static` that refers to encoding. There is a `static`
2622 /// for each encoding. The `static`s are named in all caps with hyphens
2623 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2624 /// name). For example, if you know at compile time that you will want to
2625 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2626 /// in C/C++).
2627 ///
2628 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2629 /// work around the problem that `static`s of the type `&'static Encoding`
2630 /// cannot be used to initialize items of an array whose type is
2631 /// `[&'static Encoding; N]`.
2632 ///
2633 /// If you don't know what encoding you need at compile time and need to
2634 /// dynamically get an encoding by label, use
2635 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2636 ///
2637 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2638 /// C/C++).
2639 pub struct Encoding {
2640     name: &'static str,
2641     variant: VariantEncoding,
2642 }
2643 
2644 impl Encoding {
2645     /// Implements the
2646     /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2647     /// algorithm.
2648     ///
2649     /// If, after ASCII-lowercasing and removing leading and trailing
2650     /// whitespace, the argument matches a label defined in the Encoding
2651     /// Standard, `Some(&'static Encoding)` representing the corresponding
2652     /// encoding is returned. If there is no match, `None` is returned.
2653     ///
2654     /// This is the right method to use if the action upon the method returning
2655     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2656     /// When the action upon the method returning `None` is not to proceed with
2657     /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2658     /// appropriate.
2659     ///
2660     /// The argument is of type `&[u8]` instead of `&str` to save callers
2661     /// that are extracting the label from a non-UTF-8 protocol the trouble
2662     /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2663     /// on it.)
2664     ///
2665     /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2666     pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2667         let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2668         let mut trimmed_pos = 0usize;
2669         let mut iter = label.into_iter();
2670         // before
2671         loop {
2672             match iter.next() {
2673                 None => {
2674                     return None;
2675                 }
2676                 Some(byte) => {
2677                     // The characters used in labels are:
2678                     // a-z (except q, but excluding it below seems excessive)
2679                     // 0-9
2680                     // . _ - :
2681                     match *byte {
2682                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2683                             continue;
2684                         }
2685                         b'A'...b'Z' => {
2686                             trimmed[trimmed_pos] = *byte + 0x20u8;
2687                             trimmed_pos = 1usize;
2688                             break;
2689                         }
2690                         b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2691                             trimmed[trimmed_pos] = *byte;
2692                             trimmed_pos = 1usize;
2693                             break;
2694                         }
2695                         _ => {
2696                             return None;
2697                         }
2698                     }
2699                 }
2700             }
2701         }
2702         // inside
2703         loop {
2704             match iter.next() {
2705                 None => {
2706                     break;
2707                 }
2708                 Some(byte) => {
2709                     match *byte {
2710                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2711                             break;
2712                         }
2713                         b'A'...b'Z' => {
2714                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2715                                 // There's no encoding with a label this long
2716                                 return None;
2717                             }
2718                             trimmed[trimmed_pos] = *byte + 0x20u8;
2719                             trimmed_pos += 1usize;
2720                             continue;
2721                         }
2722                         b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2723                             if trimmed_pos == LONGEST_LABEL_LENGTH {
2724                                 // There's no encoding with a label this long
2725                                 return None;
2726                             }
2727                             trimmed[trimmed_pos] = *byte;
2728                             trimmed_pos += 1usize;
2729                             continue;
2730                         }
2731                         _ => {
2732                             return None;
2733                         }
2734                     }
2735                 }
2736             }
2737         }
2738         // after
2739         loop {
2740             match iter.next() {
2741                 None => {
2742                     break;
2743                 }
2744                 Some(byte) => {
2745                     match *byte {
2746                         0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2747                             continue;
2748                         }
2749                         _ => {
2750                             // There's no label with space in the middle
2751                             return None;
2752                         }
2753                     }
2754                 }
2755             }
2756         }
2757         let candidate = &trimmed[..trimmed_pos];
2758         match LABELS_SORTED.binary_search_by(|probe| {
2759             let bytes = probe.as_bytes();
2760             let c = bytes.len().cmp(&candidate.len());
2761             if c != Ordering::Equal {
2762                 return c;
2763             }
2764             let probe_iter = bytes.iter().rev();
2765             let candidate_iter = candidate.iter().rev();
2766             probe_iter.cmp(candidate_iter)
2767         }) {
2768             Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2769             Err(_) => None,
2770         }
2771     }
2772 
2773     /// This method behaves the same as `for_label()`, except when `for_label()`
2774     /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2775     ///
2776     /// This method is useful in scenarios where a fatal error is required
2777     /// upon invalid label, because in those cases the caller typically wishes
2778     /// to treat the labels that map to the replacement encoding as fatal
2779     /// errors, too.
2780     ///
2781     /// It is not OK to use this method when the action upon the method returning
2782     /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2783     /// case, the `for_label()` method should be used instead in order to avoid
2784     /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2785     ///
2786     /// Available via the C wrapper.
2787     #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2788     pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2789         match Encoding::for_label(label) {
2790             None => None,
2791             Some(encoding) => {
2792                 if encoding == REPLACEMENT {
2793                     None
2794                 } else {
2795                     Some(encoding)
2796                 }
2797             }
2798         }
2799     }
2800 
2801     /// Performs non-incremental BOM sniffing.
2802     ///
2803     /// The argument must either be a buffer representing the entire input
2804     /// stream (non-streaming case) or a buffer representing at least the first
2805     /// three bytes of the input stream (streaming case).
2806     ///
2807     /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2808     /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2809     /// or UTF-16BE BOM or `None` otherwise.
2810     ///
2811     /// Available via the C wrapper.
2812     #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2813     pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2814         if buffer.starts_with(b"\xEF\xBB\xBF") {
2815             Some((UTF_8, 3))
2816         } else if buffer.starts_with(b"\xFF\xFE") {
2817             Some((UTF_16LE, 2))
2818         } else if buffer.starts_with(b"\xFE\xFF") {
2819             Some((UTF_16BE, 2))
2820         } else {
2821             None
2822         }
2823     }
2824 
2825     /// Returns the name of this encoding.
2826     ///
2827     /// This name is appropriate to return as-is from the DOM
2828     /// `document.characterSet` property.
2829     ///
2830     /// Available via the C wrapper.
2831     #[inline]
name(&'static self) -> &'static str2832     pub fn name(&'static self) -> &'static str {
2833         self.name
2834     }
2835 
2836     /// Checks whether the _output encoding_ of this encoding can encode every
2837     /// `char`. (Only true if the output encoding is UTF-8.)
2838     ///
2839     /// Available via the C wrapper.
2840     #[inline]
can_encode_everything(&'static self) -> bool2841     pub fn can_encode_everything(&'static self) -> bool {
2842         self.output_encoding() == UTF_8
2843     }
2844 
2845     /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2846     /// U+0000...U+007F and vice versa.
2847     ///
2848     /// Available via the C wrapper.
2849     #[inline]
is_ascii_compatible(&'static self) -> bool2850     pub fn is_ascii_compatible(&'static self) -> bool {
2851         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2852     }
2853 
2854     /// Checks whether this encoding maps one byte to one Basic Multilingual
2855     /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2856     /// vice versa (for mappable characters).
2857     ///
2858     /// `true` iff this encoding is on the list of [Legacy single-byte
2859     /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2860     /// in the spec or x-user-defined.
2861     ///
2862     /// Available via the C wrapper.
2863     #[inline]
is_single_byte(&'static self) -> bool2864     pub fn is_single_byte(&'static self) -> bool {
2865         self.variant.is_single_byte()
2866     }
2867 
2868     /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2869     /// U+0000...U+007F and vice versa.
2870     #[inline]
is_potentially_borrowable(&'static self) -> bool2871     fn is_potentially_borrowable(&'static self) -> bool {
2872         !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2873     }
2874 
2875     /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2876     /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2877     ///
2878     /// Available via the C wrapper.
2879     #[inline]
output_encoding(&'static self) -> &'static Encoding2880     pub fn output_encoding(&'static self) -> &'static Encoding {
2881         if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2882             UTF_8
2883         } else {
2884             self
2885         }
2886     }
2887 
2888     /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2889     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2890     /// entire input is available as a single buffer (i.e. the end of the
2891     /// buffer marks the end of the stream).
2892     ///
2893     /// This method implements the (non-streaming version of) the
2894     /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2895     ///
2896     /// The second item in the returned tuple is the encoding that was actually
2897     /// used (which may differ from this encoding thanks to BOM sniffing).
2898     ///
2899     /// The third item in the returned tuple indicates whether there were
2900     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2901     ///
2902     /// _Note:_ It is wrong to use this when the input buffer represents only
2903     /// a segment of the input instead of the whole input. Use `new_decoder()`
2904     /// when decoding segmented input.
2905     ///
2906     /// This method performs a one or two heap allocations for the backing
2907     /// buffer of the `String` when unable to borrow. (One allocation if not
2908     /// errors and potentially another one in the presence of errors.) The
2909     /// first allocation assumes jemalloc and may not be optimal with
2910     /// allocators that do not use power-of-two buckets. A borrow is performed
2911     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2912     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2913     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2914     /// transitions.
2915     ///
2916     /// # Panics
2917     ///
2918     /// If the size calculation for a heap-allocated backing buffer overflows
2919     /// `usize`.
2920     ///
2921     /// Available to Rust only.
2922     #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2923     pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2924         let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2925             Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2926             None => (self, bytes),
2927         };
2928         let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2929         (cow, encoding, had_errors)
2930     }
2931 
2932     /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2933     /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2934     /// entire input is available as a single buffer (i.e. the end of the
2935     /// buffer marks the end of the stream).
2936     ///
2937     /// When invoked on `UTF_8`, this method implements the (non-streaming
2938     /// version of) the
2939     /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2940     /// concept.
2941     ///
2942     /// The second item in the returned pair indicates whether there were
2943     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2944     ///
2945     /// _Note:_ It is wrong to use this when the input buffer represents only
2946     /// a segment of the input instead of the whole input. Use
2947     /// `new_decoder_with_bom_removal()` when decoding segmented input.
2948     ///
2949     /// This method performs a one or two heap allocations for the backing
2950     /// buffer of the `String` when unable to borrow. (One allocation if not
2951     /// errors and potentially another one in the presence of errors.) The
2952     /// first allocation assumes jemalloc and may not be optimal with
2953     /// allocators that do not use power-of-two buckets. A borrow is performed
2954     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2955     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2956     /// ISO-2022-JP and the input is entirely in the ASCII state without state
2957     /// transitions.
2958     ///
2959     /// # Panics
2960     ///
2961     /// If the size calculation for a heap-allocated backing buffer overflows
2962     /// `usize`.
2963     ///
2964     /// Available to Rust only.
2965     #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)2966     pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2967         let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2968             &bytes[3..]
2969         } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2970             || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2971         {
2972             &bytes[2..]
2973         } else {
2974             bytes
2975         };
2976         self.decode_without_bom_handling(without_bom)
2977     }
2978 
2979     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2980     /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2981     /// the entire input is available as a single buffer (i.e. the end of the
2982     /// buffer marks the end of the stream).
2983     ///
2984     /// When invoked on `UTF_8`, this method implements the (non-streaming
2985     /// version of) the
2986     /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2987     /// spec concept.
2988     ///
2989     /// The second item in the returned pair indicates whether there were
2990     /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2991     ///
2992     /// _Note:_ It is wrong to use this when the input buffer represents only
2993     /// a segment of the input instead of the whole input. Use
2994     /// `new_decoder_without_bom_handling()` when decoding segmented input.
2995     ///
2996     /// This method performs a one or two heap allocations for the backing
2997     /// buffer of the `String` when unable to borrow. (One allocation if not
2998     /// errors and potentially another one in the presence of errors.) The
2999     /// first allocation assumes jemalloc and may not be optimal with
3000     /// allocators that do not use power-of-two buckets. A borrow is performed
3001     /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3002     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3003     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3004     /// transitions.
3005     ///
3006     /// # Panics
3007     ///
3008     /// If the size calculation for a heap-allocated backing buffer overflows
3009     /// `usize`.
3010     ///
3011     /// Available to Rust only.
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3012     pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3013         let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3014             let valid_up_to = if self == UTF_8 {
3015                 utf8_valid_up_to(bytes)
3016             } else if self == ISO_2022_JP {
3017                 iso_2022_jp_ascii_valid_up_to(bytes)
3018             } else {
3019                 ascii_valid_up_to(bytes)
3020             };
3021             if valid_up_to == bytes.len() {
3022                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3023                 return (Cow::Borrowed(str), false);
3024             }
3025             let decoder = self.new_decoder_without_bom_handling();
3026 
3027             let rounded_without_replacement = checked_next_power_of_two(checked_add(
3028                 valid_up_to,
3029                 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3030             ));
3031             let with_replacement = checked_add(
3032                 valid_up_to,
3033                 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3034             );
3035             let mut string = String::with_capacity(
3036                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3037             );
3038             unsafe {
3039                 let vec = string.as_mut_vec();
3040                 vec.set_len(valid_up_to);
3041                 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3042             }
3043             (decoder, string, valid_up_to)
3044         } else {
3045             let decoder = self.new_decoder_without_bom_handling();
3046             let rounded_without_replacement = checked_next_power_of_two(
3047                 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3048             );
3049             let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3050             let string = String::with_capacity(
3051                 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3052             );
3053             (decoder, string, 0)
3054         };
3055 
3056         let mut total_had_errors = false;
3057         loop {
3058             let (result, read, had_errors) =
3059                 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3060             total_read += read;
3061             total_had_errors |= had_errors;
3062             match result {
3063                 CoderResult::InputEmpty => {
3064                     debug_assert_eq!(total_read, bytes.len());
3065                     return (Cow::Owned(string), total_had_errors);
3066                 }
3067                 CoderResult::OutputFull => {
3068                     // Allocate for the worst case. That is, we should come
3069                     // here at most once per invocation of this method.
3070                     let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3071                     string.reserve(needed.unwrap());
3072                 }
3073             }
3074         }
3075     }
3076 
3077     /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3078     /// _with malformed sequences treated as fatal_ when the entire input is
3079     /// available as a single buffer (i.e. the end of the buffer marks the end
3080     /// of the stream).
3081     ///
3082     /// When invoked on `UTF_8`, this method implements the (non-streaming
3083     /// version of) the
3084     /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3085     /// spec concept.
3086     ///
3087     /// Returns `None` if a malformed sequence was encountered and the result
3088     /// of the decode as `Some(String)` otherwise.
3089     ///
3090     /// _Note:_ It is wrong to use this when the input buffer represents only
3091     /// a segment of the input instead of the whole input. Use
3092     /// `new_decoder_without_bom_handling()` when decoding segmented input.
3093     ///
3094     /// This method performs a single heap allocation for the backing
3095     /// buffer of the `String` when unable to borrow. A borrow is performed if
3096     /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3097     /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3098     /// ISO-2022-JP and the input is entirely in the ASCII state without state
3099     /// transitions.
3100     ///
3101     /// # Panics
3102     ///
3103     /// If the size calculation for a heap-allocated backing buffer overflows
3104     /// `usize`.
3105     ///
3106     /// Available to Rust only.
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3107     pub fn decode_without_bom_handling_and_without_replacement<'a>(
3108         &'static self,
3109         bytes: &'a [u8],
3110     ) -> Option<Cow<'a, str>> {
3111         if self == UTF_8 {
3112             let valid_up_to = utf8_valid_up_to(bytes);
3113             if valid_up_to == bytes.len() {
3114                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3115                 return Some(Cow::Borrowed(str));
3116             }
3117             return None;
3118         }
3119         let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3120             let valid_up_to = if self == ISO_2022_JP {
3121                 iso_2022_jp_ascii_valid_up_to(bytes)
3122             } else {
3123                 ascii_valid_up_to(bytes)
3124             };
3125             if valid_up_to == bytes.len() {
3126                 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3127                 return Some(Cow::Borrowed(str));
3128             }
3129             let decoder = self.new_decoder_without_bom_handling();
3130             let mut string = String::with_capacity(
3131                 checked_add(
3132                     valid_up_to,
3133                     decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3134                 )
3135                 .unwrap(),
3136             );
3137             unsafe {
3138                 let vec = string.as_mut_vec();
3139                 vec.set_len(valid_up_to);
3140                 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3141             }
3142             (decoder, string, &bytes[valid_up_to..])
3143         } else {
3144             let decoder = self.new_decoder_without_bom_handling();
3145             let string = String::with_capacity(
3146                 decoder
3147                     .max_utf8_buffer_length_without_replacement(bytes.len())
3148                     .unwrap(),
3149             );
3150             (decoder, string, bytes)
3151         };
3152         let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3153         match result {
3154             DecoderResult::InputEmpty => {
3155                 debug_assert_eq!(read, input.len());
3156                 Some(Cow::Owned(string))
3157             }
3158             DecoderResult::Malformed(_, _) => None,
3159             DecoderResult::OutputFull => unreachable!(),
3160         }
3161     }
3162 
3163     /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3164     /// replaced with decimal numeric character references when the entire input
3165     /// is available as a single buffer (i.e. the end of the buffer marks the
3166     /// end of the stream).
3167     ///
3168     /// This method implements the (non-streaming version of) the
3169     /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3170     /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3171     /// spec concept, it is slightly more efficient to use
3172     /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3173     /// method on `UTF_8`.
3174     ///
3175     /// The second item in the returned tuple is the encoding that was actually
3176     /// used (which may differ from this encoding thanks to some encodings
3177     /// having UTF-8 as their output encoding).
3178     ///
3179     /// The third item in the returned tuple indicates whether there were
3180     /// unmappable characters (that were replaced with HTML numeric character
3181     /// references).
3182     ///
3183     /// _Note:_ It is wrong to use this when the input buffer represents only
3184     /// a segment of the input instead of the whole input. Use `new_encoder()`
3185     /// when encoding segmented output.
3186     ///
3187     /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3188     /// ASCII-compatible encoding, this method returns a borrow of the input
3189     /// without a heap allocation. Otherwise, this method performs a single
3190     /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3191     /// unmappable characters and potentially multiple heap allocations if
3192     /// there are. These allocations are tuned for jemalloc and may not be
3193     /// optimal when using a different allocator that doesn't use power-of-two
3194     /// buckets.
3195     ///
3196     /// # Panics
3197     ///
3198     /// If the size calculation for a heap-allocated backing buffer overflows
3199     /// `usize`.
3200     ///
3201     /// Available to Rust only.
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3202     pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3203         let output_encoding = self.output_encoding();
3204         if output_encoding == UTF_8 {
3205             return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3206         }
3207         debug_assert!(output_encoding.is_potentially_borrowable());
3208         let bytes = string.as_bytes();
3209         let valid_up_to = if output_encoding == ISO_2022_JP {
3210             iso_2022_jp_ascii_valid_up_to(bytes)
3211         } else {
3212             ascii_valid_up_to(bytes)
3213         };
3214         if valid_up_to == bytes.len() {
3215             return (Cow::Borrowed(bytes), output_encoding, false);
3216         }
3217         let mut encoder = output_encoding.new_encoder();
3218         let mut vec: Vec<u8> = Vec::with_capacity(
3219             (checked_add(
3220                 valid_up_to,
3221                 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3222             ))
3223             .unwrap()
3224             .next_power_of_two(),
3225         );
3226         unsafe {
3227             vec.set_len(valid_up_to);
3228             std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3229         }
3230         let mut total_read = valid_up_to;
3231         let mut total_had_errors = false;
3232         loop {
3233             let (result, read, had_errors) =
3234                 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3235             total_read += read;
3236             total_had_errors |= had_errors;
3237             match result {
3238                 CoderResult::InputEmpty => {
3239                     debug_assert_eq!(total_read, string.len());
3240                     return (Cow::Owned(vec), output_encoding, total_had_errors);
3241                 }
3242                 CoderResult::OutputFull => {
3243                     // reserve_exact wants to know how much more on top of current
3244                     // length--not current capacity.
3245                     let needed = encoder
3246                         .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3247                     let rounded = (checked_add(vec.capacity(), needed))
3248                         .unwrap()
3249                         .next_power_of_two();
3250                     let additional = rounded - vec.len();
3251                     vec.reserve_exact(additional);
3252                 }
3253             }
3254         }
3255     }
3256 
new_variant_decoder(&'static self) -> VariantDecoder3257     fn new_variant_decoder(&'static self) -> VariantDecoder {
3258         self.variant.new_variant_decoder()
3259     }
3260 
3261     /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3262     ///
3263     /// BOM sniffing may cause the returned decoder to morph into a decoder
3264     /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3265     ///
3266     /// Available via the C wrapper.
3267     #[inline]
new_decoder(&'static self) -> Decoder3268     pub fn new_decoder(&'static self) -> Decoder {
3269         Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3270     }
3271 
3272     /// Instantiates a new decoder for this encoding with BOM removal.
3273     ///
3274     /// If the input starts with bytes that are the BOM for this encoding,
3275     /// those bytes are removed. However, the decoder never morphs into a
3276     /// decoder for another encoding: A BOM for another encoding is treated as
3277     /// (potentially malformed) input to the decoding algorithm for this
3278     /// encoding.
3279     ///
3280     /// Available via the C wrapper.
3281     #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3282     pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3283         Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3284     }
3285 
3286     /// Instantiates a new decoder for this encoding with BOM handling disabled.
3287     ///
3288     /// If the input starts with bytes that look like a BOM, those bytes are
3289     /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3290     /// for another encoding.)
3291     ///
3292     /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3293     /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3294     /// instead of this method to cause the BOM to be removed.
3295     ///
3296     /// Available via the C wrapper.
3297     #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3298     pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3299         Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3300     }
3301 
3302     /// Instantiates a new encoder for the output encoding of this encoding.
3303     ///
3304     /// Available via the C wrapper.
3305     #[inline]
new_encoder(&'static self) -> Encoder3306     pub fn new_encoder(&'static self) -> Encoder {
3307         let enc = self.output_encoding();
3308         enc.variant.new_encoder(enc)
3309     }
3310 
3311     /// Validates UTF-8.
3312     ///
3313     /// Returns the index of the first byte that makes the input malformed as
3314     /// UTF-8 or the length of the slice if the slice is entirely valid.
3315     ///
3316     /// This is currently faster than the corresponding standard library
3317     /// functionality. If this implementation gets upstreamed to the standard
3318     /// library, this method may be removed in the future.
3319     ///
3320     /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3321     pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3322         utf8_valid_up_to(bytes)
3323     }
3324 
3325     /// Validates ASCII.
3326     ///
3327     /// Returns the index of the first byte that makes the input malformed as
3328     /// ASCII or the length of the slice if the slice is entirely valid.
3329     ///
3330     /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3331     pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3332         ascii_valid_up_to(bytes)
3333     }
3334 
3335     /// Validates ISO-2022-JP ASCII-state data.
3336     ///
3337     /// Returns the index of the first byte that makes the input not
3338     /// representable in the ASCII state of ISO-2022-JP or the length of the
3339     /// slice if the slice is entirely representable in the ASCII state of
3340     /// ISO-2022-JP.
3341     ///
3342     /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3343     pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3344         iso_2022_jp_ascii_valid_up_to(bytes)
3345     }
3346 }
3347 
3348 impl PartialEq for Encoding {
3349     #[inline]
eq(&self, other: &Encoding) -> bool3350     fn eq(&self, other: &Encoding) -> bool {
3351         (self as *const Encoding) == (other as *const Encoding)
3352     }
3353 }
3354 
3355 impl Eq for Encoding {}
3356 
3357 impl Hash for Encoding {
3358     #[inline]
hash<H: Hasher>(&self, state: &mut H)3359     fn hash<H: Hasher>(&self, state: &mut H) {
3360         (self as *const Encoding).hash(state);
3361     }
3362 }
3363 
3364 impl std::fmt::Debug for Encoding {
3365     #[inline]
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result3366     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3367         write!(f, "Encoding {{ {} }}", self.name)
3368     }
3369 }
3370 
3371 #[cfg(feature = "serde")]
3372 impl Serialize for Encoding {
3373     #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3374     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3375     where
3376         S: Serializer,
3377     {
3378         serializer.serialize_str(self.name)
3379     }
3380 }
3381 
3382 #[cfg(feature = "serde")]
3383 struct EncodingVisitor;
3384 
3385 #[cfg(feature = "serde")]
3386 impl<'de> Visitor<'de> for EncodingVisitor {
3387     type Value = &'static Encoding;
3388 
expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result3389     fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
3390         formatter.write_str("a valid encoding label")
3391     }
3392 
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3393     fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3394     where
3395         E: serde::de::Error,
3396     {
3397         if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3398             Ok(enc)
3399         } else {
3400             Err(E::custom(format!("invalid encoding label: {}", value)))
3401         }
3402     }
3403 }
3404 
3405 #[cfg(feature = "serde")]
3406 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3407     fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3408     where
3409         D: Deserializer<'de>,
3410     {
3411         deserializer.deserialize_str(EncodingVisitor)
3412     }
3413 }
3414 
3415 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3416 #[derive(PartialEq, Debug, Copy, Clone)]
3417 enum DecoderLifeCycle {
3418     /// The decoder has seen no input yet.
3419     AtStart,
3420     /// The decoder has seen no input yet but expects UTF-8.
3421     AtUtf8Start,
3422     /// The decoder has seen no input yet but expects UTF-16BE.
3423     AtUtf16BeStart,
3424     /// The decoder has seen no input yet but expects UTF-16LE.
3425     AtUtf16LeStart,
3426     /// The decoder has seen EF.
3427     SeenUtf8First,
3428     /// The decoder has seen EF, BB.
3429     SeenUtf8Second,
3430     /// The decoder has seen FE.
3431     SeenUtf16BeFirst,
3432     /// The decoder has seen FF.
3433     SeenUtf16LeFirst,
3434     /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3435     /// underlying decoder reported EF as an error, so we need to remember to
3436     /// push BB before the next buffer.
3437     ConvertingWithPendingBB,
3438     /// No longer looking for a BOM and EOF not yet seen.
3439     Converting,
3440     /// EOF has been seen.
3441     Finished,
3442 }
3443 
3444 /// Communicate the BOM handling mode.
3445 #[derive(Debug, Copy, Clone)]
3446 enum BomHandling {
3447     /// Don't handle the BOM
3448     Off,
3449     /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3450     Sniff,
3451     /// Remove the BOM only if it's the BOM for this encoding
3452     Remove,
3453 }
3454 
3455 /// Result of a (potentially partial) decode or encode operation with
3456 /// replacement.
3457 #[must_use]
3458 #[derive(Debug, PartialEq, Eq)]
3459 pub enum CoderResult {
3460     /// The input was exhausted.
3461     ///
3462     /// If this result was returned from a call where `last` was `true`, the
3463     /// conversion process has completed. Otherwise, the caller should call a
3464     /// decode or encode method again with more input.
3465     InputEmpty,
3466 
3467     /// The converter cannot produce another unit of output, because the output
3468     /// buffer does not have enough space left.
3469     ///
3470     /// The caller must provide more output space upon the next call and re-push
3471     /// the remaining input to the converter.
3472     OutputFull,
3473 }
3474 
3475 /// Result of a (potentially partial) decode operation without replacement.
3476 #[must_use]
3477 #[derive(Debug, PartialEq, Eq)]
3478 pub enum DecoderResult {
3479     /// The input was exhausted.
3480     ///
3481     /// If this result was returned from a call where `last` was `true`, the
3482     /// decoding process has completed. Otherwise, the caller should call a
3483     /// decode method again with more input.
3484     InputEmpty,
3485 
3486     /// The decoder cannot produce another unit of output, because the output
3487     /// buffer does not have enough space left.
3488     ///
3489     /// The caller must provide more output space upon the next call and re-push
3490     /// the remaining input to the decoder.
3491     OutputFull,
3492 
3493     /// The decoder encountered a malformed byte sequence.
3494     ///
3495     /// The caller must either treat this as a fatal error or must append one
3496     /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3497     /// the remaining input to the decoder.
3498     ///
3499     /// The first wrapped integer indicates the length of the malformed byte
3500     /// sequence. The second wrapped integer indicates the number of bytes
3501     /// that were consumed after the malformed sequence. If the second
3502     /// integer is zero, the last byte that was consumed is the last byte of
3503     /// the malformed sequence. Note that the malformed bytes may have been part
3504     /// of an earlier input buffer.
3505     ///
3506     /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3507     /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3508     /// of the two is 6, which happens with ISO-2022-JP.
3509     Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3510 }
3511 
3512 /// A converter that decodes a byte stream into Unicode according to a
3513 /// character encoding in a streaming (incremental) manner.
3514 ///
3515 /// The various `decode_*` methods take an input buffer (`src`) and an output
3516 /// buffer `dst` both of which are caller-allocated. There are variants for
3517 /// both UTF-8 and UTF-16 output buffers.
3518 ///
3519 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3520 /// into `dst` until one of the following three things happens:
3521 ///
3522 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3523 ///    variants only).
3524 ///
3525 /// 2. The output buffer has been filled so near capacity that the decoder
3526 ///    cannot be sure that processing an additional byte of input wouldn't
3527 ///    cause so much output that the output buffer would overflow.
3528 ///
3529 /// 3. All the input bytes have been processed.
3530 ///
3531 /// The `decode_*` method then returns tuple of a status indicating which one
3532 /// of the three reasons to return happened, how many input bytes were read,
3533 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3534 /// when decoding to UTF-16) were written (except when decoding into `String`,
3535 /// whose length change indicates this), and in the case of the
3536 /// variants performing replacement, a boolean indicating whether an error was
3537 /// replaced with the REPLACEMENT CHARACTER during the call.
3538 ///
3539 /// The number of bytes "written" is what's logically written. Garbage may be
3540 /// written in the output buffer beyond the point logically written to.
3541 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3542 /// methods that take an `&mut str` argument instead of the ones that take an
3543 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3544 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3545 /// latter don't.
3546 ///
3547 /// In the case of the `*_without_replacement` variants, the status is a
3548 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3549 /// `InputEmpty` corresponding to the three cases listed above).
3550 ///
3551 /// In the case of methods whose name does not end with
3552 /// `*_without_replacement`, malformed sequences are automatically replaced
3553 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3554 /// return early.
3555 ///
3556 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3557 /// space. When decoding to UTF-16, the output buffer must have at least two
3558 /// UTF-16 code units (`u16`) of space.
3559 ///
3560 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3561 /// not to return indicating that more output space is needed if the length
3562 /// of the output buffer is at least the length returned by
3563 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3564 /// with replacement, the length of the output buffer that guarantees the
3565 /// methods not to return indicating that more output space is needed is given
3566 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3567 /// or without replacement, the length of the output buffer that guarantees
3568 /// the methods not to return indicating that more output space is needed is
3569 /// given by [`max_utf16_buffer_length()`][4].
3570 ///
3571 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3572 /// and the output after each `decode_*` call is guaranteed to consist of
3573 /// complete characters. (I.e. the code unit sequence for the last character is
3574 /// guaranteed not to be split across output buffers.)
3575 ///
3576 /// The boolean argument `last` indicates that the end of the stream is reached
3577 /// when all the bytes in `src` have been consumed.
3578 ///
3579 /// A `Decoder` object can be used to incrementally decode a byte stream.
3580 ///
3581 /// During the processing of a single stream, the caller must call `decode_*`
3582 /// zero or more times with `last` set to `false` and then call `decode_*` at
3583 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3584 /// the processing of the stream has ended. Otherwise, the caller must call
3585 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3586 ///  a fatal error).
3587 ///
3588 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3589 /// That is, you need to create another one to process another stream.
3590 ///
3591 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3592 /// the caller does not wish to treat it as a fatal error, the input buffer
3593 /// `src` may not have been completely consumed. In that case, the caller must
3594 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3595 /// call.
3596 ///
3597 /// [1]: enum.DecoderResult.html
3598 /// [2]: #method.max_utf8_buffer_length_without_replacement
3599 /// [3]: #method.max_utf8_buffer_length
3600 /// [4]: #method.max_utf16_buffer_length
3601 ///
3602 /// # Infinite loops
3603 ///
3604 /// When converting with a fixed-size output buffer whose size is too small to
3605 /// accommodate one character or (when applicable) one numeric character
3606 /// reference of output, an infinite loop ensues. When converting with a
3607 /// fixed-size output buffer, it generally makes sense to make the buffer
3608 /// fairly large (e.g. couple of kilobytes).
3609 pub struct Decoder {
3610     encoding: &'static Encoding,
3611     variant: VariantDecoder,
3612     life_cycle: DecoderLifeCycle,
3613 }
3614 
3615 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3616     fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3617         Decoder {
3618             encoding: enc,
3619             variant: decoder,
3620             life_cycle: match sniffing {
3621                 BomHandling::Off => DecoderLifeCycle::Converting,
3622                 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3623                 BomHandling::Remove => {
3624                     if enc == UTF_8 {
3625                         DecoderLifeCycle::AtUtf8Start
3626                     } else if enc == UTF_16BE {
3627                         DecoderLifeCycle::AtUtf16BeStart
3628                     } else if enc == UTF_16LE {
3629                         DecoderLifeCycle::AtUtf16LeStart
3630                     } else {
3631                         DecoderLifeCycle::Converting
3632                     }
3633                 }
3634             },
3635         }
3636     }
3637 
3638     /// The `Encoding` this `Decoder` is for.
3639     ///
3640     /// BOM sniffing can change the return value of this method during the life
3641     /// of the decoder.
3642     ///
3643     /// Available via the C wrapper.
3644     #[inline]
encoding(&self) -> &'static Encoding3645     pub fn encoding(&self) -> &'static Encoding {
3646         self.encoding
3647     }
3648 
3649     /// Query the worst-case UTF-8 output size _with replacement_.
3650     ///
3651     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3652     /// that will not overflow given the current state of the decoder and
3653     /// `byte_length` number of additional input bytes when decoding with
3654     /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3655     /// sequence or `None` if `usize` would overflow.
3656     ///
3657     /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3658     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3659         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3660         // BOM getting pushed to the underlying decoder.
3661         match self.life_cycle {
3662             DecoderLifeCycle::Converting
3663             | DecoderLifeCycle::AtUtf8Start
3664             | DecoderLifeCycle::AtUtf16LeStart
3665             | DecoderLifeCycle::AtUtf16BeStart => {
3666                 return self.variant.max_utf8_buffer_length(byte_length);
3667             }
3668             DecoderLifeCycle::AtStart => {
3669                 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3670                     if let Some(utf16_bom) = checked_add(
3671                         1,
3672                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3673                     ) {
3674                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3675                         let encoding = self.encoding();
3676                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3677                             // No need to consider the internal state of the underlying decoder,
3678                             // because it is at start, because no data has reached it yet.
3679                             return Some(utf_bom);
3680                         } else if let Some(non_bom) =
3681                             self.variant.max_utf8_buffer_length(byte_length)
3682                         {
3683                             return Some(std::cmp::max(utf_bom, non_bom));
3684                         }
3685                     }
3686                 }
3687             }
3688             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3689                 // Add two bytes even when only one byte has been seen,
3690                 // because the one byte can become a lead byte in multibyte
3691                 // decoders, but only after the decoder has been queried
3692                 // for max length, so the decoder's own logic for adding
3693                 // one for a pending lead cannot work.
3694                 if let Some(sum) = byte_length.checked_add(2) {
3695                     if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3696                         if self.encoding() == UTF_8 {
3697                             // No need to consider the internal state of the underlying decoder,
3698                             // because it is at start, because no data has reached it yet.
3699                             return Some(utf8_bom);
3700                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3701                             return Some(std::cmp::max(utf8_bom, non_bom));
3702                         }
3703                     }
3704                 }
3705             }
3706             DecoderLifeCycle::ConvertingWithPendingBB => {
3707                 if let Some(sum) = byte_length.checked_add(2) {
3708                     return self.variant.max_utf8_buffer_length(sum);
3709                 }
3710             }
3711             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3712                 // Add two bytes even when only one byte has been seen,
3713                 // because the one byte can become a lead byte in multibyte
3714                 // decoders, but only after the decoder has been queried
3715                 // for max length, so the decoder's own logic for adding
3716                 // one for a pending lead cannot work.
3717                 if let Some(sum) = byte_length.checked_add(2) {
3718                     if let Some(utf16_bom) =
3719                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3720                     {
3721                         let encoding = self.encoding();
3722                         if encoding == UTF_16LE || encoding == UTF_16BE {
3723                             // No need to consider the internal state of the underlying decoder,
3724                             // because it is at start, because no data has reached it yet.
3725                             return Some(utf16_bom);
3726                         } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3727                             return Some(std::cmp::max(utf16_bom, non_bom));
3728                         }
3729                     }
3730                 }
3731             }
3732             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3733         }
3734         None
3735     }
3736 
3737     /// Query the worst-case UTF-8 output size _without replacement_.
3738     ///
3739     /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3740     /// that will not overflow given the current state of the decoder and
3741     /// `byte_length` number of additional input bytes when decoding without
3742     /// replacement error handling or `None` if `usize` would overflow.
3743     ///
3744     /// Note that this value may be too small for the `_with_replacement` case.
3745     /// Use `max_utf8_buffer_length()` for that case.
3746     ///
3747     /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3748     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3749         // Need to consider a) the decoder morphing due to the BOM and b) a partial
3750         // BOM getting pushed to the underlying decoder.
3751         match self.life_cycle {
3752             DecoderLifeCycle::Converting
3753             | DecoderLifeCycle::AtUtf8Start
3754             | DecoderLifeCycle::AtUtf16LeStart
3755             | DecoderLifeCycle::AtUtf16BeStart => {
3756                 return self
3757                     .variant
3758                     .max_utf8_buffer_length_without_replacement(byte_length);
3759             }
3760             DecoderLifeCycle::AtStart => {
3761                 if let Some(utf8_bom) = byte_length.checked_add(3) {
3762                     if let Some(utf16_bom) = checked_add(
3763                         1,
3764                         checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3765                     ) {
3766                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3767                         let encoding = self.encoding();
3768                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3769                             // No need to consider the internal state of the underlying decoder,
3770                             // because it is at start, because no data has reached it yet.
3771                             return Some(utf_bom);
3772                         } else if let Some(non_bom) = self
3773                             .variant
3774                             .max_utf8_buffer_length_without_replacement(byte_length)
3775                         {
3776                             return Some(std::cmp::max(utf_bom, non_bom));
3777                         }
3778                     }
3779                 }
3780             }
3781             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3782                 // Add two bytes even when only one byte has been seen,
3783                 // because the one byte can become a lead byte in multibyte
3784                 // decoders, but only after the decoder has been queried
3785                 // for max length, so the decoder's own logic for adding
3786                 // one for a pending lead cannot work.
3787                 if let Some(sum) = byte_length.checked_add(2) {
3788                     if let Some(utf8_bom) = sum.checked_add(3) {
3789                         if self.encoding() == UTF_8 {
3790                             // No need to consider the internal state of the underlying decoder,
3791                             // because it is at start, because no data has reached it yet.
3792                             return Some(utf8_bom);
3793                         } else if let Some(non_bom) =
3794                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3795                         {
3796                             return Some(std::cmp::max(utf8_bom, non_bom));
3797                         }
3798                     }
3799                 }
3800             }
3801             DecoderLifeCycle::ConvertingWithPendingBB => {
3802                 if let Some(sum) = byte_length.checked_add(2) {
3803                     return self.variant.max_utf8_buffer_length_without_replacement(sum);
3804                 }
3805             }
3806             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3807                 // Add two bytes even when only one byte has been seen,
3808                 // because the one byte can become a lead byte in multibyte
3809                 // decoders, but only after the decoder has been queried
3810                 // for max length, so the decoder's own logic for adding
3811                 // one for a pending lead cannot work.
3812                 if let Some(sum) = byte_length.checked_add(2) {
3813                     if let Some(utf16_bom) =
3814                         checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3815                     {
3816                         let encoding = self.encoding();
3817                         if encoding == UTF_16LE || encoding == UTF_16BE {
3818                             // No need to consider the internal state of the underlying decoder,
3819                             // because it is at start, because no data has reached it yet.
3820                             return Some(utf16_bom);
3821                         } else if let Some(non_bom) =
3822                             self.variant.max_utf8_buffer_length_without_replacement(sum)
3823                         {
3824                             return Some(std::cmp::max(utf16_bom, non_bom));
3825                         }
3826                     }
3827                 }
3828             }
3829             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3830         }
3831         None
3832     }
3833 
3834     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3835     /// replaced with the REPLACEMENT CHARACTER.
3836     ///
3837     /// See the documentation of the struct for documentation for `decode_*`
3838     /// methods collectively.
3839     ///
3840     /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3841     pub fn decode_to_utf8(
3842         &mut self,
3843         src: &[u8],
3844         dst: &mut [u8],
3845         last: bool,
3846     ) -> (CoderResult, usize, usize, bool) {
3847         let mut had_errors = false;
3848         let mut total_read = 0usize;
3849         let mut total_written = 0usize;
3850         loop {
3851             let (result, read, written) = self.decode_to_utf8_without_replacement(
3852                 &src[total_read..],
3853                 &mut dst[total_written..],
3854                 last,
3855             );
3856             total_read += read;
3857             total_written += written;
3858             match result {
3859                 DecoderResult::InputEmpty => {
3860                     return (
3861                         CoderResult::InputEmpty,
3862                         total_read,
3863                         total_written,
3864                         had_errors,
3865                     );
3866                 }
3867                 DecoderResult::OutputFull => {
3868                     return (
3869                         CoderResult::OutputFull,
3870                         total_read,
3871                         total_written,
3872                         had_errors,
3873                     );
3874                 }
3875                 DecoderResult::Malformed(_, _) => {
3876                     had_errors = true;
3877                     // There should always be space for the U+FFFD, because
3878                     // otherwise we'd have gotten OutputFull already.
3879                     // XXX: is the above comment actually true for UTF-8 itself?
3880                     // TODO: Consider having fewer bound checks here.
3881                     dst[total_written] = 0xEFu8;
3882                     total_written += 1;
3883                     dst[total_written] = 0xBFu8;
3884                     total_written += 1;
3885                     dst[total_written] = 0xBDu8;
3886                     total_written += 1;
3887                 }
3888             }
3889         }
3890     }
3891 
3892     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3893     /// replaced with the REPLACEMENT CHARACTER with type system signaling
3894     /// of UTF-8 validity.
3895     ///
3896     /// This methods calls `decode_to_utf8` and then zeroes
3897     /// out up to three bytes that aren't logically part of the write in order
3898     /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3899     ///
3900     /// See the documentation of the struct for documentation for `decode_*`
3901     /// methods collectively.
3902     ///
3903     /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3904     pub fn decode_to_str(
3905         &mut self,
3906         src: &[u8],
3907         dst: &mut str,
3908         last: bool,
3909     ) -> (CoderResult, usize, usize, bool) {
3910         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3911         let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3912         let len = bytes.len();
3913         let mut trail = written;
3914         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3915         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3916         // encodings to avoid overwriting here.
3917         if self.encoding != UTF_8 {
3918             let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3919             while trail < max {
3920                 bytes[trail] = 0;
3921                 trail += 1;
3922             }
3923         }
3924         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3925             bytes[trail] = 0;
3926             trail += 1;
3927         }
3928         (result, read, written, replaced)
3929     }
3930 
3931     /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3932     /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3933     ///
3934     /// Like the others, this method follows the logic that the output buffer is
3935     /// caller-allocated. This method treats the capacity of the `String` as
3936     /// the output limit. That is, this method guarantees not to cause a
3937     /// reallocation of the backing buffer of `String`.
3938     ///
3939     /// The return value is a tuple that contains the `DecoderResult`, the
3940     /// number of bytes read and a boolean indicating whether replacements
3941     /// were done. The number of bytes written is signaled via the length of
3942     /// the `String` changing.
3943     ///
3944     /// See the documentation of the struct for documentation for `decode_*`
3945     /// methods collectively.
3946     ///
3947     /// Available to Rust only.
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)3948     pub fn decode_to_string(
3949         &mut self,
3950         src: &[u8],
3951         dst: &mut String,
3952         last: bool,
3953     ) -> (CoderResult, usize, bool) {
3954         unsafe {
3955             let vec = dst.as_mut_vec();
3956             let old_len = vec.len();
3957             let capacity = vec.capacity();
3958             vec.set_len(capacity);
3959             let (result, read, written, replaced) =
3960                 self.decode_to_utf8(src, &mut vec[old_len..], last);
3961             vec.set_len(old_len + written);
3962             (result, read, replaced)
3963         }
3964     }
3965 
3966     public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3967                             /// _without replacement_.
3968                             ///
3969                             /// See the documentation of the struct for
3970                             /// documentation for `decode_*` methods
3971                             /// collectively.
3972                             ///
3973                             /// Available via the C wrapper.
3974                             ,
3975                             decode_to_utf8_without_replacement,
3976                             decode_to_utf8_raw,
3977                             decode_to_utf8_checking_end,
3978                             decode_to_utf8_after_one_potential_bom_byte,
3979                             decode_to_utf8_after_two_potential_bom_bytes,
3980                             decode_to_utf8_checking_end_with_offset,
3981                             u8);
3982 
3983     /// Incrementally decode a byte stream into UTF-8 with type system signaling
3984     /// of UTF-8 validity.
3985     ///
3986     /// This methods calls `decode_to_utf8` and then zeroes out up to three
3987     /// bytes that aren't logically part of the write in order to retain the
3988     /// UTF-8 validity even for the unwritten part of the buffer.
3989     ///
3990     /// See the documentation of the struct for documentation for `decode_*`
3991     /// methods collectively.
3992     ///
3993     /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)3994     pub fn decode_to_str_without_replacement(
3995         &mut self,
3996         src: &[u8],
3997         dst: &mut str,
3998         last: bool,
3999     ) -> (DecoderResult, usize, usize) {
4000         let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4001         let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4002         let len = bytes.len();
4003         let mut trail = written;
4004         // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4005         // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4006         // encodings to avoid overwriting here.
4007         if self.encoding != UTF_8 {
4008             let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4009             while trail < max {
4010                 bytes[trail] = 0;
4011                 trail += 1;
4012             }
4013         }
4014         while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4015             bytes[trail] = 0;
4016             trail += 1;
4017         }
4018         (result, read, written)
4019     }
4020 
4021     /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4022     ///
4023     /// Like the others, this method follows the logic that the output buffer is
4024     /// caller-allocated. This method treats the capacity of the `String` as
4025     /// the output limit. That is, this method guarantees not to cause a
4026     /// reallocation of the backing buffer of `String`.
4027     ///
4028     /// The return value is a pair that contains the `DecoderResult` and the
4029     /// number of bytes read. The number of bytes written is signaled via
4030     /// the length of the `String` changing.
4031     ///
4032     /// See the documentation of the struct for documentation for `decode_*`
4033     /// methods collectively.
4034     ///
4035     /// Available to Rust only.
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4036     pub fn decode_to_string_without_replacement(
4037         &mut self,
4038         src: &[u8],
4039         dst: &mut String,
4040         last: bool,
4041     ) -> (DecoderResult, usize) {
4042         unsafe {
4043             let vec = dst.as_mut_vec();
4044             let old_len = vec.len();
4045             let capacity = vec.capacity();
4046             vec.set_len(capacity);
4047             let (result, read, written) =
4048                 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4049             vec.set_len(old_len + written);
4050             (result, read)
4051         }
4052     }
4053 
4054     /// Query the worst-case UTF-16 output size (with or without replacement).
4055     ///
4056     /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4057     /// that will not overflow given the current state of the decoder and
4058     /// `byte_length` number of additional input bytes or `None` if `usize`
4059     /// would overflow.
4060     ///
4061     /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4062     /// return value of this method applies also in the
4063     /// `_without_replacement` case.
4064     ///
4065     /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4066     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4067         // Need to consider a) the decoder morphing due to the BOM and b) a partial
4068         // BOM getting pushed to the underlying decoder.
4069         match self.life_cycle {
4070             DecoderLifeCycle::Converting
4071             | DecoderLifeCycle::AtUtf8Start
4072             | DecoderLifeCycle::AtUtf16LeStart
4073             | DecoderLifeCycle::AtUtf16BeStart => {
4074                 return self.variant.max_utf16_buffer_length(byte_length);
4075             }
4076             DecoderLifeCycle::AtStart => {
4077                 if let Some(utf8_bom) = byte_length.checked_add(1) {
4078                     if let Some(utf16_bom) =
4079                         checked_add(1, checked_div(byte_length.checked_add(1), 2))
4080                     {
4081                         let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
4082                         let encoding = self.encoding();
4083                         if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4084                             // No need to consider the internal state of the underlying decoder,
4085                             // because it is at start, because no data has reached it yet.
4086                             return Some(utf_bom);
4087                         } else if let Some(non_bom) =
4088                             self.variant.max_utf16_buffer_length(byte_length)
4089                         {
4090                             return Some(std::cmp::max(utf_bom, non_bom));
4091                         }
4092                     }
4093                 }
4094             }
4095             DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4096                 // Add two bytes even when only one byte has been seen,
4097                 // because the one byte can become a lead byte in multibyte
4098                 // decoders, but only after the decoder has been queried
4099                 // for max length, so the decoder's own logic for adding
4100                 // one for a pending lead cannot work.
4101                 if let Some(sum) = byte_length.checked_add(2) {
4102                     if let Some(utf8_bom) = sum.checked_add(1) {
4103                         if self.encoding() == UTF_8 {
4104                             // No need to consider the internal state of the underlying decoder,
4105                             // because it is at start, because no data has reached it yet.
4106                             return Some(utf8_bom);
4107                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4108                             return Some(std::cmp::max(utf8_bom, non_bom));
4109                         }
4110                     }
4111                 }
4112             }
4113             DecoderLifeCycle::ConvertingWithPendingBB => {
4114                 if let Some(sum) = byte_length.checked_add(2) {
4115                     return self.variant.max_utf16_buffer_length(sum);
4116                 }
4117             }
4118             DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4119                 // Add two bytes even when only one byte has been seen,
4120                 // because the one byte can become a lead byte in multibyte
4121                 // decoders, but only after the decoder has been queried
4122                 // for max length, so the decoder's own logic for adding
4123                 // one for a pending lead cannot work.
4124                 if let Some(sum) = byte_length.checked_add(2) {
4125                     if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4126                         let encoding = self.encoding();
4127                         if encoding == UTF_16LE || encoding == UTF_16BE {
4128                             // No need to consider the internal state of the underlying decoder,
4129                             // because it is at start, because no data has reached it yet.
4130                             return Some(utf16_bom);
4131                         } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4132                             return Some(std::cmp::max(utf16_bom, non_bom));
4133                         }
4134                     }
4135                 }
4136             }
4137             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4138         }
4139         None
4140     }
4141 
4142     /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4143     /// replaced with the REPLACEMENT CHARACTER.
4144     ///
4145     /// See the documentation of the struct for documentation for `decode_*`
4146     /// methods collectively.
4147     ///
4148     /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4149     pub fn decode_to_utf16(
4150         &mut self,
4151         src: &[u8],
4152         dst: &mut [u16],
4153         last: bool,
4154     ) -> (CoderResult, usize, usize, bool) {
4155         let mut had_errors = false;
4156         let mut total_read = 0usize;
4157         let mut total_written = 0usize;
4158         loop {
4159             let (result, read, written) = self.decode_to_utf16_without_replacement(
4160                 &src[total_read..],
4161                 &mut dst[total_written..],
4162                 last,
4163             );
4164             total_read += read;
4165             total_written += written;
4166             match result {
4167                 DecoderResult::InputEmpty => {
4168                     return (
4169                         CoderResult::InputEmpty,
4170                         total_read,
4171                         total_written,
4172                         had_errors,
4173                     );
4174                 }
4175                 DecoderResult::OutputFull => {
4176                     return (
4177                         CoderResult::OutputFull,
4178                         total_read,
4179                         total_written,
4180                         had_errors,
4181                     );
4182                 }
4183                 DecoderResult::Malformed(_, _) => {
4184                     had_errors = true;
4185                     // There should always be space for the U+FFFD, because
4186                     // otherwise we'd have gotten OutputFull already.
4187                     dst[total_written] = 0xFFFD;
4188                     total_written += 1;
4189                 }
4190             }
4191         }
4192     }
4193 
4194     public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4195                             /// _without replacement_.
4196                             ///
4197                             /// See the documentation of the struct for
4198                             /// documentation for `decode_*` methods
4199                             /// collectively.
4200                             ///
4201                             /// Available via the C wrapper.
4202                             ,
4203                             decode_to_utf16_without_replacement,
4204                             decode_to_utf16_raw,
4205                             decode_to_utf16_checking_end,
4206                             decode_to_utf16_after_one_potential_bom_byte,
4207                             decode_to_utf16_after_two_potential_bom_bytes,
4208                             decode_to_utf16_checking_end_with_offset,
4209                             u16);
4210 
4211     /// Checks for compatibility with storing Unicode scalar values as unsigned
4212     /// bytes taking into account the state of the decoder.
4213     ///
4214     /// Returns `None` if the decoder is not in a neutral state, including waiting
4215     /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4216     ///
4217     /// Otherwise returns the index of the first byte whose unsigned value doesn't
4218     /// directly correspond to the decoded Unicode scalar value, or the length
4219     /// of the input if all bytes in the input decode directly to scalar values
4220     /// corresponding to the unsigned byte values.
4221     ///
4222     /// Does not change the state of the decoder.
4223     ///
4224     /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4225     /// storage optimizations.
4226     ///
4227     /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4228     pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4229         match self.life_cycle {
4230             DecoderLifeCycle::Converting => {
4231                 return self.variant.latin1_byte_compatible_up_to(bytes);
4232             }
4233             DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4234             _ => None,
4235         }
4236     }
4237 }
4238 
4239 /// Result of a (potentially partial) encode operation without replacement.
4240 #[must_use]
4241 #[derive(Debug, PartialEq, Eq)]
4242 pub enum EncoderResult {
4243     /// The input was exhausted.
4244     ///
4245     /// If this result was returned from a call where `last` was `true`, the
4246     /// decoding process has completed. Otherwise, the caller should call a
4247     /// decode method again with more input.
4248     InputEmpty,
4249 
4250     /// The encoder cannot produce another unit of output, because the output
4251     /// buffer does not have enough space left.
4252     ///
4253     /// The caller must provide more output space upon the next call and re-push
4254     /// the remaining input to the decoder.
4255     OutputFull,
4256 
4257     /// The encoder encountered an unmappable character.
4258     ///
4259     /// The caller must either treat this as a fatal error or must append
4260     /// a placeholder to the output and then re-push the remaining input to the
4261     /// encoder.
4262     Unmappable(char),
4263 }
4264 
4265 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4266     fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4267         EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
4268     }
4269 }
4270 
4271 /// A converter that encodes a Unicode stream into bytes according to a
4272 /// character encoding in a streaming (incremental) manner.
4273 ///
4274 /// The various `encode_*` methods take an input buffer (`src`) and an output
4275 /// buffer `dst` both of which are caller-allocated. There are variants for
4276 /// both UTF-8 and UTF-16 input buffers.
4277 ///
4278 /// An `encode_*` method encode characters from `src` into bytes characters
4279 /// stored into `dst` until one of the following three things happens:
4280 ///
4281 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4282 ///    only).
4283 ///
4284 /// 2. The output buffer has been filled so near capacity that the decoder
4285 ///    cannot be sure that processing an additional character of input wouldn't
4286 ///    cause so much output that the output buffer would overflow.
4287 ///
4288 /// 3. All the input characters have been processed.
4289 ///
4290 /// The `encode_*` method then returns tuple of a status indicating which one
4291 /// of the three reasons to return happened, how many input code units (`u8`
4292 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4293 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4294 /// whose length change indicates this), and in the case of the variants that
4295 /// perform replacement, a boolean indicating whether an unmappable
4296 /// character was replaced with a numeric character reference during the call.
4297 ///
4298 /// The number of bytes "written" is what's logically written. Garbage may be
4299 /// written in the output buffer beyond the point logically written to.
4300 ///
4301 /// In the case of the methods whose name ends with
4302 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4303 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4304 /// the three cases listed above).
4305 ///
4306 /// In the case of methods whose name does not end with
4307 /// `*_without_replacement`, unmappable characters are automatically replaced
4308 /// with the corresponding numeric character references and unmappable
4309 /// characters do not cause the methods to return early.
4310 ///
4311 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4312 /// not to return indicating that more output space is needed if the length
4313 /// of the output buffer is at least the length returned by
4314 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4315 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4316 /// methods not to return indicating that more output space is needed in the
4317 /// absence of unmappable characters is given by
4318 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4319 /// UTF-16 without replacement, the methods are guaranteed not to return
4320 /// indicating that more output space is needed if the length of the output
4321 /// buffer is at least the length returned by
4322 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4323 /// from UTF-16 with replacement, the the length of the output buffer that
4324 /// guarantees the methods not to return indicating that more output space is
4325 /// needed in the absence of unmappable characters is given by
4326 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4327 /// When encoding with replacement, applications are not expected to size the
4328 /// buffer for the worst case ahead of time but to resize the buffer if there
4329 /// are unmappable characters. This is why max length queries are only available
4330 /// for the case where there are no unmappable characters.
4331 ///
4332 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4333 /// calling from Rust, the type system takes care of this.) When encoding from
4334 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4335 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4336 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4337 /// are not split across input buffer boundaries.
4338 ///
4339 /// After an `encode_*` call returns, the output produced so far, taken as a
4340 /// whole from the start of the stream, is guaranteed to consist of a valid
4341 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4342 /// character is guaranteed not to be split across output buffers. However, due
4343 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4344 /// from the start for it to be valid. For other encodings, the validity holds
4345 /// on a per-output buffer basis.)
4346 ///
4347 /// The boolean argument `last` indicates that the end of the stream is reached
4348 /// when all the characters in `src` have been consumed. This argument is needed
4349 /// for ISO-2022-JP and is ignored for other encodings.
4350 ///
4351 /// An `Encoder` object can be used to incrementally encode a byte stream.
4352 ///
4353 /// During the processing of a single stream, the caller must call `encode_*`
4354 /// zero or more times with `last` set to `false` and then call `encode_*` at
4355 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4356 /// the processing of the stream has ended. Otherwise, the caller must call
4357 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4358 /// as a fatal error).
4359 ///
4360 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4361 /// That is, you need to create another one to process another stream.
4362 ///
4363 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4364 /// and the caller does not wish to treat it as a fatal error, the input buffer
4365 /// `src` may not have been completely consumed. In that case, the caller must
4366 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4367 /// call.
4368 ///
4369 /// [1]: enum.EncoderResult.html
4370 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4371 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4372 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4373 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4374 ///
4375 /// # Infinite loops
4376 ///
4377 /// When converting with a fixed-size output buffer whose size is too small to
4378 /// accommodate one character of output, an infinite loop ensues. When
4379 /// converting with a fixed-size output buffer, it generally makes sense to
4380 /// make the buffer fairly large (e.g. couple of kilobytes).
4381 pub struct Encoder {
4382     encoding: &'static Encoding,
4383     variant: VariantEncoder,
4384 }
4385 
4386 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4387     fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4388         Encoder {
4389             encoding: enc,
4390             variant: encoder,
4391         }
4392     }
4393 
4394     /// The `Encoding` this `Encoder` is for.
4395     #[inline]
encoding(&self) -> &'static Encoding4396     pub fn encoding(&self) -> &'static Encoding {
4397         self.encoding
4398     }
4399 
4400     /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4401     /// ASCII state and `false` otherwise.
4402     #[inline]
has_pending_state(&self) -> bool4403     pub fn has_pending_state(&self) -> bool {
4404         self.variant.has_pending_state()
4405     }
4406 
4407     /// Query the worst-case output size when encoding from UTF-8 with
4408     /// replacement.
4409     ///
4410     /// Returns the size of the output buffer in bytes that will not overflow
4411     /// given the current state of the encoder and `byte_length` number of
4412     /// additional input code units if there are no unmappable characters in
4413     /// the input or `None` if `usize` would overflow.
4414     ///
4415     /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4416     pub fn max_buffer_length_from_utf8_if_no_unmappables(
4417         &self,
4418         byte_length: usize,
4419     ) -> Option<usize> {
4420         checked_add(
4421             if self.encoding().can_encode_everything() {
4422                 0
4423             } else {
4424                 NCR_EXTRA
4425             },
4426             self.max_buffer_length_from_utf8_without_replacement(byte_length),
4427         )
4428     }
4429 
4430     /// Query the worst-case output size when encoding from UTF-8 without
4431     /// replacement.
4432     ///
4433     /// Returns the size of the output buffer in bytes that will not overflow
4434     /// given the current state of the encoder and `byte_length` number of
4435     /// additional input code units or `None` if `usize` would overflow.
4436     ///
4437     /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4438     pub fn max_buffer_length_from_utf8_without_replacement(
4439         &self,
4440         byte_length: usize,
4441     ) -> Option<usize> {
4442         self.variant
4443             .max_buffer_length_from_utf8_without_replacement(byte_length)
4444     }
4445 
4446     /// Incrementally encode into byte stream from UTF-8 with unmappable
4447     /// characters replaced with HTML (decimal) numeric character references.
4448     ///
4449     /// See the documentation of the struct for documentation for `encode_*`
4450     /// methods collectively.
4451     ///
4452     /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4453     pub fn encode_from_utf8(
4454         &mut self,
4455         src: &str,
4456         dst: &mut [u8],
4457         last: bool,
4458     ) -> (CoderResult, usize, usize, bool) {
4459         let dst_len = dst.len();
4460         let effective_dst_len = if self.encoding().can_encode_everything() {
4461             dst_len
4462         } else {
4463             if dst_len < NCR_EXTRA {
4464                 if src.is_empty() && !(last && self.has_pending_state()) {
4465                     return (CoderResult::InputEmpty, 0, 0, false);
4466                 }
4467                 return (CoderResult::OutputFull, 0, 0, false);
4468             }
4469             dst_len - NCR_EXTRA
4470         };
4471         let mut had_unmappables = false;
4472         let mut total_read = 0usize;
4473         let mut total_written = 0usize;
4474         loop {
4475             let (result, read, written) = self.encode_from_utf8_without_replacement(
4476                 &src[total_read..],
4477                 &mut dst[total_written..effective_dst_len],
4478                 last,
4479             );
4480             total_read += read;
4481             total_written += written;
4482             match result {
4483                 EncoderResult::InputEmpty => {
4484                     return (
4485                         CoderResult::InputEmpty,
4486                         total_read,
4487                         total_written,
4488                         had_unmappables,
4489                     );
4490                 }
4491                 EncoderResult::OutputFull => {
4492                     return (
4493                         CoderResult::OutputFull,
4494                         total_read,
4495                         total_written,
4496                         had_unmappables,
4497                     );
4498                 }
4499                 EncoderResult::Unmappable(unmappable) => {
4500                     had_unmappables = true;
4501                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4502                     debug_assert_ne!(self.encoding(), UTF_16BE);
4503                     debug_assert_ne!(self.encoding(), UTF_16LE);
4504                     // Additionally, Iso2022JpEncoder is responsible for
4505                     // transitioning to ASCII when returning with Unmappable.
4506                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4507                     if total_written >= effective_dst_len {
4508                         if total_read == src.len() && !(last && self.has_pending_state()) {
4509                             return (
4510                                 CoderResult::InputEmpty,
4511                                 total_read,
4512                                 total_written,
4513                                 had_unmappables,
4514                             );
4515                         }
4516                         return (
4517                             CoderResult::OutputFull,
4518                             total_read,
4519                             total_written,
4520                             had_unmappables,
4521                         );
4522                     }
4523                 }
4524             }
4525         }
4526     }
4527 
4528     /// Incrementally encode into byte stream from UTF-8 with unmappable
4529     /// characters replaced with HTML (decimal) numeric character references.
4530     ///
4531     /// See the documentation of the struct for documentation for `encode_*`
4532     /// methods collectively.
4533     ///
4534     /// Available to Rust only.
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4535     pub fn encode_from_utf8_to_vec(
4536         &mut self,
4537         src: &str,
4538         dst: &mut Vec<u8>,
4539         last: bool,
4540     ) -> (CoderResult, usize, bool) {
4541         unsafe {
4542             let old_len = dst.len();
4543             let capacity = dst.capacity();
4544             dst.set_len(capacity);
4545             let (result, read, written, replaced) =
4546                 self.encode_from_utf8(src, &mut dst[old_len..], last);
4547             dst.set_len(old_len + written);
4548             (result, read, replaced)
4549         }
4550     }
4551 
4552     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4553     ///
4554     /// See the documentation of the struct for documentation for `encode_*`
4555     /// methods collectively.
4556     ///
4557     /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4558     pub fn encode_from_utf8_without_replacement(
4559         &mut self,
4560         src: &str,
4561         dst: &mut [u8],
4562         last: bool,
4563     ) -> (EncoderResult, usize, usize) {
4564         self.variant.encode_from_utf8_raw(src, dst, last)
4565     }
4566 
4567     /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4568     ///
4569     /// See the documentation of the struct for documentation for `encode_*`
4570     /// methods collectively.
4571     ///
4572     /// Available to Rust only.
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4573     pub fn encode_from_utf8_to_vec_without_replacement(
4574         &mut self,
4575         src: &str,
4576         dst: &mut Vec<u8>,
4577         last: bool,
4578     ) -> (EncoderResult, usize) {
4579         unsafe {
4580             let old_len = dst.len();
4581             let capacity = dst.capacity();
4582             dst.set_len(capacity);
4583             let (result, read, written) =
4584                 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4585             dst.set_len(old_len + written);
4586             (result, read)
4587         }
4588     }
4589 
4590     /// Query the worst-case output size when encoding from UTF-16 with
4591     /// replacement.
4592     ///
4593     /// Returns the size of the output buffer in bytes that will not overflow
4594     /// given the current state of the encoder and `u16_length` number of
4595     /// additional input code units if there are no unmappable characters in
4596     /// the input or `None` if `usize` would overflow.
4597     ///
4598     /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4599     pub fn max_buffer_length_from_utf16_if_no_unmappables(
4600         &self,
4601         u16_length: usize,
4602     ) -> Option<usize> {
4603         checked_add(
4604             if self.encoding().can_encode_everything() {
4605                 0
4606             } else {
4607                 NCR_EXTRA
4608             },
4609             self.max_buffer_length_from_utf16_without_replacement(u16_length),
4610         )
4611     }
4612 
4613     /// Query the worst-case output size when encoding from UTF-16 without
4614     /// replacement.
4615     ///
4616     /// Returns the size of the output buffer in bytes that will not overflow
4617     /// given the current state of the encoder and `u16_length` number of
4618     /// additional input code units or `None` if `usize` would overflow.
4619     ///
4620     /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4621     pub fn max_buffer_length_from_utf16_without_replacement(
4622         &self,
4623         u16_length: usize,
4624     ) -> Option<usize> {
4625         self.variant
4626             .max_buffer_length_from_utf16_without_replacement(u16_length)
4627     }
4628 
4629     /// Incrementally encode into byte stream from UTF-16 with unmappable
4630     /// characters replaced with HTML (decimal) numeric character references.
4631     ///
4632     /// See the documentation of the struct for documentation for `encode_*`
4633     /// methods collectively.
4634     ///
4635     /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4636     pub fn encode_from_utf16(
4637         &mut self,
4638         src: &[u16],
4639         dst: &mut [u8],
4640         last: bool,
4641     ) -> (CoderResult, usize, usize, bool) {
4642         let dst_len = dst.len();
4643         let effective_dst_len = if self.encoding().can_encode_everything() {
4644             dst_len
4645         } else {
4646             if dst_len < NCR_EXTRA {
4647                 if src.is_empty() && !(last && self.has_pending_state()) {
4648                     return (CoderResult::InputEmpty, 0, 0, false);
4649                 }
4650                 return (CoderResult::OutputFull, 0, 0, false);
4651             }
4652             dst_len - NCR_EXTRA
4653         };
4654         let mut had_unmappables = false;
4655         let mut total_read = 0usize;
4656         let mut total_written = 0usize;
4657         loop {
4658             let (result, read, written) = self.encode_from_utf16_without_replacement(
4659                 &src[total_read..],
4660                 &mut dst[total_written..effective_dst_len],
4661                 last,
4662             );
4663             total_read += read;
4664             total_written += written;
4665             match result {
4666                 EncoderResult::InputEmpty => {
4667                     return (
4668                         CoderResult::InputEmpty,
4669                         total_read,
4670                         total_written,
4671                         had_unmappables,
4672                     );
4673                 }
4674                 EncoderResult::OutputFull => {
4675                     return (
4676                         CoderResult::OutputFull,
4677                         total_read,
4678                         total_written,
4679                         had_unmappables,
4680                     );
4681                 }
4682                 EncoderResult::Unmappable(unmappable) => {
4683                     had_unmappables = true;
4684                     debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4685                     // There are no UTF-16 encoders and even if there were,
4686                     // they'd never have unmappables.
4687                     debug_assert_ne!(self.encoding(), UTF_16BE);
4688                     debug_assert_ne!(self.encoding(), UTF_16LE);
4689                     // Additionally, Iso2022JpEncoder is responsible for
4690                     // transitioning to ASCII when returning with Unmappable
4691                     // from the jis0208 state. That is, when we encode
4692                     // ISO-2022-JP and come here, the encoder is in either the
4693                     // ASCII or the Roman state. We are allowed to generate any
4694                     // printable ASCII excluding \ and ~.
4695                     total_written += write_ncr(unmappable, &mut dst[total_written..]);
4696                     if total_written >= effective_dst_len {
4697                         if total_read == src.len() && !(last && self.has_pending_state()) {
4698                             return (
4699                                 CoderResult::InputEmpty,
4700                                 total_read,
4701                                 total_written,
4702                                 had_unmappables,
4703                             );
4704                         }
4705                         return (
4706                             CoderResult::OutputFull,
4707                             total_read,
4708                             total_written,
4709                             had_unmappables,
4710                         );
4711                     }
4712                 }
4713             }
4714         }
4715     }
4716 
4717     /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4718     ///
4719     /// See the documentation of the struct for documentation for `encode_*`
4720     /// methods collectively.
4721     ///
4722     /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4723     pub fn encode_from_utf16_without_replacement(
4724         &mut self,
4725         src: &[u16],
4726         dst: &mut [u8],
4727         last: bool,
4728     ) -> (EncoderResult, usize, usize) {
4729         self.variant.encode_from_utf16_raw(src, dst, last)
4730     }
4731 }
4732 
4733 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4734 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4735     // len is the number of decimal digits needed to represent unmappable plus
4736     // 3 (the length of "&#" and ";").
4737     let mut number = unmappable as u32;
4738     let len = if number >= 1_000_000u32 {
4739         10usize
4740     } else if number >= 100_000u32 {
4741         9usize
4742     } else if number >= 10_000u32 {
4743         8usize
4744     } else if number >= 1_000u32 {
4745         7usize
4746     } else if number >= 100u32 {
4747         6usize
4748     } else {
4749         // Review the outcome of https://github.com/whatwg/encoding/issues/15
4750         // to see if this case is possible
4751         5usize
4752     };
4753     debug_assert!(number >= 10u32);
4754     debug_assert!(len <= dst.len());
4755     let mut pos = len - 1;
4756     dst[pos] = b';';
4757     pos -= 1;
4758     loop {
4759         let rightmost = number % 10;
4760         dst[pos] = rightmost as u8 + b'0';
4761         pos -= 1;
4762         if number < 10 {
4763             break;
4764         }
4765         number /= 10;
4766     }
4767     dst[1] = b'#';
4768     dst[0] = b'&';
4769     len
4770 }
4771 
4772 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4773 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4774     i.wrapping_sub(start) < (end - start)
4775 }
4776 
4777 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4778 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4779     i.wrapping_sub(start) < (end - start)
4780 }
4781 
4782 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4783 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4784     i.wrapping_sub(start) <= (end - start)
4785 }
4786 
4787 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4788 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4789     i.wrapping_sub(start) <= (end - start)
4790 }
4791 
4792 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4793 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4794     i.wrapping_sub(start) <= (end - start)
4795 }
4796 
4797 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4798 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4799     i.wrapping_sub(start) <= (end - start)
4800 }
4801 
4802 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4803 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4804     if let Some(n) = opt {
4805         n.checked_add(num)
4806     } else {
4807         None
4808     }
4809 }
4810 
4811 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4812 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4813     if let Some(n) = one {
4814         checked_add(n, other)
4815     } else {
4816         None
4817     }
4818 }
4819 
4820 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4821 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4822     if let Some(n) = opt {
4823         n.checked_mul(num)
4824     } else {
4825         None
4826     }
4827 }
4828 
4829 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4830 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4831     if let Some(n) = opt {
4832         n.checked_div(num)
4833     } else {
4834         None
4835     }
4836 }
4837 
4838 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4839 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4840     opt.map(|n| n.next_power_of_two())
4841 }
4842 
4843 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4844 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4845     if let Some(a) = one {
4846         if let Some(b) = other {
4847             Some(::std::cmp::min(a, b))
4848         } else {
4849             Some(a)
4850         }
4851     } else {
4852         other
4853     }
4854 }
4855 
4856 // ############## TESTS ###############
4857 
4858 #[cfg(all(test, feature = "serde"))]
4859 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4860 struct Demo {
4861     num: u32,
4862     name: String,
4863     enc: &'static Encoding,
4864 }
4865 
4866 #[cfg(test)]
4867 mod test_labels_names;
4868 
4869 #[cfg(test)]
4870 mod tests {
4871     use super::*;
4872     use std::borrow::Cow;
4873 
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4874     fn sniff_to_utf16(
4875         initial_encoding: &'static Encoding,
4876         expected_encoding: &'static Encoding,
4877         bytes: &[u8],
4878         expect: &[u16],
4879         breaks: &[usize],
4880     ) {
4881         let mut decoder = initial_encoding.new_decoder();
4882 
4883         let mut dest: Vec<u16> =
4884             Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4885         let capacity = dest.capacity();
4886         dest.resize(capacity, 0u16);
4887 
4888         let mut total_written = 0usize;
4889         let mut start = 0usize;
4890         for br in breaks {
4891             let (result, read, written, _) =
4892                 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4893             total_written += written;
4894             assert_eq!(read, *br - start);
4895             match result {
4896                 CoderResult::InputEmpty => {}
4897                 CoderResult::OutputFull => {
4898                     unreachable!();
4899                 }
4900             }
4901             start = *br;
4902         }
4903         let (result, read, written, _) =
4904             decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4905         total_written += written;
4906         match result {
4907             CoderResult::InputEmpty => {}
4908             CoderResult::OutputFull => {
4909                 unreachable!();
4910             }
4911         }
4912         assert_eq!(read, bytes.len() - start);
4913         assert_eq!(total_written, expect.len());
4914         assert_eq!(&dest[..total_written], expect);
4915         assert_eq!(decoder.encoding(), expected_encoding);
4916     }
4917 
4918     // Any copyright to the test code below this comment is dedicated to the
4919     // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4920 
4921     #[test]
test_bom_sniffing()4922     fn test_bom_sniffing() {
4923         // ASCII
4924         sniff_to_utf16(
4925             WINDOWS_1252,
4926             WINDOWS_1252,
4927             b"\x61\x62",
4928             &[0x0061u16, 0x0062u16],
4929             &[],
4930         );
4931         // UTF-8
4932         sniff_to_utf16(
4933             WINDOWS_1252,
4934             UTF_8,
4935             b"\xEF\xBB\xBF\x61\x62",
4936             &[0x0061u16, 0x0062u16],
4937             &[],
4938         );
4939         sniff_to_utf16(
4940             WINDOWS_1252,
4941             UTF_8,
4942             b"\xEF\xBB\xBF\x61\x62",
4943             &[0x0061u16, 0x0062u16],
4944             &[1],
4945         );
4946         sniff_to_utf16(
4947             WINDOWS_1252,
4948             UTF_8,
4949             b"\xEF\xBB\xBF\x61\x62",
4950             &[0x0061u16, 0x0062u16],
4951             &[2],
4952         );
4953         sniff_to_utf16(
4954             WINDOWS_1252,
4955             UTF_8,
4956             b"\xEF\xBB\xBF\x61\x62",
4957             &[0x0061u16, 0x0062u16],
4958             &[3],
4959         );
4960         sniff_to_utf16(
4961             WINDOWS_1252,
4962             UTF_8,
4963             b"\xEF\xBB\xBF\x61\x62",
4964             &[0x0061u16, 0x0062u16],
4965             &[4],
4966         );
4967         sniff_to_utf16(
4968             WINDOWS_1252,
4969             UTF_8,
4970             b"\xEF\xBB\xBF\x61\x62",
4971             &[0x0061u16, 0x0062u16],
4972             &[2, 3],
4973         );
4974         sniff_to_utf16(
4975             WINDOWS_1252,
4976             UTF_8,
4977             b"\xEF\xBB\xBF\x61\x62",
4978             &[0x0061u16, 0x0062u16],
4979             &[1, 2],
4980         );
4981         sniff_to_utf16(
4982             WINDOWS_1252,
4983             UTF_8,
4984             b"\xEF\xBB\xBF\x61\x62",
4985             &[0x0061u16, 0x0062u16],
4986             &[1, 3],
4987         );
4988         sniff_to_utf16(
4989             WINDOWS_1252,
4990             UTF_8,
4991             b"\xEF\xBB\xBF\x61\x62",
4992             &[0x0061u16, 0x0062u16],
4993             &[1, 2, 3, 4],
4994         );
4995         sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
4996         // Not UTF-8
4997         sniff_to_utf16(
4998             WINDOWS_1252,
4999             WINDOWS_1252,
5000             b"\xEF\xBB\x61\x62",
5001             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5002             &[],
5003         );
5004         sniff_to_utf16(
5005             WINDOWS_1252,
5006             WINDOWS_1252,
5007             b"\xEF\xBB\x61\x62",
5008             &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5009             &[1],
5010         );
5011         sniff_to_utf16(
5012             WINDOWS_1252,
5013             WINDOWS_1252,
5014             b"\xEF\x61\x62",
5015             &[0x00EFu16, 0x0061u16, 0x0062u16],
5016             &[],
5017         );
5018         sniff_to_utf16(
5019             WINDOWS_1252,
5020             WINDOWS_1252,
5021             b"\xEF\x61\x62",
5022             &[0x00EFu16, 0x0061u16, 0x0062u16],
5023             &[1],
5024         );
5025         sniff_to_utf16(
5026             WINDOWS_1252,
5027             WINDOWS_1252,
5028             b"\xEF\xBB",
5029             &[0x00EFu16, 0x00BBu16],
5030             &[],
5031         );
5032         sniff_to_utf16(
5033             WINDOWS_1252,
5034             WINDOWS_1252,
5035             b"\xEF\xBB",
5036             &[0x00EFu16, 0x00BBu16],
5037             &[1],
5038         );
5039         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5040         // Not UTF-16
5041         sniff_to_utf16(
5042             WINDOWS_1252,
5043             WINDOWS_1252,
5044             b"\xFE\x61\x62",
5045             &[0x00FEu16, 0x0061u16, 0x0062u16],
5046             &[],
5047         );
5048         sniff_to_utf16(
5049             WINDOWS_1252,
5050             WINDOWS_1252,
5051             b"\xFE\x61\x62",
5052             &[0x00FEu16, 0x0061u16, 0x0062u16],
5053             &[1],
5054         );
5055         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5056         sniff_to_utf16(
5057             WINDOWS_1252,
5058             WINDOWS_1252,
5059             b"\xFF\x61\x62",
5060             &[0x00FFu16, 0x0061u16, 0x0062u16],
5061             &[],
5062         );
5063         sniff_to_utf16(
5064             WINDOWS_1252,
5065             WINDOWS_1252,
5066             b"\xFF\x61\x62",
5067             &[0x00FFu16, 0x0061u16, 0x0062u16],
5068             &[1],
5069         );
5070         sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5071         // UTF-16
5072         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5073         sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5074         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5075         sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5076     }
5077 
5078     #[test]
test_output_encoding()5079     fn test_output_encoding() {
5080         assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5081         assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5082         assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5083         assert_eq!(UTF_8.output_encoding(), UTF_8);
5084         assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5085         assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5086         assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5087         assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5088         assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5089         assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5090     }
5091 
5092     #[test]
test_label_resolution()5093     fn test_label_resolution() {
5094         assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5095         assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5096         assert_eq!(
5097             Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5098             Some(UTF_8)
5099         );
5100         assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5101         assert_eq!(Encoding::for_label(b"bogus"), None);
5102         assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5103     }
5104 
5105     #[test]
test_decode_valid_windows_1257_to_cow()5106     fn test_decode_valid_windows_1257_to_cow() {
5107         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5108         match cow {
5109             Cow::Borrowed(_) => unreachable!(),
5110             Cow::Owned(s) => {
5111                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5112             }
5113         }
5114         assert_eq!(encoding, WINDOWS_1257);
5115         assert!(!had_errors);
5116     }
5117 
5118     #[test]
test_decode_invalid_windows_1257_to_cow()5119     fn test_decode_invalid_windows_1257_to_cow() {
5120         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5121         match cow {
5122             Cow::Borrowed(_) => unreachable!(),
5123             Cow::Owned(s) => {
5124                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5125             }
5126         }
5127         assert_eq!(encoding, WINDOWS_1257);
5128         assert!(had_errors);
5129     }
5130 
5131     #[test]
test_decode_ascii_only_windows_1257_to_cow()5132     fn test_decode_ascii_only_windows_1257_to_cow() {
5133         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5134         match cow {
5135             Cow::Borrowed(s) => {
5136                 assert_eq!(s, "abc");
5137             }
5138             Cow::Owned(_) => unreachable!(),
5139         }
5140         assert_eq!(encoding, WINDOWS_1257);
5141         assert!(!had_errors);
5142     }
5143 
5144     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5145     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5146         let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5147         match cow {
5148             Cow::Borrowed(s) => {
5149                 assert_eq!(s, "\u{20AC}\u{00E4}");
5150             }
5151             Cow::Owned(_) => unreachable!(),
5152         }
5153         assert_eq!(encoding, UTF_8);
5154         assert!(!had_errors);
5155     }
5156 
5157     #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5158     fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5159         let (cow, encoding, had_errors) =
5160             WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5161         match cow {
5162             Cow::Borrowed(_) => unreachable!(),
5163             Cow::Owned(s) => {
5164                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5165             }
5166         }
5167         assert_eq!(encoding, UTF_8);
5168         assert!(had_errors);
5169     }
5170 
5171     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5172     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5173         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5174         match cow {
5175             Cow::Borrowed(s) => {
5176                 assert_eq!(s, "\u{20AC}\u{00E4}");
5177             }
5178             Cow::Owned(_) => unreachable!(),
5179         }
5180         assert_eq!(encoding, UTF_8);
5181         assert!(!had_errors);
5182     }
5183 
5184     #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5185     fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5186         let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5187         match cow {
5188             Cow::Borrowed(_) => unreachable!(),
5189             Cow::Owned(s) => {
5190                 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5191             }
5192         }
5193         assert_eq!(encoding, UTF_8);
5194         assert!(had_errors);
5195     }
5196 
5197     #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5198     fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5199         let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5200         match cow {
5201             Cow::Borrowed(s) => {
5202                 assert_eq!(s, "\u{20AC}\u{00E4}");
5203             }
5204             Cow::Owned(_) => unreachable!(),
5205         }
5206         assert!(!had_errors);
5207     }
5208 
5209     #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5210     fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5211         let (cow, had_errors) =
5212             WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5213         match cow {
5214             Cow::Borrowed(_) => unreachable!(),
5215             Cow::Owned(s) => {
5216                 assert_eq!(
5217                     s,
5218                     "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5219                 );
5220             }
5221         }
5222         assert!(!had_errors);
5223     }
5224 
5225     #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5226     fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5227         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5228         match cow {
5229             Cow::Borrowed(_) => unreachable!(),
5230             Cow::Owned(s) => {
5231                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5232             }
5233         }
5234         assert!(!had_errors);
5235     }
5236 
5237     #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5238     fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5239         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5240         match cow {
5241             Cow::Borrowed(_) => unreachable!(),
5242             Cow::Owned(s) => {
5243                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5244             }
5245         }
5246         assert!(had_errors);
5247     }
5248 
5249     #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5250     fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5251         let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5252         match cow {
5253             Cow::Borrowed(s) => {
5254                 assert_eq!(s, "abc");
5255             }
5256             Cow::Owned(_) => unreachable!(),
5257         }
5258         assert!(!had_errors);
5259     }
5260 
5261     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5262     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5263         let (cow, had_errors) =
5264             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5265         match cow {
5266             Cow::Borrowed(s) => {
5267                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5268             }
5269             Cow::Owned(_) => unreachable!(),
5270         }
5271         assert!(!had_errors);
5272     }
5273 
5274     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5275     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5276         let (cow, had_errors) =
5277             UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5278         match cow {
5279             Cow::Borrowed(_) => unreachable!(),
5280             Cow::Owned(s) => {
5281                 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5282             }
5283         }
5284         assert!(had_errors);
5285     }
5286 
5287     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5288     fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5289         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5290         match cow {
5291             Cow::Borrowed(_) => unreachable!(),
5292             Cow::Owned(s) => {
5293                 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5294             }
5295         }
5296         assert!(!had_errors);
5297     }
5298 
5299     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5300     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5301         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5302         match cow {
5303             Cow::Borrowed(_) => unreachable!(),
5304             Cow::Owned(s) => {
5305                 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5306             }
5307         }
5308         assert!(had_errors);
5309     }
5310 
5311     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5312     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5313         let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5314         match cow {
5315             Cow::Borrowed(s) => {
5316                 assert_eq!(s, "abc");
5317             }
5318             Cow::Owned(_) => unreachable!(),
5319         }
5320         assert!(!had_errors);
5321     }
5322 
5323     #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5324     fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5325         match UTF_8.decode_without_bom_handling_and_without_replacement(
5326             b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5327         ) {
5328             Some(cow) => match cow {
5329                 Cow::Borrowed(s) => {
5330                     assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5331                 }
5332                 Cow::Owned(_) => unreachable!(),
5333             },
5334             None => unreachable!(),
5335         }
5336     }
5337 
5338     #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5339     fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5340         assert!(UTF_8
5341             .decode_without_bom_handling_and_without_replacement(
5342                 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5343             )
5344             .is_none());
5345     }
5346 
5347     #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5348     fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5349         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5350             Some(cow) => match cow {
5351                 Cow::Borrowed(_) => unreachable!(),
5352                 Cow::Owned(s) => {
5353                     assert_eq!(s, "abc\u{20AC}\u{00E4}");
5354                 }
5355             },
5356             None => unreachable!(),
5357         }
5358     }
5359 
5360     #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5361     fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5362         assert!(WINDOWS_1257
5363             .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5364             .is_none());
5365     }
5366 
5367     #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5368     fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5369         match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5370             Some(cow) => match cow {
5371                 Cow::Borrowed(s) => {
5372                     assert_eq!(s, "abc");
5373                 }
5374                 Cow::Owned(_) => unreachable!(),
5375             },
5376             None => unreachable!(),
5377         }
5378     }
5379 
5380     #[test]
test_encode_ascii_only_windows_1257_to_cow()5381     fn test_encode_ascii_only_windows_1257_to_cow() {
5382         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5383         match cow {
5384             Cow::Borrowed(s) => {
5385                 assert_eq!(s, b"abc");
5386             }
5387             Cow::Owned(_) => unreachable!(),
5388         }
5389         assert_eq!(encoding, WINDOWS_1257);
5390         assert!(!had_errors);
5391     }
5392 
5393     #[test]
test_encode_valid_windows_1257_to_cow()5394     fn test_encode_valid_windows_1257_to_cow() {
5395         let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5396         match cow {
5397             Cow::Borrowed(_) => unreachable!(),
5398             Cow::Owned(s) => {
5399                 assert_eq!(s, b"abc\x80\xE4");
5400             }
5401         }
5402         assert_eq!(encoding, WINDOWS_1257);
5403         assert!(!had_errors);
5404     }
5405 
5406     #[test]
test_utf16_space_with_one_bom_byte()5407     fn test_utf16_space_with_one_bom_byte() {
5408         let mut decoder = UTF_16LE.new_decoder();
5409         let mut dst = [0u16; 12];
5410         {
5411             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5412             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5413             assert_eq!(result, CoderResult::InputEmpty);
5414         }
5415         {
5416             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5417             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5418             assert_eq!(result, CoderResult::InputEmpty);
5419         }
5420     }
5421 
5422     #[test]
test_utf8_space_with_one_bom_byte()5423     fn test_utf8_space_with_one_bom_byte() {
5424         let mut decoder = UTF_8.new_decoder();
5425         let mut dst = [0u16; 12];
5426         {
5427             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5428             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5429             assert_eq!(result, CoderResult::InputEmpty);
5430         }
5431         {
5432             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5433             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5434             assert_eq!(result, CoderResult::InputEmpty);
5435         }
5436     }
5437 
5438     #[test]
test_utf16_space_with_two_bom_bytes()5439     fn test_utf16_space_with_two_bom_bytes() {
5440         let mut decoder = UTF_16LE.new_decoder();
5441         let mut dst = [0u16; 12];
5442         {
5443             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5444             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5445             assert_eq!(result, CoderResult::InputEmpty);
5446         }
5447         {
5448             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5449             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5450             assert_eq!(result, CoderResult::InputEmpty);
5451         }
5452         {
5453             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5454             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5455             assert_eq!(result, CoderResult::InputEmpty);
5456         }
5457     }
5458 
5459     #[test]
test_utf8_space_with_two_bom_bytes()5460     fn test_utf8_space_with_two_bom_bytes() {
5461         let mut decoder = UTF_8.new_decoder();
5462         let mut dst = [0u16; 12];
5463         {
5464             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5465             let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5466             assert_eq!(result, CoderResult::InputEmpty);
5467         }
5468         {
5469             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5470             let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5471             assert_eq!(result, CoderResult::InputEmpty);
5472         }
5473         {
5474             let needed = decoder.max_utf16_buffer_length(1).unwrap();
5475             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5476             assert_eq!(result, CoderResult::InputEmpty);
5477         }
5478     }
5479 
5480     #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5481     fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5482         let mut decoder = UTF_16LE.new_decoder();
5483         let mut dst = [0u16; 12];
5484         {
5485             let needed = decoder.max_utf16_buffer_length(2).unwrap();
5486             let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5487             assert_eq!(result, CoderResult::InputEmpty);
5488         }
5489     }
5490 
5491     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5492     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5493         let mut dst = [0u8; 8];
5494         let mut encoder = ISO_2022_JP.new_encoder();
5495         {
5496             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5497             assert_eq!(result, CoderResult::InputEmpty);
5498         }
5499         {
5500             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5501             assert_eq!(result, CoderResult::InputEmpty);
5502         }
5503     }
5504 
5505     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5506     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5507         let mut dst = [0u8; 16];
5508         let mut encoder = ISO_2022_JP.new_encoder();
5509         {
5510             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5511             assert_eq!(result, CoderResult::InputEmpty);
5512         }
5513         {
5514             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5515             assert_eq!(result, CoderResult::InputEmpty);
5516         }
5517         {
5518             let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5519             assert_eq!(result, CoderResult::OutputFull);
5520         }
5521     }
5522 
5523     #[test]
test_buffer_end_iso_2022_jp_from_utf8()5524     fn test_buffer_end_iso_2022_jp_from_utf8() {
5525         let mut dst = [0u8; 18];
5526         {
5527             let mut encoder = ISO_2022_JP.new_encoder();
5528             let (result, _, _, _) =
5529                 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5530             assert_eq!(result, CoderResult::InputEmpty);
5531         }
5532         {
5533             let mut encoder = ISO_2022_JP.new_encoder();
5534             let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5535             assert_eq!(result, CoderResult::OutputFull);
5536         }
5537         {
5538             let mut encoder = ISO_2022_JP.new_encoder();
5539             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5540             assert_eq!(result, CoderResult::InputEmpty);
5541         }
5542         {
5543             let mut encoder = ISO_2022_JP.new_encoder();
5544             let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5545             assert_eq!(result, CoderResult::InputEmpty);
5546         }
5547     }
5548 
5549     #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5550     fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5551         let mut dst = [0u8; 8];
5552         let mut encoder = ISO_2022_JP.new_encoder();
5553         {
5554             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5555             assert_eq!(result, CoderResult::InputEmpty);
5556         }
5557         {
5558             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5559             assert_eq!(result, CoderResult::InputEmpty);
5560         }
5561     }
5562 
5563     #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5564     fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5565         let mut dst = [0u8; 16];
5566         let mut encoder = ISO_2022_JP.new_encoder();
5567         {
5568             let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5569             assert_eq!(result, CoderResult::InputEmpty);
5570         }
5571         {
5572             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5573             assert_eq!(result, CoderResult::InputEmpty);
5574         }
5575         {
5576             let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5577             assert_eq!(result, CoderResult::OutputFull);
5578         }
5579     }
5580 
5581     #[test]
test_buffer_end_iso_2022_jp_from_utf16()5582     fn test_buffer_end_iso_2022_jp_from_utf16() {
5583         let mut dst = [0u8; 18];
5584         {
5585             let mut encoder = ISO_2022_JP.new_encoder();
5586             let (result, _, _, _) =
5587                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5588             assert_eq!(result, CoderResult::InputEmpty);
5589         }
5590         {
5591             let mut encoder = ISO_2022_JP.new_encoder();
5592             let (result, _, _, _) =
5593                 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5594             assert_eq!(result, CoderResult::OutputFull);
5595         }
5596         {
5597             let mut encoder = ISO_2022_JP.new_encoder();
5598             let (result, _, _, _) =
5599                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5600             assert_eq!(result, CoderResult::InputEmpty);
5601         }
5602         {
5603             let mut encoder = ISO_2022_JP.new_encoder();
5604             let (result, _, _, _) =
5605                 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5606             assert_eq!(result, CoderResult::InputEmpty);
5607         }
5608     }
5609 
5610     #[test]
test_buffer_end_utf16be()5611     fn test_buffer_end_utf16be() {
5612         let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5613         let mut dest = [0u8; 4];
5614 
5615         assert_eq!(
5616             decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5617             (CoderResult::InputEmpty, 2, 0, false)
5618         );
5619 
5620         let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5621     }
5622 
5623     #[test]
test_hash()5624     fn test_hash() {
5625         let mut encodings = ::std::collections::HashSet::new();
5626         encodings.insert(UTF_8);
5627         encodings.insert(ISO_2022_JP);
5628         assert!(encodings.contains(UTF_8));
5629         assert!(encodings.contains(ISO_2022_JP));
5630         assert!(!encodings.contains(WINDOWS_1252));
5631         encodings.remove(ISO_2022_JP);
5632         assert!(!encodings.contains(ISO_2022_JP));
5633     }
5634 
5635     #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5636     fn test_iso_2022_jp_ncr_extra_from_utf16() {
5637         let mut dst = [0u8; 17];
5638         {
5639             let mut encoder = ISO_2022_JP.new_encoder();
5640             let (result, _, _, _) =
5641                 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5642             assert_eq!(result, CoderResult::OutputFull);
5643         }
5644     }
5645 
5646     #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5647     fn test_iso_2022_jp_ncr_extra_from_utf8() {
5648         let mut dst = [0u8; 17];
5649         {
5650             let mut encoder = ISO_2022_JP.new_encoder();
5651             let (result, _, _, _) =
5652                 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5653             assert_eq!(result, CoderResult::OutputFull);
5654         }
5655     }
5656 
5657     #[test]
test_max_length_with_bom_to_utf8()5658     fn test_max_length_with_bom_to_utf8() {
5659         let mut output = [0u8; 20];
5660         let mut decoder = REPLACEMENT.new_decoder();
5661         let input = b"\xEF\xBB\xBFA";
5662         {
5663             let needed = decoder
5664                 .max_utf8_buffer_length_without_replacement(input.len())
5665                 .unwrap();
5666             let (result, read, written) =
5667                 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5668             assert_eq!(result, DecoderResult::InputEmpty);
5669             assert_eq!(read, input.len());
5670             assert_eq!(written, 1);
5671             assert_eq!(output[0], 0x41);
5672         }
5673     }
5674 
5675     #[cfg(feature = "serde")]
5676     #[test]
test_serde()5677     fn test_serde() {
5678         let demo = Demo {
5679             num: 42,
5680             name: "foo".into(),
5681             enc: UTF_8,
5682         };
5683 
5684         let serialized = serde_json::to_string(&demo).unwrap();
5685 
5686         let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5687         assert_eq!(deserialized, demo);
5688 
5689         let bincoded = bincode::serialize(&demo).unwrap();
5690         let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5691         assert_eq!(debincoded, demo);
5692     }
5693 
5694     #[test]
test_is_single_byte()5695     fn test_is_single_byte() {
5696         assert!(!BIG5.is_single_byte());
5697         assert!(!EUC_JP.is_single_byte());
5698         assert!(!EUC_KR.is_single_byte());
5699         assert!(!GB18030.is_single_byte());
5700         assert!(!GBK.is_single_byte());
5701         assert!(!REPLACEMENT.is_single_byte());
5702         assert!(!SHIFT_JIS.is_single_byte());
5703         assert!(!UTF_8.is_single_byte());
5704         assert!(!UTF_16BE.is_single_byte());
5705         assert!(!UTF_16LE.is_single_byte());
5706         assert!(!ISO_2022_JP.is_single_byte());
5707 
5708         assert!(IBM866.is_single_byte());
5709         assert!(ISO_8859_2.is_single_byte());
5710         assert!(ISO_8859_3.is_single_byte());
5711         assert!(ISO_8859_4.is_single_byte());
5712         assert!(ISO_8859_5.is_single_byte());
5713         assert!(ISO_8859_6.is_single_byte());
5714         assert!(ISO_8859_7.is_single_byte());
5715         assert!(ISO_8859_8.is_single_byte());
5716         assert!(ISO_8859_10.is_single_byte());
5717         assert!(ISO_8859_13.is_single_byte());
5718         assert!(ISO_8859_14.is_single_byte());
5719         assert!(ISO_8859_15.is_single_byte());
5720         assert!(ISO_8859_16.is_single_byte());
5721         assert!(ISO_8859_8_I.is_single_byte());
5722         assert!(KOI8_R.is_single_byte());
5723         assert!(KOI8_U.is_single_byte());
5724         assert!(MACINTOSH.is_single_byte());
5725         assert!(WINDOWS_874.is_single_byte());
5726         assert!(WINDOWS_1250.is_single_byte());
5727         assert!(WINDOWS_1251.is_single_byte());
5728         assert!(WINDOWS_1252.is_single_byte());
5729         assert!(WINDOWS_1253.is_single_byte());
5730         assert!(WINDOWS_1254.is_single_byte());
5731         assert!(WINDOWS_1255.is_single_byte());
5732         assert!(WINDOWS_1256.is_single_byte());
5733         assert!(WINDOWS_1257.is_single_byte());
5734         assert!(WINDOWS_1258.is_single_byte());
5735         assert!(X_MAC_CYRILLIC.is_single_byte());
5736         assert!(X_USER_DEFINED.is_single_byte());
5737     }
5738 
5739     #[test]
test_latin1_byte_compatible_up_to()5740     fn test_latin1_byte_compatible_up_to() {
5741         let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5742         assert_eq!(
5743             BIG5.new_decoder_without_bom_handling()
5744                 .latin1_byte_compatible_up_to(buffer)
5745                 .unwrap(),
5746             1
5747         );
5748         assert_eq!(
5749             EUC_JP
5750                 .new_decoder_without_bom_handling()
5751                 .latin1_byte_compatible_up_to(buffer)
5752                 .unwrap(),
5753             1
5754         );
5755         assert_eq!(
5756             EUC_KR
5757                 .new_decoder_without_bom_handling()
5758                 .latin1_byte_compatible_up_to(buffer)
5759                 .unwrap(),
5760             1
5761         );
5762         assert_eq!(
5763             GB18030
5764                 .new_decoder_without_bom_handling()
5765                 .latin1_byte_compatible_up_to(buffer)
5766                 .unwrap(),
5767             1
5768         );
5769         assert_eq!(
5770             GBK.new_decoder_without_bom_handling()
5771                 .latin1_byte_compatible_up_to(buffer)
5772                 .unwrap(),
5773             1
5774         );
5775         assert!(REPLACEMENT
5776             .new_decoder_without_bom_handling()
5777             .latin1_byte_compatible_up_to(buffer)
5778             .is_none());
5779         assert_eq!(
5780             SHIFT_JIS
5781                 .new_decoder_without_bom_handling()
5782                 .latin1_byte_compatible_up_to(buffer)
5783                 .unwrap(),
5784             1
5785         );
5786         assert_eq!(
5787             UTF_8
5788                 .new_decoder_without_bom_handling()
5789                 .latin1_byte_compatible_up_to(buffer)
5790                 .unwrap(),
5791             1
5792         );
5793         assert!(UTF_16BE
5794             .new_decoder_without_bom_handling()
5795             .latin1_byte_compatible_up_to(buffer)
5796             .is_none());
5797         assert!(UTF_16LE
5798             .new_decoder_without_bom_handling()
5799             .latin1_byte_compatible_up_to(buffer)
5800             .is_none());
5801         assert_eq!(
5802             ISO_2022_JP
5803                 .new_decoder_without_bom_handling()
5804                 .latin1_byte_compatible_up_to(buffer)
5805                 .unwrap(),
5806             1
5807         );
5808 
5809         assert_eq!(
5810             IBM866
5811                 .new_decoder_without_bom_handling()
5812                 .latin1_byte_compatible_up_to(buffer)
5813                 .unwrap(),
5814             1
5815         );
5816         assert_eq!(
5817             ISO_8859_2
5818                 .new_decoder_without_bom_handling()
5819                 .latin1_byte_compatible_up_to(buffer)
5820                 .unwrap(),
5821             2
5822         );
5823         assert_eq!(
5824             ISO_8859_3
5825                 .new_decoder_without_bom_handling()
5826                 .latin1_byte_compatible_up_to(buffer)
5827                 .unwrap(),
5828             2
5829         );
5830         assert_eq!(
5831             ISO_8859_4
5832                 .new_decoder_without_bom_handling()
5833                 .latin1_byte_compatible_up_to(buffer)
5834                 .unwrap(),
5835             2
5836         );
5837         assert_eq!(
5838             ISO_8859_5
5839                 .new_decoder_without_bom_handling()
5840                 .latin1_byte_compatible_up_to(buffer)
5841                 .unwrap(),
5842             2
5843         );
5844         assert_eq!(
5845             ISO_8859_6
5846                 .new_decoder_without_bom_handling()
5847                 .latin1_byte_compatible_up_to(buffer)
5848                 .unwrap(),
5849             2
5850         );
5851         assert_eq!(
5852             ISO_8859_7
5853                 .new_decoder_without_bom_handling()
5854                 .latin1_byte_compatible_up_to(buffer)
5855                 .unwrap(),
5856             2
5857         );
5858         assert_eq!(
5859             ISO_8859_8
5860                 .new_decoder_without_bom_handling()
5861                 .latin1_byte_compatible_up_to(buffer)
5862                 .unwrap(),
5863             3
5864         );
5865         assert_eq!(
5866             ISO_8859_10
5867                 .new_decoder_without_bom_handling()
5868                 .latin1_byte_compatible_up_to(buffer)
5869                 .unwrap(),
5870             2
5871         );
5872         assert_eq!(
5873             ISO_8859_13
5874                 .new_decoder_without_bom_handling()
5875                 .latin1_byte_compatible_up_to(buffer)
5876                 .unwrap(),
5877             4
5878         );
5879         assert_eq!(
5880             ISO_8859_14
5881                 .new_decoder_without_bom_handling()
5882                 .latin1_byte_compatible_up_to(buffer)
5883                 .unwrap(),
5884             4
5885         );
5886         assert_eq!(
5887             ISO_8859_15
5888                 .new_decoder_without_bom_handling()
5889                 .latin1_byte_compatible_up_to(buffer)
5890                 .unwrap(),
5891             6
5892         );
5893         assert_eq!(
5894             ISO_8859_16
5895                 .new_decoder_without_bom_handling()
5896                 .latin1_byte_compatible_up_to(buffer)
5897                 .unwrap(),
5898             4
5899         );
5900         assert_eq!(
5901             ISO_8859_8_I
5902                 .new_decoder_without_bom_handling()
5903                 .latin1_byte_compatible_up_to(buffer)
5904                 .unwrap(),
5905             3
5906         );
5907         assert_eq!(
5908             KOI8_R
5909                 .new_decoder_without_bom_handling()
5910                 .latin1_byte_compatible_up_to(buffer)
5911                 .unwrap(),
5912             1
5913         );
5914         assert_eq!(
5915             KOI8_U
5916                 .new_decoder_without_bom_handling()
5917                 .latin1_byte_compatible_up_to(buffer)
5918                 .unwrap(),
5919             1
5920         );
5921         assert_eq!(
5922             MACINTOSH
5923                 .new_decoder_without_bom_handling()
5924                 .latin1_byte_compatible_up_to(buffer)
5925                 .unwrap(),
5926             1
5927         );
5928         assert_eq!(
5929             WINDOWS_874
5930                 .new_decoder_without_bom_handling()
5931                 .latin1_byte_compatible_up_to(buffer)
5932                 .unwrap(),
5933             2
5934         );
5935         assert_eq!(
5936             WINDOWS_1250
5937                 .new_decoder_without_bom_handling()
5938                 .latin1_byte_compatible_up_to(buffer)
5939                 .unwrap(),
5940             4
5941         );
5942         assert_eq!(
5943             WINDOWS_1251
5944                 .new_decoder_without_bom_handling()
5945                 .latin1_byte_compatible_up_to(buffer)
5946                 .unwrap(),
5947             1
5948         );
5949         assert_eq!(
5950             WINDOWS_1252
5951                 .new_decoder_without_bom_handling()
5952                 .latin1_byte_compatible_up_to(buffer)
5953                 .unwrap(),
5954             5
5955         );
5956         assert_eq!(
5957             WINDOWS_1253
5958                 .new_decoder_without_bom_handling()
5959                 .latin1_byte_compatible_up_to(buffer)
5960                 .unwrap(),
5961             3
5962         );
5963         assert_eq!(
5964             WINDOWS_1254
5965                 .new_decoder_without_bom_handling()
5966                 .latin1_byte_compatible_up_to(buffer)
5967                 .unwrap(),
5968             4
5969         );
5970         assert_eq!(
5971             WINDOWS_1255
5972                 .new_decoder_without_bom_handling()
5973                 .latin1_byte_compatible_up_to(buffer)
5974                 .unwrap(),
5975             3
5976         );
5977         assert_eq!(
5978             WINDOWS_1256
5979                 .new_decoder_without_bom_handling()
5980                 .latin1_byte_compatible_up_to(buffer)
5981                 .unwrap(),
5982             1
5983         );
5984         assert_eq!(
5985             WINDOWS_1257
5986                 .new_decoder_without_bom_handling()
5987                 .latin1_byte_compatible_up_to(buffer)
5988                 .unwrap(),
5989             4
5990         );
5991         assert_eq!(
5992             WINDOWS_1258
5993                 .new_decoder_without_bom_handling()
5994                 .latin1_byte_compatible_up_to(buffer)
5995                 .unwrap(),
5996             4
5997         );
5998         assert_eq!(
5999             X_MAC_CYRILLIC
6000                 .new_decoder_without_bom_handling()
6001                 .latin1_byte_compatible_up_to(buffer)
6002                 .unwrap(),
6003             1
6004         );
6005         assert_eq!(
6006             X_USER_DEFINED
6007                 .new_decoder_without_bom_handling()
6008                 .latin1_byte_compatible_up_to(buffer)
6009                 .unwrap(),
6010             1
6011         );
6012 
6013         assert!(UTF_8
6014             .new_decoder()
6015             .latin1_byte_compatible_up_to(buffer)
6016             .is_none());
6017 
6018         let mut decoder = UTF_8.new_decoder();
6019         let mut output = [0u16; 4];
6020         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6021         assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6022         let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6023         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6024         let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6025         assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6026     }
6027 }
6028