1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 #![cfg_attr(
11 feature = "cargo-clippy",
12 allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14
15 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17 //! Gecko-oriented means that converting to and from UTF-16 is supported in
18 //! addition to converting to and from UTF-8, that the performance and
19 //! streamability goals are browser-oriented, and that FFI-friendliness is a
20 //! goal.
21 //!
22 //! Additionally, the `mem` module provides functions that are useful for
23 //! applications that need to be able to deal with legacy in-memory
24 //! representations of Unicode.
25 //!
26 //! For expectation setting, please be sure to read the sections
27 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29 //!
30 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31 //! design and internals of the crate.
32 //!
33 //! # Availability
34 //!
35 //! The code is available under the
36 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38 //! See the
39 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40 //! file for details.
41 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43 //!
44 //! # Integration with `std::io`
45 //!
46 //! This crate doesn't implement traits from `std::io`. However, for the case of
47 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50 //!
51 //! # Examples
52 //!
53 //! Example programs:
54 //!
55 //! * [Rust](https://github.com/hsivonen/recode_rs)
56 //! * [C](https://github.com/hsivonen/recode_c)
57 //! * [C++](https://github.com/hsivonen/recode_cpp)
58 //!
59 //! Decode using the non-streaming API:
60 //!
61 //! ```
62 //! use encoding_rs::*;
63 //!
64 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
65 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
66 //!
67 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
68 //! assert_eq!(&cow[..], expectation);
69 //! assert_eq!(encoding_used, SHIFT_JIS);
70 //! assert!(!had_errors);
71 //! ```
72 //!
73 //! Decode using the streaming API with minimal `unsafe`:
74 //!
75 //! ```
76 //! use encoding_rs::*;
77 //!
78 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
79 //!
80 //! // Use an array of byte slices to demonstrate content arriving piece by
81 //! // piece from the network.
82 //! let bytes: [&'static [u8]; 4] = [b"\x83",
83 //! b"n\x83\x8D\x81",
84 //! b"[\x81E\x83\x8F\x81[\x83",
85 //! b"\x8B\x83h"];
86 //!
87 //! // Very short output buffer to demonstrate the output buffer getting full.
88 //! // Normally, you'd use something like `[0u8; 2048]`.
89 //! let mut buffer_bytes = [0u8; 8];
90 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
91 //!
92 //! // How many bytes in the buffer currently hold significant data.
93 //! let mut bytes_in_buffer = 0usize;
94 //!
95 //! // Collect the output to a string for demonstration purposes.
96 //! let mut output = String::new();
97 //!
98 //! // The `Decoder`
99 //! let mut decoder = SHIFT_JIS.new_decoder();
100 //!
101 //! // Track whether we see errors.
102 //! let mut total_had_errors = false;
103 //!
104 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
105 //! // use of a fixed-size buffer; normally when the output of an incremental
106 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
107 //! // avoid the intermediate buffer).
108 //! for input in &bytes[..] {
109 //! // The number of bytes already read from current `input` in total.
110 //! let mut total_read_from_current_input = 0usize;
111 //!
112 //! loop {
113 //! let (result, read, written, had_errors) =
114 //! decoder.decode_to_str(&input[total_read_from_current_input..],
115 //! &mut buffer[bytes_in_buffer..],
116 //! false);
117 //! total_read_from_current_input += read;
118 //! bytes_in_buffer += written;
119 //! total_had_errors |= had_errors;
120 //! match result {
121 //! CoderResult::InputEmpty => {
122 //! // We have consumed the current input buffer. Break out of
123 //! // the inner loop to get the next input buffer from the
124 //! // outer loop.
125 //! break;
126 //! },
127 //! CoderResult::OutputFull => {
128 //! // Write the current buffer out and consider the buffer
129 //! // empty.
130 //! output.push_str(&buffer[..bytes_in_buffer]);
131 //! bytes_in_buffer = 0usize;
132 //! continue;
133 //! }
134 //! }
135 //! }
136 //! }
137 //!
138 //! // Process EOF
139 //! loop {
140 //! let (result, _, written, had_errors) =
141 //! decoder.decode_to_str(b"",
142 //! &mut buffer[bytes_in_buffer..],
143 //! true);
144 //! bytes_in_buffer += written;
145 //! total_had_errors |= had_errors;
146 //! // Write the current buffer out and consider the buffer empty.
147 //! // Need to do this here for both `match` arms, because we exit the
148 //! // loop on `CoderResult::InputEmpty`.
149 //! output.push_str(&buffer[..bytes_in_buffer]);
150 //! bytes_in_buffer = 0usize;
151 //! match result {
152 //! CoderResult::InputEmpty => {
153 //! // Done!
154 //! break;
155 //! },
156 //! CoderResult::OutputFull => {
157 //! continue;
158 //! }
159 //! }
160 //! }
161 //!
162 //! assert_eq!(&output[..], expectation);
163 //! assert!(!total_had_errors);
164 //! ```
165 //!
166 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
167 //!
168 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
169 //! __so this crate does not provide encoders for those encodings__!
170 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
171 //! so you get an UTF-8 encoder if you request an encoder for them.
172 //!
173 //! Additionally, the Encoding Standard factors BOM handling into wrapper
174 //! algorithms so that BOM handling isn't part of the definition of the
175 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
176 //! Standard define BOM handling or lack thereof as part of the encoding
177 //! scheme.
178 //!
179 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
180 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
181 //! the Unicode Standard.
182 //!
183 //! When used with the `_with_bom_removal` entry points, the UTF-8
184 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
185 //! Standard.
186 //!
187 //! This crate does not provide a mode that matches the UTF-16 _encoding
188 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
189 //! the entry points without `_bom_` qualifiers is the closest match,
190 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
191 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
192 //! Unicode Standard.
193 //!
194 //! The UTF-32 family of Unicode encoding schemes is not supported
195 //! by this crate. The Encoding Standard doesn't define any UTF-32
196 //! family encodings, since they aren't necessary for consuming Web
197 //! content.
198 //!
199 //! ## ISO-8859-1
200 //!
201 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
202 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
203 //! byte value to the same Unicode scalar value is not available via
204 //! `Encoding` in this crate.
205 //!
206 //! However, the functions whose name starts with `convert` and contains
207 //! `latin1` in the `mem` module support such conversions, which are known as
208 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
209 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
210 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
211 //!
212 //! ## Web / Browser Focus
213 //!
214 //! Both in terms of scope and performance, the focus is on the Web. For scope,
215 //! this means that encoding_rs implements the Encoding Standard fully and
216 //! doesn't implement encodings that are not specified in the Encoding
217 //! Standard. For performance, this means that decoding performance is
218 //! important as well as performance for encoding into UTF-8 or encoding the
219 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
220 //! be encoded into legacy encodings in only two places in the Web platform: in
221 //! the query part of URLs, in which case it's a matter of relatively rare
222 //! error handling, and in form submission, in which case the user action and
223 //! networking tend to hide the performance of the encoder.
224 //!
225 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
226 //! encodings enables smaller code size thanks to the encoder side using the
227 //! decode-optimized data tables without having encode-optimized data tables at
228 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
229 //! multiplication operations.
230 //!
231 //! Additionally, performance is a non-goal for the ASCII-incompatible
232 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
233 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
234 //! of implementation.
235 //!
236 //! Despite the browser focus, the hope is that non-browser applications
237 //! that wish to consume Web content or submit Web forms in a Web-compatible
238 //! way will find encoding_rs useful. While encoding_rs does not try to match
239 //! Windows behavior, many of the encodings are close enough to legacy
240 //! encodings implemented by Windows that applications that need to consume
241 //! data in legacy Windows encodins may find encoding_rs useful. The
242 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
243 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
244 //!
245 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
246 //! to the encodings defined in the Encoding Standard. The
247 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
248 //! UTF-7 decoding for email purposes.
249 //!
250 //! # Preparing Text for the Encoders
251 //!
252 //! Normalizing text into Unicode Normalization Form C prior to encoding text
253 //! into a legacy encoding minimizes unmappable characters. Text can be
254 //! normalized to Unicode Normalization Form C using the
255 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
256 //!
257 //! The exception is windows-1258, which after normalizing to Unicode
258 //! Normalization Form C requires tone marks to be decomposed in order to
259 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
260 //! using the [`detone`](https://crates.io/crates/detone) crate.
261 //!
262 //! # Streaming & Non-Streaming; Rust & C/C++
263 //!
264 //! The API in Rust has two modes of operation: streaming and non-streaming.
265 //! The streaming API is the foundation of the implementation and should be
266 //! used when processing data that arrives piecemeal from an i/o stream. The
267 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
268 //! to C callers. The non-streaming part of the API is for Rust callers only and
269 //! is smart about borrowing instead of copying when possible. When
270 //! streamability is not needed, the non-streaming API should be preferrer in
271 //! order to avoid copying data when a borrow suffices.
272 //!
273 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
274 //! standard types for growable byte buffers and Unicode strings that know
275 //! their length.
276 //!
277 //! The C API (header file generated at `target/include/encoding_rs.h` when
278 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
279 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
280 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
281 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
282 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
283 //! as part of Mozilla [bug 1261841][4].
284 //!
285 //! The `Encoding` type is common to both the streaming and non-streaming
286 //! modes. In the streaming mode, decoding operations are performed with a
287 //! `Decoder` and encoding operations with an `Encoder` object obtained via
288 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
289 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
290 //! and `Encoder` objects are not used at all.
291 //!
292 //! [1]: https://github.com/hsivonen/encoding_c
293 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
294 //! [3]: https://github.com/Microsoft/GSL/
295 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
296 //!
297 //! # Memory management
298 //!
299 //! The non-streaming mode never performs heap allocations (even the methods
300 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
301 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
302 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
303 //!
304 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
305 //! perform heap allocations but only to allocate the backing buffer of the
306 //! `Vec<u8>` or the `String`.
307 //!
308 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
309 //! `Drop` cleanup.
310 //!
311 //! # Buffer reading and writing behavior
312 //!
313 //! Based on experience gained with the `java.nio.charset` encoding converter
314 //! API and with the Gecko uconv encoding converter API, the buffer reading
315 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
316 //! fully drained but output buffers are not always fully filled.
317 //!
318 //! When reading from an input buffer, encoding_rs always consumes all input
319 //! up to the next error or to the end of the buffer. In particular, when
320 //! decoding, even if the input buffer ends in the middle of a byte sequence
321 //! for a character, the decoder consumes all input. This has the benefit that
322 //! the caller of the API can always fill the next buffer from the start from
323 //! whatever source the bytes come from and never has to first copy the last
324 //! bytes of the previous buffer to the start of the next buffer. However, when
325 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
326 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
327 //! boundaries falling in the middle of a surrogate pair result in both
328 //! suggorates being treated individually as unpaired surrogates.
329 //!
330 //! Additionally, decoders guarantee that they can be fed even one byte at a
331 //! time and encoders guarantee that they can be fed even one code point at a
332 //! time. This has the benefit of not placing restrictions on the size of
333 //! chunks the content arrives e.g. from network.
334 //!
335 //! When writing into an output buffer, encoding_rs makes sure that the code
336 //! unit sequence for a character is never split across output buffer
337 //! boundaries. This may result in wasted space at the end of an output buffer,
338 //! but the advantages are that the output side of both decoders and encoders
339 //! is greatly simplified compared to designs that attempt to fill output
340 //! buffers exactly even when that entails splitting a code unit sequence and
341 //! when encoding_rs methods return to the caller, the output produces thus
342 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
343 //! the output needs to be considered as a whole, because the latest output
344 //! buffer taken alone might not be valid taken alone if the transition away
345 //! from the ASCII state occurred in an earlier output buffer. However, since
346 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
347 //! state as being in error despite the encoder generating a transition to the
348 //! ASCII state at the end, the claim about the partial output taken as a whole
349 //! being valid is true even for ISO-2022-JP.)
350 //!
351 //! # Error Reporting
352 //!
353 //! Based on experience gained with the `java.nio.charset` encoding converter
354 //! API and with the Gecko uconv encoding converter API, the error reporting
355 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
356 //! that leave it up to the caller to extract the erroneous bytes from the
357 //! input stream if the caller wishes to do so but encoder errors provide the
358 //! code point associated with the error without requiring the caller to
359 //! extract it from the input on its own.
360 //!
361 //! On the encoder side, an error is always triggered by the most recently
362 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
363 //! caller. Also, it's very typical for the caller to wish to do something with
364 //! this data: generate a numeric escape for the character. Additionally, the
365 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
366 //! certain cases, so requiring the caller to extract the character from the
367 //! input buffer would require the caller to handle ISO-2022-JP details.
368 //! Furthermore, requiring the caller to extract the character from the input
369 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
370 //! the job of an encoding conversion library.
371 //!
372 //! On the decoder side, errors are triggered in more complex ways. For
373 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
374 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
375 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
376 //! be the ones most recently pushed to the decoder and the error might not even
377 //! be in the current buffer.
378 //!
379 //! Some encoding conversion APIs address the problem by not acknowledging
380 //! trailing bytes of an input buffer as consumed if it's still possible for
381 //! future bytes to cause the trailing bytes to be in error. This way, error
382 //! reporting can always refer to the most recently pushed buffer. This has the
383 //! problem that the caller of the API has to copy the unconsumed trailing
384 //! bytes to the start of the next buffer before being able to fill the rest
385 //! of the next buffer. This is annoying, error-prone and inefficient.
386 //!
387 //! A possible solution would be making the decoder remember recently consumed
388 //! bytes in order to be able to include a copy of the erroneous bytes when
389 //! reporting an error. This has two problem: First, callers a rarely
390 //! interested in the erroneous bytes, so attempts to identify them are most
391 //! often just overhead anyway. Second, the rare applications that are
392 //! interested typically care about the location of the error in the input
393 //! stream.
394 //!
395 //! To keep the API convenient for common uses and the overhead low while making
396 //! it possible to develop applications, such as HTML validators, that care
397 //! about which bytes were in error, encoding_rs reports the length of the
398 //! erroneous sequence and the number of bytes consumed after the erroneous
399 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
400 //! this makes it possible for callers that care about the erroneous bytes to
401 //! locate them.
402 //!
403 //! # No Convenience API for Custom Replacements
404 //!
405 //! The Web Platform and, therefore, the Encoding Standard supports only one
406 //! error recovery mode for decoders and only one error recovery mode for
407 //! encoders. The supported error recovery mode for decoders is emitting the
408 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
409 //! encoders is emitting an HTML decimal numeric character reference for
410 //! unmappable characters.
411 //!
412 //! Since encoding_rs is Web-focused, these are the only error recovery modes
413 //! for which convenient support is provided. Moreover, on the decoder side,
414 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
415 //! on error (other than treating errors as fatal). In particular, simply
416 //! ignoring errors is a
417 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
418 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
419 //! callers to ignore errors.
420 //!
421 //! On the encoder side, there are plausible alternatives for HTML decimal
422 //! numeric character references. For example, when outputting CSS, CSS-style
423 //! escapes would seem to make sense. However, instead of facilitating the
424 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
425 //! position that you shouldn't generate output in encodings other than UTF-8,
426 //! except where backward compatibility with interacting with the legacy Web
427 //! requires it. The legacy Web requires it only when parsing the query strings
428 //! of URLs and when submitting forms, and those two both use HTML decimal
429 //! numeric character references.
430 //!
431 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
432 //! numeric character references easy, it does make them _possible_.
433 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
434 //! for unmappable characters, is implemented on top of
435 //! `encode_from_utf8_without_replacement()`. Applications that really, really
436 //! want other replacement schemes for unmappable characters can likewise
437 //! implement them on top of `encode_from_utf8_without_replacement()`.
438 //!
439 //! # No Extensibility by Design
440 //!
441 //! The set of encodings supported by encoding_rs is not extensible by design.
442 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
443 //! rather than `trait`s. encoding_rs takes the design position that all future
444 //! text interchange should be done using UTF-8, which can represent all of
445 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
446 //! Standard and encoding_rs that can represent all of Unicode and that has
447 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
448 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
449 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
450 //! other than being able to consume legacy content.
451 //!
452 //! Considering that UTF-8 can represent all of Unicode and is already supported
453 //! by all Web browsers, introducing a new encoding wouldn't add to the
454 //! expressiveness but would add to compatibility problems. In that sense,
455 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
456 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
457 //! the Web Platform. On the other hand, the set of legacy encodings that must
458 //! be supported for a Web browser to be able to be successful is not going to
459 //! expand. Empirically, the set of encodings specified in the Encoding Standard
460 //! is already sufficient and the set of legacy encodings won't grow
461 //! retroactively.
462 //!
463 //! Since extensibility doesn't make sense considering the Web focus of
464 //! encoding_rs and adding encodings to Web clients would be actively harmful,
465 //! it makes sense to make the set of encodings that encoding_rs supports
466 //! non-extensible and to take the (admittedly small) benefits arising from
467 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
468 //! of time, which enables stack allocation thereof.
469 //!
470 //! This does have downsides for applications that might want to put encoding_rs
471 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
472 //! needed for Web uses. The needs of such applications should not complicate
473 //! encoding_rs itself, though. It is up to those applications to provide a
474 //! framework that delegates the operations with encodings that encoding_rs
475 //! supports to encoding_rs and operations with other encodings to something
476 //! else (as opposed to encoding_rs itself providing an extensibility
477 //! framework).
478 //!
479 //! # Panics
480 //!
481 //! Methods in encoding_rs can panic if the API is used against the requirements
482 //! stated in the documentation, if a state that's supposed to be impossible
483 //! is reached due to an internal bug or on integer overflow. When used
484 //! according to documentation with buffer sizes that stay below integer
485 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
486 //!
487 //! Panics arising from API misuse aren't documented beyond this on individual
488 //! methods.
489 //!
490 //! # At-Risk Parts of the API
491 //!
492 //! The foreseeable source of partially backward-incompatible API change is the
493 //! way the instances of `Encoding` are made available.
494 //!
495 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
496 //! initialized with `static`s of type `&'static Encoding`, the non-reference
497 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
498 //!
499 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
500 //! unique when the constant is used in different crates, the reference-typed
501 //! `static`s for the encoding instances will be changed from `static` to
502 //! `const` and the non-reference-typed `_INIT` instances will be removed.
503 //!
504 //! # Mapping Spec Concepts onto the API
505 //!
506 //! <table>
507 //! <thead>
508 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
509 //! </thead>
510 //! <tbody>
511 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&'static Encoding</code></td><td><code>&'static Encoding</code></td></tr>
512 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
513 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
514 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
515 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// … (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// …</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
522 //! </tbody>
523 //! </table>
524 //!
525 //! # Compatibility with the rust-encoding API
526 //!
527 //! The crate
528 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
529 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
530 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
531 //!
532 //! # Mapping rust-encoding concepts to encoding_rs concepts
533 //!
534 //! The following table provides a mapping from rust-encoding constructs to
535 //! encoding_rs ones.
536 //!
537 //! <table>
538 //! <thead>
539 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
540 //! </thead>
541 //! <tbody>
542 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&'static encoding_rs::Encoding</code></td></tr>
543 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
544 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
545 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
546 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
547 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
548 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
549 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
550 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
551 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
552 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
553 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
554 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
555 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
556 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
557 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
558 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
559 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
560 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
561 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
562 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
563 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
564 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
565 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
566 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
567 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
568 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
570 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571 //! </tbody>
572 //! </table>
573 //!
574 //! # Relationship with Windows Code Pages
575 //!
576 //! Despite the Web and browser focus, the encodings defined by the Encoding
577 //! Standard and implemented by this crate may be useful for decoding legacy
578 //! data that uses Windows code pages. The following table names the single-byte
579 //! encodings
580 //! that have a closely related Windows code page, the number of the closest
581 //! code page, a column indicating whether Windows maps unassigned code points
582 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
583 //! indicating remarks in the list after the table.
584 //!
585 //! <table>
586 //! <thead>
587 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
588 //! </thead>
589 //! <tbody>
590 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
591 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
592 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
593 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
594 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
595 //! <tr><td>windows-874</td><td>874</td><td>•</td><td></td></tr>
596 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
597 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
598 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
599 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
600 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
601 //! <tr><td>windows-1253</td><td>1253</td><td>•</td><td></td></tr>
602 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
603 //! <tr><td>windows-1255</td><td>1255</td><td>•</td><td></td></tr>
604 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
605 //! <tr><td>windows-1257</td><td>1257</td><td>•</td><td></td></tr>
606 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
607 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
608 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
609 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
610 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
611 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
612 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
613 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
614 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
615 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
616 //! <tr><td>ISO-8859-6</td><td>28596</td><td>•</td><td></td></tr>
617 //! <tr><td>ISO-8859-7</td><td>28597</td><td>•</td><td>3</td></tr>
618 //! <tr><td>ISO-8859-8</td><td>28598</td><td>•</td><td>4</td></tr>
619 //! <tr><td>ISO-8859-13</td><td>28603</td><td>•</td><td></td></tr>
620 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
621 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
622 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
623 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
624 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
625 //! </tbody>
626 //! </table>
627 //!
628 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
629 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
630 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
631 //! which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
632 //! decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
633 //! LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
634 //! instead of U+2019 RIGHT SINGLE QUOTATION MARK.
635 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
636 //! of LRM and RLM.
637 //! 5. Remarks from the previous item apply.
638 //!
639 //! The differences between this crate and Windows in the case of multibyte encodings
640 //! are not yet fully documented here. The lack of remarks above should not be taken
641 //! as indication of lack of differences.
642 //!
643 //! # Notable Differences from IANA Naming
644 //!
645 //! In some cases, the Encoding Standard specifies the popular unextended encoding
646 //! name where in IANA terms one of the other labels would be more precise considering
647 //! the extensions that the Encoding Standard has unified into the encoding.
648 //!
649 //! <table>
650 //! <thead>
651 //! <tr><th>Encoding</th><th>IANA</th></tr>
652 //! </thead>
653 //! <tbody>
654 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
655 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
656 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
657 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
658 //! </tbody>
659 //! </table>
660 //!
661 //! In other cases where the Encoding Standard unifies unextended and extended
662 //! variants of an encoding, the encoding gets the name of the extended
663 //! variant.
664 //!
665 //! <table>
666 //! <thead>
667 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
668 //! </thead>
669 //! <tbody>
670 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
671 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
672 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
673 //! </tbody>
674 //! </table>
675 //!
676 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
677 //! for discussion about the UTF-16 family.
678
679 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
680
681 #[macro_use]
682 extern crate cfg_if;
683
684 #[cfg(all(
685 feature = "simd-accel",
686 any(
687 target_feature = "sse2",
688 all(target_endian = "little", target_arch = "aarch64"),
689 all(target_endian = "little", target_feature = "neon")
690 )
691 ))]
692 #[macro_use(shuffle)]
693 extern crate packed_simd;
694
695 #[cfg(feature = "serde")]
696 extern crate serde;
697
698 #[cfg(all(test, feature = "serde"))]
699 extern crate bincode;
700 #[cfg(all(test, feature = "serde"))]
701 #[macro_use]
702 extern crate serde_derive;
703 #[cfg(all(test, feature = "serde"))]
704 extern crate serde_json;
705
706 #[macro_use]
707 mod macros;
708
709 #[cfg(all(
710 feature = "simd-accel",
711 any(
712 target_feature = "sse2",
713 all(target_endian = "little", target_arch = "aarch64"),
714 all(target_endian = "little", target_feature = "neon")
715 )
716 ))]
717 mod simd_funcs;
718
719 #[cfg(test)]
720 mod testing;
721
722 mod big5;
723 mod euc_jp;
724 mod euc_kr;
725 mod gb18030;
726 mod iso_2022_jp;
727 mod replacement;
728 mod shift_jis;
729 mod single_byte;
730 mod utf_16;
731 mod utf_8;
732 mod x_user_defined;
733
734 mod ascii;
735 mod data;
736 mod handles;
737 mod variant;
738
739 pub mod mem;
740
741 use ascii::ascii_valid_up_to;
742 use ascii::iso_2022_jp_ascii_valid_up_to;
743 use utf_8::utf8_valid_up_to;
744 use variant::*;
745
746 use std::borrow::Cow;
747 use std::cmp::Ordering;
748 use std::hash::Hash;
749 use std::hash::Hasher;
750
751 #[cfg(feature = "serde")]
752 use serde::de::Visitor;
753 #[cfg(feature = "serde")]
754 use serde::{Deserialize, Deserializer, Serialize, Serializer};
755
756 /// This has to be the max length of an NCR instead of max
757 /// minus one, because we can't rely on getting the minus
758 /// one from the space reserved for the current unmappable,
759 /// because the ISO-2022-JP encoder can fill up that space
760 /// with a state transition escape.
761 const NCR_EXTRA: usize = 10; // 
762
763 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
764 // Instead, please regenerate using generate-encoding-data.py
765
766 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
767
768 /// The initializer for the [Big5](static.BIG5.html) encoding.
769 ///
770 /// For use only for taking the address of this form when
771 /// Rust prohibits the use of the non-`_INIT` form directly,
772 /// such as in initializers of other `static`s. If in doubt,
773 /// use the corresponding non-`_INIT` reference-typed `static`.
774 ///
775 /// This part of the public API will go away if Rust changes
776 /// to make the referent of `pub const FOO: &'static Encoding`
777 /// unique cross-crate or if Rust starts allowing static arrays
778 /// to be initialized with `pub static FOO: &'static Encoding`
779 /// items.
780 pub static BIG5_INIT: Encoding = Encoding {
781 name: "Big5",
782 variant: VariantEncoding::Big5,
783 };
784
785 /// The Big5 encoding.
786 ///
787 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
788 /// instead of the Private Use Area code points that have been used historically.
789 /// It is believed to be able to decode existing Web content in a way that makes
790 /// sense.
791 ///
792 /// To avoid form submissions generating data that Web servers don't understand,
793 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
794 /// Big5 in the lexical order.
795 ///
796 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
797 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
798 ///
799 /// This encoding is designed to be suited for decoding the Windows code page 950
800 /// and its HKSCS patched "951" variant such that the text makes sense, given
801 /// assignments that Unicode has made after those encodings used Private Use
802 /// Area characters.
803 ///
804 /// This will change from `static` to `const` if Rust changes
805 /// to make the referent of `pub const FOO: &'static Encoding`
806 /// unique cross-crate, so don't take the address of this
807 /// `static`.
808 pub static BIG5: &'static Encoding = &BIG5_INIT;
809
810 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
811 ///
812 /// For use only for taking the address of this form when
813 /// Rust prohibits the use of the non-`_INIT` form directly,
814 /// such as in initializers of other `static`s. If in doubt,
815 /// use the corresponding non-`_INIT` reference-typed `static`.
816 ///
817 /// This part of the public API will go away if Rust changes
818 /// to make the referent of `pub const FOO: &'static Encoding`
819 /// unique cross-crate or if Rust starts allowing static arrays
820 /// to be initialized with `pub static FOO: &'static Encoding`
821 /// items.
822 pub static EUC_JP_INIT: Encoding = Encoding {
823 name: "EUC-JP",
824 variant: VariantEncoding::EucJp,
825 };
826
827 /// The EUC-JP encoding.
828 ///
829 /// This is the legacy Unix encoding for Japanese.
830 ///
831 /// For compatibility with Web servers that don't expect three-byte sequences
832 /// in form submissions, the encoder doesn't generate three-byte sequences.
833 /// That is, the JIS X 0212 support is decode-only.
834 ///
835 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
836 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
837 ///
838 /// This encoding roughly matches the Windows code page 20932. There are error
839 /// handling differences and a handful of 2-byte sequences that decode differently.
840 /// Additionall, Windows doesn't support 3-byte sequences.
841 ///
842 /// This will change from `static` to `const` if Rust changes
843 /// to make the referent of `pub const FOO: &'static Encoding`
844 /// unique cross-crate, so don't take the address of this
845 /// `static`.
846 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
847
848 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
849 ///
850 /// For use only for taking the address of this form when
851 /// Rust prohibits the use of the non-`_INIT` form directly,
852 /// such as in initializers of other `static`s. If in doubt,
853 /// use the corresponding non-`_INIT` reference-typed `static`.
854 ///
855 /// This part of the public API will go away if Rust changes
856 /// to make the referent of `pub const FOO: &'static Encoding`
857 /// unique cross-crate or if Rust starts allowing static arrays
858 /// to be initialized with `pub static FOO: &'static Encoding`
859 /// items.
860 pub static EUC_KR_INIT: Encoding = Encoding {
861 name: "EUC-KR",
862 variant: VariantEncoding::EucKr,
863 };
864
865 /// The EUC-KR encoding.
866 ///
867 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
868 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
869 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
870 ///
871 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
872 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
873 ///
874 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
875 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
876 /// the question mark or the Private Use Area.
877 ///
878 /// This will change from `static` to `const` if Rust changes
879 /// to make the referent of `pub const FOO: &'static Encoding`
880 /// unique cross-crate, so don't take the address of this
881 /// `static`.
882 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
883
884 /// The initializer for the [GBK](static.GBK.html) encoding.
885 ///
886 /// For use only for taking the address of this form when
887 /// Rust prohibits the use of the non-`_INIT` form directly,
888 /// such as in initializers of other `static`s. If in doubt,
889 /// use the corresponding non-`_INIT` reference-typed `static`.
890 ///
891 /// This part of the public API will go away if Rust changes
892 /// to make the referent of `pub const FOO: &'static Encoding`
893 /// unique cross-crate or if Rust starts allowing static arrays
894 /// to be initialized with `pub static FOO: &'static Encoding`
895 /// items.
896 pub static GBK_INIT: Encoding = Encoding {
897 name: "GBK",
898 variant: VariantEncoding::Gbk,
899 };
900
901 /// The GBK encoding.
902 ///
903 /// The decoder for this encoding is the same as the decoder for gb18030.
904 /// The encoder side of this encoding is GBK with Windows code page 936 euro
905 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
906 /// Unicode block as well as a handful of ideographs from the CJK Unified
907 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
908 ///
909 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
910 /// unified with the gb18030 encoder in the Encoding Standard out of concern
911 /// that servers that expect GBK form submissions might not be able to handle
912 /// the four-byte sequences.
913 ///
914 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
915 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
916 ///
917 /// The encoder of this encoding roughly matches the Windows code page 936.
918 /// The decoder side is a superset.
919 ///
920 /// This will change from `static` to `const` if Rust changes
921 /// to make the referent of `pub const FOO: &'static Encoding`
922 /// unique cross-crate, so don't take the address of this
923 /// `static`.
924 pub static GBK: &'static Encoding = &GBK_INIT;
925
926 /// The initializer for the [IBM866](static.IBM866.html) encoding.
927 ///
928 /// For use only for taking the address of this form when
929 /// Rust prohibits the use of the non-`_INIT` form directly,
930 /// such as in initializers of other `static`s. If in doubt,
931 /// use the corresponding non-`_INIT` reference-typed `static`.
932 ///
933 /// This part of the public API will go away if Rust changes
934 /// to make the referent of `pub const FOO: &'static Encoding`
935 /// unique cross-crate or if Rust starts allowing static arrays
936 /// to be initialized with `pub static FOO: &'static Encoding`
937 /// items.
938 pub static IBM866_INIT: Encoding = Encoding {
939 name: "IBM866",
940 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
941 };
942
943 /// The IBM866 encoding.
944 ///
945 /// This the most notable one of the DOS Cyrillic code pages. It has the same
946 /// box drawing characters as code page 437, so it can be used for decoding
947 /// DOS-era ASCII + box drawing data.
948 ///
949 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
950 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
951 ///
952 /// This encoding matches the Windows code page 866.
953 ///
954 /// This will change from `static` to `const` if Rust changes
955 /// to make the referent of `pub const FOO: &'static Encoding`
956 /// unique cross-crate, so don't take the address of this
957 /// `static`.
958 pub static IBM866: &'static Encoding = &IBM866_INIT;
959
960 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
961 ///
962 /// For use only for taking the address of this form when
963 /// Rust prohibits the use of the non-`_INIT` form directly,
964 /// such as in initializers of other `static`s. If in doubt,
965 /// use the corresponding non-`_INIT` reference-typed `static`.
966 ///
967 /// This part of the public API will go away if Rust changes
968 /// to make the referent of `pub const FOO: &'static Encoding`
969 /// unique cross-crate or if Rust starts allowing static arrays
970 /// to be initialized with `pub static FOO: &'static Encoding`
971 /// items.
972 pub static ISO_2022_JP_INIT: Encoding = Encoding {
973 name: "ISO-2022-JP",
974 variant: VariantEncoding::Iso2022Jp,
975 };
976
977 /// The ISO-2022-JP encoding.
978 ///
979 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
980 /// byte range to encode non-Basic Latin characters. It's the only encoding
981 /// supported by this crate whose encoder is stateful.
982 ///
983 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
984 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
985 ///
986 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
987 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
988 /// error handling.
989 ///
990 /// This will change from `static` to `const` if Rust changes
991 /// to make the referent of `pub const FOO: &'static Encoding`
992 /// unique cross-crate, so don't take the address of this
993 /// `static`.
994 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
995
996 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
997 ///
998 /// For use only for taking the address of this form when
999 /// Rust prohibits the use of the non-`_INIT` form directly,
1000 /// such as in initializers of other `static`s. If in doubt,
1001 /// use the corresponding non-`_INIT` reference-typed `static`.
1002 ///
1003 /// This part of the public API will go away if Rust changes
1004 /// to make the referent of `pub const FOO: &'static Encoding`
1005 /// unique cross-crate or if Rust starts allowing static arrays
1006 /// to be initialized with `pub static FOO: &'static Encoding`
1007 /// items.
1008 pub static ISO_8859_10_INIT: Encoding = Encoding {
1009 name: "ISO-8859-10",
1010 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1011 };
1012
1013 /// The ISO-8859-10 encoding.
1014 ///
1015 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1016 /// is also known as Latin 6.
1017 ///
1018 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1019 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1020 ///
1021 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1022 /// does not support this encoding.
1023 ///
1024 /// This will change from `static` to `const` if Rust changes
1025 /// to make the referent of `pub const FOO: &'static Encoding`
1026 /// unique cross-crate, so don't take the address of this
1027 /// `static`.
1028 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1029
1030 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1031 ///
1032 /// For use only for taking the address of this form when
1033 /// Rust prohibits the use of the non-`_INIT` form directly,
1034 /// such as in initializers of other `static`s. If in doubt,
1035 /// use the corresponding non-`_INIT` reference-typed `static`.
1036 ///
1037 /// This part of the public API will go away if Rust changes
1038 /// to make the referent of `pub const FOO: &'static Encoding`
1039 /// unique cross-crate or if Rust starts allowing static arrays
1040 /// to be initialized with `pub static FOO: &'static Encoding`
1041 /// items.
1042 pub static ISO_8859_13_INIT: Encoding = Encoding {
1043 name: "ISO-8859-13",
1044 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1045 };
1046
1047 /// The ISO-8859-13 encoding.
1048 ///
1049 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1050 /// is also known as Latin 7.
1051 ///
1052 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1053 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1054 ///
1055 /// This encoding matches the Windows code page 28603, except Windows decodes
1056 /// unassigned code points to the Private Use Area of Unicode.
1057 ///
1058 /// This will change from `static` to `const` if Rust changes
1059 /// to make the referent of `pub const FOO: &'static Encoding`
1060 /// unique cross-crate, so don't take the address of this
1061 /// `static`.
1062 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1063
1064 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1065 ///
1066 /// For use only for taking the address of this form when
1067 /// Rust prohibits the use of the non-`_INIT` form directly,
1068 /// such as in initializers of other `static`s. If in doubt,
1069 /// use the corresponding non-`_INIT` reference-typed `static`.
1070 ///
1071 /// This part of the public API will go away if Rust changes
1072 /// to make the referent of `pub const FOO: &'static Encoding`
1073 /// unique cross-crate or if Rust starts allowing static arrays
1074 /// to be initialized with `pub static FOO: &'static Encoding`
1075 /// items.
1076 pub static ISO_8859_14_INIT: Encoding = Encoding {
1077 name: "ISO-8859-14",
1078 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1079 };
1080
1081 /// The ISO-8859-14 encoding.
1082 ///
1083 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1084 /// is also known as Latin 8.
1085 ///
1086 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1087 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1088 ///
1089 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1090 /// does not support this encoding.
1091 ///
1092 /// This will change from `static` to `const` if Rust changes
1093 /// to make the referent of `pub const FOO: &'static Encoding`
1094 /// unique cross-crate, so don't take the address of this
1095 /// `static`.
1096 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1097
1098 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1099 ///
1100 /// For use only for taking the address of this form when
1101 /// Rust prohibits the use of the non-`_INIT` form directly,
1102 /// such as in initializers of other `static`s. If in doubt,
1103 /// use the corresponding non-`_INIT` reference-typed `static`.
1104 ///
1105 /// This part of the public API will go away if Rust changes
1106 /// to make the referent of `pub const FOO: &'static Encoding`
1107 /// unique cross-crate or if Rust starts allowing static arrays
1108 /// to be initialized with `pub static FOO: &'static Encoding`
1109 /// items.
1110 pub static ISO_8859_15_INIT: Encoding = Encoding {
1111 name: "ISO-8859-15",
1112 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1113 };
1114
1115 /// The ISO-8859-15 encoding.
1116 ///
1117 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1118 /// family. This encoding is also known as Latin 9.
1119 ///
1120 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1121 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1122 ///
1123 /// This encoding matches the Windows code page 28605.
1124 ///
1125 /// This will change from `static` to `const` if Rust changes
1126 /// to make the referent of `pub const FOO: &'static Encoding`
1127 /// unique cross-crate, so don't take the address of this
1128 /// `static`.
1129 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1130
1131 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1132 ///
1133 /// For use only for taking the address of this form when
1134 /// Rust prohibits the use of the non-`_INIT` form directly,
1135 /// such as in initializers of other `static`s. If in doubt,
1136 /// use the corresponding non-`_INIT` reference-typed `static`.
1137 ///
1138 /// This part of the public API will go away if Rust changes
1139 /// to make the referent of `pub const FOO: &'static Encoding`
1140 /// unique cross-crate or if Rust starts allowing static arrays
1141 /// to be initialized with `pub static FOO: &'static Encoding`
1142 /// items.
1143 pub static ISO_8859_16_INIT: Encoding = Encoding {
1144 name: "ISO-8859-16",
1145 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1146 };
1147
1148 /// The ISO-8859-16 encoding.
1149 ///
1150 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1151 /// family. This encoding is also known as Latin 10.
1152 ///
1153 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1154 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1155 ///
1156 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1157 /// does not support this encoding.
1158 ///
1159 /// This will change from `static` to `const` if Rust changes
1160 /// to make the referent of `pub const FOO: &'static Encoding`
1161 /// unique cross-crate, so don't take the address of this
1162 /// `static`.
1163 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1164
1165 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1166 ///
1167 /// For use only for taking the address of this form when
1168 /// Rust prohibits the use of the non-`_INIT` form directly,
1169 /// such as in initializers of other `static`s. If in doubt,
1170 /// use the corresponding non-`_INIT` reference-typed `static`.
1171 ///
1172 /// This part of the public API will go away if Rust changes
1173 /// to make the referent of `pub const FOO: &'static Encoding`
1174 /// unique cross-crate or if Rust starts allowing static arrays
1175 /// to be initialized with `pub static FOO: &'static Encoding`
1176 /// items.
1177 pub static ISO_8859_2_INIT: Encoding = Encoding {
1178 name: "ISO-8859-2",
1179 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1180 };
1181
1182 /// The ISO-8859-2 encoding.
1183 ///
1184 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1185 ///
1186 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1187 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1188 ///
1189 /// This encoding matches the Windows code page 28592.
1190 ///
1191 /// This will change from `static` to `const` if Rust changes
1192 /// to make the referent of `pub const FOO: &'static Encoding`
1193 /// unique cross-crate, so don't take the address of this
1194 /// `static`.
1195 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1196
1197 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1198 ///
1199 /// For use only for taking the address of this form when
1200 /// Rust prohibits the use of the non-`_INIT` form directly,
1201 /// such as in initializers of other `static`s. If in doubt,
1202 /// use the corresponding non-`_INIT` reference-typed `static`.
1203 ///
1204 /// This part of the public API will go away if Rust changes
1205 /// to make the referent of `pub const FOO: &'static Encoding`
1206 /// unique cross-crate or if Rust starts allowing static arrays
1207 /// to be initialized with `pub static FOO: &'static Encoding`
1208 /// items.
1209 pub static ISO_8859_3_INIT: Encoding = Encoding {
1210 name: "ISO-8859-3",
1211 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1212 };
1213
1214 /// The ISO-8859-3 encoding.
1215 ///
1216 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1217 ///
1218 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1219 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1220 ///
1221 /// This encoding matches the Windows code page 28593.
1222 ///
1223 /// This will change from `static` to `const` if Rust changes
1224 /// to make the referent of `pub const FOO: &'static Encoding`
1225 /// unique cross-crate, so don't take the address of this
1226 /// `static`.
1227 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1228
1229 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1230 ///
1231 /// For use only for taking the address of this form when
1232 /// Rust prohibits the use of the non-`_INIT` form directly,
1233 /// such as in initializers of other `static`s. If in doubt,
1234 /// use the corresponding non-`_INIT` reference-typed `static`.
1235 ///
1236 /// This part of the public API will go away if Rust changes
1237 /// to make the referent of `pub const FOO: &'static Encoding`
1238 /// unique cross-crate or if Rust starts allowing static arrays
1239 /// to be initialized with `pub static FOO: &'static Encoding`
1240 /// items.
1241 pub static ISO_8859_4_INIT: Encoding = Encoding {
1242 name: "ISO-8859-4",
1243 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1244 };
1245
1246 /// The ISO-8859-4 encoding.
1247 ///
1248 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1249 ///
1250 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1251 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1252 ///
1253 /// This encoding matches the Windows code page 28594.
1254 ///
1255 /// This will change from `static` to `const` if Rust changes
1256 /// to make the referent of `pub const FOO: &'static Encoding`
1257 /// unique cross-crate, so don't take the address of this
1258 /// `static`.
1259 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1260
1261 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1262 ///
1263 /// For use only for taking the address of this form when
1264 /// Rust prohibits the use of the non-`_INIT` form directly,
1265 /// such as in initializers of other `static`s. If in doubt,
1266 /// use the corresponding non-`_INIT` reference-typed `static`.
1267 ///
1268 /// This part of the public API will go away if Rust changes
1269 /// to make the referent of `pub const FOO: &'static Encoding`
1270 /// unique cross-crate or if Rust starts allowing static arrays
1271 /// to be initialized with `pub static FOO: &'static Encoding`
1272 /// items.
1273 pub static ISO_8859_5_INIT: Encoding = Encoding {
1274 name: "ISO-8859-5",
1275 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1276 };
1277
1278 /// The ISO-8859-5 encoding.
1279 ///
1280 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1281 ///
1282 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1283 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1284 ///
1285 /// This encoding matches the Windows code page 28595.
1286 ///
1287 /// This will change from `static` to `const` if Rust changes
1288 /// to make the referent of `pub const FOO: &'static Encoding`
1289 /// unique cross-crate, so don't take the address of this
1290 /// `static`.
1291 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1292
1293 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1294 ///
1295 /// For use only for taking the address of this form when
1296 /// Rust prohibits the use of the non-`_INIT` form directly,
1297 /// such as in initializers of other `static`s. If in doubt,
1298 /// use the corresponding non-`_INIT` reference-typed `static`.
1299 ///
1300 /// This part of the public API will go away if Rust changes
1301 /// to make the referent of `pub const FOO: &'static Encoding`
1302 /// unique cross-crate or if Rust starts allowing static arrays
1303 /// to be initialized with `pub static FOO: &'static Encoding`
1304 /// items.
1305 pub static ISO_8859_6_INIT: Encoding = Encoding {
1306 name: "ISO-8859-6",
1307 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1308 };
1309
1310 /// The ISO-8859-6 encoding.
1311 ///
1312 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1313 ///
1314 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1315 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1316 ///
1317 /// This encoding matches the Windows code page 28596, except Windows decodes
1318 /// unassigned code points to the Private Use Area of Unicode.
1319 ///
1320 /// This will change from `static` to `const` if Rust changes
1321 /// to make the referent of `pub const FOO: &'static Encoding`
1322 /// unique cross-crate, so don't take the address of this
1323 /// `static`.
1324 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1325
1326 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1327 ///
1328 /// For use only for taking the address of this form when
1329 /// Rust prohibits the use of the non-`_INIT` form directly,
1330 /// such as in initializers of other `static`s. If in doubt,
1331 /// use the corresponding non-`_INIT` reference-typed `static`.
1332 ///
1333 /// This part of the public API will go away if Rust changes
1334 /// to make the referent of `pub const FOO: &'static Encoding`
1335 /// unique cross-crate or if Rust starts allowing static arrays
1336 /// to be initialized with `pub static FOO: &'static Encoding`
1337 /// items.
1338 pub static ISO_8859_7_INIT: Encoding = Encoding {
1339 name: "ISO-8859-7",
1340 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1341 };
1342
1343 /// The ISO-8859-7 encoding.
1344 ///
1345 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1346 ///
1347 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1348 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1349 ///
1350 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1351 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1352 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1353 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1354 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1355 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1356 ///
1357 /// This will change from `static` to `const` if Rust changes
1358 /// to make the referent of `pub const FOO: &'static Encoding`
1359 /// unique cross-crate, so don't take the address of this
1360 /// `static`.
1361 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1362
1363 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1364 ///
1365 /// For use only for taking the address of this form when
1366 /// Rust prohibits the use of the non-`_INIT` form directly,
1367 /// such as in initializers of other `static`s. If in doubt,
1368 /// use the corresponding non-`_INIT` reference-typed `static`.
1369 ///
1370 /// This part of the public API will go away if Rust changes
1371 /// to make the referent of `pub const FOO: &'static Encoding`
1372 /// unique cross-crate or if Rust starts allowing static arrays
1373 /// to be initialized with `pub static FOO: &'static Encoding`
1374 /// items.
1375 pub static ISO_8859_8_INIT: Encoding = Encoding {
1376 name: "ISO-8859-8",
1377 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1378 };
1379
1380 /// The ISO-8859-8 encoding.
1381 ///
1382 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1383 ///
1384 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1385 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1386 ///
1387 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1388 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1389 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1390 /// the private use area.
1391 ///
1392 /// This will change from `static` to `const` if Rust changes
1393 /// to make the referent of `pub const FOO: &'static Encoding`
1394 /// unique cross-crate, so don't take the address of this
1395 /// `static`.
1396 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1397
1398 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1399 ///
1400 /// For use only for taking the address of this form when
1401 /// Rust prohibits the use of the non-`_INIT` form directly,
1402 /// such as in initializers of other `static`s. If in doubt,
1403 /// use the corresponding non-`_INIT` reference-typed `static`.
1404 ///
1405 /// This part of the public API will go away if Rust changes
1406 /// to make the referent of `pub const FOO: &'static Encoding`
1407 /// unique cross-crate or if Rust starts allowing static arrays
1408 /// to be initialized with `pub static FOO: &'static Encoding`
1409 /// items.
1410 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1411 name: "ISO-8859-8-I",
1412 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1413 };
1414
1415 /// The ISO-8859-8-I encoding.
1416 ///
1417 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1418 ///
1419 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1420 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1421 ///
1422 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1423 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1424 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1425 /// the private use area.
1426 ///
1427 /// This will change from `static` to `const` if Rust changes
1428 /// to make the referent of `pub const FOO: &'static Encoding`
1429 /// unique cross-crate, so don't take the address of this
1430 /// `static`.
1431 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1432
1433 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1434 ///
1435 /// For use only for taking the address of this form when
1436 /// Rust prohibits the use of the non-`_INIT` form directly,
1437 /// such as in initializers of other `static`s. If in doubt,
1438 /// use the corresponding non-`_INIT` reference-typed `static`.
1439 ///
1440 /// This part of the public API will go away if Rust changes
1441 /// to make the referent of `pub const FOO: &'static Encoding`
1442 /// unique cross-crate or if Rust starts allowing static arrays
1443 /// to be initialized with `pub static FOO: &'static Encoding`
1444 /// items.
1445 pub static KOI8_R_INIT: Encoding = Encoding {
1446 name: "KOI8-R",
1447 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1448 };
1449
1450 /// The KOI8-R encoding.
1451 ///
1452 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1453 ///
1454 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1455 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1456 ///
1457 /// This encoding matches the Windows code page 20866.
1458 ///
1459 /// This will change from `static` to `const` if Rust changes
1460 /// to make the referent of `pub const FOO: &'static Encoding`
1461 /// unique cross-crate, so don't take the address of this
1462 /// `static`.
1463 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1464
1465 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1466 ///
1467 /// For use only for taking the address of this form when
1468 /// Rust prohibits the use of the non-`_INIT` form directly,
1469 /// such as in initializers of other `static`s. If in doubt,
1470 /// use the corresponding non-`_INIT` reference-typed `static`.
1471 ///
1472 /// This part of the public API will go away if Rust changes
1473 /// to make the referent of `pub const FOO: &'static Encoding`
1474 /// unique cross-crate or if Rust starts allowing static arrays
1475 /// to be initialized with `pub static FOO: &'static Encoding`
1476 /// items.
1477 pub static KOI8_U_INIT: Encoding = Encoding {
1478 name: "KOI8-U",
1479 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1480 };
1481
1482 /// The KOI8-U encoding.
1483 ///
1484 /// This is an encoding for Ukrainian adapted from KOI8-R.
1485 ///
1486 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1487 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1488 ///
1489 /// This encoding matches the Windows code page 21866.
1490 ///
1491 /// This will change from `static` to `const` if Rust changes
1492 /// to make the referent of `pub const FOO: &'static Encoding`
1493 /// unique cross-crate, so don't take the address of this
1494 /// `static`.
1495 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1496
1497 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1498 ///
1499 /// For use only for taking the address of this form when
1500 /// Rust prohibits the use of the non-`_INIT` form directly,
1501 /// such as in initializers of other `static`s. If in doubt,
1502 /// use the corresponding non-`_INIT` reference-typed `static`.
1503 ///
1504 /// This part of the public API will go away if Rust changes
1505 /// to make the referent of `pub const FOO: &'static Encoding`
1506 /// unique cross-crate or if Rust starts allowing static arrays
1507 /// to be initialized with `pub static FOO: &'static Encoding`
1508 /// items.
1509 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1510 name: "Shift_JIS",
1511 variant: VariantEncoding::ShiftJis,
1512 };
1513
1514 /// The Shift_JIS encoding.
1515 ///
1516 /// This is the Japanese encoding for Windows.
1517 ///
1518 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1519 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1520 ///
1521 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1522 /// sequences that are error per the Encoding Standard to the question mark or the
1523 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1524 ///
1525 /// This will change from `static` to `const` if Rust changes
1526 /// to make the referent of `pub const FOO: &'static Encoding`
1527 /// unique cross-crate, so don't take the address of this
1528 /// `static`.
1529 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1530
1531 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1532 ///
1533 /// For use only for taking the address of this form when
1534 /// Rust prohibits the use of the non-`_INIT` form directly,
1535 /// such as in initializers of other `static`s. If in doubt,
1536 /// use the corresponding non-`_INIT` reference-typed `static`.
1537 ///
1538 /// This part of the public API will go away if Rust changes
1539 /// to make the referent of `pub const FOO: &'static Encoding`
1540 /// unique cross-crate or if Rust starts allowing static arrays
1541 /// to be initialized with `pub static FOO: &'static Encoding`
1542 /// items.
1543 pub static UTF_16BE_INIT: Encoding = Encoding {
1544 name: "UTF-16BE",
1545 variant: VariantEncoding::Utf16Be,
1546 };
1547
1548 /// The UTF-16BE encoding.
1549 ///
1550 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1551 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1552 /// mark the big endian byte order is assumed.
1553 ///
1554 /// There is no corresponding encoder in this crate or in the Encoding
1555 /// Standard. The output encoding of this encoding is UTF-8.
1556 ///
1557 /// This encoding matches the Windows code page 1201.
1558 ///
1559 /// This will change from `static` to `const` if Rust changes
1560 /// to make the referent of `pub const FOO: &'static Encoding`
1561 /// unique cross-crate, so don't take the address of this
1562 /// `static`.
1563 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1564
1565 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1566 ///
1567 /// For use only for taking the address of this form when
1568 /// Rust prohibits the use of the non-`_INIT` form directly,
1569 /// such as in initializers of other `static`s. If in doubt,
1570 /// use the corresponding non-`_INIT` reference-typed `static`.
1571 ///
1572 /// This part of the public API will go away if Rust changes
1573 /// to make the referent of `pub const FOO: &'static Encoding`
1574 /// unique cross-crate or if Rust starts allowing static arrays
1575 /// to be initialized with `pub static FOO: &'static Encoding`
1576 /// items.
1577 pub static UTF_16LE_INIT: Encoding = Encoding {
1578 name: "UTF-16LE",
1579 variant: VariantEncoding::Utf16Le,
1580 };
1581
1582 /// The UTF-16LE encoding.
1583 ///
1584 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1585 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1586 /// mark the little endian byte order is assumed.
1587 ///
1588 /// There is no corresponding encoder in this crate or in the Encoding
1589 /// Standard. The output encoding of this encoding is UTF-8.
1590 ///
1591 /// This encoding matches the Windows code page 1200.
1592 ///
1593 /// This will change from `static` to `const` if Rust changes
1594 /// to make the referent of `pub const FOO: &'static Encoding`
1595 /// unique cross-crate, so don't take the address of this
1596 /// `static`.
1597 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1598
1599 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1600 ///
1601 /// For use only for taking the address of this form when
1602 /// Rust prohibits the use of the non-`_INIT` form directly,
1603 /// such as in initializers of other `static`s. If in doubt,
1604 /// use the corresponding non-`_INIT` reference-typed `static`.
1605 ///
1606 /// This part of the public API will go away if Rust changes
1607 /// to make the referent of `pub const FOO: &'static Encoding`
1608 /// unique cross-crate or if Rust starts allowing static arrays
1609 /// to be initialized with `pub static FOO: &'static Encoding`
1610 /// items.
1611 pub static UTF_8_INIT: Encoding = Encoding {
1612 name: "UTF-8",
1613 variant: VariantEncoding::Utf8,
1614 };
1615
1616 /// The UTF-8 encoding.
1617 ///
1618 /// This is the encoding that should be used for all new development it can
1619 /// represent all of Unicode.
1620 ///
1621 /// This encoding matches the Windows code page 65001, except Windows differs
1622 /// in the number of errors generated for some erroneous byte sequences.
1623 ///
1624 /// This will change from `static` to `const` if Rust changes
1625 /// to make the referent of `pub const FOO: &'static Encoding`
1626 /// unique cross-crate, so don't take the address of this
1627 /// `static`.
1628 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1629
1630 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1631 ///
1632 /// For use only for taking the address of this form when
1633 /// Rust prohibits the use of the non-`_INIT` form directly,
1634 /// such as in initializers of other `static`s. If in doubt,
1635 /// use the corresponding non-`_INIT` reference-typed `static`.
1636 ///
1637 /// This part of the public API will go away if Rust changes
1638 /// to make the referent of `pub const FOO: &'static Encoding`
1639 /// unique cross-crate or if Rust starts allowing static arrays
1640 /// to be initialized with `pub static FOO: &'static Encoding`
1641 /// items.
1642 pub static GB18030_INIT: Encoding = Encoding {
1643 name: "gb18030",
1644 variant: VariantEncoding::Gb18030,
1645 };
1646
1647 /// The gb18030 encoding.
1648 ///
1649 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1650 /// maps to U+3000 for compatibility with existing Web content. As a result,
1651 /// this encoding can represent all of Unicode except for the private-use
1652 /// character U+E5E5.
1653 ///
1654 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1655 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1656 ///
1657 /// This encoding matches the Windows code page 54936.
1658 ///
1659 /// This will change from `static` to `const` if Rust changes
1660 /// to make the referent of `pub const FOO: &'static Encoding`
1661 /// unique cross-crate, so don't take the address of this
1662 /// `static`.
1663 pub static GB18030: &'static Encoding = &GB18030_INIT;
1664
1665 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1666 ///
1667 /// For use only for taking the address of this form when
1668 /// Rust prohibits the use of the non-`_INIT` form directly,
1669 /// such as in initializers of other `static`s. If in doubt,
1670 /// use the corresponding non-`_INIT` reference-typed `static`.
1671 ///
1672 /// This part of the public API will go away if Rust changes
1673 /// to make the referent of `pub const FOO: &'static Encoding`
1674 /// unique cross-crate or if Rust starts allowing static arrays
1675 /// to be initialized with `pub static FOO: &'static Encoding`
1676 /// items.
1677 pub static MACINTOSH_INIT: Encoding = Encoding {
1678 name: "macintosh",
1679 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1680 };
1681
1682 /// The macintosh encoding.
1683 ///
1684 /// This is the MacRoman encoding from Mac OS Classic.
1685 ///
1686 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1687 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1688 ///
1689 /// This encoding matches the Windows code page 10000, except Windows decodes
1690 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1691 ///
1692 /// This will change from `static` to `const` if Rust changes
1693 /// to make the referent of `pub const FOO: &'static Encoding`
1694 /// unique cross-crate, so don't take the address of this
1695 /// `static`.
1696 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1697
1698 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1699 ///
1700 /// For use only for taking the address of this form when
1701 /// Rust prohibits the use of the non-`_INIT` form directly,
1702 /// such as in initializers of other `static`s. If in doubt,
1703 /// use the corresponding non-`_INIT` reference-typed `static`.
1704 ///
1705 /// This part of the public API will go away if Rust changes
1706 /// to make the referent of `pub const FOO: &'static Encoding`
1707 /// unique cross-crate or if Rust starts allowing static arrays
1708 /// to be initialized with `pub static FOO: &'static Encoding`
1709 /// items.
1710 pub static REPLACEMENT_INIT: Encoding = Encoding {
1711 name: "replacement",
1712 variant: VariantEncoding::Replacement,
1713 };
1714
1715 /// The replacement encoding.
1716 ///
1717 /// This decode-only encoding decodes all non-zero-length streams to a single
1718 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1719 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1720 /// encodings that are no longer supported by the Web Platform and that
1721 /// would be dangerous to treat as ASCII-compatible.
1722 ///
1723 /// There is no corresponding encoder. The output encoding of this encoding
1724 /// is UTF-8.
1725 ///
1726 /// This encoding does not have a Windows code page number.
1727 ///
1728 /// This will change from `static` to `const` if Rust changes
1729 /// to make the referent of `pub const FOO: &'static Encoding`
1730 /// unique cross-crate, so don't take the address of this
1731 /// `static`.
1732 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1733
1734 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1735 ///
1736 /// For use only for taking the address of this form when
1737 /// Rust prohibits the use of the non-`_INIT` form directly,
1738 /// such as in initializers of other `static`s. If in doubt,
1739 /// use the corresponding non-`_INIT` reference-typed `static`.
1740 ///
1741 /// This part of the public API will go away if Rust changes
1742 /// to make the referent of `pub const FOO: &'static Encoding`
1743 /// unique cross-crate or if Rust starts allowing static arrays
1744 /// to be initialized with `pub static FOO: &'static Encoding`
1745 /// items.
1746 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1747 name: "windows-1250",
1748 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1749 };
1750
1751 /// The windows-1250 encoding.
1752 ///
1753 /// This is the Central European encoding for Windows.
1754 ///
1755 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1756 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1757 ///
1758 /// This encoding matches the Windows code page 1250.
1759 ///
1760 /// This will change from `static` to `const` if Rust changes
1761 /// to make the referent of `pub const FOO: &'static Encoding`
1762 /// unique cross-crate, so don't take the address of this
1763 /// `static`.
1764 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1765
1766 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1767 ///
1768 /// For use only for taking the address of this form when
1769 /// Rust prohibits the use of the non-`_INIT` form directly,
1770 /// such as in initializers of other `static`s. If in doubt,
1771 /// use the corresponding non-`_INIT` reference-typed `static`.
1772 ///
1773 /// This part of the public API will go away if Rust changes
1774 /// to make the referent of `pub const FOO: &'static Encoding`
1775 /// unique cross-crate or if Rust starts allowing static arrays
1776 /// to be initialized with `pub static FOO: &'static Encoding`
1777 /// items.
1778 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1779 name: "windows-1251",
1780 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1781 };
1782
1783 /// The windows-1251 encoding.
1784 ///
1785 /// This is the Cyrillic encoding for Windows.
1786 ///
1787 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1788 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1789 ///
1790 /// This encoding matches the Windows code page 1251.
1791 ///
1792 /// This will change from `static` to `const` if Rust changes
1793 /// to make the referent of `pub const FOO: &'static Encoding`
1794 /// unique cross-crate, so don't take the address of this
1795 /// `static`.
1796 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1797
1798 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1799 ///
1800 /// For use only for taking the address of this form when
1801 /// Rust prohibits the use of the non-`_INIT` form directly,
1802 /// such as in initializers of other `static`s. If in doubt,
1803 /// use the corresponding non-`_INIT` reference-typed `static`.
1804 ///
1805 /// This part of the public API will go away if Rust changes
1806 /// to make the referent of `pub const FOO: &'static Encoding`
1807 /// unique cross-crate or if Rust starts allowing static arrays
1808 /// to be initialized with `pub static FOO: &'static Encoding`
1809 /// items.
1810 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1811 name: "windows-1252",
1812 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1813 };
1814
1815 /// The windows-1252 encoding.
1816 ///
1817 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1818 /// which is known as Latin 1.
1819 ///
1820 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1821 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1822 ///
1823 /// This encoding matches the Windows code page 1252.
1824 ///
1825 /// This will change from `static` to `const` if Rust changes
1826 /// to make the referent of `pub const FOO: &'static Encoding`
1827 /// unique cross-crate, so don't take the address of this
1828 /// `static`.
1829 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1830
1831 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1832 ///
1833 /// For use only for taking the address of this form when
1834 /// Rust prohibits the use of the non-`_INIT` form directly,
1835 /// such as in initializers of other `static`s. If in doubt,
1836 /// use the corresponding non-`_INIT` reference-typed `static`.
1837 ///
1838 /// This part of the public API will go away if Rust changes
1839 /// to make the referent of `pub const FOO: &'static Encoding`
1840 /// unique cross-crate or if Rust starts allowing static arrays
1841 /// to be initialized with `pub static FOO: &'static Encoding`
1842 /// items.
1843 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1844 name: "windows-1253",
1845 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1846 };
1847
1848 /// The windows-1253 encoding.
1849 ///
1850 /// This is the Greek encoding for Windows. It is mostly an extension of
1851 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1852 ///
1853 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1854 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1855 ///
1856 /// This encoding matches the Windows code page 1253, except Windows decodes
1857 /// unassigned code points to the Private Use Area of Unicode.
1858 ///
1859 /// This will change from `static` to `const` if Rust changes
1860 /// to make the referent of `pub const FOO: &'static Encoding`
1861 /// unique cross-crate, so don't take the address of this
1862 /// `static`.
1863 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1864
1865 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1866 ///
1867 /// For use only for taking the address of this form when
1868 /// Rust prohibits the use of the non-`_INIT` form directly,
1869 /// such as in initializers of other `static`s. If in doubt,
1870 /// use the corresponding non-`_INIT` reference-typed `static`.
1871 ///
1872 /// This part of the public API will go away if Rust changes
1873 /// to make the referent of `pub const FOO: &'static Encoding`
1874 /// unique cross-crate or if Rust starts allowing static arrays
1875 /// to be initialized with `pub static FOO: &'static Encoding`
1876 /// items.
1877 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1878 name: "windows-1254",
1879 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1880 };
1881
1882 /// The windows-1254 encoding.
1883 ///
1884 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1885 /// which is known as Latin 5.
1886 ///
1887 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1888 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1889 ///
1890 /// This encoding matches the Windows code page 1254.
1891 ///
1892 /// This will change from `static` to `const` if Rust changes
1893 /// to make the referent of `pub const FOO: &'static Encoding`
1894 /// unique cross-crate, so don't take the address of this
1895 /// `static`.
1896 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1897
1898 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1899 ///
1900 /// For use only for taking the address of this form when
1901 /// Rust prohibits the use of the non-`_INIT` form directly,
1902 /// such as in initializers of other `static`s. If in doubt,
1903 /// use the corresponding non-`_INIT` reference-typed `static`.
1904 ///
1905 /// This part of the public API will go away if Rust changes
1906 /// to make the referent of `pub const FOO: &'static Encoding`
1907 /// unique cross-crate or if Rust starts allowing static arrays
1908 /// to be initialized with `pub static FOO: &'static Encoding`
1909 /// items.
1910 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1911 name: "windows-1255",
1912 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1913 };
1914
1915 /// The windows-1255 encoding.
1916 ///
1917 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1918 /// except for a currency sign swap.
1919 ///
1920 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1921 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1922 ///
1923 /// This encoding matches the Windows code page 1255, except Windows decodes
1924 /// unassigned code points to the Private Use Area of Unicode.
1925 ///
1926 /// This will change from `static` to `const` if Rust changes
1927 /// to make the referent of `pub const FOO: &'static Encoding`
1928 /// unique cross-crate, so don't take the address of this
1929 /// `static`.
1930 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1931
1932 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1933 ///
1934 /// For use only for taking the address of this form when
1935 /// Rust prohibits the use of the non-`_INIT` form directly,
1936 /// such as in initializers of other `static`s. If in doubt,
1937 /// use the corresponding non-`_INIT` reference-typed `static`.
1938 ///
1939 /// This part of the public API will go away if Rust changes
1940 /// to make the referent of `pub const FOO: &'static Encoding`
1941 /// unique cross-crate or if Rust starts allowing static arrays
1942 /// to be initialized with `pub static FOO: &'static Encoding`
1943 /// items.
1944 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1945 name: "windows-1256",
1946 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1947 };
1948
1949 /// The windows-1256 encoding.
1950 ///
1951 /// This is the Arabic encoding for Windows.
1952 ///
1953 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1954 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1955 ///
1956 /// This encoding matches the Windows code page 1256.
1957 ///
1958 /// This will change from `static` to `const` if Rust changes
1959 /// to make the referent of `pub const FOO: &'static Encoding`
1960 /// unique cross-crate, so don't take the address of this
1961 /// `static`.
1962 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1963
1964 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1965 ///
1966 /// For use only for taking the address of this form when
1967 /// Rust prohibits the use of the non-`_INIT` form directly,
1968 /// such as in initializers of other `static`s. If in doubt,
1969 /// use the corresponding non-`_INIT` reference-typed `static`.
1970 ///
1971 /// This part of the public API will go away if Rust changes
1972 /// to make the referent of `pub const FOO: &'static Encoding`
1973 /// unique cross-crate or if Rust starts allowing static arrays
1974 /// to be initialized with `pub static FOO: &'static Encoding`
1975 /// items.
1976 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1977 name: "windows-1257",
1978 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1979 };
1980
1981 /// The windows-1257 encoding.
1982 ///
1983 /// This is the Baltic encoding for Windows.
1984 ///
1985 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1986 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1987 ///
1988 /// This encoding matches the Windows code page 1257, except Windows decodes
1989 /// unassigned code points to the Private Use Area of Unicode.
1990 ///
1991 /// This will change from `static` to `const` if Rust changes
1992 /// to make the referent of `pub const FOO: &'static Encoding`
1993 /// unique cross-crate, so don't take the address of this
1994 /// `static`.
1995 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
1996
1997 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
1998 ///
1999 /// For use only for taking the address of this form when
2000 /// Rust prohibits the use of the non-`_INIT` form directly,
2001 /// such as in initializers of other `static`s. If in doubt,
2002 /// use the corresponding non-`_INIT` reference-typed `static`.
2003 ///
2004 /// This part of the public API will go away if Rust changes
2005 /// to make the referent of `pub const FOO: &'static Encoding`
2006 /// unique cross-crate or if Rust starts allowing static arrays
2007 /// to be initialized with `pub static FOO: &'static Encoding`
2008 /// items.
2009 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2010 name: "windows-1258",
2011 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2012 };
2013
2014 /// The windows-1258 encoding.
2015 ///
2016 /// This is the Vietnamese encoding for Windows.
2017 ///
2018 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2019 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2020 ///
2021 /// This encoding matches the Windows code page 1258 when used in the
2022 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2023 /// result of decoding is not necessarily in Normalization Form C. On the
2024 /// other hand, input in the Normalization Form C is not encoded without
2025 /// replacement. In general, it's a bad idea to encode to encodings other
2026 /// than UTF-8, but this encoding is especially hazardous to encode to.
2027 ///
2028 /// This will change from `static` to `const` if Rust changes
2029 /// to make the referent of `pub const FOO: &'static Encoding`
2030 /// unique cross-crate, so don't take the address of this
2031 /// `static`.
2032 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2033
2034 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2035 ///
2036 /// For use only for taking the address of this form when
2037 /// Rust prohibits the use of the non-`_INIT` form directly,
2038 /// such as in initializers of other `static`s. If in doubt,
2039 /// use the corresponding non-`_INIT` reference-typed `static`.
2040 ///
2041 /// This part of the public API will go away if Rust changes
2042 /// to make the referent of `pub const FOO: &'static Encoding`
2043 /// unique cross-crate or if Rust starts allowing static arrays
2044 /// to be initialized with `pub static FOO: &'static Encoding`
2045 /// items.
2046 pub static WINDOWS_874_INIT: Encoding = Encoding {
2047 name: "windows-874",
2048 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2049 };
2050
2051 /// The windows-874 encoding.
2052 ///
2053 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2054 ///
2055 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2056 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2057 ///
2058 /// This encoding matches the Windows code page 874, except Windows decodes
2059 /// unassigned code points to the Private Use Area of Unicode.
2060 ///
2061 /// This will change from `static` to `const` if Rust changes
2062 /// to make the referent of `pub const FOO: &'static Encoding`
2063 /// unique cross-crate, so don't take the address of this
2064 /// `static`.
2065 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2066
2067 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2068 ///
2069 /// For use only for taking the address of this form when
2070 /// Rust prohibits the use of the non-`_INIT` form directly,
2071 /// such as in initializers of other `static`s. If in doubt,
2072 /// use the corresponding non-`_INIT` reference-typed `static`.
2073 ///
2074 /// This part of the public API will go away if Rust changes
2075 /// to make the referent of `pub const FOO: &'static Encoding`
2076 /// unique cross-crate or if Rust starts allowing static arrays
2077 /// to be initialized with `pub static FOO: &'static Encoding`
2078 /// items.
2079 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2080 name: "x-mac-cyrillic",
2081 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2082 };
2083
2084 /// The x-mac-cyrillic encoding.
2085 ///
2086 /// This is the MacUkrainian encoding from Mac OS Classic.
2087 ///
2088 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2089 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2090 ///
2091 /// This encoding matches the Windows code page 10017.
2092 ///
2093 /// This will change from `static` to `const` if Rust changes
2094 /// to make the referent of `pub const FOO: &'static Encoding`
2095 /// unique cross-crate, so don't take the address of this
2096 /// `static`.
2097 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2098
2099 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2100 ///
2101 /// For use only for taking the address of this form when
2102 /// Rust prohibits the use of the non-`_INIT` form directly,
2103 /// such as in initializers of other `static`s. If in doubt,
2104 /// use the corresponding non-`_INIT` reference-typed `static`.
2105 ///
2106 /// This part of the public API will go away if Rust changes
2107 /// to make the referent of `pub const FOO: &'static Encoding`
2108 /// unique cross-crate or if Rust starts allowing static arrays
2109 /// to be initialized with `pub static FOO: &'static Encoding`
2110 /// items.
2111 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2112 name: "x-user-defined",
2113 variant: VariantEncoding::UserDefined,
2114 };
2115
2116 /// The x-user-defined encoding.
2117 ///
2118 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2119 /// them to the Private Use Area of Unicode. It was used for loading binary
2120 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2121 /// the `"arraybuffer"` response type.
2122 ///
2123 /// This encoding does not have a Windows code page number.
2124 ///
2125 /// This will change from `static` to `const` if Rust changes
2126 /// to make the referent of `pub const FOO: &'static Encoding`
2127 /// unique cross-crate, so don't take the address of this
2128 /// `static`.
2129 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2130
2131 static LABELS_SORTED: [&'static str; 219] = [
2132 "l1",
2133 "l2",
2134 "l3",
2135 "l4",
2136 "l5",
2137 "l6",
2138 "l9",
2139 "866",
2140 "mac",
2141 "koi",
2142 "gbk",
2143 "big5",
2144 "utf8",
2145 "koi8",
2146 "sjis",
2147 "ms932",
2148 "cp866",
2149 "utf-8",
2150 "cp819",
2151 "ascii",
2152 "x-gbk",
2153 "greek",
2154 "cp1250",
2155 "cp1251",
2156 "latin1",
2157 "gb2312",
2158 "cp1252",
2159 "latin2",
2160 "cp1253",
2161 "latin3",
2162 "cp1254",
2163 "latin4",
2164 "cp1255",
2165 "csbig5",
2166 "latin5",
2167 "utf-16",
2168 "cp1256",
2169 "ibm866",
2170 "latin6",
2171 "cp1257",
2172 "cp1258",
2173 "greek8",
2174 "ibm819",
2175 "arabic",
2176 "visual",
2177 "korean",
2178 "euc-jp",
2179 "koi8-r",
2180 "koi8_r",
2181 "euc-kr",
2182 "x-sjis",
2183 "koi8-u",
2184 "hebrew",
2185 "tis-620",
2186 "gb18030",
2187 "ksc5601",
2188 "gb_2312",
2189 "dos-874",
2190 "cn-big5",
2191 "chinese",
2192 "logical",
2193 "cskoi8r",
2194 "cseuckr",
2195 "koi8-ru",
2196 "x-cp1250",
2197 "ksc_5601",
2198 "x-cp1251",
2199 "iso88591",
2200 "csgb2312",
2201 "x-cp1252",
2202 "iso88592",
2203 "x-cp1253",
2204 "iso88593",
2205 "ecma-114",
2206 "x-cp1254",
2207 "iso88594",
2208 "x-cp1255",
2209 "iso88595",
2210 "x-x-big5",
2211 "x-cp1256",
2212 "csibm866",
2213 "iso88596",
2214 "x-cp1257",
2215 "iso88597",
2216 "asmo-708",
2217 "ecma-118",
2218 "elot_928",
2219 "x-cp1258",
2220 "iso88598",
2221 "iso88599",
2222 "cyrillic",
2223 "utf-16be",
2224 "utf-16le",
2225 "us-ascii",
2226 "ms_kanji",
2227 "x-euc-jp",
2228 "iso885910",
2229 "iso8859-1",
2230 "iso885911",
2231 "iso8859-2",
2232 "iso8859-3",
2233 "iso885913",
2234 "iso8859-4",
2235 "iso885914",
2236 "iso8859-5",
2237 "iso885915",
2238 "iso8859-6",
2239 "iso8859-7",
2240 "iso8859-8",
2241 "iso-ir-58",
2242 "iso8859-9",
2243 "macintosh",
2244 "shift-jis",
2245 "shift_jis",
2246 "iso-ir-100",
2247 "iso8859-10",
2248 "iso-ir-110",
2249 "gb_2312-80",
2250 "iso-8859-1",
2251 "iso_8859-1",
2252 "iso-ir-101",
2253 "iso8859-11",
2254 "iso-8859-2",
2255 "iso_8859-2",
2256 "hz-gb-2312",
2257 "iso-8859-3",
2258 "iso_8859-3",
2259 "iso8859-13",
2260 "iso-8859-4",
2261 "iso_8859-4",
2262 "iso8859-14",
2263 "iso-ir-144",
2264 "iso-8859-5",
2265 "iso_8859-5",
2266 "iso8859-15",
2267 "iso-8859-6",
2268 "iso_8859-6",
2269 "iso-ir-126",
2270 "iso-8859-7",
2271 "iso_8859-7",
2272 "iso-ir-127",
2273 "iso-ir-157",
2274 "iso-8859-8",
2275 "iso_8859-8",
2276 "iso-ir-138",
2277 "iso-ir-148",
2278 "iso-8859-9",
2279 "iso_8859-9",
2280 "iso-ir-109",
2281 "iso-ir-149",
2282 "big5-hkscs",
2283 "csshiftjis",
2284 "iso-8859-10",
2285 "iso-8859-11",
2286 "csisolatin1",
2287 "csisolatin2",
2288 "iso-8859-13",
2289 "csisolatin3",
2290 "iso-8859-14",
2291 "windows-874",
2292 "csisolatin4",
2293 "iso-8859-15",
2294 "iso_8859-15",
2295 "csisolatin5",
2296 "iso-8859-16",
2297 "csisolatin6",
2298 "windows-949",
2299 "csisolatin9",
2300 "csiso88596e",
2301 "csiso88598e",
2302 "csmacintosh",
2303 "csiso88596i",
2304 "csiso88598i",
2305 "windows-31j",
2306 "x-mac-roman",
2307 "iso-2022-cn",
2308 "iso-2022-jp",
2309 "csiso2022jp",
2310 "iso-2022-kr",
2311 "csiso2022kr",
2312 "replacement",
2313 "windows-1250",
2314 "windows-1251",
2315 "windows-1252",
2316 "windows-1253",
2317 "windows-1254",
2318 "windows-1255",
2319 "windows-1256",
2320 "windows-1257",
2321 "windows-1258",
2322 "iso-8859-6-e",
2323 "iso-8859-8-e",
2324 "iso-8859-6-i",
2325 "iso-8859-8-i",
2326 "sun_eu_greek",
2327 "csksc56011987",
2328 "ks_c_5601-1987",
2329 "ansi_x3.4-1968",
2330 "ks_c_5601-1989",
2331 "x-mac-cyrillic",
2332 "x-user-defined",
2333 "csiso58gb231280",
2334 "iso_8859-1:1987",
2335 "iso_8859-2:1987",
2336 "iso_8859-6:1987",
2337 "iso_8859-7:1987",
2338 "iso_8859-3:1988",
2339 "iso_8859-4:1988",
2340 "iso_8859-5:1988",
2341 "iso_8859-8:1988",
2342 "iso_8859-9:1989",
2343 "csisolatingreek",
2344 "x-mac-ukrainian",
2345 "iso-2022-cn-ext",
2346 "csisolatinarabic",
2347 "csisolatinhebrew",
2348 "unicode-1-1-utf-8",
2349 "csisolatincyrillic",
2350 "cseucpkdfmtjapanese",
2351 ];
2352
2353 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2354 &WINDOWS_1252_INIT,
2355 &ISO_8859_2_INIT,
2356 &ISO_8859_3_INIT,
2357 &ISO_8859_4_INIT,
2358 &WINDOWS_1254_INIT,
2359 &ISO_8859_10_INIT,
2360 &ISO_8859_15_INIT,
2361 &IBM866_INIT,
2362 &MACINTOSH_INIT,
2363 &KOI8_R_INIT,
2364 &GBK_INIT,
2365 &BIG5_INIT,
2366 &UTF_8_INIT,
2367 &KOI8_R_INIT,
2368 &SHIFT_JIS_INIT,
2369 &SHIFT_JIS_INIT,
2370 &IBM866_INIT,
2371 &UTF_8_INIT,
2372 &WINDOWS_1252_INIT,
2373 &WINDOWS_1252_INIT,
2374 &GBK_INIT,
2375 &ISO_8859_7_INIT,
2376 &WINDOWS_1250_INIT,
2377 &WINDOWS_1251_INIT,
2378 &WINDOWS_1252_INIT,
2379 &GBK_INIT,
2380 &WINDOWS_1252_INIT,
2381 &ISO_8859_2_INIT,
2382 &WINDOWS_1253_INIT,
2383 &ISO_8859_3_INIT,
2384 &WINDOWS_1254_INIT,
2385 &ISO_8859_4_INIT,
2386 &WINDOWS_1255_INIT,
2387 &BIG5_INIT,
2388 &WINDOWS_1254_INIT,
2389 &UTF_16LE_INIT,
2390 &WINDOWS_1256_INIT,
2391 &IBM866_INIT,
2392 &ISO_8859_10_INIT,
2393 &WINDOWS_1257_INIT,
2394 &WINDOWS_1258_INIT,
2395 &ISO_8859_7_INIT,
2396 &WINDOWS_1252_INIT,
2397 &ISO_8859_6_INIT,
2398 &ISO_8859_8_INIT,
2399 &EUC_KR_INIT,
2400 &EUC_JP_INIT,
2401 &KOI8_R_INIT,
2402 &KOI8_R_INIT,
2403 &EUC_KR_INIT,
2404 &SHIFT_JIS_INIT,
2405 &KOI8_U_INIT,
2406 &ISO_8859_8_INIT,
2407 &WINDOWS_874_INIT,
2408 &GB18030_INIT,
2409 &EUC_KR_INIT,
2410 &GBK_INIT,
2411 &WINDOWS_874_INIT,
2412 &BIG5_INIT,
2413 &GBK_INIT,
2414 &ISO_8859_8_I_INIT,
2415 &KOI8_R_INIT,
2416 &EUC_KR_INIT,
2417 &KOI8_U_INIT,
2418 &WINDOWS_1250_INIT,
2419 &EUC_KR_INIT,
2420 &WINDOWS_1251_INIT,
2421 &WINDOWS_1252_INIT,
2422 &GBK_INIT,
2423 &WINDOWS_1252_INIT,
2424 &ISO_8859_2_INIT,
2425 &WINDOWS_1253_INIT,
2426 &ISO_8859_3_INIT,
2427 &ISO_8859_6_INIT,
2428 &WINDOWS_1254_INIT,
2429 &ISO_8859_4_INIT,
2430 &WINDOWS_1255_INIT,
2431 &ISO_8859_5_INIT,
2432 &BIG5_INIT,
2433 &WINDOWS_1256_INIT,
2434 &IBM866_INIT,
2435 &ISO_8859_6_INIT,
2436 &WINDOWS_1257_INIT,
2437 &ISO_8859_7_INIT,
2438 &ISO_8859_6_INIT,
2439 &ISO_8859_7_INIT,
2440 &ISO_8859_7_INIT,
2441 &WINDOWS_1258_INIT,
2442 &ISO_8859_8_INIT,
2443 &WINDOWS_1254_INIT,
2444 &ISO_8859_5_INIT,
2445 &UTF_16BE_INIT,
2446 &UTF_16LE_INIT,
2447 &WINDOWS_1252_INIT,
2448 &SHIFT_JIS_INIT,
2449 &EUC_JP_INIT,
2450 &ISO_8859_10_INIT,
2451 &WINDOWS_1252_INIT,
2452 &WINDOWS_874_INIT,
2453 &ISO_8859_2_INIT,
2454 &ISO_8859_3_INIT,
2455 &ISO_8859_13_INIT,
2456 &ISO_8859_4_INIT,
2457 &ISO_8859_14_INIT,
2458 &ISO_8859_5_INIT,
2459 &ISO_8859_15_INIT,
2460 &ISO_8859_6_INIT,
2461 &ISO_8859_7_INIT,
2462 &ISO_8859_8_INIT,
2463 &GBK_INIT,
2464 &WINDOWS_1254_INIT,
2465 &MACINTOSH_INIT,
2466 &SHIFT_JIS_INIT,
2467 &SHIFT_JIS_INIT,
2468 &WINDOWS_1252_INIT,
2469 &ISO_8859_10_INIT,
2470 &ISO_8859_4_INIT,
2471 &GBK_INIT,
2472 &WINDOWS_1252_INIT,
2473 &WINDOWS_1252_INIT,
2474 &ISO_8859_2_INIT,
2475 &WINDOWS_874_INIT,
2476 &ISO_8859_2_INIT,
2477 &ISO_8859_2_INIT,
2478 &REPLACEMENT_INIT,
2479 &ISO_8859_3_INIT,
2480 &ISO_8859_3_INIT,
2481 &ISO_8859_13_INIT,
2482 &ISO_8859_4_INIT,
2483 &ISO_8859_4_INIT,
2484 &ISO_8859_14_INIT,
2485 &ISO_8859_5_INIT,
2486 &ISO_8859_5_INIT,
2487 &ISO_8859_5_INIT,
2488 &ISO_8859_15_INIT,
2489 &ISO_8859_6_INIT,
2490 &ISO_8859_6_INIT,
2491 &ISO_8859_7_INIT,
2492 &ISO_8859_7_INIT,
2493 &ISO_8859_7_INIT,
2494 &ISO_8859_6_INIT,
2495 &ISO_8859_10_INIT,
2496 &ISO_8859_8_INIT,
2497 &ISO_8859_8_INIT,
2498 &ISO_8859_8_INIT,
2499 &WINDOWS_1254_INIT,
2500 &WINDOWS_1254_INIT,
2501 &WINDOWS_1254_INIT,
2502 &ISO_8859_3_INIT,
2503 &EUC_KR_INIT,
2504 &BIG5_INIT,
2505 &SHIFT_JIS_INIT,
2506 &ISO_8859_10_INIT,
2507 &WINDOWS_874_INIT,
2508 &WINDOWS_1252_INIT,
2509 &ISO_8859_2_INIT,
2510 &ISO_8859_13_INIT,
2511 &ISO_8859_3_INIT,
2512 &ISO_8859_14_INIT,
2513 &WINDOWS_874_INIT,
2514 &ISO_8859_4_INIT,
2515 &ISO_8859_15_INIT,
2516 &ISO_8859_15_INIT,
2517 &WINDOWS_1254_INIT,
2518 &ISO_8859_16_INIT,
2519 &ISO_8859_10_INIT,
2520 &EUC_KR_INIT,
2521 &ISO_8859_15_INIT,
2522 &ISO_8859_6_INIT,
2523 &ISO_8859_8_INIT,
2524 &MACINTOSH_INIT,
2525 &ISO_8859_6_INIT,
2526 &ISO_8859_8_I_INIT,
2527 &SHIFT_JIS_INIT,
2528 &MACINTOSH_INIT,
2529 &REPLACEMENT_INIT,
2530 &ISO_2022_JP_INIT,
2531 &ISO_2022_JP_INIT,
2532 &REPLACEMENT_INIT,
2533 &REPLACEMENT_INIT,
2534 &REPLACEMENT_INIT,
2535 &WINDOWS_1250_INIT,
2536 &WINDOWS_1251_INIT,
2537 &WINDOWS_1252_INIT,
2538 &WINDOWS_1253_INIT,
2539 &WINDOWS_1254_INIT,
2540 &WINDOWS_1255_INIT,
2541 &WINDOWS_1256_INIT,
2542 &WINDOWS_1257_INIT,
2543 &WINDOWS_1258_INIT,
2544 &ISO_8859_6_INIT,
2545 &ISO_8859_8_INIT,
2546 &ISO_8859_6_INIT,
2547 &ISO_8859_8_I_INIT,
2548 &ISO_8859_7_INIT,
2549 &EUC_KR_INIT,
2550 &EUC_KR_INIT,
2551 &WINDOWS_1252_INIT,
2552 &EUC_KR_INIT,
2553 &X_MAC_CYRILLIC_INIT,
2554 &X_USER_DEFINED_INIT,
2555 &GBK_INIT,
2556 &WINDOWS_1252_INIT,
2557 &ISO_8859_2_INIT,
2558 &ISO_8859_6_INIT,
2559 &ISO_8859_7_INIT,
2560 &ISO_8859_3_INIT,
2561 &ISO_8859_4_INIT,
2562 &ISO_8859_5_INIT,
2563 &ISO_8859_8_INIT,
2564 &WINDOWS_1254_INIT,
2565 &ISO_8859_7_INIT,
2566 &X_MAC_CYRILLIC_INIT,
2567 &REPLACEMENT_INIT,
2568 &ISO_8859_6_INIT,
2569 &ISO_8859_8_INIT,
2570 &UTF_8_INIT,
2571 &ISO_8859_5_INIT,
2572 &EUC_JP_INIT,
2573 ];
2574
2575 // END GENERATED CODE
2576
2577 /// An encoding as defined in the [Encoding Standard][1].
2578 ///
2579 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2580 /// and, in most cases, vice versa. Each encoding has a name, an output
2581 /// encoding, and one or more labels.
2582 ///
2583 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2584 /// encoding in formats and protocols. The _name_ of the encoding is the
2585 /// preferred label in the case appropriate for returning from the
2586 /// [`characterSet`][2] property of the `Document` DOM interface.
2587 ///
2588 /// The _output encoding_ is the encoding used for form submission and URL
2589 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2590 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2591 /// encodings.
2592 ///
2593 /// [1]: https://encoding.spec.whatwg.org/
2594 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2595 ///
2596 /// # Streaming vs. Non-Streaming
2597 ///
2598 /// When you have the entire input in a single buffer, you can use the
2599 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2600 /// [`decode_without_bom_handling()`][5],
2601 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2602 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2603 /// not available in the C API.) Unlike the rest of the API available to Rust,
2604 /// these methods perform heap allocations. You should the `Decoder` and
2605 /// `Encoder` objects when your input is split into multiple buffers or when
2606 /// you want to control the allocation of the output buffers.
2607 ///
2608 /// [3]: #method.decode
2609 /// [4]: #method.decode_with_bom_removal
2610 /// [5]: #method.decode_without_bom_handling
2611 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2612 /// [7]: #method.encode
2613 ///
2614 /// # Instances
2615 ///
2616 /// All instances of `Encoding` are statically allocated and have the `'static`
2617 /// lifetime. There is precisely one unique `Encoding` instance for each
2618 /// encoding defined in the Encoding Standard.
2619 ///
2620 /// To obtain a reference to a particular encoding whose identity you know at
2621 /// compile time, use a `static` that refers to encoding. There is a `static`
2622 /// for each encoding. The `static`s are named in all caps with hyphens
2623 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2624 /// name). For example, if you know at compile time that you will want to
2625 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2626 /// in C/C++).
2627 ///
2628 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2629 /// work around the problem that `static`s of the type `&'static Encoding`
2630 /// cannot be used to initialize items of an array whose type is
2631 /// `[&'static Encoding; N]`.
2632 ///
2633 /// If you don't know what encoding you need at compile time and need to
2634 /// dynamically get an encoding by label, use
2635 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2636 ///
2637 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2638 /// C/C++).
2639 pub struct Encoding {
2640 name: &'static str,
2641 variant: VariantEncoding,
2642 }
2643
2644 impl Encoding {
2645 /// Implements the
2646 /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2647 /// algorithm.
2648 ///
2649 /// If, after ASCII-lowercasing and removing leading and trailing
2650 /// whitespace, the argument matches a label defined in the Encoding
2651 /// Standard, `Some(&'static Encoding)` representing the corresponding
2652 /// encoding is returned. If there is no match, `None` is returned.
2653 ///
2654 /// This is the right method to use if the action upon the method returning
2655 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2656 /// When the action upon the method returning `None` is not to proceed with
2657 /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2658 /// appropriate.
2659 ///
2660 /// The argument is of type `&[u8]` instead of `&str` to save callers
2661 /// that are extracting the label from a non-UTF-8 protocol the trouble
2662 /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2663 /// on it.)
2664 ///
2665 /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2666 pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2667 let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2668 let mut trimmed_pos = 0usize;
2669 let mut iter = label.into_iter();
2670 // before
2671 loop {
2672 match iter.next() {
2673 None => {
2674 return None;
2675 }
2676 Some(byte) => {
2677 // The characters used in labels are:
2678 // a-z (except q, but excluding it below seems excessive)
2679 // 0-9
2680 // . _ - :
2681 match *byte {
2682 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2683 continue;
2684 }
2685 b'A'...b'Z' => {
2686 trimmed[trimmed_pos] = *byte + 0x20u8;
2687 trimmed_pos = 1usize;
2688 break;
2689 }
2690 b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2691 trimmed[trimmed_pos] = *byte;
2692 trimmed_pos = 1usize;
2693 break;
2694 }
2695 _ => {
2696 return None;
2697 }
2698 }
2699 }
2700 }
2701 }
2702 // inside
2703 loop {
2704 match iter.next() {
2705 None => {
2706 break;
2707 }
2708 Some(byte) => {
2709 match *byte {
2710 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2711 break;
2712 }
2713 b'A'...b'Z' => {
2714 if trimmed_pos == LONGEST_LABEL_LENGTH {
2715 // There's no encoding with a label this long
2716 return None;
2717 }
2718 trimmed[trimmed_pos] = *byte + 0x20u8;
2719 trimmed_pos += 1usize;
2720 continue;
2721 }
2722 b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2723 if trimmed_pos == LONGEST_LABEL_LENGTH {
2724 // There's no encoding with a label this long
2725 return None;
2726 }
2727 trimmed[trimmed_pos] = *byte;
2728 trimmed_pos += 1usize;
2729 continue;
2730 }
2731 _ => {
2732 return None;
2733 }
2734 }
2735 }
2736 }
2737 }
2738 // after
2739 loop {
2740 match iter.next() {
2741 None => {
2742 break;
2743 }
2744 Some(byte) => {
2745 match *byte {
2746 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2747 continue;
2748 }
2749 _ => {
2750 // There's no label with space in the middle
2751 return None;
2752 }
2753 }
2754 }
2755 }
2756 }
2757 let candidate = &trimmed[..trimmed_pos];
2758 match LABELS_SORTED.binary_search_by(|probe| {
2759 let bytes = probe.as_bytes();
2760 let c = bytes.len().cmp(&candidate.len());
2761 if c != Ordering::Equal {
2762 return c;
2763 }
2764 let probe_iter = bytes.iter().rev();
2765 let candidate_iter = candidate.iter().rev();
2766 probe_iter.cmp(candidate_iter)
2767 }) {
2768 Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2769 Err(_) => None,
2770 }
2771 }
2772
2773 /// This method behaves the same as `for_label()`, except when `for_label()`
2774 /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2775 ///
2776 /// This method is useful in scenarios where a fatal error is required
2777 /// upon invalid label, because in those cases the caller typically wishes
2778 /// to treat the labels that map to the replacement encoding as fatal
2779 /// errors, too.
2780 ///
2781 /// It is not OK to use this method when the action upon the method returning
2782 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2783 /// case, the `for_label()` method should be used instead in order to avoid
2784 /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2785 ///
2786 /// Available via the C wrapper.
2787 #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2788 pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2789 match Encoding::for_label(label) {
2790 None => None,
2791 Some(encoding) => {
2792 if encoding == REPLACEMENT {
2793 None
2794 } else {
2795 Some(encoding)
2796 }
2797 }
2798 }
2799 }
2800
2801 /// Performs non-incremental BOM sniffing.
2802 ///
2803 /// The argument must either be a buffer representing the entire input
2804 /// stream (non-streaming case) or a buffer representing at least the first
2805 /// three bytes of the input stream (streaming case).
2806 ///
2807 /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2808 /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2809 /// or UTF-16BE BOM or `None` otherwise.
2810 ///
2811 /// Available via the C wrapper.
2812 #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2813 pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2814 if buffer.starts_with(b"\xEF\xBB\xBF") {
2815 Some((UTF_8, 3))
2816 } else if buffer.starts_with(b"\xFF\xFE") {
2817 Some((UTF_16LE, 2))
2818 } else if buffer.starts_with(b"\xFE\xFF") {
2819 Some((UTF_16BE, 2))
2820 } else {
2821 None
2822 }
2823 }
2824
2825 /// Returns the name of this encoding.
2826 ///
2827 /// This name is appropriate to return as-is from the DOM
2828 /// `document.characterSet` property.
2829 ///
2830 /// Available via the C wrapper.
2831 #[inline]
name(&'static self) -> &'static str2832 pub fn name(&'static self) -> &'static str {
2833 self.name
2834 }
2835
2836 /// Checks whether the _output encoding_ of this encoding can encode every
2837 /// `char`. (Only true if the output encoding is UTF-8.)
2838 ///
2839 /// Available via the C wrapper.
2840 #[inline]
can_encode_everything(&'static self) -> bool2841 pub fn can_encode_everything(&'static self) -> bool {
2842 self.output_encoding() == UTF_8
2843 }
2844
2845 /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2846 /// U+0000...U+007F and vice versa.
2847 ///
2848 /// Available via the C wrapper.
2849 #[inline]
is_ascii_compatible(&'static self) -> bool2850 pub fn is_ascii_compatible(&'static self) -> bool {
2851 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2852 }
2853
2854 /// Checks whether this encoding maps one byte to one Basic Multilingual
2855 /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2856 /// vice versa (for mappable characters).
2857 ///
2858 /// `true` iff this encoding is on the list of [Legacy single-byte
2859 /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2860 /// in the spec or x-user-defined.
2861 ///
2862 /// Available via the C wrapper.
2863 #[inline]
is_single_byte(&'static self) -> bool2864 pub fn is_single_byte(&'static self) -> bool {
2865 self.variant.is_single_byte()
2866 }
2867
2868 /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2869 /// U+0000...U+007F and vice versa.
2870 #[inline]
is_potentially_borrowable(&'static self) -> bool2871 fn is_potentially_borrowable(&'static self) -> bool {
2872 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2873 }
2874
2875 /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2876 /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2877 ///
2878 /// Available via the C wrapper.
2879 #[inline]
output_encoding(&'static self) -> &'static Encoding2880 pub fn output_encoding(&'static self) -> &'static Encoding {
2881 if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2882 UTF_8
2883 } else {
2884 self
2885 }
2886 }
2887
2888 /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2889 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2890 /// entire input is available as a single buffer (i.e. the end of the
2891 /// buffer marks the end of the stream).
2892 ///
2893 /// This method implements the (non-streaming version of) the
2894 /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2895 ///
2896 /// The second item in the returned tuple is the encoding that was actually
2897 /// used (which may differ from this encoding thanks to BOM sniffing).
2898 ///
2899 /// The third item in the returned tuple indicates whether there were
2900 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2901 ///
2902 /// _Note:_ It is wrong to use this when the input buffer represents only
2903 /// a segment of the input instead of the whole input. Use `new_decoder()`
2904 /// when decoding segmented input.
2905 ///
2906 /// This method performs a one or two heap allocations for the backing
2907 /// buffer of the `String` when unable to borrow. (One allocation if not
2908 /// errors and potentially another one in the presence of errors.) The
2909 /// first allocation assumes jemalloc and may not be optimal with
2910 /// allocators that do not use power-of-two buckets. A borrow is performed
2911 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2912 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2913 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2914 /// transitions.
2915 ///
2916 /// # Panics
2917 ///
2918 /// If the size calculation for a heap-allocated backing buffer overflows
2919 /// `usize`.
2920 ///
2921 /// Available to Rust only.
2922 #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2923 pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2924 let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2925 Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2926 None => (self, bytes),
2927 };
2928 let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2929 (cow, encoding, had_errors)
2930 }
2931
2932 /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2933 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2934 /// entire input is available as a single buffer (i.e. the end of the
2935 /// buffer marks the end of the stream).
2936 ///
2937 /// When invoked on `UTF_8`, this method implements the (non-streaming
2938 /// version of) the
2939 /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2940 /// concept.
2941 ///
2942 /// The second item in the returned pair indicates whether there were
2943 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2944 ///
2945 /// _Note:_ It is wrong to use this when the input buffer represents only
2946 /// a segment of the input instead of the whole input. Use
2947 /// `new_decoder_with_bom_removal()` when decoding segmented input.
2948 ///
2949 /// This method performs a one or two heap allocations for the backing
2950 /// buffer of the `String` when unable to borrow. (One allocation if not
2951 /// errors and potentially another one in the presence of errors.) The
2952 /// first allocation assumes jemalloc and may not be optimal with
2953 /// allocators that do not use power-of-two buckets. A borrow is performed
2954 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2955 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2956 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2957 /// transitions.
2958 ///
2959 /// # Panics
2960 ///
2961 /// If the size calculation for a heap-allocated backing buffer overflows
2962 /// `usize`.
2963 ///
2964 /// Available to Rust only.
2965 #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)2966 pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2967 let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2968 &bytes[3..]
2969 } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2970 || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2971 {
2972 &bytes[2..]
2973 } else {
2974 bytes
2975 };
2976 self.decode_without_bom_handling(without_bom)
2977 }
2978
2979 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2980 /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2981 /// the entire input is available as a single buffer (i.e. the end of the
2982 /// buffer marks the end of the stream).
2983 ///
2984 /// When invoked on `UTF_8`, this method implements the (non-streaming
2985 /// version of) the
2986 /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2987 /// spec concept.
2988 ///
2989 /// The second item in the returned pair indicates whether there were
2990 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2991 ///
2992 /// _Note:_ It is wrong to use this when the input buffer represents only
2993 /// a segment of the input instead of the whole input. Use
2994 /// `new_decoder_without_bom_handling()` when decoding segmented input.
2995 ///
2996 /// This method performs a one or two heap allocations for the backing
2997 /// buffer of the `String` when unable to borrow. (One allocation if not
2998 /// errors and potentially another one in the presence of errors.) The
2999 /// first allocation assumes jemalloc and may not be optimal with
3000 /// allocators that do not use power-of-two buckets. A borrow is performed
3001 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3002 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3003 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3004 /// transitions.
3005 ///
3006 /// # Panics
3007 ///
3008 /// If the size calculation for a heap-allocated backing buffer overflows
3009 /// `usize`.
3010 ///
3011 /// Available to Rust only.
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3012 pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3013 let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3014 let valid_up_to = if self == UTF_8 {
3015 utf8_valid_up_to(bytes)
3016 } else if self == ISO_2022_JP {
3017 iso_2022_jp_ascii_valid_up_to(bytes)
3018 } else {
3019 ascii_valid_up_to(bytes)
3020 };
3021 if valid_up_to == bytes.len() {
3022 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3023 return (Cow::Borrowed(str), false);
3024 }
3025 let decoder = self.new_decoder_without_bom_handling();
3026
3027 let rounded_without_replacement = checked_next_power_of_two(checked_add(
3028 valid_up_to,
3029 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3030 ));
3031 let with_replacement = checked_add(
3032 valid_up_to,
3033 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3034 );
3035 let mut string = String::with_capacity(
3036 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3037 );
3038 unsafe {
3039 let vec = string.as_mut_vec();
3040 vec.set_len(valid_up_to);
3041 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3042 }
3043 (decoder, string, valid_up_to)
3044 } else {
3045 let decoder = self.new_decoder_without_bom_handling();
3046 let rounded_without_replacement = checked_next_power_of_two(
3047 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3048 );
3049 let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3050 let string = String::with_capacity(
3051 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3052 );
3053 (decoder, string, 0)
3054 };
3055
3056 let mut total_had_errors = false;
3057 loop {
3058 let (result, read, had_errors) =
3059 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3060 total_read += read;
3061 total_had_errors |= had_errors;
3062 match result {
3063 CoderResult::InputEmpty => {
3064 debug_assert_eq!(total_read, bytes.len());
3065 return (Cow::Owned(string), total_had_errors);
3066 }
3067 CoderResult::OutputFull => {
3068 // Allocate for the worst case. That is, we should come
3069 // here at most once per invocation of this method.
3070 let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3071 string.reserve(needed.unwrap());
3072 }
3073 }
3074 }
3075 }
3076
3077 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3078 /// _with malformed sequences treated as fatal_ when the entire input is
3079 /// available as a single buffer (i.e. the end of the buffer marks the end
3080 /// of the stream).
3081 ///
3082 /// When invoked on `UTF_8`, this method implements the (non-streaming
3083 /// version of) the
3084 /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3085 /// spec concept.
3086 ///
3087 /// Returns `None` if a malformed sequence was encountered and the result
3088 /// of the decode as `Some(String)` otherwise.
3089 ///
3090 /// _Note:_ It is wrong to use this when the input buffer represents only
3091 /// a segment of the input instead of the whole input. Use
3092 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3093 ///
3094 /// This method performs a single heap allocation for the backing
3095 /// buffer of the `String` when unable to borrow. A borrow is performed if
3096 /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3097 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3098 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3099 /// transitions.
3100 ///
3101 /// # Panics
3102 ///
3103 /// If the size calculation for a heap-allocated backing buffer overflows
3104 /// `usize`.
3105 ///
3106 /// Available to Rust only.
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3107 pub fn decode_without_bom_handling_and_without_replacement<'a>(
3108 &'static self,
3109 bytes: &'a [u8],
3110 ) -> Option<Cow<'a, str>> {
3111 if self == UTF_8 {
3112 let valid_up_to = utf8_valid_up_to(bytes);
3113 if valid_up_to == bytes.len() {
3114 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3115 return Some(Cow::Borrowed(str));
3116 }
3117 return None;
3118 }
3119 let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3120 let valid_up_to = if self == ISO_2022_JP {
3121 iso_2022_jp_ascii_valid_up_to(bytes)
3122 } else {
3123 ascii_valid_up_to(bytes)
3124 };
3125 if valid_up_to == bytes.len() {
3126 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3127 return Some(Cow::Borrowed(str));
3128 }
3129 let decoder = self.new_decoder_without_bom_handling();
3130 let mut string = String::with_capacity(
3131 checked_add(
3132 valid_up_to,
3133 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3134 )
3135 .unwrap(),
3136 );
3137 unsafe {
3138 let vec = string.as_mut_vec();
3139 vec.set_len(valid_up_to);
3140 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3141 }
3142 (decoder, string, &bytes[valid_up_to..])
3143 } else {
3144 let decoder = self.new_decoder_without_bom_handling();
3145 let string = String::with_capacity(
3146 decoder
3147 .max_utf8_buffer_length_without_replacement(bytes.len())
3148 .unwrap(),
3149 );
3150 (decoder, string, bytes)
3151 };
3152 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3153 match result {
3154 DecoderResult::InputEmpty => {
3155 debug_assert_eq!(read, input.len());
3156 Some(Cow::Owned(string))
3157 }
3158 DecoderResult::Malformed(_, _) => None,
3159 DecoderResult::OutputFull => unreachable!(),
3160 }
3161 }
3162
3163 /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3164 /// replaced with decimal numeric character references when the entire input
3165 /// is available as a single buffer (i.e. the end of the buffer marks the
3166 /// end of the stream).
3167 ///
3168 /// This method implements the (non-streaming version of) the
3169 /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3170 /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3171 /// spec concept, it is slightly more efficient to use
3172 /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3173 /// method on `UTF_8`.
3174 ///
3175 /// The second item in the returned tuple is the encoding that was actually
3176 /// used (which may differ from this encoding thanks to some encodings
3177 /// having UTF-8 as their output encoding).
3178 ///
3179 /// The third item in the returned tuple indicates whether there were
3180 /// unmappable characters (that were replaced with HTML numeric character
3181 /// references).
3182 ///
3183 /// _Note:_ It is wrong to use this when the input buffer represents only
3184 /// a segment of the input instead of the whole input. Use `new_encoder()`
3185 /// when encoding segmented output.
3186 ///
3187 /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3188 /// ASCII-compatible encoding, this method returns a borrow of the input
3189 /// without a heap allocation. Otherwise, this method performs a single
3190 /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3191 /// unmappable characters and potentially multiple heap allocations if
3192 /// there are. These allocations are tuned for jemalloc and may not be
3193 /// optimal when using a different allocator that doesn't use power-of-two
3194 /// buckets.
3195 ///
3196 /// # Panics
3197 ///
3198 /// If the size calculation for a heap-allocated backing buffer overflows
3199 /// `usize`.
3200 ///
3201 /// Available to Rust only.
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3202 pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3203 let output_encoding = self.output_encoding();
3204 if output_encoding == UTF_8 {
3205 return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3206 }
3207 debug_assert!(output_encoding.is_potentially_borrowable());
3208 let bytes = string.as_bytes();
3209 let valid_up_to = if output_encoding == ISO_2022_JP {
3210 iso_2022_jp_ascii_valid_up_to(bytes)
3211 } else {
3212 ascii_valid_up_to(bytes)
3213 };
3214 if valid_up_to == bytes.len() {
3215 return (Cow::Borrowed(bytes), output_encoding, false);
3216 }
3217 let mut encoder = output_encoding.new_encoder();
3218 let mut vec: Vec<u8> = Vec::with_capacity(
3219 (checked_add(
3220 valid_up_to,
3221 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3222 ))
3223 .unwrap()
3224 .next_power_of_two(),
3225 );
3226 unsafe {
3227 vec.set_len(valid_up_to);
3228 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3229 }
3230 let mut total_read = valid_up_to;
3231 let mut total_had_errors = false;
3232 loop {
3233 let (result, read, had_errors) =
3234 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3235 total_read += read;
3236 total_had_errors |= had_errors;
3237 match result {
3238 CoderResult::InputEmpty => {
3239 debug_assert_eq!(total_read, string.len());
3240 return (Cow::Owned(vec), output_encoding, total_had_errors);
3241 }
3242 CoderResult::OutputFull => {
3243 // reserve_exact wants to know how much more on top of current
3244 // length--not current capacity.
3245 let needed = encoder
3246 .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3247 let rounded = (checked_add(vec.capacity(), needed))
3248 .unwrap()
3249 .next_power_of_two();
3250 let additional = rounded - vec.len();
3251 vec.reserve_exact(additional);
3252 }
3253 }
3254 }
3255 }
3256
new_variant_decoder(&'static self) -> VariantDecoder3257 fn new_variant_decoder(&'static self) -> VariantDecoder {
3258 self.variant.new_variant_decoder()
3259 }
3260
3261 /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3262 ///
3263 /// BOM sniffing may cause the returned decoder to morph into a decoder
3264 /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3265 ///
3266 /// Available via the C wrapper.
3267 #[inline]
new_decoder(&'static self) -> Decoder3268 pub fn new_decoder(&'static self) -> Decoder {
3269 Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3270 }
3271
3272 /// Instantiates a new decoder for this encoding with BOM removal.
3273 ///
3274 /// If the input starts with bytes that are the BOM for this encoding,
3275 /// those bytes are removed. However, the decoder never morphs into a
3276 /// decoder for another encoding: A BOM for another encoding is treated as
3277 /// (potentially malformed) input to the decoding algorithm for this
3278 /// encoding.
3279 ///
3280 /// Available via the C wrapper.
3281 #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3282 pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3283 Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3284 }
3285
3286 /// Instantiates a new decoder for this encoding with BOM handling disabled.
3287 ///
3288 /// If the input starts with bytes that look like a BOM, those bytes are
3289 /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3290 /// for another encoding.)
3291 ///
3292 /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3293 /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3294 /// instead of this method to cause the BOM to be removed.
3295 ///
3296 /// Available via the C wrapper.
3297 #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3298 pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3299 Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3300 }
3301
3302 /// Instantiates a new encoder for the output encoding of this encoding.
3303 ///
3304 /// Available via the C wrapper.
3305 #[inline]
new_encoder(&'static self) -> Encoder3306 pub fn new_encoder(&'static self) -> Encoder {
3307 let enc = self.output_encoding();
3308 enc.variant.new_encoder(enc)
3309 }
3310
3311 /// Validates UTF-8.
3312 ///
3313 /// Returns the index of the first byte that makes the input malformed as
3314 /// UTF-8 or the length of the slice if the slice is entirely valid.
3315 ///
3316 /// This is currently faster than the corresponding standard library
3317 /// functionality. If this implementation gets upstreamed to the standard
3318 /// library, this method may be removed in the future.
3319 ///
3320 /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3321 pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3322 utf8_valid_up_to(bytes)
3323 }
3324
3325 /// Validates ASCII.
3326 ///
3327 /// Returns the index of the first byte that makes the input malformed as
3328 /// ASCII or the length of the slice if the slice is entirely valid.
3329 ///
3330 /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3331 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3332 ascii_valid_up_to(bytes)
3333 }
3334
3335 /// Validates ISO-2022-JP ASCII-state data.
3336 ///
3337 /// Returns the index of the first byte that makes the input not
3338 /// representable in the ASCII state of ISO-2022-JP or the length of the
3339 /// slice if the slice is entirely representable in the ASCII state of
3340 /// ISO-2022-JP.
3341 ///
3342 /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3343 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3344 iso_2022_jp_ascii_valid_up_to(bytes)
3345 }
3346 }
3347
3348 impl PartialEq for Encoding {
3349 #[inline]
eq(&self, other: &Encoding) -> bool3350 fn eq(&self, other: &Encoding) -> bool {
3351 (self as *const Encoding) == (other as *const Encoding)
3352 }
3353 }
3354
3355 impl Eq for Encoding {}
3356
3357 impl Hash for Encoding {
3358 #[inline]
hash<H: Hasher>(&self, state: &mut H)3359 fn hash<H: Hasher>(&self, state: &mut H) {
3360 (self as *const Encoding).hash(state);
3361 }
3362 }
3363
3364 impl std::fmt::Debug for Encoding {
3365 #[inline]
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result3366 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3367 write!(f, "Encoding {{ {} }}", self.name)
3368 }
3369 }
3370
3371 #[cfg(feature = "serde")]
3372 impl Serialize for Encoding {
3373 #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3374 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3375 where
3376 S: Serializer,
3377 {
3378 serializer.serialize_str(self.name)
3379 }
3380 }
3381
3382 #[cfg(feature = "serde")]
3383 struct EncodingVisitor;
3384
3385 #[cfg(feature = "serde")]
3386 impl<'de> Visitor<'de> for EncodingVisitor {
3387 type Value = &'static Encoding;
3388
expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result3389 fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
3390 formatter.write_str("a valid encoding label")
3391 }
3392
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3393 fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3394 where
3395 E: serde::de::Error,
3396 {
3397 if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3398 Ok(enc)
3399 } else {
3400 Err(E::custom(format!("invalid encoding label: {}", value)))
3401 }
3402 }
3403 }
3404
3405 #[cfg(feature = "serde")]
3406 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3407 fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3408 where
3409 D: Deserializer<'de>,
3410 {
3411 deserializer.deserialize_str(EncodingVisitor)
3412 }
3413 }
3414
3415 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3416 #[derive(PartialEq, Debug, Copy, Clone)]
3417 enum DecoderLifeCycle {
3418 /// The decoder has seen no input yet.
3419 AtStart,
3420 /// The decoder has seen no input yet but expects UTF-8.
3421 AtUtf8Start,
3422 /// The decoder has seen no input yet but expects UTF-16BE.
3423 AtUtf16BeStart,
3424 /// The decoder has seen no input yet but expects UTF-16LE.
3425 AtUtf16LeStart,
3426 /// The decoder has seen EF.
3427 SeenUtf8First,
3428 /// The decoder has seen EF, BB.
3429 SeenUtf8Second,
3430 /// The decoder has seen FE.
3431 SeenUtf16BeFirst,
3432 /// The decoder has seen FF.
3433 SeenUtf16LeFirst,
3434 /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3435 /// underlying decoder reported EF as an error, so we need to remember to
3436 /// push BB before the next buffer.
3437 ConvertingWithPendingBB,
3438 /// No longer looking for a BOM and EOF not yet seen.
3439 Converting,
3440 /// EOF has been seen.
3441 Finished,
3442 }
3443
3444 /// Communicate the BOM handling mode.
3445 #[derive(Debug, Copy, Clone)]
3446 enum BomHandling {
3447 /// Don't handle the BOM
3448 Off,
3449 /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3450 Sniff,
3451 /// Remove the BOM only if it's the BOM for this encoding
3452 Remove,
3453 }
3454
3455 /// Result of a (potentially partial) decode or encode operation with
3456 /// replacement.
3457 #[must_use]
3458 #[derive(Debug, PartialEq, Eq)]
3459 pub enum CoderResult {
3460 /// The input was exhausted.
3461 ///
3462 /// If this result was returned from a call where `last` was `true`, the
3463 /// conversion process has completed. Otherwise, the caller should call a
3464 /// decode or encode method again with more input.
3465 InputEmpty,
3466
3467 /// The converter cannot produce another unit of output, because the output
3468 /// buffer does not have enough space left.
3469 ///
3470 /// The caller must provide more output space upon the next call and re-push
3471 /// the remaining input to the converter.
3472 OutputFull,
3473 }
3474
3475 /// Result of a (potentially partial) decode operation without replacement.
3476 #[must_use]
3477 #[derive(Debug, PartialEq, Eq)]
3478 pub enum DecoderResult {
3479 /// The input was exhausted.
3480 ///
3481 /// If this result was returned from a call where `last` was `true`, the
3482 /// decoding process has completed. Otherwise, the caller should call a
3483 /// decode method again with more input.
3484 InputEmpty,
3485
3486 /// The decoder cannot produce another unit of output, because the output
3487 /// buffer does not have enough space left.
3488 ///
3489 /// The caller must provide more output space upon the next call and re-push
3490 /// the remaining input to the decoder.
3491 OutputFull,
3492
3493 /// The decoder encountered a malformed byte sequence.
3494 ///
3495 /// The caller must either treat this as a fatal error or must append one
3496 /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3497 /// the remaining input to the decoder.
3498 ///
3499 /// The first wrapped integer indicates the length of the malformed byte
3500 /// sequence. The second wrapped integer indicates the number of bytes
3501 /// that were consumed after the malformed sequence. If the second
3502 /// integer is zero, the last byte that was consumed is the last byte of
3503 /// the malformed sequence. Note that the malformed bytes may have been part
3504 /// of an earlier input buffer.
3505 ///
3506 /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3507 /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3508 /// of the two is 6, which happens with ISO-2022-JP.
3509 Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3510 }
3511
3512 /// A converter that decodes a byte stream into Unicode according to a
3513 /// character encoding in a streaming (incremental) manner.
3514 ///
3515 /// The various `decode_*` methods take an input buffer (`src`) and an output
3516 /// buffer `dst` both of which are caller-allocated. There are variants for
3517 /// both UTF-8 and UTF-16 output buffers.
3518 ///
3519 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3520 /// into `dst` until one of the following three things happens:
3521 ///
3522 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3523 /// variants only).
3524 ///
3525 /// 2. The output buffer has been filled so near capacity that the decoder
3526 /// cannot be sure that processing an additional byte of input wouldn't
3527 /// cause so much output that the output buffer would overflow.
3528 ///
3529 /// 3. All the input bytes have been processed.
3530 ///
3531 /// The `decode_*` method then returns tuple of a status indicating which one
3532 /// of the three reasons to return happened, how many input bytes were read,
3533 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3534 /// when decoding to UTF-16) were written (except when decoding into `String`,
3535 /// whose length change indicates this), and in the case of the
3536 /// variants performing replacement, a boolean indicating whether an error was
3537 /// replaced with the REPLACEMENT CHARACTER during the call.
3538 ///
3539 /// The number of bytes "written" is what's logically written. Garbage may be
3540 /// written in the output buffer beyond the point logically written to.
3541 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3542 /// methods that take an `&mut str` argument instead of the ones that take an
3543 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3544 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3545 /// latter don't.
3546 ///
3547 /// In the case of the `*_without_replacement` variants, the status is a
3548 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3549 /// `InputEmpty` corresponding to the three cases listed above).
3550 ///
3551 /// In the case of methods whose name does not end with
3552 /// `*_without_replacement`, malformed sequences are automatically replaced
3553 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3554 /// return early.
3555 ///
3556 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3557 /// space. When decoding to UTF-16, the output buffer must have at least two
3558 /// UTF-16 code units (`u16`) of space.
3559 ///
3560 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3561 /// not to return indicating that more output space is needed if the length
3562 /// of the output buffer is at least the length returned by
3563 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3564 /// with replacement, the length of the output buffer that guarantees the
3565 /// methods not to return indicating that more output space is needed is given
3566 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3567 /// or without replacement, the length of the output buffer that guarantees
3568 /// the methods not to return indicating that more output space is needed is
3569 /// given by [`max_utf16_buffer_length()`][4].
3570 ///
3571 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3572 /// and the output after each `decode_*` call is guaranteed to consist of
3573 /// complete characters. (I.e. the code unit sequence for the last character is
3574 /// guaranteed not to be split across output buffers.)
3575 ///
3576 /// The boolean argument `last` indicates that the end of the stream is reached
3577 /// when all the bytes in `src` have been consumed.
3578 ///
3579 /// A `Decoder` object can be used to incrementally decode a byte stream.
3580 ///
3581 /// During the processing of a single stream, the caller must call `decode_*`
3582 /// zero or more times with `last` set to `false` and then call `decode_*` at
3583 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3584 /// the processing of the stream has ended. Otherwise, the caller must call
3585 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3586 /// a fatal error).
3587 ///
3588 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3589 /// That is, you need to create another one to process another stream.
3590 ///
3591 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3592 /// the caller does not wish to treat it as a fatal error, the input buffer
3593 /// `src` may not have been completely consumed. In that case, the caller must
3594 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3595 /// call.
3596 ///
3597 /// [1]: enum.DecoderResult.html
3598 /// [2]: #method.max_utf8_buffer_length_without_replacement
3599 /// [3]: #method.max_utf8_buffer_length
3600 /// [4]: #method.max_utf16_buffer_length
3601 ///
3602 /// # Infinite loops
3603 ///
3604 /// When converting with a fixed-size output buffer whose size is too small to
3605 /// accommodate one character or (when applicable) one numeric character
3606 /// reference of output, an infinite loop ensues. When converting with a
3607 /// fixed-size output buffer, it generally makes sense to make the buffer
3608 /// fairly large (e.g. couple of kilobytes).
3609 pub struct Decoder {
3610 encoding: &'static Encoding,
3611 variant: VariantDecoder,
3612 life_cycle: DecoderLifeCycle,
3613 }
3614
3615 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3616 fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3617 Decoder {
3618 encoding: enc,
3619 variant: decoder,
3620 life_cycle: match sniffing {
3621 BomHandling::Off => DecoderLifeCycle::Converting,
3622 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3623 BomHandling::Remove => {
3624 if enc == UTF_8 {
3625 DecoderLifeCycle::AtUtf8Start
3626 } else if enc == UTF_16BE {
3627 DecoderLifeCycle::AtUtf16BeStart
3628 } else if enc == UTF_16LE {
3629 DecoderLifeCycle::AtUtf16LeStart
3630 } else {
3631 DecoderLifeCycle::Converting
3632 }
3633 }
3634 },
3635 }
3636 }
3637
3638 /// The `Encoding` this `Decoder` is for.
3639 ///
3640 /// BOM sniffing can change the return value of this method during the life
3641 /// of the decoder.
3642 ///
3643 /// Available via the C wrapper.
3644 #[inline]
encoding(&self) -> &'static Encoding3645 pub fn encoding(&self) -> &'static Encoding {
3646 self.encoding
3647 }
3648
3649 /// Query the worst-case UTF-8 output size _with replacement_.
3650 ///
3651 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3652 /// that will not overflow given the current state of the decoder and
3653 /// `byte_length` number of additional input bytes when decoding with
3654 /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3655 /// sequence or `None` if `usize` would overflow.
3656 ///
3657 /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3658 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3659 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3660 // BOM getting pushed to the underlying decoder.
3661 match self.life_cycle {
3662 DecoderLifeCycle::Converting
3663 | DecoderLifeCycle::AtUtf8Start
3664 | DecoderLifeCycle::AtUtf16LeStart
3665 | DecoderLifeCycle::AtUtf16BeStart => {
3666 return self.variant.max_utf8_buffer_length(byte_length);
3667 }
3668 DecoderLifeCycle::AtStart => {
3669 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3670 if let Some(utf16_bom) = checked_add(
3671 1,
3672 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3673 ) {
3674 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3675 let encoding = self.encoding();
3676 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3677 // No need to consider the internal state of the underlying decoder,
3678 // because it is at start, because no data has reached it yet.
3679 return Some(utf_bom);
3680 } else if let Some(non_bom) =
3681 self.variant.max_utf8_buffer_length(byte_length)
3682 {
3683 return Some(std::cmp::max(utf_bom, non_bom));
3684 }
3685 }
3686 }
3687 }
3688 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3689 // Add two bytes even when only one byte has been seen,
3690 // because the one byte can become a lead byte in multibyte
3691 // decoders, but only after the decoder has been queried
3692 // for max length, so the decoder's own logic for adding
3693 // one for a pending lead cannot work.
3694 if let Some(sum) = byte_length.checked_add(2) {
3695 if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3696 if self.encoding() == UTF_8 {
3697 // No need to consider the internal state of the underlying decoder,
3698 // because it is at start, because no data has reached it yet.
3699 return Some(utf8_bom);
3700 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3701 return Some(std::cmp::max(utf8_bom, non_bom));
3702 }
3703 }
3704 }
3705 }
3706 DecoderLifeCycle::ConvertingWithPendingBB => {
3707 if let Some(sum) = byte_length.checked_add(2) {
3708 return self.variant.max_utf8_buffer_length(sum);
3709 }
3710 }
3711 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3712 // Add two bytes even when only one byte has been seen,
3713 // because the one byte can become a lead byte in multibyte
3714 // decoders, but only after the decoder has been queried
3715 // for max length, so the decoder's own logic for adding
3716 // one for a pending lead cannot work.
3717 if let Some(sum) = byte_length.checked_add(2) {
3718 if let Some(utf16_bom) =
3719 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3720 {
3721 let encoding = self.encoding();
3722 if encoding == UTF_16LE || encoding == UTF_16BE {
3723 // No need to consider the internal state of the underlying decoder,
3724 // because it is at start, because no data has reached it yet.
3725 return Some(utf16_bom);
3726 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3727 return Some(std::cmp::max(utf16_bom, non_bom));
3728 }
3729 }
3730 }
3731 }
3732 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3733 }
3734 None
3735 }
3736
3737 /// Query the worst-case UTF-8 output size _without replacement_.
3738 ///
3739 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3740 /// that will not overflow given the current state of the decoder and
3741 /// `byte_length` number of additional input bytes when decoding without
3742 /// replacement error handling or `None` if `usize` would overflow.
3743 ///
3744 /// Note that this value may be too small for the `_with_replacement` case.
3745 /// Use `max_utf8_buffer_length()` for that case.
3746 ///
3747 /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3748 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3749 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3750 // BOM getting pushed to the underlying decoder.
3751 match self.life_cycle {
3752 DecoderLifeCycle::Converting
3753 | DecoderLifeCycle::AtUtf8Start
3754 | DecoderLifeCycle::AtUtf16LeStart
3755 | DecoderLifeCycle::AtUtf16BeStart => {
3756 return self
3757 .variant
3758 .max_utf8_buffer_length_without_replacement(byte_length);
3759 }
3760 DecoderLifeCycle::AtStart => {
3761 if let Some(utf8_bom) = byte_length.checked_add(3) {
3762 if let Some(utf16_bom) = checked_add(
3763 1,
3764 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3765 ) {
3766 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3767 let encoding = self.encoding();
3768 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3769 // No need to consider the internal state of the underlying decoder,
3770 // because it is at start, because no data has reached it yet.
3771 return Some(utf_bom);
3772 } else if let Some(non_bom) = self
3773 .variant
3774 .max_utf8_buffer_length_without_replacement(byte_length)
3775 {
3776 return Some(std::cmp::max(utf_bom, non_bom));
3777 }
3778 }
3779 }
3780 }
3781 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3782 // Add two bytes even when only one byte has been seen,
3783 // because the one byte can become a lead byte in multibyte
3784 // decoders, but only after the decoder has been queried
3785 // for max length, so the decoder's own logic for adding
3786 // one for a pending lead cannot work.
3787 if let Some(sum) = byte_length.checked_add(2) {
3788 if let Some(utf8_bom) = sum.checked_add(3) {
3789 if self.encoding() == UTF_8 {
3790 // No need to consider the internal state of the underlying decoder,
3791 // because it is at start, because no data has reached it yet.
3792 return Some(utf8_bom);
3793 } else if let Some(non_bom) =
3794 self.variant.max_utf8_buffer_length_without_replacement(sum)
3795 {
3796 return Some(std::cmp::max(utf8_bom, non_bom));
3797 }
3798 }
3799 }
3800 }
3801 DecoderLifeCycle::ConvertingWithPendingBB => {
3802 if let Some(sum) = byte_length.checked_add(2) {
3803 return self.variant.max_utf8_buffer_length_without_replacement(sum);
3804 }
3805 }
3806 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3807 // Add two bytes even when only one byte has been seen,
3808 // because the one byte can become a lead byte in multibyte
3809 // decoders, but only after the decoder has been queried
3810 // for max length, so the decoder's own logic for adding
3811 // one for a pending lead cannot work.
3812 if let Some(sum) = byte_length.checked_add(2) {
3813 if let Some(utf16_bom) =
3814 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3815 {
3816 let encoding = self.encoding();
3817 if encoding == UTF_16LE || encoding == UTF_16BE {
3818 // No need to consider the internal state of the underlying decoder,
3819 // because it is at start, because no data has reached it yet.
3820 return Some(utf16_bom);
3821 } else if let Some(non_bom) =
3822 self.variant.max_utf8_buffer_length_without_replacement(sum)
3823 {
3824 return Some(std::cmp::max(utf16_bom, non_bom));
3825 }
3826 }
3827 }
3828 }
3829 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3830 }
3831 None
3832 }
3833
3834 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3835 /// replaced with the REPLACEMENT CHARACTER.
3836 ///
3837 /// See the documentation of the struct for documentation for `decode_*`
3838 /// methods collectively.
3839 ///
3840 /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3841 pub fn decode_to_utf8(
3842 &mut self,
3843 src: &[u8],
3844 dst: &mut [u8],
3845 last: bool,
3846 ) -> (CoderResult, usize, usize, bool) {
3847 let mut had_errors = false;
3848 let mut total_read = 0usize;
3849 let mut total_written = 0usize;
3850 loop {
3851 let (result, read, written) = self.decode_to_utf8_without_replacement(
3852 &src[total_read..],
3853 &mut dst[total_written..],
3854 last,
3855 );
3856 total_read += read;
3857 total_written += written;
3858 match result {
3859 DecoderResult::InputEmpty => {
3860 return (
3861 CoderResult::InputEmpty,
3862 total_read,
3863 total_written,
3864 had_errors,
3865 );
3866 }
3867 DecoderResult::OutputFull => {
3868 return (
3869 CoderResult::OutputFull,
3870 total_read,
3871 total_written,
3872 had_errors,
3873 );
3874 }
3875 DecoderResult::Malformed(_, _) => {
3876 had_errors = true;
3877 // There should always be space for the U+FFFD, because
3878 // otherwise we'd have gotten OutputFull already.
3879 // XXX: is the above comment actually true for UTF-8 itself?
3880 // TODO: Consider having fewer bound checks here.
3881 dst[total_written] = 0xEFu8;
3882 total_written += 1;
3883 dst[total_written] = 0xBFu8;
3884 total_written += 1;
3885 dst[total_written] = 0xBDu8;
3886 total_written += 1;
3887 }
3888 }
3889 }
3890 }
3891
3892 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3893 /// replaced with the REPLACEMENT CHARACTER with type system signaling
3894 /// of UTF-8 validity.
3895 ///
3896 /// This methods calls `decode_to_utf8` and then zeroes
3897 /// out up to three bytes that aren't logically part of the write in order
3898 /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3899 ///
3900 /// See the documentation of the struct for documentation for `decode_*`
3901 /// methods collectively.
3902 ///
3903 /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3904 pub fn decode_to_str(
3905 &mut self,
3906 src: &[u8],
3907 dst: &mut str,
3908 last: bool,
3909 ) -> (CoderResult, usize, usize, bool) {
3910 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3911 let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3912 let len = bytes.len();
3913 let mut trail = written;
3914 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3915 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3916 // encodings to avoid overwriting here.
3917 if self.encoding != UTF_8 {
3918 let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3919 while trail < max {
3920 bytes[trail] = 0;
3921 trail += 1;
3922 }
3923 }
3924 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3925 bytes[trail] = 0;
3926 trail += 1;
3927 }
3928 (result, read, written, replaced)
3929 }
3930
3931 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3932 /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3933 ///
3934 /// Like the others, this method follows the logic that the output buffer is
3935 /// caller-allocated. This method treats the capacity of the `String` as
3936 /// the output limit. That is, this method guarantees not to cause a
3937 /// reallocation of the backing buffer of `String`.
3938 ///
3939 /// The return value is a tuple that contains the `DecoderResult`, the
3940 /// number of bytes read and a boolean indicating whether replacements
3941 /// were done. The number of bytes written is signaled via the length of
3942 /// the `String` changing.
3943 ///
3944 /// See the documentation of the struct for documentation for `decode_*`
3945 /// methods collectively.
3946 ///
3947 /// Available to Rust only.
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)3948 pub fn decode_to_string(
3949 &mut self,
3950 src: &[u8],
3951 dst: &mut String,
3952 last: bool,
3953 ) -> (CoderResult, usize, bool) {
3954 unsafe {
3955 let vec = dst.as_mut_vec();
3956 let old_len = vec.len();
3957 let capacity = vec.capacity();
3958 vec.set_len(capacity);
3959 let (result, read, written, replaced) =
3960 self.decode_to_utf8(src, &mut vec[old_len..], last);
3961 vec.set_len(old_len + written);
3962 (result, read, replaced)
3963 }
3964 }
3965
3966 public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3967 /// _without replacement_.
3968 ///
3969 /// See the documentation of the struct for
3970 /// documentation for `decode_*` methods
3971 /// collectively.
3972 ///
3973 /// Available via the C wrapper.
3974 ,
3975 decode_to_utf8_without_replacement,
3976 decode_to_utf8_raw,
3977 decode_to_utf8_checking_end,
3978 decode_to_utf8_after_one_potential_bom_byte,
3979 decode_to_utf8_after_two_potential_bom_bytes,
3980 decode_to_utf8_checking_end_with_offset,
3981 u8);
3982
3983 /// Incrementally decode a byte stream into UTF-8 with type system signaling
3984 /// of UTF-8 validity.
3985 ///
3986 /// This methods calls `decode_to_utf8` and then zeroes out up to three
3987 /// bytes that aren't logically part of the write in order to retain the
3988 /// UTF-8 validity even for the unwritten part of the buffer.
3989 ///
3990 /// See the documentation of the struct for documentation for `decode_*`
3991 /// methods collectively.
3992 ///
3993 /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)3994 pub fn decode_to_str_without_replacement(
3995 &mut self,
3996 src: &[u8],
3997 dst: &mut str,
3998 last: bool,
3999 ) -> (DecoderResult, usize, usize) {
4000 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4001 let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4002 let len = bytes.len();
4003 let mut trail = written;
4004 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4005 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4006 // encodings to avoid overwriting here.
4007 if self.encoding != UTF_8 {
4008 let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4009 while trail < max {
4010 bytes[trail] = 0;
4011 trail += 1;
4012 }
4013 }
4014 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4015 bytes[trail] = 0;
4016 trail += 1;
4017 }
4018 (result, read, written)
4019 }
4020
4021 /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4022 ///
4023 /// Like the others, this method follows the logic that the output buffer is
4024 /// caller-allocated. This method treats the capacity of the `String` as
4025 /// the output limit. That is, this method guarantees not to cause a
4026 /// reallocation of the backing buffer of `String`.
4027 ///
4028 /// The return value is a pair that contains the `DecoderResult` and the
4029 /// number of bytes read. The number of bytes written is signaled via
4030 /// the length of the `String` changing.
4031 ///
4032 /// See the documentation of the struct for documentation for `decode_*`
4033 /// methods collectively.
4034 ///
4035 /// Available to Rust only.
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4036 pub fn decode_to_string_without_replacement(
4037 &mut self,
4038 src: &[u8],
4039 dst: &mut String,
4040 last: bool,
4041 ) -> (DecoderResult, usize) {
4042 unsafe {
4043 let vec = dst.as_mut_vec();
4044 let old_len = vec.len();
4045 let capacity = vec.capacity();
4046 vec.set_len(capacity);
4047 let (result, read, written) =
4048 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4049 vec.set_len(old_len + written);
4050 (result, read)
4051 }
4052 }
4053
4054 /// Query the worst-case UTF-16 output size (with or without replacement).
4055 ///
4056 /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4057 /// that will not overflow given the current state of the decoder and
4058 /// `byte_length` number of additional input bytes or `None` if `usize`
4059 /// would overflow.
4060 ///
4061 /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4062 /// return value of this method applies also in the
4063 /// `_without_replacement` case.
4064 ///
4065 /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4066 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4067 // Need to consider a) the decoder morphing due to the BOM and b) a partial
4068 // BOM getting pushed to the underlying decoder.
4069 match self.life_cycle {
4070 DecoderLifeCycle::Converting
4071 | DecoderLifeCycle::AtUtf8Start
4072 | DecoderLifeCycle::AtUtf16LeStart
4073 | DecoderLifeCycle::AtUtf16BeStart => {
4074 return self.variant.max_utf16_buffer_length(byte_length);
4075 }
4076 DecoderLifeCycle::AtStart => {
4077 if let Some(utf8_bom) = byte_length.checked_add(1) {
4078 if let Some(utf16_bom) =
4079 checked_add(1, checked_div(byte_length.checked_add(1), 2))
4080 {
4081 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
4082 let encoding = self.encoding();
4083 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4084 // No need to consider the internal state of the underlying decoder,
4085 // because it is at start, because no data has reached it yet.
4086 return Some(utf_bom);
4087 } else if let Some(non_bom) =
4088 self.variant.max_utf16_buffer_length(byte_length)
4089 {
4090 return Some(std::cmp::max(utf_bom, non_bom));
4091 }
4092 }
4093 }
4094 }
4095 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4096 // Add two bytes even when only one byte has been seen,
4097 // because the one byte can become a lead byte in multibyte
4098 // decoders, but only after the decoder has been queried
4099 // for max length, so the decoder's own logic for adding
4100 // one for a pending lead cannot work.
4101 if let Some(sum) = byte_length.checked_add(2) {
4102 if let Some(utf8_bom) = sum.checked_add(1) {
4103 if self.encoding() == UTF_8 {
4104 // No need to consider the internal state of the underlying decoder,
4105 // because it is at start, because no data has reached it yet.
4106 return Some(utf8_bom);
4107 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4108 return Some(std::cmp::max(utf8_bom, non_bom));
4109 }
4110 }
4111 }
4112 }
4113 DecoderLifeCycle::ConvertingWithPendingBB => {
4114 if let Some(sum) = byte_length.checked_add(2) {
4115 return self.variant.max_utf16_buffer_length(sum);
4116 }
4117 }
4118 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4119 // Add two bytes even when only one byte has been seen,
4120 // because the one byte can become a lead byte in multibyte
4121 // decoders, but only after the decoder has been queried
4122 // for max length, so the decoder's own logic for adding
4123 // one for a pending lead cannot work.
4124 if let Some(sum) = byte_length.checked_add(2) {
4125 if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4126 let encoding = self.encoding();
4127 if encoding == UTF_16LE || encoding == UTF_16BE {
4128 // No need to consider the internal state of the underlying decoder,
4129 // because it is at start, because no data has reached it yet.
4130 return Some(utf16_bom);
4131 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4132 return Some(std::cmp::max(utf16_bom, non_bom));
4133 }
4134 }
4135 }
4136 }
4137 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4138 }
4139 None
4140 }
4141
4142 /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4143 /// replaced with the REPLACEMENT CHARACTER.
4144 ///
4145 /// See the documentation of the struct for documentation for `decode_*`
4146 /// methods collectively.
4147 ///
4148 /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4149 pub fn decode_to_utf16(
4150 &mut self,
4151 src: &[u8],
4152 dst: &mut [u16],
4153 last: bool,
4154 ) -> (CoderResult, usize, usize, bool) {
4155 let mut had_errors = false;
4156 let mut total_read = 0usize;
4157 let mut total_written = 0usize;
4158 loop {
4159 let (result, read, written) = self.decode_to_utf16_without_replacement(
4160 &src[total_read..],
4161 &mut dst[total_written..],
4162 last,
4163 );
4164 total_read += read;
4165 total_written += written;
4166 match result {
4167 DecoderResult::InputEmpty => {
4168 return (
4169 CoderResult::InputEmpty,
4170 total_read,
4171 total_written,
4172 had_errors,
4173 );
4174 }
4175 DecoderResult::OutputFull => {
4176 return (
4177 CoderResult::OutputFull,
4178 total_read,
4179 total_written,
4180 had_errors,
4181 );
4182 }
4183 DecoderResult::Malformed(_, _) => {
4184 had_errors = true;
4185 // There should always be space for the U+FFFD, because
4186 // otherwise we'd have gotten OutputFull already.
4187 dst[total_written] = 0xFFFD;
4188 total_written += 1;
4189 }
4190 }
4191 }
4192 }
4193
4194 public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4195 /// _without replacement_.
4196 ///
4197 /// See the documentation of the struct for
4198 /// documentation for `decode_*` methods
4199 /// collectively.
4200 ///
4201 /// Available via the C wrapper.
4202 ,
4203 decode_to_utf16_without_replacement,
4204 decode_to_utf16_raw,
4205 decode_to_utf16_checking_end,
4206 decode_to_utf16_after_one_potential_bom_byte,
4207 decode_to_utf16_after_two_potential_bom_bytes,
4208 decode_to_utf16_checking_end_with_offset,
4209 u16);
4210
4211 /// Checks for compatibility with storing Unicode scalar values as unsigned
4212 /// bytes taking into account the state of the decoder.
4213 ///
4214 /// Returns `None` if the decoder is not in a neutral state, including waiting
4215 /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4216 ///
4217 /// Otherwise returns the index of the first byte whose unsigned value doesn't
4218 /// directly correspond to the decoded Unicode scalar value, or the length
4219 /// of the input if all bytes in the input decode directly to scalar values
4220 /// corresponding to the unsigned byte values.
4221 ///
4222 /// Does not change the state of the decoder.
4223 ///
4224 /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4225 /// storage optimizations.
4226 ///
4227 /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4228 pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4229 match self.life_cycle {
4230 DecoderLifeCycle::Converting => {
4231 return self.variant.latin1_byte_compatible_up_to(bytes);
4232 }
4233 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4234 _ => None,
4235 }
4236 }
4237 }
4238
4239 /// Result of a (potentially partial) encode operation without replacement.
4240 #[must_use]
4241 #[derive(Debug, PartialEq, Eq)]
4242 pub enum EncoderResult {
4243 /// The input was exhausted.
4244 ///
4245 /// If this result was returned from a call where `last` was `true`, the
4246 /// decoding process has completed. Otherwise, the caller should call a
4247 /// decode method again with more input.
4248 InputEmpty,
4249
4250 /// The encoder cannot produce another unit of output, because the output
4251 /// buffer does not have enough space left.
4252 ///
4253 /// The caller must provide more output space upon the next call and re-push
4254 /// the remaining input to the decoder.
4255 OutputFull,
4256
4257 /// The encoder encountered an unmappable character.
4258 ///
4259 /// The caller must either treat this as a fatal error or must append
4260 /// a placeholder to the output and then re-push the remaining input to the
4261 /// encoder.
4262 Unmappable(char),
4263 }
4264
4265 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4266 fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4267 EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
4268 }
4269 }
4270
4271 /// A converter that encodes a Unicode stream into bytes according to a
4272 /// character encoding in a streaming (incremental) manner.
4273 ///
4274 /// The various `encode_*` methods take an input buffer (`src`) and an output
4275 /// buffer `dst` both of which are caller-allocated. There are variants for
4276 /// both UTF-8 and UTF-16 input buffers.
4277 ///
4278 /// An `encode_*` method encode characters from `src` into bytes characters
4279 /// stored into `dst` until one of the following three things happens:
4280 ///
4281 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4282 /// only).
4283 ///
4284 /// 2. The output buffer has been filled so near capacity that the decoder
4285 /// cannot be sure that processing an additional character of input wouldn't
4286 /// cause so much output that the output buffer would overflow.
4287 ///
4288 /// 3. All the input characters have been processed.
4289 ///
4290 /// The `encode_*` method then returns tuple of a status indicating which one
4291 /// of the three reasons to return happened, how many input code units (`u8`
4292 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4293 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4294 /// whose length change indicates this), and in the case of the variants that
4295 /// perform replacement, a boolean indicating whether an unmappable
4296 /// character was replaced with a numeric character reference during the call.
4297 ///
4298 /// The number of bytes "written" is what's logically written. Garbage may be
4299 /// written in the output buffer beyond the point logically written to.
4300 ///
4301 /// In the case of the methods whose name ends with
4302 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4303 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4304 /// the three cases listed above).
4305 ///
4306 /// In the case of methods whose name does not end with
4307 /// `*_without_replacement`, unmappable characters are automatically replaced
4308 /// with the corresponding numeric character references and unmappable
4309 /// characters do not cause the methods to return early.
4310 ///
4311 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4312 /// not to return indicating that more output space is needed if the length
4313 /// of the output buffer is at least the length returned by
4314 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4315 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4316 /// methods not to return indicating that more output space is needed in the
4317 /// absence of unmappable characters is given by
4318 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4319 /// UTF-16 without replacement, the methods are guaranteed not to return
4320 /// indicating that more output space is needed if the length of the output
4321 /// buffer is at least the length returned by
4322 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4323 /// from UTF-16 with replacement, the the length of the output buffer that
4324 /// guarantees the methods not to return indicating that more output space is
4325 /// needed in the absence of unmappable characters is given by
4326 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4327 /// When encoding with replacement, applications are not expected to size the
4328 /// buffer for the worst case ahead of time but to resize the buffer if there
4329 /// are unmappable characters. This is why max length queries are only available
4330 /// for the case where there are no unmappable characters.
4331 ///
4332 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4333 /// calling from Rust, the type system takes care of this.) When encoding from
4334 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4335 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4336 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4337 /// are not split across input buffer boundaries.
4338 ///
4339 /// After an `encode_*` call returns, the output produced so far, taken as a
4340 /// whole from the start of the stream, is guaranteed to consist of a valid
4341 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4342 /// character is guaranteed not to be split across output buffers. However, due
4343 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4344 /// from the start for it to be valid. For other encodings, the validity holds
4345 /// on a per-output buffer basis.)
4346 ///
4347 /// The boolean argument `last` indicates that the end of the stream is reached
4348 /// when all the characters in `src` have been consumed. This argument is needed
4349 /// for ISO-2022-JP and is ignored for other encodings.
4350 ///
4351 /// An `Encoder` object can be used to incrementally encode a byte stream.
4352 ///
4353 /// During the processing of a single stream, the caller must call `encode_*`
4354 /// zero or more times with `last` set to `false` and then call `encode_*` at
4355 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4356 /// the processing of the stream has ended. Otherwise, the caller must call
4357 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4358 /// as a fatal error).
4359 ///
4360 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4361 /// That is, you need to create another one to process another stream.
4362 ///
4363 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4364 /// and the caller does not wish to treat it as a fatal error, the input buffer
4365 /// `src` may not have been completely consumed. In that case, the caller must
4366 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4367 /// call.
4368 ///
4369 /// [1]: enum.EncoderResult.html
4370 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4371 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4372 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4373 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4374 ///
4375 /// # Infinite loops
4376 ///
4377 /// When converting with a fixed-size output buffer whose size is too small to
4378 /// accommodate one character of output, an infinite loop ensues. When
4379 /// converting with a fixed-size output buffer, it generally makes sense to
4380 /// make the buffer fairly large (e.g. couple of kilobytes).
4381 pub struct Encoder {
4382 encoding: &'static Encoding,
4383 variant: VariantEncoder,
4384 }
4385
4386 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4387 fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4388 Encoder {
4389 encoding: enc,
4390 variant: encoder,
4391 }
4392 }
4393
4394 /// The `Encoding` this `Encoder` is for.
4395 #[inline]
encoding(&self) -> &'static Encoding4396 pub fn encoding(&self) -> &'static Encoding {
4397 self.encoding
4398 }
4399
4400 /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4401 /// ASCII state and `false` otherwise.
4402 #[inline]
has_pending_state(&self) -> bool4403 pub fn has_pending_state(&self) -> bool {
4404 self.variant.has_pending_state()
4405 }
4406
4407 /// Query the worst-case output size when encoding from UTF-8 with
4408 /// replacement.
4409 ///
4410 /// Returns the size of the output buffer in bytes that will not overflow
4411 /// given the current state of the encoder and `byte_length` number of
4412 /// additional input code units if there are no unmappable characters in
4413 /// the input or `None` if `usize` would overflow.
4414 ///
4415 /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4416 pub fn max_buffer_length_from_utf8_if_no_unmappables(
4417 &self,
4418 byte_length: usize,
4419 ) -> Option<usize> {
4420 checked_add(
4421 if self.encoding().can_encode_everything() {
4422 0
4423 } else {
4424 NCR_EXTRA
4425 },
4426 self.max_buffer_length_from_utf8_without_replacement(byte_length),
4427 )
4428 }
4429
4430 /// Query the worst-case output size when encoding from UTF-8 without
4431 /// replacement.
4432 ///
4433 /// Returns the size of the output buffer in bytes that will not overflow
4434 /// given the current state of the encoder and `byte_length` number of
4435 /// additional input code units or `None` if `usize` would overflow.
4436 ///
4437 /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4438 pub fn max_buffer_length_from_utf8_without_replacement(
4439 &self,
4440 byte_length: usize,
4441 ) -> Option<usize> {
4442 self.variant
4443 .max_buffer_length_from_utf8_without_replacement(byte_length)
4444 }
4445
4446 /// Incrementally encode into byte stream from UTF-8 with unmappable
4447 /// characters replaced with HTML (decimal) numeric character references.
4448 ///
4449 /// See the documentation of the struct for documentation for `encode_*`
4450 /// methods collectively.
4451 ///
4452 /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4453 pub fn encode_from_utf8(
4454 &mut self,
4455 src: &str,
4456 dst: &mut [u8],
4457 last: bool,
4458 ) -> (CoderResult, usize, usize, bool) {
4459 let dst_len = dst.len();
4460 let effective_dst_len = if self.encoding().can_encode_everything() {
4461 dst_len
4462 } else {
4463 if dst_len < NCR_EXTRA {
4464 if src.is_empty() && !(last && self.has_pending_state()) {
4465 return (CoderResult::InputEmpty, 0, 0, false);
4466 }
4467 return (CoderResult::OutputFull, 0, 0, false);
4468 }
4469 dst_len - NCR_EXTRA
4470 };
4471 let mut had_unmappables = false;
4472 let mut total_read = 0usize;
4473 let mut total_written = 0usize;
4474 loop {
4475 let (result, read, written) = self.encode_from_utf8_without_replacement(
4476 &src[total_read..],
4477 &mut dst[total_written..effective_dst_len],
4478 last,
4479 );
4480 total_read += read;
4481 total_written += written;
4482 match result {
4483 EncoderResult::InputEmpty => {
4484 return (
4485 CoderResult::InputEmpty,
4486 total_read,
4487 total_written,
4488 had_unmappables,
4489 );
4490 }
4491 EncoderResult::OutputFull => {
4492 return (
4493 CoderResult::OutputFull,
4494 total_read,
4495 total_written,
4496 had_unmappables,
4497 );
4498 }
4499 EncoderResult::Unmappable(unmappable) => {
4500 had_unmappables = true;
4501 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4502 debug_assert_ne!(self.encoding(), UTF_16BE);
4503 debug_assert_ne!(self.encoding(), UTF_16LE);
4504 // Additionally, Iso2022JpEncoder is responsible for
4505 // transitioning to ASCII when returning with Unmappable.
4506 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4507 if total_written >= effective_dst_len {
4508 if total_read == src.len() && !(last && self.has_pending_state()) {
4509 return (
4510 CoderResult::InputEmpty,
4511 total_read,
4512 total_written,
4513 had_unmappables,
4514 );
4515 }
4516 return (
4517 CoderResult::OutputFull,
4518 total_read,
4519 total_written,
4520 had_unmappables,
4521 );
4522 }
4523 }
4524 }
4525 }
4526 }
4527
4528 /// Incrementally encode into byte stream from UTF-8 with unmappable
4529 /// characters replaced with HTML (decimal) numeric character references.
4530 ///
4531 /// See the documentation of the struct for documentation for `encode_*`
4532 /// methods collectively.
4533 ///
4534 /// Available to Rust only.
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4535 pub fn encode_from_utf8_to_vec(
4536 &mut self,
4537 src: &str,
4538 dst: &mut Vec<u8>,
4539 last: bool,
4540 ) -> (CoderResult, usize, bool) {
4541 unsafe {
4542 let old_len = dst.len();
4543 let capacity = dst.capacity();
4544 dst.set_len(capacity);
4545 let (result, read, written, replaced) =
4546 self.encode_from_utf8(src, &mut dst[old_len..], last);
4547 dst.set_len(old_len + written);
4548 (result, read, replaced)
4549 }
4550 }
4551
4552 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4553 ///
4554 /// See the documentation of the struct for documentation for `encode_*`
4555 /// methods collectively.
4556 ///
4557 /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4558 pub fn encode_from_utf8_without_replacement(
4559 &mut self,
4560 src: &str,
4561 dst: &mut [u8],
4562 last: bool,
4563 ) -> (EncoderResult, usize, usize) {
4564 self.variant.encode_from_utf8_raw(src, dst, last)
4565 }
4566
4567 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4568 ///
4569 /// See the documentation of the struct for documentation for `encode_*`
4570 /// methods collectively.
4571 ///
4572 /// Available to Rust only.
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4573 pub fn encode_from_utf8_to_vec_without_replacement(
4574 &mut self,
4575 src: &str,
4576 dst: &mut Vec<u8>,
4577 last: bool,
4578 ) -> (EncoderResult, usize) {
4579 unsafe {
4580 let old_len = dst.len();
4581 let capacity = dst.capacity();
4582 dst.set_len(capacity);
4583 let (result, read, written) =
4584 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4585 dst.set_len(old_len + written);
4586 (result, read)
4587 }
4588 }
4589
4590 /// Query the worst-case output size when encoding from UTF-16 with
4591 /// replacement.
4592 ///
4593 /// Returns the size of the output buffer in bytes that will not overflow
4594 /// given the current state of the encoder and `u16_length` number of
4595 /// additional input code units if there are no unmappable characters in
4596 /// the input or `None` if `usize` would overflow.
4597 ///
4598 /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4599 pub fn max_buffer_length_from_utf16_if_no_unmappables(
4600 &self,
4601 u16_length: usize,
4602 ) -> Option<usize> {
4603 checked_add(
4604 if self.encoding().can_encode_everything() {
4605 0
4606 } else {
4607 NCR_EXTRA
4608 },
4609 self.max_buffer_length_from_utf16_without_replacement(u16_length),
4610 )
4611 }
4612
4613 /// Query the worst-case output size when encoding from UTF-16 without
4614 /// replacement.
4615 ///
4616 /// Returns the size of the output buffer in bytes that will not overflow
4617 /// given the current state of the encoder and `u16_length` number of
4618 /// additional input code units or `None` if `usize` would overflow.
4619 ///
4620 /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4621 pub fn max_buffer_length_from_utf16_without_replacement(
4622 &self,
4623 u16_length: usize,
4624 ) -> Option<usize> {
4625 self.variant
4626 .max_buffer_length_from_utf16_without_replacement(u16_length)
4627 }
4628
4629 /// Incrementally encode into byte stream from UTF-16 with unmappable
4630 /// characters replaced with HTML (decimal) numeric character references.
4631 ///
4632 /// See the documentation of the struct for documentation for `encode_*`
4633 /// methods collectively.
4634 ///
4635 /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4636 pub fn encode_from_utf16(
4637 &mut self,
4638 src: &[u16],
4639 dst: &mut [u8],
4640 last: bool,
4641 ) -> (CoderResult, usize, usize, bool) {
4642 let dst_len = dst.len();
4643 let effective_dst_len = if self.encoding().can_encode_everything() {
4644 dst_len
4645 } else {
4646 if dst_len < NCR_EXTRA {
4647 if src.is_empty() && !(last && self.has_pending_state()) {
4648 return (CoderResult::InputEmpty, 0, 0, false);
4649 }
4650 return (CoderResult::OutputFull, 0, 0, false);
4651 }
4652 dst_len - NCR_EXTRA
4653 };
4654 let mut had_unmappables = false;
4655 let mut total_read = 0usize;
4656 let mut total_written = 0usize;
4657 loop {
4658 let (result, read, written) = self.encode_from_utf16_without_replacement(
4659 &src[total_read..],
4660 &mut dst[total_written..effective_dst_len],
4661 last,
4662 );
4663 total_read += read;
4664 total_written += written;
4665 match result {
4666 EncoderResult::InputEmpty => {
4667 return (
4668 CoderResult::InputEmpty,
4669 total_read,
4670 total_written,
4671 had_unmappables,
4672 );
4673 }
4674 EncoderResult::OutputFull => {
4675 return (
4676 CoderResult::OutputFull,
4677 total_read,
4678 total_written,
4679 had_unmappables,
4680 );
4681 }
4682 EncoderResult::Unmappable(unmappable) => {
4683 had_unmappables = true;
4684 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4685 // There are no UTF-16 encoders and even if there were,
4686 // they'd never have unmappables.
4687 debug_assert_ne!(self.encoding(), UTF_16BE);
4688 debug_assert_ne!(self.encoding(), UTF_16LE);
4689 // Additionally, Iso2022JpEncoder is responsible for
4690 // transitioning to ASCII when returning with Unmappable
4691 // from the jis0208 state. That is, when we encode
4692 // ISO-2022-JP and come here, the encoder is in either the
4693 // ASCII or the Roman state. We are allowed to generate any
4694 // printable ASCII excluding \ and ~.
4695 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4696 if total_written >= effective_dst_len {
4697 if total_read == src.len() && !(last && self.has_pending_state()) {
4698 return (
4699 CoderResult::InputEmpty,
4700 total_read,
4701 total_written,
4702 had_unmappables,
4703 );
4704 }
4705 return (
4706 CoderResult::OutputFull,
4707 total_read,
4708 total_written,
4709 had_unmappables,
4710 );
4711 }
4712 }
4713 }
4714 }
4715 }
4716
4717 /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4718 ///
4719 /// See the documentation of the struct for documentation for `encode_*`
4720 /// methods collectively.
4721 ///
4722 /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4723 pub fn encode_from_utf16_without_replacement(
4724 &mut self,
4725 src: &[u16],
4726 dst: &mut [u8],
4727 last: bool,
4728 ) -> (EncoderResult, usize, usize) {
4729 self.variant.encode_from_utf16_raw(src, dst, last)
4730 }
4731 }
4732
4733 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4734 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4735 // len is the number of decimal digits needed to represent unmappable plus
4736 // 3 (the length of "&#" and ";").
4737 let mut number = unmappable as u32;
4738 let len = if number >= 1_000_000u32 {
4739 10usize
4740 } else if number >= 100_000u32 {
4741 9usize
4742 } else if number >= 10_000u32 {
4743 8usize
4744 } else if number >= 1_000u32 {
4745 7usize
4746 } else if number >= 100u32 {
4747 6usize
4748 } else {
4749 // Review the outcome of https://github.com/whatwg/encoding/issues/15
4750 // to see if this case is possible
4751 5usize
4752 };
4753 debug_assert!(number >= 10u32);
4754 debug_assert!(len <= dst.len());
4755 let mut pos = len - 1;
4756 dst[pos] = b';';
4757 pos -= 1;
4758 loop {
4759 let rightmost = number % 10;
4760 dst[pos] = rightmost as u8 + b'0';
4761 pos -= 1;
4762 if number < 10 {
4763 break;
4764 }
4765 number /= 10;
4766 }
4767 dst[1] = b'#';
4768 dst[0] = b'&';
4769 len
4770 }
4771
4772 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4773 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4774 i.wrapping_sub(start) < (end - start)
4775 }
4776
4777 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4778 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4779 i.wrapping_sub(start) < (end - start)
4780 }
4781
4782 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4783 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4784 i.wrapping_sub(start) <= (end - start)
4785 }
4786
4787 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4788 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4789 i.wrapping_sub(start) <= (end - start)
4790 }
4791
4792 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4793 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4794 i.wrapping_sub(start) <= (end - start)
4795 }
4796
4797 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4798 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4799 i.wrapping_sub(start) <= (end - start)
4800 }
4801
4802 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4803 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4804 if let Some(n) = opt {
4805 n.checked_add(num)
4806 } else {
4807 None
4808 }
4809 }
4810
4811 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4812 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4813 if let Some(n) = one {
4814 checked_add(n, other)
4815 } else {
4816 None
4817 }
4818 }
4819
4820 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4821 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4822 if let Some(n) = opt {
4823 n.checked_mul(num)
4824 } else {
4825 None
4826 }
4827 }
4828
4829 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4830 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4831 if let Some(n) = opt {
4832 n.checked_div(num)
4833 } else {
4834 None
4835 }
4836 }
4837
4838 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4839 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4840 opt.map(|n| n.next_power_of_two())
4841 }
4842
4843 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4844 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4845 if let Some(a) = one {
4846 if let Some(b) = other {
4847 Some(::std::cmp::min(a, b))
4848 } else {
4849 Some(a)
4850 }
4851 } else {
4852 other
4853 }
4854 }
4855
4856 // ############## TESTS ###############
4857
4858 #[cfg(all(test, feature = "serde"))]
4859 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4860 struct Demo {
4861 num: u32,
4862 name: String,
4863 enc: &'static Encoding,
4864 }
4865
4866 #[cfg(test)]
4867 mod test_labels_names;
4868
4869 #[cfg(test)]
4870 mod tests {
4871 use super::*;
4872 use std::borrow::Cow;
4873
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4874 fn sniff_to_utf16(
4875 initial_encoding: &'static Encoding,
4876 expected_encoding: &'static Encoding,
4877 bytes: &[u8],
4878 expect: &[u16],
4879 breaks: &[usize],
4880 ) {
4881 let mut decoder = initial_encoding.new_decoder();
4882
4883 let mut dest: Vec<u16> =
4884 Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4885 let capacity = dest.capacity();
4886 dest.resize(capacity, 0u16);
4887
4888 let mut total_written = 0usize;
4889 let mut start = 0usize;
4890 for br in breaks {
4891 let (result, read, written, _) =
4892 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4893 total_written += written;
4894 assert_eq!(read, *br - start);
4895 match result {
4896 CoderResult::InputEmpty => {}
4897 CoderResult::OutputFull => {
4898 unreachable!();
4899 }
4900 }
4901 start = *br;
4902 }
4903 let (result, read, written, _) =
4904 decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4905 total_written += written;
4906 match result {
4907 CoderResult::InputEmpty => {}
4908 CoderResult::OutputFull => {
4909 unreachable!();
4910 }
4911 }
4912 assert_eq!(read, bytes.len() - start);
4913 assert_eq!(total_written, expect.len());
4914 assert_eq!(&dest[..total_written], expect);
4915 assert_eq!(decoder.encoding(), expected_encoding);
4916 }
4917
4918 // Any copyright to the test code below this comment is dedicated to the
4919 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4920
4921 #[test]
test_bom_sniffing()4922 fn test_bom_sniffing() {
4923 // ASCII
4924 sniff_to_utf16(
4925 WINDOWS_1252,
4926 WINDOWS_1252,
4927 b"\x61\x62",
4928 &[0x0061u16, 0x0062u16],
4929 &[],
4930 );
4931 // UTF-8
4932 sniff_to_utf16(
4933 WINDOWS_1252,
4934 UTF_8,
4935 b"\xEF\xBB\xBF\x61\x62",
4936 &[0x0061u16, 0x0062u16],
4937 &[],
4938 );
4939 sniff_to_utf16(
4940 WINDOWS_1252,
4941 UTF_8,
4942 b"\xEF\xBB\xBF\x61\x62",
4943 &[0x0061u16, 0x0062u16],
4944 &[1],
4945 );
4946 sniff_to_utf16(
4947 WINDOWS_1252,
4948 UTF_8,
4949 b"\xEF\xBB\xBF\x61\x62",
4950 &[0x0061u16, 0x0062u16],
4951 &[2],
4952 );
4953 sniff_to_utf16(
4954 WINDOWS_1252,
4955 UTF_8,
4956 b"\xEF\xBB\xBF\x61\x62",
4957 &[0x0061u16, 0x0062u16],
4958 &[3],
4959 );
4960 sniff_to_utf16(
4961 WINDOWS_1252,
4962 UTF_8,
4963 b"\xEF\xBB\xBF\x61\x62",
4964 &[0x0061u16, 0x0062u16],
4965 &[4],
4966 );
4967 sniff_to_utf16(
4968 WINDOWS_1252,
4969 UTF_8,
4970 b"\xEF\xBB\xBF\x61\x62",
4971 &[0x0061u16, 0x0062u16],
4972 &[2, 3],
4973 );
4974 sniff_to_utf16(
4975 WINDOWS_1252,
4976 UTF_8,
4977 b"\xEF\xBB\xBF\x61\x62",
4978 &[0x0061u16, 0x0062u16],
4979 &[1, 2],
4980 );
4981 sniff_to_utf16(
4982 WINDOWS_1252,
4983 UTF_8,
4984 b"\xEF\xBB\xBF\x61\x62",
4985 &[0x0061u16, 0x0062u16],
4986 &[1, 3],
4987 );
4988 sniff_to_utf16(
4989 WINDOWS_1252,
4990 UTF_8,
4991 b"\xEF\xBB\xBF\x61\x62",
4992 &[0x0061u16, 0x0062u16],
4993 &[1, 2, 3, 4],
4994 );
4995 sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
4996 // Not UTF-8
4997 sniff_to_utf16(
4998 WINDOWS_1252,
4999 WINDOWS_1252,
5000 b"\xEF\xBB\x61\x62",
5001 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5002 &[],
5003 );
5004 sniff_to_utf16(
5005 WINDOWS_1252,
5006 WINDOWS_1252,
5007 b"\xEF\xBB\x61\x62",
5008 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5009 &[1],
5010 );
5011 sniff_to_utf16(
5012 WINDOWS_1252,
5013 WINDOWS_1252,
5014 b"\xEF\x61\x62",
5015 &[0x00EFu16, 0x0061u16, 0x0062u16],
5016 &[],
5017 );
5018 sniff_to_utf16(
5019 WINDOWS_1252,
5020 WINDOWS_1252,
5021 b"\xEF\x61\x62",
5022 &[0x00EFu16, 0x0061u16, 0x0062u16],
5023 &[1],
5024 );
5025 sniff_to_utf16(
5026 WINDOWS_1252,
5027 WINDOWS_1252,
5028 b"\xEF\xBB",
5029 &[0x00EFu16, 0x00BBu16],
5030 &[],
5031 );
5032 sniff_to_utf16(
5033 WINDOWS_1252,
5034 WINDOWS_1252,
5035 b"\xEF\xBB",
5036 &[0x00EFu16, 0x00BBu16],
5037 &[1],
5038 );
5039 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5040 // Not UTF-16
5041 sniff_to_utf16(
5042 WINDOWS_1252,
5043 WINDOWS_1252,
5044 b"\xFE\x61\x62",
5045 &[0x00FEu16, 0x0061u16, 0x0062u16],
5046 &[],
5047 );
5048 sniff_to_utf16(
5049 WINDOWS_1252,
5050 WINDOWS_1252,
5051 b"\xFE\x61\x62",
5052 &[0x00FEu16, 0x0061u16, 0x0062u16],
5053 &[1],
5054 );
5055 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5056 sniff_to_utf16(
5057 WINDOWS_1252,
5058 WINDOWS_1252,
5059 b"\xFF\x61\x62",
5060 &[0x00FFu16, 0x0061u16, 0x0062u16],
5061 &[],
5062 );
5063 sniff_to_utf16(
5064 WINDOWS_1252,
5065 WINDOWS_1252,
5066 b"\xFF\x61\x62",
5067 &[0x00FFu16, 0x0061u16, 0x0062u16],
5068 &[1],
5069 );
5070 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5071 // UTF-16
5072 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5073 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5074 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5075 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5076 }
5077
5078 #[test]
test_output_encoding()5079 fn test_output_encoding() {
5080 assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5081 assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5082 assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5083 assert_eq!(UTF_8.output_encoding(), UTF_8);
5084 assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5085 assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5086 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5087 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5088 assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5089 assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5090 }
5091
5092 #[test]
test_label_resolution()5093 fn test_label_resolution() {
5094 assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5095 assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5096 assert_eq!(
5097 Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5098 Some(UTF_8)
5099 );
5100 assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5101 assert_eq!(Encoding::for_label(b"bogus"), None);
5102 assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5103 }
5104
5105 #[test]
test_decode_valid_windows_1257_to_cow()5106 fn test_decode_valid_windows_1257_to_cow() {
5107 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5108 match cow {
5109 Cow::Borrowed(_) => unreachable!(),
5110 Cow::Owned(s) => {
5111 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5112 }
5113 }
5114 assert_eq!(encoding, WINDOWS_1257);
5115 assert!(!had_errors);
5116 }
5117
5118 #[test]
test_decode_invalid_windows_1257_to_cow()5119 fn test_decode_invalid_windows_1257_to_cow() {
5120 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5121 match cow {
5122 Cow::Borrowed(_) => unreachable!(),
5123 Cow::Owned(s) => {
5124 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5125 }
5126 }
5127 assert_eq!(encoding, WINDOWS_1257);
5128 assert!(had_errors);
5129 }
5130
5131 #[test]
test_decode_ascii_only_windows_1257_to_cow()5132 fn test_decode_ascii_only_windows_1257_to_cow() {
5133 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5134 match cow {
5135 Cow::Borrowed(s) => {
5136 assert_eq!(s, "abc");
5137 }
5138 Cow::Owned(_) => unreachable!(),
5139 }
5140 assert_eq!(encoding, WINDOWS_1257);
5141 assert!(!had_errors);
5142 }
5143
5144 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5145 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5146 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5147 match cow {
5148 Cow::Borrowed(s) => {
5149 assert_eq!(s, "\u{20AC}\u{00E4}");
5150 }
5151 Cow::Owned(_) => unreachable!(),
5152 }
5153 assert_eq!(encoding, UTF_8);
5154 assert!(!had_errors);
5155 }
5156
5157 #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5158 fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5159 let (cow, encoding, had_errors) =
5160 WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5161 match cow {
5162 Cow::Borrowed(_) => unreachable!(),
5163 Cow::Owned(s) => {
5164 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5165 }
5166 }
5167 assert_eq!(encoding, UTF_8);
5168 assert!(had_errors);
5169 }
5170
5171 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5172 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5173 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5174 match cow {
5175 Cow::Borrowed(s) => {
5176 assert_eq!(s, "\u{20AC}\u{00E4}");
5177 }
5178 Cow::Owned(_) => unreachable!(),
5179 }
5180 assert_eq!(encoding, UTF_8);
5181 assert!(!had_errors);
5182 }
5183
5184 #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5185 fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5186 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5187 match cow {
5188 Cow::Borrowed(_) => unreachable!(),
5189 Cow::Owned(s) => {
5190 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5191 }
5192 }
5193 assert_eq!(encoding, UTF_8);
5194 assert!(had_errors);
5195 }
5196
5197 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5198 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5199 let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5200 match cow {
5201 Cow::Borrowed(s) => {
5202 assert_eq!(s, "\u{20AC}\u{00E4}");
5203 }
5204 Cow::Owned(_) => unreachable!(),
5205 }
5206 assert!(!had_errors);
5207 }
5208
5209 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5210 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5211 let (cow, had_errors) =
5212 WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5213 match cow {
5214 Cow::Borrowed(_) => unreachable!(),
5215 Cow::Owned(s) => {
5216 assert_eq!(
5217 s,
5218 "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5219 );
5220 }
5221 }
5222 assert!(!had_errors);
5223 }
5224
5225 #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5226 fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5227 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5228 match cow {
5229 Cow::Borrowed(_) => unreachable!(),
5230 Cow::Owned(s) => {
5231 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5232 }
5233 }
5234 assert!(!had_errors);
5235 }
5236
5237 #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5238 fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5239 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5240 match cow {
5241 Cow::Borrowed(_) => unreachable!(),
5242 Cow::Owned(s) => {
5243 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5244 }
5245 }
5246 assert!(had_errors);
5247 }
5248
5249 #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5250 fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5251 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5252 match cow {
5253 Cow::Borrowed(s) => {
5254 assert_eq!(s, "abc");
5255 }
5256 Cow::Owned(_) => unreachable!(),
5257 }
5258 assert!(!had_errors);
5259 }
5260
5261 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5262 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5263 let (cow, had_errors) =
5264 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5265 match cow {
5266 Cow::Borrowed(s) => {
5267 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5268 }
5269 Cow::Owned(_) => unreachable!(),
5270 }
5271 assert!(!had_errors);
5272 }
5273
5274 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5275 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5276 let (cow, had_errors) =
5277 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5278 match cow {
5279 Cow::Borrowed(_) => unreachable!(),
5280 Cow::Owned(s) => {
5281 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5282 }
5283 }
5284 assert!(had_errors);
5285 }
5286
5287 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5288 fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5289 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5290 match cow {
5291 Cow::Borrowed(_) => unreachable!(),
5292 Cow::Owned(s) => {
5293 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5294 }
5295 }
5296 assert!(!had_errors);
5297 }
5298
5299 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5300 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5301 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5302 match cow {
5303 Cow::Borrowed(_) => unreachable!(),
5304 Cow::Owned(s) => {
5305 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5306 }
5307 }
5308 assert!(had_errors);
5309 }
5310
5311 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5312 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5313 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5314 match cow {
5315 Cow::Borrowed(s) => {
5316 assert_eq!(s, "abc");
5317 }
5318 Cow::Owned(_) => unreachable!(),
5319 }
5320 assert!(!had_errors);
5321 }
5322
5323 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5324 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5325 match UTF_8.decode_without_bom_handling_and_without_replacement(
5326 b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5327 ) {
5328 Some(cow) => match cow {
5329 Cow::Borrowed(s) => {
5330 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5331 }
5332 Cow::Owned(_) => unreachable!(),
5333 },
5334 None => unreachable!(),
5335 }
5336 }
5337
5338 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5339 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5340 assert!(UTF_8
5341 .decode_without_bom_handling_and_without_replacement(
5342 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5343 )
5344 .is_none());
5345 }
5346
5347 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5348 fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5349 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5350 Some(cow) => match cow {
5351 Cow::Borrowed(_) => unreachable!(),
5352 Cow::Owned(s) => {
5353 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5354 }
5355 },
5356 None => unreachable!(),
5357 }
5358 }
5359
5360 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5361 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5362 assert!(WINDOWS_1257
5363 .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5364 .is_none());
5365 }
5366
5367 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5368 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5369 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5370 Some(cow) => match cow {
5371 Cow::Borrowed(s) => {
5372 assert_eq!(s, "abc");
5373 }
5374 Cow::Owned(_) => unreachable!(),
5375 },
5376 None => unreachable!(),
5377 }
5378 }
5379
5380 #[test]
test_encode_ascii_only_windows_1257_to_cow()5381 fn test_encode_ascii_only_windows_1257_to_cow() {
5382 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5383 match cow {
5384 Cow::Borrowed(s) => {
5385 assert_eq!(s, b"abc");
5386 }
5387 Cow::Owned(_) => unreachable!(),
5388 }
5389 assert_eq!(encoding, WINDOWS_1257);
5390 assert!(!had_errors);
5391 }
5392
5393 #[test]
test_encode_valid_windows_1257_to_cow()5394 fn test_encode_valid_windows_1257_to_cow() {
5395 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5396 match cow {
5397 Cow::Borrowed(_) => unreachable!(),
5398 Cow::Owned(s) => {
5399 assert_eq!(s, b"abc\x80\xE4");
5400 }
5401 }
5402 assert_eq!(encoding, WINDOWS_1257);
5403 assert!(!had_errors);
5404 }
5405
5406 #[test]
test_utf16_space_with_one_bom_byte()5407 fn test_utf16_space_with_one_bom_byte() {
5408 let mut decoder = UTF_16LE.new_decoder();
5409 let mut dst = [0u16; 12];
5410 {
5411 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5412 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5413 assert_eq!(result, CoderResult::InputEmpty);
5414 }
5415 {
5416 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5417 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5418 assert_eq!(result, CoderResult::InputEmpty);
5419 }
5420 }
5421
5422 #[test]
test_utf8_space_with_one_bom_byte()5423 fn test_utf8_space_with_one_bom_byte() {
5424 let mut decoder = UTF_8.new_decoder();
5425 let mut dst = [0u16; 12];
5426 {
5427 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5428 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5429 assert_eq!(result, CoderResult::InputEmpty);
5430 }
5431 {
5432 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5433 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5434 assert_eq!(result, CoderResult::InputEmpty);
5435 }
5436 }
5437
5438 #[test]
test_utf16_space_with_two_bom_bytes()5439 fn test_utf16_space_with_two_bom_bytes() {
5440 let mut decoder = UTF_16LE.new_decoder();
5441 let mut dst = [0u16; 12];
5442 {
5443 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5444 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5445 assert_eq!(result, CoderResult::InputEmpty);
5446 }
5447 {
5448 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5449 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5450 assert_eq!(result, CoderResult::InputEmpty);
5451 }
5452 {
5453 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5454 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5455 assert_eq!(result, CoderResult::InputEmpty);
5456 }
5457 }
5458
5459 #[test]
test_utf8_space_with_two_bom_bytes()5460 fn test_utf8_space_with_two_bom_bytes() {
5461 let mut decoder = UTF_8.new_decoder();
5462 let mut dst = [0u16; 12];
5463 {
5464 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5465 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5466 assert_eq!(result, CoderResult::InputEmpty);
5467 }
5468 {
5469 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5470 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5471 assert_eq!(result, CoderResult::InputEmpty);
5472 }
5473 {
5474 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5475 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5476 assert_eq!(result, CoderResult::InputEmpty);
5477 }
5478 }
5479
5480 #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5481 fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5482 let mut decoder = UTF_16LE.new_decoder();
5483 let mut dst = [0u16; 12];
5484 {
5485 let needed = decoder.max_utf16_buffer_length(2).unwrap();
5486 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5487 assert_eq!(result, CoderResult::InputEmpty);
5488 }
5489 }
5490
5491 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5492 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5493 let mut dst = [0u8; 8];
5494 let mut encoder = ISO_2022_JP.new_encoder();
5495 {
5496 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5497 assert_eq!(result, CoderResult::InputEmpty);
5498 }
5499 {
5500 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5501 assert_eq!(result, CoderResult::InputEmpty);
5502 }
5503 }
5504
5505 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5506 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5507 let mut dst = [0u8; 16];
5508 let mut encoder = ISO_2022_JP.new_encoder();
5509 {
5510 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5511 assert_eq!(result, CoderResult::InputEmpty);
5512 }
5513 {
5514 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5515 assert_eq!(result, CoderResult::InputEmpty);
5516 }
5517 {
5518 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5519 assert_eq!(result, CoderResult::OutputFull);
5520 }
5521 }
5522
5523 #[test]
test_buffer_end_iso_2022_jp_from_utf8()5524 fn test_buffer_end_iso_2022_jp_from_utf8() {
5525 let mut dst = [0u8; 18];
5526 {
5527 let mut encoder = ISO_2022_JP.new_encoder();
5528 let (result, _, _, _) =
5529 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5530 assert_eq!(result, CoderResult::InputEmpty);
5531 }
5532 {
5533 let mut encoder = ISO_2022_JP.new_encoder();
5534 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5535 assert_eq!(result, CoderResult::OutputFull);
5536 }
5537 {
5538 let mut encoder = ISO_2022_JP.new_encoder();
5539 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5540 assert_eq!(result, CoderResult::InputEmpty);
5541 }
5542 {
5543 let mut encoder = ISO_2022_JP.new_encoder();
5544 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5545 assert_eq!(result, CoderResult::InputEmpty);
5546 }
5547 }
5548
5549 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5550 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5551 let mut dst = [0u8; 8];
5552 let mut encoder = ISO_2022_JP.new_encoder();
5553 {
5554 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5555 assert_eq!(result, CoderResult::InputEmpty);
5556 }
5557 {
5558 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5559 assert_eq!(result, CoderResult::InputEmpty);
5560 }
5561 }
5562
5563 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5564 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5565 let mut dst = [0u8; 16];
5566 let mut encoder = ISO_2022_JP.new_encoder();
5567 {
5568 let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5569 assert_eq!(result, CoderResult::InputEmpty);
5570 }
5571 {
5572 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5573 assert_eq!(result, CoderResult::InputEmpty);
5574 }
5575 {
5576 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5577 assert_eq!(result, CoderResult::OutputFull);
5578 }
5579 }
5580
5581 #[test]
test_buffer_end_iso_2022_jp_from_utf16()5582 fn test_buffer_end_iso_2022_jp_from_utf16() {
5583 let mut dst = [0u8; 18];
5584 {
5585 let mut encoder = ISO_2022_JP.new_encoder();
5586 let (result, _, _, _) =
5587 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5588 assert_eq!(result, CoderResult::InputEmpty);
5589 }
5590 {
5591 let mut encoder = ISO_2022_JP.new_encoder();
5592 let (result, _, _, _) =
5593 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5594 assert_eq!(result, CoderResult::OutputFull);
5595 }
5596 {
5597 let mut encoder = ISO_2022_JP.new_encoder();
5598 let (result, _, _, _) =
5599 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5600 assert_eq!(result, CoderResult::InputEmpty);
5601 }
5602 {
5603 let mut encoder = ISO_2022_JP.new_encoder();
5604 let (result, _, _, _) =
5605 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5606 assert_eq!(result, CoderResult::InputEmpty);
5607 }
5608 }
5609
5610 #[test]
test_buffer_end_utf16be()5611 fn test_buffer_end_utf16be() {
5612 let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5613 let mut dest = [0u8; 4];
5614
5615 assert_eq!(
5616 decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5617 (CoderResult::InputEmpty, 2, 0, false)
5618 );
5619
5620 let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5621 }
5622
5623 #[test]
test_hash()5624 fn test_hash() {
5625 let mut encodings = ::std::collections::HashSet::new();
5626 encodings.insert(UTF_8);
5627 encodings.insert(ISO_2022_JP);
5628 assert!(encodings.contains(UTF_8));
5629 assert!(encodings.contains(ISO_2022_JP));
5630 assert!(!encodings.contains(WINDOWS_1252));
5631 encodings.remove(ISO_2022_JP);
5632 assert!(!encodings.contains(ISO_2022_JP));
5633 }
5634
5635 #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5636 fn test_iso_2022_jp_ncr_extra_from_utf16() {
5637 let mut dst = [0u8; 17];
5638 {
5639 let mut encoder = ISO_2022_JP.new_encoder();
5640 let (result, _, _, _) =
5641 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5642 assert_eq!(result, CoderResult::OutputFull);
5643 }
5644 }
5645
5646 #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5647 fn test_iso_2022_jp_ncr_extra_from_utf8() {
5648 let mut dst = [0u8; 17];
5649 {
5650 let mut encoder = ISO_2022_JP.new_encoder();
5651 let (result, _, _, _) =
5652 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5653 assert_eq!(result, CoderResult::OutputFull);
5654 }
5655 }
5656
5657 #[test]
test_max_length_with_bom_to_utf8()5658 fn test_max_length_with_bom_to_utf8() {
5659 let mut output = [0u8; 20];
5660 let mut decoder = REPLACEMENT.new_decoder();
5661 let input = b"\xEF\xBB\xBFA";
5662 {
5663 let needed = decoder
5664 .max_utf8_buffer_length_without_replacement(input.len())
5665 .unwrap();
5666 let (result, read, written) =
5667 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5668 assert_eq!(result, DecoderResult::InputEmpty);
5669 assert_eq!(read, input.len());
5670 assert_eq!(written, 1);
5671 assert_eq!(output[0], 0x41);
5672 }
5673 }
5674
5675 #[cfg(feature = "serde")]
5676 #[test]
test_serde()5677 fn test_serde() {
5678 let demo = Demo {
5679 num: 42,
5680 name: "foo".into(),
5681 enc: UTF_8,
5682 };
5683
5684 let serialized = serde_json::to_string(&demo).unwrap();
5685
5686 let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5687 assert_eq!(deserialized, demo);
5688
5689 let bincoded = bincode::serialize(&demo).unwrap();
5690 let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5691 assert_eq!(debincoded, demo);
5692 }
5693
5694 #[test]
test_is_single_byte()5695 fn test_is_single_byte() {
5696 assert!(!BIG5.is_single_byte());
5697 assert!(!EUC_JP.is_single_byte());
5698 assert!(!EUC_KR.is_single_byte());
5699 assert!(!GB18030.is_single_byte());
5700 assert!(!GBK.is_single_byte());
5701 assert!(!REPLACEMENT.is_single_byte());
5702 assert!(!SHIFT_JIS.is_single_byte());
5703 assert!(!UTF_8.is_single_byte());
5704 assert!(!UTF_16BE.is_single_byte());
5705 assert!(!UTF_16LE.is_single_byte());
5706 assert!(!ISO_2022_JP.is_single_byte());
5707
5708 assert!(IBM866.is_single_byte());
5709 assert!(ISO_8859_2.is_single_byte());
5710 assert!(ISO_8859_3.is_single_byte());
5711 assert!(ISO_8859_4.is_single_byte());
5712 assert!(ISO_8859_5.is_single_byte());
5713 assert!(ISO_8859_6.is_single_byte());
5714 assert!(ISO_8859_7.is_single_byte());
5715 assert!(ISO_8859_8.is_single_byte());
5716 assert!(ISO_8859_10.is_single_byte());
5717 assert!(ISO_8859_13.is_single_byte());
5718 assert!(ISO_8859_14.is_single_byte());
5719 assert!(ISO_8859_15.is_single_byte());
5720 assert!(ISO_8859_16.is_single_byte());
5721 assert!(ISO_8859_8_I.is_single_byte());
5722 assert!(KOI8_R.is_single_byte());
5723 assert!(KOI8_U.is_single_byte());
5724 assert!(MACINTOSH.is_single_byte());
5725 assert!(WINDOWS_874.is_single_byte());
5726 assert!(WINDOWS_1250.is_single_byte());
5727 assert!(WINDOWS_1251.is_single_byte());
5728 assert!(WINDOWS_1252.is_single_byte());
5729 assert!(WINDOWS_1253.is_single_byte());
5730 assert!(WINDOWS_1254.is_single_byte());
5731 assert!(WINDOWS_1255.is_single_byte());
5732 assert!(WINDOWS_1256.is_single_byte());
5733 assert!(WINDOWS_1257.is_single_byte());
5734 assert!(WINDOWS_1258.is_single_byte());
5735 assert!(X_MAC_CYRILLIC.is_single_byte());
5736 assert!(X_USER_DEFINED.is_single_byte());
5737 }
5738
5739 #[test]
test_latin1_byte_compatible_up_to()5740 fn test_latin1_byte_compatible_up_to() {
5741 let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5742 assert_eq!(
5743 BIG5.new_decoder_without_bom_handling()
5744 .latin1_byte_compatible_up_to(buffer)
5745 .unwrap(),
5746 1
5747 );
5748 assert_eq!(
5749 EUC_JP
5750 .new_decoder_without_bom_handling()
5751 .latin1_byte_compatible_up_to(buffer)
5752 .unwrap(),
5753 1
5754 );
5755 assert_eq!(
5756 EUC_KR
5757 .new_decoder_without_bom_handling()
5758 .latin1_byte_compatible_up_to(buffer)
5759 .unwrap(),
5760 1
5761 );
5762 assert_eq!(
5763 GB18030
5764 .new_decoder_without_bom_handling()
5765 .latin1_byte_compatible_up_to(buffer)
5766 .unwrap(),
5767 1
5768 );
5769 assert_eq!(
5770 GBK.new_decoder_without_bom_handling()
5771 .latin1_byte_compatible_up_to(buffer)
5772 .unwrap(),
5773 1
5774 );
5775 assert!(REPLACEMENT
5776 .new_decoder_without_bom_handling()
5777 .latin1_byte_compatible_up_to(buffer)
5778 .is_none());
5779 assert_eq!(
5780 SHIFT_JIS
5781 .new_decoder_without_bom_handling()
5782 .latin1_byte_compatible_up_to(buffer)
5783 .unwrap(),
5784 1
5785 );
5786 assert_eq!(
5787 UTF_8
5788 .new_decoder_without_bom_handling()
5789 .latin1_byte_compatible_up_to(buffer)
5790 .unwrap(),
5791 1
5792 );
5793 assert!(UTF_16BE
5794 .new_decoder_without_bom_handling()
5795 .latin1_byte_compatible_up_to(buffer)
5796 .is_none());
5797 assert!(UTF_16LE
5798 .new_decoder_without_bom_handling()
5799 .latin1_byte_compatible_up_to(buffer)
5800 .is_none());
5801 assert_eq!(
5802 ISO_2022_JP
5803 .new_decoder_without_bom_handling()
5804 .latin1_byte_compatible_up_to(buffer)
5805 .unwrap(),
5806 1
5807 );
5808
5809 assert_eq!(
5810 IBM866
5811 .new_decoder_without_bom_handling()
5812 .latin1_byte_compatible_up_to(buffer)
5813 .unwrap(),
5814 1
5815 );
5816 assert_eq!(
5817 ISO_8859_2
5818 .new_decoder_without_bom_handling()
5819 .latin1_byte_compatible_up_to(buffer)
5820 .unwrap(),
5821 2
5822 );
5823 assert_eq!(
5824 ISO_8859_3
5825 .new_decoder_without_bom_handling()
5826 .latin1_byte_compatible_up_to(buffer)
5827 .unwrap(),
5828 2
5829 );
5830 assert_eq!(
5831 ISO_8859_4
5832 .new_decoder_without_bom_handling()
5833 .latin1_byte_compatible_up_to(buffer)
5834 .unwrap(),
5835 2
5836 );
5837 assert_eq!(
5838 ISO_8859_5
5839 .new_decoder_without_bom_handling()
5840 .latin1_byte_compatible_up_to(buffer)
5841 .unwrap(),
5842 2
5843 );
5844 assert_eq!(
5845 ISO_8859_6
5846 .new_decoder_without_bom_handling()
5847 .latin1_byte_compatible_up_to(buffer)
5848 .unwrap(),
5849 2
5850 );
5851 assert_eq!(
5852 ISO_8859_7
5853 .new_decoder_without_bom_handling()
5854 .latin1_byte_compatible_up_to(buffer)
5855 .unwrap(),
5856 2
5857 );
5858 assert_eq!(
5859 ISO_8859_8
5860 .new_decoder_without_bom_handling()
5861 .latin1_byte_compatible_up_to(buffer)
5862 .unwrap(),
5863 3
5864 );
5865 assert_eq!(
5866 ISO_8859_10
5867 .new_decoder_without_bom_handling()
5868 .latin1_byte_compatible_up_to(buffer)
5869 .unwrap(),
5870 2
5871 );
5872 assert_eq!(
5873 ISO_8859_13
5874 .new_decoder_without_bom_handling()
5875 .latin1_byte_compatible_up_to(buffer)
5876 .unwrap(),
5877 4
5878 );
5879 assert_eq!(
5880 ISO_8859_14
5881 .new_decoder_without_bom_handling()
5882 .latin1_byte_compatible_up_to(buffer)
5883 .unwrap(),
5884 4
5885 );
5886 assert_eq!(
5887 ISO_8859_15
5888 .new_decoder_without_bom_handling()
5889 .latin1_byte_compatible_up_to(buffer)
5890 .unwrap(),
5891 6
5892 );
5893 assert_eq!(
5894 ISO_8859_16
5895 .new_decoder_without_bom_handling()
5896 .latin1_byte_compatible_up_to(buffer)
5897 .unwrap(),
5898 4
5899 );
5900 assert_eq!(
5901 ISO_8859_8_I
5902 .new_decoder_without_bom_handling()
5903 .latin1_byte_compatible_up_to(buffer)
5904 .unwrap(),
5905 3
5906 );
5907 assert_eq!(
5908 KOI8_R
5909 .new_decoder_without_bom_handling()
5910 .latin1_byte_compatible_up_to(buffer)
5911 .unwrap(),
5912 1
5913 );
5914 assert_eq!(
5915 KOI8_U
5916 .new_decoder_without_bom_handling()
5917 .latin1_byte_compatible_up_to(buffer)
5918 .unwrap(),
5919 1
5920 );
5921 assert_eq!(
5922 MACINTOSH
5923 .new_decoder_without_bom_handling()
5924 .latin1_byte_compatible_up_to(buffer)
5925 .unwrap(),
5926 1
5927 );
5928 assert_eq!(
5929 WINDOWS_874
5930 .new_decoder_without_bom_handling()
5931 .latin1_byte_compatible_up_to(buffer)
5932 .unwrap(),
5933 2
5934 );
5935 assert_eq!(
5936 WINDOWS_1250
5937 .new_decoder_without_bom_handling()
5938 .latin1_byte_compatible_up_to(buffer)
5939 .unwrap(),
5940 4
5941 );
5942 assert_eq!(
5943 WINDOWS_1251
5944 .new_decoder_without_bom_handling()
5945 .latin1_byte_compatible_up_to(buffer)
5946 .unwrap(),
5947 1
5948 );
5949 assert_eq!(
5950 WINDOWS_1252
5951 .new_decoder_without_bom_handling()
5952 .latin1_byte_compatible_up_to(buffer)
5953 .unwrap(),
5954 5
5955 );
5956 assert_eq!(
5957 WINDOWS_1253
5958 .new_decoder_without_bom_handling()
5959 .latin1_byte_compatible_up_to(buffer)
5960 .unwrap(),
5961 3
5962 );
5963 assert_eq!(
5964 WINDOWS_1254
5965 .new_decoder_without_bom_handling()
5966 .latin1_byte_compatible_up_to(buffer)
5967 .unwrap(),
5968 4
5969 );
5970 assert_eq!(
5971 WINDOWS_1255
5972 .new_decoder_without_bom_handling()
5973 .latin1_byte_compatible_up_to(buffer)
5974 .unwrap(),
5975 3
5976 );
5977 assert_eq!(
5978 WINDOWS_1256
5979 .new_decoder_without_bom_handling()
5980 .latin1_byte_compatible_up_to(buffer)
5981 .unwrap(),
5982 1
5983 );
5984 assert_eq!(
5985 WINDOWS_1257
5986 .new_decoder_without_bom_handling()
5987 .latin1_byte_compatible_up_to(buffer)
5988 .unwrap(),
5989 4
5990 );
5991 assert_eq!(
5992 WINDOWS_1258
5993 .new_decoder_without_bom_handling()
5994 .latin1_byte_compatible_up_to(buffer)
5995 .unwrap(),
5996 4
5997 );
5998 assert_eq!(
5999 X_MAC_CYRILLIC
6000 .new_decoder_without_bom_handling()
6001 .latin1_byte_compatible_up_to(buffer)
6002 .unwrap(),
6003 1
6004 );
6005 assert_eq!(
6006 X_USER_DEFINED
6007 .new_decoder_without_bom_handling()
6008 .latin1_byte_compatible_up_to(buffer)
6009 .unwrap(),
6010 1
6011 );
6012
6013 assert!(UTF_8
6014 .new_decoder()
6015 .latin1_byte_compatible_up_to(buffer)
6016 .is_none());
6017
6018 let mut decoder = UTF_8.new_decoder();
6019 let mut output = [0u16; 4];
6020 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6021 assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6022 let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6023 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6024 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6025 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6026 }
6027 }
6028