1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 #![cfg_attr(
11 feature = "cargo-clippy",
12 allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14
15 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17 //! Gecko-oriented means that converting to and from UTF-16 is supported in
18 //! addition to converting to and from UTF-8, that the performance and
19 //! streamability goals are browser-oriented, and that FFI-friendliness is a
20 //! goal.
21 //!
22 //! Additionally, the `mem` module provides functions that are useful for
23 //! applications that need to be able to deal with legacy in-memory
24 //! representations of Unicode.
25 //!
26 //! For expectation setting, please be sure to read the sections
27 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29 //!
30 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31 //! design and internals of the crate.
32 //!
33 //! # Availability
34 //!
35 //! The code is available under the
36 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38 //! See the
39 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40 //! file for details.
41 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43 //!
44 //! # Integration with `std::io`
45 //!
46 //! This crate doesn't implement traits from `std::io`. However, for the case of
47 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50 //!
51 //! # Examples
52 //!
53 //! Example programs:
54 //!
55 //! * [Rust](https://github.com/hsivonen/recode_rs)
56 //! * [C](https://github.com/hsivonen/recode_c)
57 //! * [C++](https://github.com/hsivonen/recode_cpp)
58 //!
59 //! Decode using the non-streaming API:
60 //!
61 //! ```
62 //! #[cfg(feature = "alloc")] {
63 //! use encoding_rs::*;
64 //!
65 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
66 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
67 //!
68 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
69 //! assert_eq!(&cow[..], expectation);
70 //! assert_eq!(encoding_used, SHIFT_JIS);
71 //! assert!(!had_errors);
72 //! }
73 //! ```
74 //!
75 //! Decode using the streaming API with minimal `unsafe`:
76 //!
77 //! ```
78 //! use encoding_rs::*;
79 //!
80 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
81 //!
82 //! // Use an array of byte slices to demonstrate content arriving piece by
83 //! // piece from the network.
84 //! let bytes: [&'static [u8]; 4] = [b"\x83",
85 //! b"n\x83\x8D\x81",
86 //! b"[\x81E\x83\x8F\x81[\x83",
87 //! b"\x8B\x83h"];
88 //!
89 //! // Very short output buffer to demonstrate the output buffer getting full.
90 //! // Normally, you'd use something like `[0u8; 2048]`.
91 //! let mut buffer_bytes = [0u8; 8];
92 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
93 //!
94 //! // How many bytes in the buffer currently hold significant data.
95 //! let mut bytes_in_buffer = 0usize;
96 //!
97 //! // Collect the output to a string for demonstration purposes.
98 //! let mut output = String::new();
99 //!
100 //! // The `Decoder`
101 //! let mut decoder = SHIFT_JIS.new_decoder();
102 //!
103 //! // Track whether we see errors.
104 //! let mut total_had_errors = false;
105 //!
106 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
107 //! // use of a fixed-size buffer; normally when the output of an incremental
108 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
109 //! // avoid the intermediate buffer).
110 //! for input in &bytes[..] {
111 //! // The number of bytes already read from current `input` in total.
112 //! let mut total_read_from_current_input = 0usize;
113 //!
114 //! loop {
115 //! let (result, read, written, had_errors) =
116 //! decoder.decode_to_str(&input[total_read_from_current_input..],
117 //! &mut buffer[bytes_in_buffer..],
118 //! false);
119 //! total_read_from_current_input += read;
120 //! bytes_in_buffer += written;
121 //! total_had_errors |= had_errors;
122 //! match result {
123 //! CoderResult::InputEmpty => {
124 //! // We have consumed the current input buffer. Break out of
125 //! // the inner loop to get the next input buffer from the
126 //! // outer loop.
127 //! break;
128 //! },
129 //! CoderResult::OutputFull => {
130 //! // Write the current buffer out and consider the buffer
131 //! // empty.
132 //! output.push_str(&buffer[..bytes_in_buffer]);
133 //! bytes_in_buffer = 0usize;
134 //! continue;
135 //! }
136 //! }
137 //! }
138 //! }
139 //!
140 //! // Process EOF
141 //! loop {
142 //! let (result, _, written, had_errors) =
143 //! decoder.decode_to_str(b"",
144 //! &mut buffer[bytes_in_buffer..],
145 //! true);
146 //! bytes_in_buffer += written;
147 //! total_had_errors |= had_errors;
148 //! // Write the current buffer out and consider the buffer empty.
149 //! // Need to do this here for both `match` arms, because we exit the
150 //! // loop on `CoderResult::InputEmpty`.
151 //! output.push_str(&buffer[..bytes_in_buffer]);
152 //! bytes_in_buffer = 0usize;
153 //! match result {
154 //! CoderResult::InputEmpty => {
155 //! // Done!
156 //! break;
157 //! },
158 //! CoderResult::OutputFull => {
159 //! continue;
160 //! }
161 //! }
162 //! }
163 //!
164 //! assert_eq!(&output[..], expectation);
165 //! assert!(!total_had_errors);
166 //! ```
167 //!
168 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
169 //!
170 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
171 //! __so this crate does not provide encoders for those encodings__!
172 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
173 //! so you get an UTF-8 encoder if you request an encoder for them.
174 //!
175 //! Additionally, the Encoding Standard factors BOM handling into wrapper
176 //! algorithms so that BOM handling isn't part of the definition of the
177 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
178 //! Standard define BOM handling or lack thereof as part of the encoding
179 //! scheme.
180 //!
181 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
182 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
183 //! the Unicode Standard.
184 //!
185 //! When used with the `_with_bom_removal` entry points, the UTF-8
186 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
187 //! Standard.
188 //!
189 //! This crate does not provide a mode that matches the UTF-16 _encoding
190 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
191 //! the entry points without `_bom_` qualifiers is the closest match,
192 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
193 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
194 //! Unicode Standard.
195 //!
196 //! The UTF-32 family of Unicode encoding schemes is not supported
197 //! by this crate. The Encoding Standard doesn't define any UTF-32
198 //! family encodings, since they aren't necessary for consuming Web
199 //! content.
200 //!
201 //! ## ISO-8859-1
202 //!
203 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
204 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
205 //! byte value to the same Unicode scalar value is not available via
206 //! `Encoding` in this crate.
207 //!
208 //! However, the functions whose name starts with `convert` and contains
209 //! `latin1` in the `mem` module support such conversions, which are known as
210 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
211 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
212 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
213 //!
214 //! ## Web / Browser Focus
215 //!
216 //! Both in terms of scope and performance, the focus is on the Web. For scope,
217 //! this means that encoding_rs implements the Encoding Standard fully and
218 //! doesn't implement encodings that are not specified in the Encoding
219 //! Standard. For performance, this means that decoding performance is
220 //! important as well as performance for encoding into UTF-8 or encoding the
221 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
222 //! be encoded into legacy encodings in only two places in the Web platform: in
223 //! the query part of URLs, in which case it's a matter of relatively rare
224 //! error handling, and in form submission, in which case the user action and
225 //! networking tend to hide the performance of the encoder.
226 //!
227 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
228 //! encodings enables smaller code size thanks to the encoder side using the
229 //! decode-optimized data tables without having encode-optimized data tables at
230 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
231 //! multiplication operations.
232 //!
233 //! Additionally, performance is a non-goal for the ASCII-incompatible
234 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
235 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
236 //! of implementation.
237 //!
238 //! Despite the browser focus, the hope is that non-browser applications
239 //! that wish to consume Web content or submit Web forms in a Web-compatible
240 //! way will find encoding_rs useful. While encoding_rs does not try to match
241 //! Windows behavior, many of the encodings are close enough to legacy
242 //! encodings implemented by Windows that applications that need to consume
243 //! data in legacy Windows encodins may find encoding_rs useful. The
244 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
245 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
246 //!
247 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
248 //! to the encodings defined in the Encoding Standard. The
249 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
250 //! UTF-7 decoding for email purposes.
251 //!
252 //! For single-byte DOS encodings beyond the ones supported by the Encoding
253 //! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
254 //!
255 //! # Preparing Text for the Encoders
256 //!
257 //! Normalizing text into Unicode Normalization Form C prior to encoding text
258 //! into a legacy encoding minimizes unmappable characters. Text can be
259 //! normalized to Unicode Normalization Form C using the
260 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
261 //!
262 //! The exception is windows-1258, which after normalizing to Unicode
263 //! Normalization Form C requires tone marks to be decomposed in order to
264 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
265 //! using the [`detone`](https://crates.io/crates/detone) crate.
266 //!
267 //! # Streaming & Non-Streaming; Rust & C/C++
268 //!
269 //! The API in Rust has two modes of operation: streaming and non-streaming.
270 //! The streaming API is the foundation of the implementation and should be
271 //! used when processing data that arrives piecemeal from an i/o stream. The
272 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
273 //! to C callers. The non-streaming part of the API is for Rust callers only and
274 //! is smart about borrowing instead of copying when possible. When
275 //! streamability is not needed, the non-streaming API should be preferrer in
276 //! order to avoid copying data when a borrow suffices.
277 //!
278 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
279 //! standard types for growable byte buffers and Unicode strings that know
280 //! their length.
281 //!
282 //! The C API (header file generated at `target/include/encoding_rs.h` when
283 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
284 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
285 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
286 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
287 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
288 //! as part of Mozilla [bug 1261841][4].
289 //!
290 //! The `Encoding` type is common to both the streaming and non-streaming
291 //! modes. In the streaming mode, decoding operations are performed with a
292 //! `Decoder` and encoding operations with an `Encoder` object obtained via
293 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
294 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
295 //! and `Encoder` objects are not used at all.
296 //!
297 //! [1]: https://github.com/hsivonen/encoding_c
298 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
299 //! [3]: https://github.com/Microsoft/GSL/
300 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
301 //!
302 //! # Memory management
303 //!
304 //! The non-streaming mode never performs heap allocations (even the methods
305 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
306 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
307 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
308 //!
309 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
310 //! perform heap allocations but only to allocate the backing buffer of the
311 //! `Vec<u8>` or the `String`.
312 //!
313 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
314 //! `Drop` cleanup.
315 //!
316 //! # Buffer reading and writing behavior
317 //!
318 //! Based on experience gained with the `java.nio.charset` encoding converter
319 //! API and with the Gecko uconv encoding converter API, the buffer reading
320 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
321 //! fully drained but output buffers are not always fully filled.
322 //!
323 //! When reading from an input buffer, encoding_rs always consumes all input
324 //! up to the next error or to the end of the buffer. In particular, when
325 //! decoding, even if the input buffer ends in the middle of a byte sequence
326 //! for a character, the decoder consumes all input. This has the benefit that
327 //! the caller of the API can always fill the next buffer from the start from
328 //! whatever source the bytes come from and never has to first copy the last
329 //! bytes of the previous buffer to the start of the next buffer. However, when
330 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
331 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
332 //! boundaries falling in the middle of a surrogate pair result in both
333 //! suggorates being treated individually as unpaired surrogates.
334 //!
335 //! Additionally, decoders guarantee that they can be fed even one byte at a
336 //! time and encoders guarantee that they can be fed even one code point at a
337 //! time. This has the benefit of not placing restrictions on the size of
338 //! chunks the content arrives e.g. from network.
339 //!
340 //! When writing into an output buffer, encoding_rs makes sure that the code
341 //! unit sequence for a character is never split across output buffer
342 //! boundaries. This may result in wasted space at the end of an output buffer,
343 //! but the advantages are that the output side of both decoders and encoders
344 //! is greatly simplified compared to designs that attempt to fill output
345 //! buffers exactly even when that entails splitting a code unit sequence and
346 //! when encoding_rs methods return to the caller, the output produces thus
347 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
348 //! the output needs to be considered as a whole, because the latest output
349 //! buffer taken alone might not be valid taken alone if the transition away
350 //! from the ASCII state occurred in an earlier output buffer. However, since
351 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
352 //! state as being in error despite the encoder generating a transition to the
353 //! ASCII state at the end, the claim about the partial output taken as a whole
354 //! being valid is true even for ISO-2022-JP.)
355 //!
356 //! # Error Reporting
357 //!
358 //! Based on experience gained with the `java.nio.charset` encoding converter
359 //! API and with the Gecko uconv encoding converter API, the error reporting
360 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
361 //! that leave it up to the caller to extract the erroneous bytes from the
362 //! input stream if the caller wishes to do so but encoder errors provide the
363 //! code point associated with the error without requiring the caller to
364 //! extract it from the input on its own.
365 //!
366 //! On the encoder side, an error is always triggered by the most recently
367 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
368 //! caller. Also, it's very typical for the caller to wish to do something with
369 //! this data: generate a numeric escape for the character. Additionally, the
370 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
371 //! certain cases, so requiring the caller to extract the character from the
372 //! input buffer would require the caller to handle ISO-2022-JP details.
373 //! Furthermore, requiring the caller to extract the character from the input
374 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
375 //! the job of an encoding conversion library.
376 //!
377 //! On the decoder side, errors are triggered in more complex ways. For
378 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
379 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
380 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
381 //! be the ones most recently pushed to the decoder and the error might not even
382 //! be in the current buffer.
383 //!
384 //! Some encoding conversion APIs address the problem by not acknowledging
385 //! trailing bytes of an input buffer as consumed if it's still possible for
386 //! future bytes to cause the trailing bytes to be in error. This way, error
387 //! reporting can always refer to the most recently pushed buffer. This has the
388 //! problem that the caller of the API has to copy the unconsumed trailing
389 //! bytes to the start of the next buffer before being able to fill the rest
390 //! of the next buffer. This is annoying, error-prone and inefficient.
391 //!
392 //! A possible solution would be making the decoder remember recently consumed
393 //! bytes in order to be able to include a copy of the erroneous bytes when
394 //! reporting an error. This has two problem: First, callers a rarely
395 //! interested in the erroneous bytes, so attempts to identify them are most
396 //! often just overhead anyway. Second, the rare applications that are
397 //! interested typically care about the location of the error in the input
398 //! stream.
399 //!
400 //! To keep the API convenient for common uses and the overhead low while making
401 //! it possible to develop applications, such as HTML validators, that care
402 //! about which bytes were in error, encoding_rs reports the length of the
403 //! erroneous sequence and the number of bytes consumed after the erroneous
404 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
405 //! this makes it possible for callers that care about the erroneous bytes to
406 //! locate them.
407 //!
408 //! # No Convenience API for Custom Replacements
409 //!
410 //! The Web Platform and, therefore, the Encoding Standard supports only one
411 //! error recovery mode for decoders and only one error recovery mode for
412 //! encoders. The supported error recovery mode for decoders is emitting the
413 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
414 //! encoders is emitting an HTML decimal numeric character reference for
415 //! unmappable characters.
416 //!
417 //! Since encoding_rs is Web-focused, these are the only error recovery modes
418 //! for which convenient support is provided. Moreover, on the decoder side,
419 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
420 //! on error (other than treating errors as fatal). In particular, simply
421 //! ignoring errors is a
422 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
423 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
424 //! callers to ignore errors.
425 //!
426 //! On the encoder side, there are plausible alternatives for HTML decimal
427 //! numeric character references. For example, when outputting CSS, CSS-style
428 //! escapes would seem to make sense. However, instead of facilitating the
429 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
430 //! position that you shouldn't generate output in encodings other than UTF-8,
431 //! except where backward compatibility with interacting with the legacy Web
432 //! requires it. The legacy Web requires it only when parsing the query strings
433 //! of URLs and when submitting forms, and those two both use HTML decimal
434 //! numeric character references.
435 //!
436 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
437 //! numeric character references easy, it does make them _possible_.
438 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
439 //! for unmappable characters, is implemented on top of
440 //! `encode_from_utf8_without_replacement()`. Applications that really, really
441 //! want other replacement schemes for unmappable characters can likewise
442 //! implement them on top of `encode_from_utf8_without_replacement()`.
443 //!
444 //! # No Extensibility by Design
445 //!
446 //! The set of encodings supported by encoding_rs is not extensible by design.
447 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
448 //! rather than `trait`s. encoding_rs takes the design position that all future
449 //! text interchange should be done using UTF-8, which can represent all of
450 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
451 //! Standard and encoding_rs that can represent all of Unicode and that has
452 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
453 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
454 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
455 //! other than being able to consume legacy content.
456 //!
457 //! Considering that UTF-8 can represent all of Unicode and is already supported
458 //! by all Web browsers, introducing a new encoding wouldn't add to the
459 //! expressiveness but would add to compatibility problems. In that sense,
460 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
461 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
462 //! the Web Platform. On the other hand, the set of legacy encodings that must
463 //! be supported for a Web browser to be able to be successful is not going to
464 //! expand. Empirically, the set of encodings specified in the Encoding Standard
465 //! is already sufficient and the set of legacy encodings won't grow
466 //! retroactively.
467 //!
468 //! Since extensibility doesn't make sense considering the Web focus of
469 //! encoding_rs and adding encodings to Web clients would be actively harmful,
470 //! it makes sense to make the set of encodings that encoding_rs supports
471 //! non-extensible and to take the (admittedly small) benefits arising from
472 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
473 //! of time, which enables stack allocation thereof.
474 //!
475 //! This does have downsides for applications that might want to put encoding_rs
476 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
477 //! needed for Web uses. The needs of such applications should not complicate
478 //! encoding_rs itself, though. It is up to those applications to provide a
479 //! framework that delegates the operations with encodings that encoding_rs
480 //! supports to encoding_rs and operations with other encodings to something
481 //! else (as opposed to encoding_rs itself providing an extensibility
482 //! framework).
483 //!
484 //! # Panics
485 //!
486 //! Methods in encoding_rs can panic if the API is used against the requirements
487 //! stated in the documentation, if a state that's supposed to be impossible
488 //! is reached due to an internal bug or on integer overflow. When used
489 //! according to documentation with buffer sizes that stay below integer
490 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
491 //!
492 //! Panics arising from API misuse aren't documented beyond this on individual
493 //! methods.
494 //!
495 //! # At-Risk Parts of the API
496 //!
497 //! The foreseeable source of partially backward-incompatible API change is the
498 //! way the instances of `Encoding` are made available.
499 //!
500 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
501 //! initialized with `static`s of type `&'static Encoding`, the non-reference
502 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
503 //!
504 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
505 //! unique when the constant is used in different crates, the reference-typed
506 //! `static`s for the encoding instances will be changed from `static` to
507 //! `const` and the non-reference-typed `_INIT` instances will be removed.
508 //!
509 //! # Mapping Spec Concepts onto the API
510 //!
511 //! <table>
512 //! <thead>
513 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
514 //! </thead>
515 //! <tbody>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&'static Encoding</code></td><td><code>&'static Encoding</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
522 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
523 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
524 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// … (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
525 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
526 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// …</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
527 //! </tbody>
528 //! </table>
529 //!
530 //! # Compatibility with the rust-encoding API
531 //!
532 //! The crate
533 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
534 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
535 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
536 //!
537 //! # Mapping rust-encoding concepts to encoding_rs concepts
538 //!
539 //! The following table provides a mapping from rust-encoding constructs to
540 //! encoding_rs ones.
541 //!
542 //! <table>
543 //! <thead>
544 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
545 //! </thead>
546 //! <tbody>
547 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&'static encoding_rs::Encoding</code></td></tr>
548 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
549 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
550 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
551 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
552 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
553 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
554 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
555 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
556 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
557 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
558 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
559 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
560 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
561 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
562 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
563 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
564 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
565 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
566 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
567 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
568 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
570 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
572 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
573 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
574 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
575 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
576 //! </tbody>
577 //! </table>
578 //!
579 //! # Relationship with Windows Code Pages
580 //!
581 //! Despite the Web and browser focus, the encodings defined by the Encoding
582 //! Standard and implemented by this crate may be useful for decoding legacy
583 //! data that uses Windows code pages. The following table names the single-byte
584 //! encodings
585 //! that have a closely related Windows code page, the number of the closest
586 //! code page, a column indicating whether Windows maps unassigned code points
587 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
588 //! indicating remarks in the list after the table.
589 //!
590 //! <table>
591 //! <thead>
592 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
593 //! </thead>
594 //! <tbody>
595 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
596 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
597 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
598 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
599 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
600 //! <tr><td>windows-874</td><td>874</td><td>•</td><td></td></tr>
601 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
602 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
603 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
604 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
605 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
606 //! <tr><td>windows-1253</td><td>1253</td><td>•</td><td></td></tr>
607 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
608 //! <tr><td>windows-1255</td><td>1255</td><td>•</td><td></td></tr>
609 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
610 //! <tr><td>windows-1257</td><td>1257</td><td>•</td><td></td></tr>
611 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
612 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
613 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
614 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
615 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
616 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
617 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
618 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
619 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
620 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
621 //! <tr><td>ISO-8859-6</td><td>28596</td><td>•</td><td></td></tr>
622 //! <tr><td>ISO-8859-7</td><td>28597</td><td>•</td><td>3</td></tr>
623 //! <tr><td>ISO-8859-8</td><td>28598</td><td>•</td><td>4</td></tr>
624 //! <tr><td>ISO-8859-13</td><td>28603</td><td>•</td><td></td></tr>
625 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
626 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
627 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
628 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
629 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
630 //! </tbody>
631 //! </table>
632 //!
633 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
634 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
635 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
636 //! which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
637 //! decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
638 //! LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
639 //! instead of U+2019 RIGHT SINGLE QUOTATION MARK.
640 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
641 //! of LRM and RLM.
642 //! 5. Remarks from the previous item apply.
643 //!
644 //! The differences between this crate and Windows in the case of multibyte encodings
645 //! are not yet fully documented here. The lack of remarks above should not be taken
646 //! as indication of lack of differences.
647 //!
648 //! # Notable Differences from IANA Naming
649 //!
650 //! In some cases, the Encoding Standard specifies the popular unextended encoding
651 //! name where in IANA terms one of the other labels would be more precise considering
652 //! the extensions that the Encoding Standard has unified into the encoding.
653 //!
654 //! <table>
655 //! <thead>
656 //! <tr><th>Encoding</th><th>IANA</th></tr>
657 //! </thead>
658 //! <tbody>
659 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
660 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
661 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
662 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
663 //! </tbody>
664 //! </table>
665 //!
666 //! In other cases where the Encoding Standard unifies unextended and extended
667 //! variants of an encoding, the encoding gets the name of the extended
668 //! variant.
669 //!
670 //! <table>
671 //! <thead>
672 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
673 //! </thead>
674 //! <tbody>
675 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
676 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
677 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
678 //! </tbody>
679 //! </table>
680 //!
681 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
682 //! for discussion about the UTF-16 family.
683
684 #![no_std]
685 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
686
687 #[cfg(feature = "alloc")]
688 #[cfg_attr(test, macro_use)]
689 extern crate alloc;
690
691 extern crate core;
692 #[macro_use]
693 extern crate cfg_if;
694
695 #[cfg(all(
696 feature = "simd-accel",
697 any(
698 target_feature = "sse2",
699 all(target_endian = "little", target_arch = "aarch64"),
700 all(target_endian = "little", target_feature = "neon")
701 )
702 ))]
703 #[macro_use(shuffle)]
704 extern crate packed_simd;
705
706 #[cfg(feature = "serde")]
707 extern crate serde;
708
709 #[cfg(all(test, feature = "serde"))]
710 extern crate bincode;
711 #[cfg(all(test, feature = "serde"))]
712 #[macro_use]
713 extern crate serde_derive;
714 #[cfg(all(test, feature = "serde"))]
715 extern crate serde_json;
716
717 #[macro_use]
718 mod macros;
719
720 #[cfg(all(
721 feature = "simd-accel",
722 any(
723 target_feature = "sse2",
724 all(target_endian = "little", target_arch = "aarch64"),
725 all(target_endian = "little", target_feature = "neon")
726 )
727 ))]
728 mod simd_funcs;
729
730 #[cfg(all(test, feature = "alloc"))]
731 mod testing;
732
733 mod big5;
734 mod euc_jp;
735 mod euc_kr;
736 mod gb18030;
737 mod iso_2022_jp;
738 mod replacement;
739 mod shift_jis;
740 mod single_byte;
741 mod utf_16;
742 mod utf_8;
743 mod x_user_defined;
744
745 mod ascii;
746 mod data;
747 mod handles;
748 mod variant;
749
750 pub mod mem;
751
752 use crate::ascii::ascii_valid_up_to;
753 use crate::ascii::iso_2022_jp_ascii_valid_up_to;
754 use crate::utf_8::utf8_valid_up_to;
755 use crate::variant::*;
756
757 #[cfg(feature = "alloc")]
758 use alloc::borrow::Cow;
759 #[cfg(feature = "alloc")]
760 use alloc::string::String;
761 #[cfg(feature = "alloc")]
762 use alloc::vec::Vec;
763 use core::cmp::Ordering;
764 use core::hash::Hash;
765 use core::hash::Hasher;
766
767 #[cfg(feature = "serde")]
768 use serde::de::Visitor;
769 #[cfg(feature = "serde")]
770 use serde::{Deserialize, Deserializer, Serialize, Serializer};
771
772 /// This has to be the max length of an NCR instead of max
773 /// minus one, because we can't rely on getting the minus
774 /// one from the space reserved for the current unmappable,
775 /// because the ISO-2022-JP encoder can fill up that space
776 /// with a state transition escape.
777 const NCR_EXTRA: usize = 10; // 
778
779 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
780 // Instead, please regenerate using generate-encoding-data.py
781
782 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
783
784 /// The initializer for the [Big5](static.BIG5.html) encoding.
785 ///
786 /// For use only for taking the address of this form when
787 /// Rust prohibits the use of the non-`_INIT` form directly,
788 /// such as in initializers of other `static`s. If in doubt,
789 /// use the corresponding non-`_INIT` reference-typed `static`.
790 ///
791 /// This part of the public API will go away if Rust changes
792 /// to make the referent of `pub const FOO: &'static Encoding`
793 /// unique cross-crate or if Rust starts allowing static arrays
794 /// to be initialized with `pub static FOO: &'static Encoding`
795 /// items.
796 pub static BIG5_INIT: Encoding = Encoding {
797 name: "Big5",
798 variant: VariantEncoding::Big5,
799 };
800
801 /// The Big5 encoding.
802 ///
803 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
804 /// instead of the Private Use Area code points that have been used historically.
805 /// It is believed to be able to decode existing Web content in a way that makes
806 /// sense.
807 ///
808 /// To avoid form submissions generating data that Web servers don't understand,
809 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
810 /// Big5 in the lexical order.
811 ///
812 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
813 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
814 ///
815 /// This encoding is designed to be suited for decoding the Windows code page 950
816 /// and its HKSCS patched "951" variant such that the text makes sense, given
817 /// assignments that Unicode has made after those encodings used Private Use
818 /// Area characters.
819 ///
820 /// This will change from `static` to `const` if Rust changes
821 /// to make the referent of `pub const FOO: &'static Encoding`
822 /// unique cross-crate, so don't take the address of this
823 /// `static`.
824 pub static BIG5: &'static Encoding = &BIG5_INIT;
825
826 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
827 ///
828 /// For use only for taking the address of this form when
829 /// Rust prohibits the use of the non-`_INIT` form directly,
830 /// such as in initializers of other `static`s. If in doubt,
831 /// use the corresponding non-`_INIT` reference-typed `static`.
832 ///
833 /// This part of the public API will go away if Rust changes
834 /// to make the referent of `pub const FOO: &'static Encoding`
835 /// unique cross-crate or if Rust starts allowing static arrays
836 /// to be initialized with `pub static FOO: &'static Encoding`
837 /// items.
838 pub static EUC_JP_INIT: Encoding = Encoding {
839 name: "EUC-JP",
840 variant: VariantEncoding::EucJp,
841 };
842
843 /// The EUC-JP encoding.
844 ///
845 /// This is the legacy Unix encoding for Japanese.
846 ///
847 /// For compatibility with Web servers that don't expect three-byte sequences
848 /// in form submissions, the encoder doesn't generate three-byte sequences.
849 /// That is, the JIS X 0212 support is decode-only.
850 ///
851 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
852 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
853 ///
854 /// This encoding roughly matches the Windows code page 20932. There are error
855 /// handling differences and a handful of 2-byte sequences that decode differently.
856 /// Additionall, Windows doesn't support 3-byte sequences.
857 ///
858 /// This will change from `static` to `const` if Rust changes
859 /// to make the referent of `pub const FOO: &'static Encoding`
860 /// unique cross-crate, so don't take the address of this
861 /// `static`.
862 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
863
864 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
865 ///
866 /// For use only for taking the address of this form when
867 /// Rust prohibits the use of the non-`_INIT` form directly,
868 /// such as in initializers of other `static`s. If in doubt,
869 /// use the corresponding non-`_INIT` reference-typed `static`.
870 ///
871 /// This part of the public API will go away if Rust changes
872 /// to make the referent of `pub const FOO: &'static Encoding`
873 /// unique cross-crate or if Rust starts allowing static arrays
874 /// to be initialized with `pub static FOO: &'static Encoding`
875 /// items.
876 pub static EUC_KR_INIT: Encoding = Encoding {
877 name: "EUC-KR",
878 variant: VariantEncoding::EucKr,
879 };
880
881 /// The EUC-KR encoding.
882 ///
883 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
884 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
885 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
886 ///
887 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
888 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
889 ///
890 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
891 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
892 /// the question mark or the Private Use Area.
893 ///
894 /// This will change from `static` to `const` if Rust changes
895 /// to make the referent of `pub const FOO: &'static Encoding`
896 /// unique cross-crate, so don't take the address of this
897 /// `static`.
898 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
899
900 /// The initializer for the [GBK](static.GBK.html) encoding.
901 ///
902 /// For use only for taking the address of this form when
903 /// Rust prohibits the use of the non-`_INIT` form directly,
904 /// such as in initializers of other `static`s. If in doubt,
905 /// use the corresponding non-`_INIT` reference-typed `static`.
906 ///
907 /// This part of the public API will go away if Rust changes
908 /// to make the referent of `pub const FOO: &'static Encoding`
909 /// unique cross-crate or if Rust starts allowing static arrays
910 /// to be initialized with `pub static FOO: &'static Encoding`
911 /// items.
912 pub static GBK_INIT: Encoding = Encoding {
913 name: "GBK",
914 variant: VariantEncoding::Gbk,
915 };
916
917 /// The GBK encoding.
918 ///
919 /// The decoder for this encoding is the same as the decoder for gb18030.
920 /// The encoder side of this encoding is GBK with Windows code page 936 euro
921 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
922 /// Unicode block as well as a handful of ideographs from the CJK Unified
923 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
924 ///
925 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
926 /// unified with the gb18030 encoder in the Encoding Standard out of concern
927 /// that servers that expect GBK form submissions might not be able to handle
928 /// the four-byte sequences.
929 ///
930 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
931 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
932 ///
933 /// The encoder of this encoding roughly matches the Windows code page 936.
934 /// The decoder side is a superset.
935 ///
936 /// This will change from `static` to `const` if Rust changes
937 /// to make the referent of `pub const FOO: &'static Encoding`
938 /// unique cross-crate, so don't take the address of this
939 /// `static`.
940 pub static GBK: &'static Encoding = &GBK_INIT;
941
942 /// The initializer for the [IBM866](static.IBM866.html) encoding.
943 ///
944 /// For use only for taking the address of this form when
945 /// Rust prohibits the use of the non-`_INIT` form directly,
946 /// such as in initializers of other `static`s. If in doubt,
947 /// use the corresponding non-`_INIT` reference-typed `static`.
948 ///
949 /// This part of the public API will go away if Rust changes
950 /// to make the referent of `pub const FOO: &'static Encoding`
951 /// unique cross-crate or if Rust starts allowing static arrays
952 /// to be initialized with `pub static FOO: &'static Encoding`
953 /// items.
954 pub static IBM866_INIT: Encoding = Encoding {
955 name: "IBM866",
956 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
957 };
958
959 /// The IBM866 encoding.
960 ///
961 /// This the most notable one of the DOS Cyrillic code pages. It has the same
962 /// box drawing characters as code page 437, so it can be used for decoding
963 /// DOS-era ASCII + box drawing data.
964 ///
965 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
966 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
967 ///
968 /// This encoding matches the Windows code page 866.
969 ///
970 /// This will change from `static` to `const` if Rust changes
971 /// to make the referent of `pub const FOO: &'static Encoding`
972 /// unique cross-crate, so don't take the address of this
973 /// `static`.
974 pub static IBM866: &'static Encoding = &IBM866_INIT;
975
976 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
977 ///
978 /// For use only for taking the address of this form when
979 /// Rust prohibits the use of the non-`_INIT` form directly,
980 /// such as in initializers of other `static`s. If in doubt,
981 /// use the corresponding non-`_INIT` reference-typed `static`.
982 ///
983 /// This part of the public API will go away if Rust changes
984 /// to make the referent of `pub const FOO: &'static Encoding`
985 /// unique cross-crate or if Rust starts allowing static arrays
986 /// to be initialized with `pub static FOO: &'static Encoding`
987 /// items.
988 pub static ISO_2022_JP_INIT: Encoding = Encoding {
989 name: "ISO-2022-JP",
990 variant: VariantEncoding::Iso2022Jp,
991 };
992
993 /// The ISO-2022-JP encoding.
994 ///
995 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
996 /// byte range to encode non-Basic Latin characters. It's the only encoding
997 /// supported by this crate whose encoder is stateful.
998 ///
999 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
1000 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
1001 ///
1002 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
1003 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
1004 /// error handling.
1005 ///
1006 /// This will change from `static` to `const` if Rust changes
1007 /// to make the referent of `pub const FOO: &'static Encoding`
1008 /// unique cross-crate, so don't take the address of this
1009 /// `static`.
1010 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1011
1012 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1013 ///
1014 /// For use only for taking the address of this form when
1015 /// Rust prohibits the use of the non-`_INIT` form directly,
1016 /// such as in initializers of other `static`s. If in doubt,
1017 /// use the corresponding non-`_INIT` reference-typed `static`.
1018 ///
1019 /// This part of the public API will go away if Rust changes
1020 /// to make the referent of `pub const FOO: &'static Encoding`
1021 /// unique cross-crate or if Rust starts allowing static arrays
1022 /// to be initialized with `pub static FOO: &'static Encoding`
1023 /// items.
1024 pub static ISO_8859_10_INIT: Encoding = Encoding {
1025 name: "ISO-8859-10",
1026 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1027 };
1028
1029 /// The ISO-8859-10 encoding.
1030 ///
1031 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1032 /// is also known as Latin 6.
1033 ///
1034 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1035 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1036 ///
1037 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1038 /// does not support this encoding.
1039 ///
1040 /// This will change from `static` to `const` if Rust changes
1041 /// to make the referent of `pub const FOO: &'static Encoding`
1042 /// unique cross-crate, so don't take the address of this
1043 /// `static`.
1044 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1045
1046 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1047 ///
1048 /// For use only for taking the address of this form when
1049 /// Rust prohibits the use of the non-`_INIT` form directly,
1050 /// such as in initializers of other `static`s. If in doubt,
1051 /// use the corresponding non-`_INIT` reference-typed `static`.
1052 ///
1053 /// This part of the public API will go away if Rust changes
1054 /// to make the referent of `pub const FOO: &'static Encoding`
1055 /// unique cross-crate or if Rust starts allowing static arrays
1056 /// to be initialized with `pub static FOO: &'static Encoding`
1057 /// items.
1058 pub static ISO_8859_13_INIT: Encoding = Encoding {
1059 name: "ISO-8859-13",
1060 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1061 };
1062
1063 /// The ISO-8859-13 encoding.
1064 ///
1065 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1066 /// is also known as Latin 7.
1067 ///
1068 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1069 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1070 ///
1071 /// This encoding matches the Windows code page 28603, except Windows decodes
1072 /// unassigned code points to the Private Use Area of Unicode.
1073 ///
1074 /// This will change from `static` to `const` if Rust changes
1075 /// to make the referent of `pub const FOO: &'static Encoding`
1076 /// unique cross-crate, so don't take the address of this
1077 /// `static`.
1078 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1079
1080 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1081 ///
1082 /// For use only for taking the address of this form when
1083 /// Rust prohibits the use of the non-`_INIT` form directly,
1084 /// such as in initializers of other `static`s. If in doubt,
1085 /// use the corresponding non-`_INIT` reference-typed `static`.
1086 ///
1087 /// This part of the public API will go away if Rust changes
1088 /// to make the referent of `pub const FOO: &'static Encoding`
1089 /// unique cross-crate or if Rust starts allowing static arrays
1090 /// to be initialized with `pub static FOO: &'static Encoding`
1091 /// items.
1092 pub static ISO_8859_14_INIT: Encoding = Encoding {
1093 name: "ISO-8859-14",
1094 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1095 };
1096
1097 /// The ISO-8859-14 encoding.
1098 ///
1099 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1100 /// is also known as Latin 8.
1101 ///
1102 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1103 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1104 ///
1105 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1106 /// does not support this encoding.
1107 ///
1108 /// This will change from `static` to `const` if Rust changes
1109 /// to make the referent of `pub const FOO: &'static Encoding`
1110 /// unique cross-crate, so don't take the address of this
1111 /// `static`.
1112 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1113
1114 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1115 ///
1116 /// For use only for taking the address of this form when
1117 /// Rust prohibits the use of the non-`_INIT` form directly,
1118 /// such as in initializers of other `static`s. If in doubt,
1119 /// use the corresponding non-`_INIT` reference-typed `static`.
1120 ///
1121 /// This part of the public API will go away if Rust changes
1122 /// to make the referent of `pub const FOO: &'static Encoding`
1123 /// unique cross-crate or if Rust starts allowing static arrays
1124 /// to be initialized with `pub static FOO: &'static Encoding`
1125 /// items.
1126 pub static ISO_8859_15_INIT: Encoding = Encoding {
1127 name: "ISO-8859-15",
1128 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1129 };
1130
1131 /// The ISO-8859-15 encoding.
1132 ///
1133 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1134 /// family. This encoding is also known as Latin 9.
1135 ///
1136 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1137 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1138 ///
1139 /// This encoding matches the Windows code page 28605.
1140 ///
1141 /// This will change from `static` to `const` if Rust changes
1142 /// to make the referent of `pub const FOO: &'static Encoding`
1143 /// unique cross-crate, so don't take the address of this
1144 /// `static`.
1145 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1146
1147 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1148 ///
1149 /// For use only for taking the address of this form when
1150 /// Rust prohibits the use of the non-`_INIT` form directly,
1151 /// such as in initializers of other `static`s. If in doubt,
1152 /// use the corresponding non-`_INIT` reference-typed `static`.
1153 ///
1154 /// This part of the public API will go away if Rust changes
1155 /// to make the referent of `pub const FOO: &'static Encoding`
1156 /// unique cross-crate or if Rust starts allowing static arrays
1157 /// to be initialized with `pub static FOO: &'static Encoding`
1158 /// items.
1159 pub static ISO_8859_16_INIT: Encoding = Encoding {
1160 name: "ISO-8859-16",
1161 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1162 };
1163
1164 /// The ISO-8859-16 encoding.
1165 ///
1166 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1167 /// family. This encoding is also known as Latin 10.
1168 ///
1169 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1170 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1171 ///
1172 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1173 /// does not support this encoding.
1174 ///
1175 /// This will change from `static` to `const` if Rust changes
1176 /// to make the referent of `pub const FOO: &'static Encoding`
1177 /// unique cross-crate, so don't take the address of this
1178 /// `static`.
1179 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1180
1181 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1182 ///
1183 /// For use only for taking the address of this form when
1184 /// Rust prohibits the use of the non-`_INIT` form directly,
1185 /// such as in initializers of other `static`s. If in doubt,
1186 /// use the corresponding non-`_INIT` reference-typed `static`.
1187 ///
1188 /// This part of the public API will go away if Rust changes
1189 /// to make the referent of `pub const FOO: &'static Encoding`
1190 /// unique cross-crate or if Rust starts allowing static arrays
1191 /// to be initialized with `pub static FOO: &'static Encoding`
1192 /// items.
1193 pub static ISO_8859_2_INIT: Encoding = Encoding {
1194 name: "ISO-8859-2",
1195 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1196 };
1197
1198 /// The ISO-8859-2 encoding.
1199 ///
1200 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1201 ///
1202 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1203 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1204 ///
1205 /// This encoding matches the Windows code page 28592.
1206 ///
1207 /// This will change from `static` to `const` if Rust changes
1208 /// to make the referent of `pub const FOO: &'static Encoding`
1209 /// unique cross-crate, so don't take the address of this
1210 /// `static`.
1211 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1212
1213 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1214 ///
1215 /// For use only for taking the address of this form when
1216 /// Rust prohibits the use of the non-`_INIT` form directly,
1217 /// such as in initializers of other `static`s. If in doubt,
1218 /// use the corresponding non-`_INIT` reference-typed `static`.
1219 ///
1220 /// This part of the public API will go away if Rust changes
1221 /// to make the referent of `pub const FOO: &'static Encoding`
1222 /// unique cross-crate or if Rust starts allowing static arrays
1223 /// to be initialized with `pub static FOO: &'static Encoding`
1224 /// items.
1225 pub static ISO_8859_3_INIT: Encoding = Encoding {
1226 name: "ISO-8859-3",
1227 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1228 };
1229
1230 /// The ISO-8859-3 encoding.
1231 ///
1232 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1233 ///
1234 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1235 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1236 ///
1237 /// This encoding matches the Windows code page 28593.
1238 ///
1239 /// This will change from `static` to `const` if Rust changes
1240 /// to make the referent of `pub const FOO: &'static Encoding`
1241 /// unique cross-crate, so don't take the address of this
1242 /// `static`.
1243 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1244
1245 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1246 ///
1247 /// For use only for taking the address of this form when
1248 /// Rust prohibits the use of the non-`_INIT` form directly,
1249 /// such as in initializers of other `static`s. If in doubt,
1250 /// use the corresponding non-`_INIT` reference-typed `static`.
1251 ///
1252 /// This part of the public API will go away if Rust changes
1253 /// to make the referent of `pub const FOO: &'static Encoding`
1254 /// unique cross-crate or if Rust starts allowing static arrays
1255 /// to be initialized with `pub static FOO: &'static Encoding`
1256 /// items.
1257 pub static ISO_8859_4_INIT: Encoding = Encoding {
1258 name: "ISO-8859-4",
1259 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1260 };
1261
1262 /// The ISO-8859-4 encoding.
1263 ///
1264 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1265 ///
1266 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1267 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1268 ///
1269 /// This encoding matches the Windows code page 28594.
1270 ///
1271 /// This will change from `static` to `const` if Rust changes
1272 /// to make the referent of `pub const FOO: &'static Encoding`
1273 /// unique cross-crate, so don't take the address of this
1274 /// `static`.
1275 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1276
1277 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1278 ///
1279 /// For use only for taking the address of this form when
1280 /// Rust prohibits the use of the non-`_INIT` form directly,
1281 /// such as in initializers of other `static`s. If in doubt,
1282 /// use the corresponding non-`_INIT` reference-typed `static`.
1283 ///
1284 /// This part of the public API will go away if Rust changes
1285 /// to make the referent of `pub const FOO: &'static Encoding`
1286 /// unique cross-crate or if Rust starts allowing static arrays
1287 /// to be initialized with `pub static FOO: &'static Encoding`
1288 /// items.
1289 pub static ISO_8859_5_INIT: Encoding = Encoding {
1290 name: "ISO-8859-5",
1291 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1292 };
1293
1294 /// The ISO-8859-5 encoding.
1295 ///
1296 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1297 ///
1298 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1299 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1300 ///
1301 /// This encoding matches the Windows code page 28595.
1302 ///
1303 /// This will change from `static` to `const` if Rust changes
1304 /// to make the referent of `pub const FOO: &'static Encoding`
1305 /// unique cross-crate, so don't take the address of this
1306 /// `static`.
1307 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1308
1309 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1310 ///
1311 /// For use only for taking the address of this form when
1312 /// Rust prohibits the use of the non-`_INIT` form directly,
1313 /// such as in initializers of other `static`s. If in doubt,
1314 /// use the corresponding non-`_INIT` reference-typed `static`.
1315 ///
1316 /// This part of the public API will go away if Rust changes
1317 /// to make the referent of `pub const FOO: &'static Encoding`
1318 /// unique cross-crate or if Rust starts allowing static arrays
1319 /// to be initialized with `pub static FOO: &'static Encoding`
1320 /// items.
1321 pub static ISO_8859_6_INIT: Encoding = Encoding {
1322 name: "ISO-8859-6",
1323 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1324 };
1325
1326 /// The ISO-8859-6 encoding.
1327 ///
1328 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1329 ///
1330 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1331 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1332 ///
1333 /// This encoding matches the Windows code page 28596, except Windows decodes
1334 /// unassigned code points to the Private Use Area of Unicode.
1335 ///
1336 /// This will change from `static` to `const` if Rust changes
1337 /// to make the referent of `pub const FOO: &'static Encoding`
1338 /// unique cross-crate, so don't take the address of this
1339 /// `static`.
1340 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1341
1342 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1343 ///
1344 /// For use only for taking the address of this form when
1345 /// Rust prohibits the use of the non-`_INIT` form directly,
1346 /// such as in initializers of other `static`s. If in doubt,
1347 /// use the corresponding non-`_INIT` reference-typed `static`.
1348 ///
1349 /// This part of the public API will go away if Rust changes
1350 /// to make the referent of `pub const FOO: &'static Encoding`
1351 /// unique cross-crate or if Rust starts allowing static arrays
1352 /// to be initialized with `pub static FOO: &'static Encoding`
1353 /// items.
1354 pub static ISO_8859_7_INIT: Encoding = Encoding {
1355 name: "ISO-8859-7",
1356 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1357 };
1358
1359 /// The ISO-8859-7 encoding.
1360 ///
1361 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1362 ///
1363 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1364 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1365 ///
1366 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1367 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1368 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1369 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1370 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1371 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1372 ///
1373 /// This will change from `static` to `const` if Rust changes
1374 /// to make the referent of `pub const FOO: &'static Encoding`
1375 /// unique cross-crate, so don't take the address of this
1376 /// `static`.
1377 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1378
1379 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1380 ///
1381 /// For use only for taking the address of this form when
1382 /// Rust prohibits the use of the non-`_INIT` form directly,
1383 /// such as in initializers of other `static`s. If in doubt,
1384 /// use the corresponding non-`_INIT` reference-typed `static`.
1385 ///
1386 /// This part of the public API will go away if Rust changes
1387 /// to make the referent of `pub const FOO: &'static Encoding`
1388 /// unique cross-crate or if Rust starts allowing static arrays
1389 /// to be initialized with `pub static FOO: &'static Encoding`
1390 /// items.
1391 pub static ISO_8859_8_INIT: Encoding = Encoding {
1392 name: "ISO-8859-8",
1393 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1394 };
1395
1396 /// The ISO-8859-8 encoding.
1397 ///
1398 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1399 ///
1400 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1401 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1402 ///
1403 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1404 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1405 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1406 /// the private use area.
1407 ///
1408 /// This will change from `static` to `const` if Rust changes
1409 /// to make the referent of `pub const FOO: &'static Encoding`
1410 /// unique cross-crate, so don't take the address of this
1411 /// `static`.
1412 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1413
1414 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1415 ///
1416 /// For use only for taking the address of this form when
1417 /// Rust prohibits the use of the non-`_INIT` form directly,
1418 /// such as in initializers of other `static`s. If in doubt,
1419 /// use the corresponding non-`_INIT` reference-typed `static`.
1420 ///
1421 /// This part of the public API will go away if Rust changes
1422 /// to make the referent of `pub const FOO: &'static Encoding`
1423 /// unique cross-crate or if Rust starts allowing static arrays
1424 /// to be initialized with `pub static FOO: &'static Encoding`
1425 /// items.
1426 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1427 name: "ISO-8859-8-I",
1428 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1429 };
1430
1431 /// The ISO-8859-8-I encoding.
1432 ///
1433 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1434 ///
1435 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1436 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1437 ///
1438 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1439 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1440 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1441 /// the private use area.
1442 ///
1443 /// This will change from `static` to `const` if Rust changes
1444 /// to make the referent of `pub const FOO: &'static Encoding`
1445 /// unique cross-crate, so don't take the address of this
1446 /// `static`.
1447 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1448
1449 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1450 ///
1451 /// For use only for taking the address of this form when
1452 /// Rust prohibits the use of the non-`_INIT` form directly,
1453 /// such as in initializers of other `static`s. If in doubt,
1454 /// use the corresponding non-`_INIT` reference-typed `static`.
1455 ///
1456 /// This part of the public API will go away if Rust changes
1457 /// to make the referent of `pub const FOO: &'static Encoding`
1458 /// unique cross-crate or if Rust starts allowing static arrays
1459 /// to be initialized with `pub static FOO: &'static Encoding`
1460 /// items.
1461 pub static KOI8_R_INIT: Encoding = Encoding {
1462 name: "KOI8-R",
1463 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1464 };
1465
1466 /// The KOI8-R encoding.
1467 ///
1468 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1469 ///
1470 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1471 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1472 ///
1473 /// This encoding matches the Windows code page 20866.
1474 ///
1475 /// This will change from `static` to `const` if Rust changes
1476 /// to make the referent of `pub const FOO: &'static Encoding`
1477 /// unique cross-crate, so don't take the address of this
1478 /// `static`.
1479 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1480
1481 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1482 ///
1483 /// For use only for taking the address of this form when
1484 /// Rust prohibits the use of the non-`_INIT` form directly,
1485 /// such as in initializers of other `static`s. If in doubt,
1486 /// use the corresponding non-`_INIT` reference-typed `static`.
1487 ///
1488 /// This part of the public API will go away if Rust changes
1489 /// to make the referent of `pub const FOO: &'static Encoding`
1490 /// unique cross-crate or if Rust starts allowing static arrays
1491 /// to be initialized with `pub static FOO: &'static Encoding`
1492 /// items.
1493 pub static KOI8_U_INIT: Encoding = Encoding {
1494 name: "KOI8-U",
1495 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1496 };
1497
1498 /// The KOI8-U encoding.
1499 ///
1500 /// This is an encoding for Ukrainian adapted from KOI8-R.
1501 ///
1502 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1503 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1504 ///
1505 /// This encoding matches the Windows code page 21866.
1506 ///
1507 /// This will change from `static` to `const` if Rust changes
1508 /// to make the referent of `pub const FOO: &'static Encoding`
1509 /// unique cross-crate, so don't take the address of this
1510 /// `static`.
1511 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1512
1513 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1514 ///
1515 /// For use only for taking the address of this form when
1516 /// Rust prohibits the use of the non-`_INIT` form directly,
1517 /// such as in initializers of other `static`s. If in doubt,
1518 /// use the corresponding non-`_INIT` reference-typed `static`.
1519 ///
1520 /// This part of the public API will go away if Rust changes
1521 /// to make the referent of `pub const FOO: &'static Encoding`
1522 /// unique cross-crate or if Rust starts allowing static arrays
1523 /// to be initialized with `pub static FOO: &'static Encoding`
1524 /// items.
1525 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1526 name: "Shift_JIS",
1527 variant: VariantEncoding::ShiftJis,
1528 };
1529
1530 /// The Shift_JIS encoding.
1531 ///
1532 /// This is the Japanese encoding for Windows.
1533 ///
1534 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1535 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1536 ///
1537 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1538 /// sequences that are error per the Encoding Standard to the question mark or the
1539 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1540 ///
1541 /// This will change from `static` to `const` if Rust changes
1542 /// to make the referent of `pub const FOO: &'static Encoding`
1543 /// unique cross-crate, so don't take the address of this
1544 /// `static`.
1545 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1546
1547 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1548 ///
1549 /// For use only for taking the address of this form when
1550 /// Rust prohibits the use of the non-`_INIT` form directly,
1551 /// such as in initializers of other `static`s. If in doubt,
1552 /// use the corresponding non-`_INIT` reference-typed `static`.
1553 ///
1554 /// This part of the public API will go away if Rust changes
1555 /// to make the referent of `pub const FOO: &'static Encoding`
1556 /// unique cross-crate or if Rust starts allowing static arrays
1557 /// to be initialized with `pub static FOO: &'static Encoding`
1558 /// items.
1559 pub static UTF_16BE_INIT: Encoding = Encoding {
1560 name: "UTF-16BE",
1561 variant: VariantEncoding::Utf16Be,
1562 };
1563
1564 /// The UTF-16BE encoding.
1565 ///
1566 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1567 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1568 /// mark the big endian byte order is assumed.
1569 ///
1570 /// There is no corresponding encoder in this crate or in the Encoding
1571 /// Standard. The output encoding of this encoding is UTF-8.
1572 ///
1573 /// This encoding matches the Windows code page 1201.
1574 ///
1575 /// This will change from `static` to `const` if Rust changes
1576 /// to make the referent of `pub const FOO: &'static Encoding`
1577 /// unique cross-crate, so don't take the address of this
1578 /// `static`.
1579 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1580
1581 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1582 ///
1583 /// For use only for taking the address of this form when
1584 /// Rust prohibits the use of the non-`_INIT` form directly,
1585 /// such as in initializers of other `static`s. If in doubt,
1586 /// use the corresponding non-`_INIT` reference-typed `static`.
1587 ///
1588 /// This part of the public API will go away if Rust changes
1589 /// to make the referent of `pub const FOO: &'static Encoding`
1590 /// unique cross-crate or if Rust starts allowing static arrays
1591 /// to be initialized with `pub static FOO: &'static Encoding`
1592 /// items.
1593 pub static UTF_16LE_INIT: Encoding = Encoding {
1594 name: "UTF-16LE",
1595 variant: VariantEncoding::Utf16Le,
1596 };
1597
1598 /// The UTF-16LE encoding.
1599 ///
1600 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1601 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1602 /// mark the little endian byte order is assumed.
1603 ///
1604 /// There is no corresponding encoder in this crate or in the Encoding
1605 /// Standard. The output encoding of this encoding is UTF-8.
1606 ///
1607 /// This encoding matches the Windows code page 1200.
1608 ///
1609 /// This will change from `static` to `const` if Rust changes
1610 /// to make the referent of `pub const FOO: &'static Encoding`
1611 /// unique cross-crate, so don't take the address of this
1612 /// `static`.
1613 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1614
1615 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1616 ///
1617 /// For use only for taking the address of this form when
1618 /// Rust prohibits the use of the non-`_INIT` form directly,
1619 /// such as in initializers of other `static`s. If in doubt,
1620 /// use the corresponding non-`_INIT` reference-typed `static`.
1621 ///
1622 /// This part of the public API will go away if Rust changes
1623 /// to make the referent of `pub const FOO: &'static Encoding`
1624 /// unique cross-crate or if Rust starts allowing static arrays
1625 /// to be initialized with `pub static FOO: &'static Encoding`
1626 /// items.
1627 pub static UTF_8_INIT: Encoding = Encoding {
1628 name: "UTF-8",
1629 variant: VariantEncoding::Utf8,
1630 };
1631
1632 /// The UTF-8 encoding.
1633 ///
1634 /// This is the encoding that should be used for all new development it can
1635 /// represent all of Unicode.
1636 ///
1637 /// This encoding matches the Windows code page 65001, except Windows differs
1638 /// in the number of errors generated for some erroneous byte sequences.
1639 ///
1640 /// This will change from `static` to `const` if Rust changes
1641 /// to make the referent of `pub const FOO: &'static Encoding`
1642 /// unique cross-crate, so don't take the address of this
1643 /// `static`.
1644 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1645
1646 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1647 ///
1648 /// For use only for taking the address of this form when
1649 /// Rust prohibits the use of the non-`_INIT` form directly,
1650 /// such as in initializers of other `static`s. If in doubt,
1651 /// use the corresponding non-`_INIT` reference-typed `static`.
1652 ///
1653 /// This part of the public API will go away if Rust changes
1654 /// to make the referent of `pub const FOO: &'static Encoding`
1655 /// unique cross-crate or if Rust starts allowing static arrays
1656 /// to be initialized with `pub static FOO: &'static Encoding`
1657 /// items.
1658 pub static GB18030_INIT: Encoding = Encoding {
1659 name: "gb18030",
1660 variant: VariantEncoding::Gb18030,
1661 };
1662
1663 /// The gb18030 encoding.
1664 ///
1665 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1666 /// maps to U+3000 for compatibility with existing Web content. As a result,
1667 /// this encoding can represent all of Unicode except for the private-use
1668 /// character U+E5E5.
1669 ///
1670 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1671 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1672 ///
1673 /// This encoding matches the Windows code page 54936.
1674 ///
1675 /// This will change from `static` to `const` if Rust changes
1676 /// to make the referent of `pub const FOO: &'static Encoding`
1677 /// unique cross-crate, so don't take the address of this
1678 /// `static`.
1679 pub static GB18030: &'static Encoding = &GB18030_INIT;
1680
1681 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1682 ///
1683 /// For use only for taking the address of this form when
1684 /// Rust prohibits the use of the non-`_INIT` form directly,
1685 /// such as in initializers of other `static`s. If in doubt,
1686 /// use the corresponding non-`_INIT` reference-typed `static`.
1687 ///
1688 /// This part of the public API will go away if Rust changes
1689 /// to make the referent of `pub const FOO: &'static Encoding`
1690 /// unique cross-crate or if Rust starts allowing static arrays
1691 /// to be initialized with `pub static FOO: &'static Encoding`
1692 /// items.
1693 pub static MACINTOSH_INIT: Encoding = Encoding {
1694 name: "macintosh",
1695 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1696 };
1697
1698 /// The macintosh encoding.
1699 ///
1700 /// This is the MacRoman encoding from Mac OS Classic.
1701 ///
1702 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1703 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1704 ///
1705 /// This encoding matches the Windows code page 10000, except Windows decodes
1706 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1707 ///
1708 /// This will change from `static` to `const` if Rust changes
1709 /// to make the referent of `pub const FOO: &'static Encoding`
1710 /// unique cross-crate, so don't take the address of this
1711 /// `static`.
1712 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1713
1714 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1715 ///
1716 /// For use only for taking the address of this form when
1717 /// Rust prohibits the use of the non-`_INIT` form directly,
1718 /// such as in initializers of other `static`s. If in doubt,
1719 /// use the corresponding non-`_INIT` reference-typed `static`.
1720 ///
1721 /// This part of the public API will go away if Rust changes
1722 /// to make the referent of `pub const FOO: &'static Encoding`
1723 /// unique cross-crate or if Rust starts allowing static arrays
1724 /// to be initialized with `pub static FOO: &'static Encoding`
1725 /// items.
1726 pub static REPLACEMENT_INIT: Encoding = Encoding {
1727 name: "replacement",
1728 variant: VariantEncoding::Replacement,
1729 };
1730
1731 /// The replacement encoding.
1732 ///
1733 /// This decode-only encoding decodes all non-zero-length streams to a single
1734 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1735 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1736 /// encodings that are no longer supported by the Web Platform and that
1737 /// would be dangerous to treat as ASCII-compatible.
1738 ///
1739 /// There is no corresponding encoder. The output encoding of this encoding
1740 /// is UTF-8.
1741 ///
1742 /// This encoding does not have a Windows code page number.
1743 ///
1744 /// This will change from `static` to `const` if Rust changes
1745 /// to make the referent of `pub const FOO: &'static Encoding`
1746 /// unique cross-crate, so don't take the address of this
1747 /// `static`.
1748 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1749
1750 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1751 ///
1752 /// For use only for taking the address of this form when
1753 /// Rust prohibits the use of the non-`_INIT` form directly,
1754 /// such as in initializers of other `static`s. If in doubt,
1755 /// use the corresponding non-`_INIT` reference-typed `static`.
1756 ///
1757 /// This part of the public API will go away if Rust changes
1758 /// to make the referent of `pub const FOO: &'static Encoding`
1759 /// unique cross-crate or if Rust starts allowing static arrays
1760 /// to be initialized with `pub static FOO: &'static Encoding`
1761 /// items.
1762 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1763 name: "windows-1250",
1764 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1765 };
1766
1767 /// The windows-1250 encoding.
1768 ///
1769 /// This is the Central European encoding for Windows.
1770 ///
1771 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1772 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1773 ///
1774 /// This encoding matches the Windows code page 1250.
1775 ///
1776 /// This will change from `static` to `const` if Rust changes
1777 /// to make the referent of `pub const FOO: &'static Encoding`
1778 /// unique cross-crate, so don't take the address of this
1779 /// `static`.
1780 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1781
1782 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1783 ///
1784 /// For use only for taking the address of this form when
1785 /// Rust prohibits the use of the non-`_INIT` form directly,
1786 /// such as in initializers of other `static`s. If in doubt,
1787 /// use the corresponding non-`_INIT` reference-typed `static`.
1788 ///
1789 /// This part of the public API will go away if Rust changes
1790 /// to make the referent of `pub const FOO: &'static Encoding`
1791 /// unique cross-crate or if Rust starts allowing static arrays
1792 /// to be initialized with `pub static FOO: &'static Encoding`
1793 /// items.
1794 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1795 name: "windows-1251",
1796 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1797 };
1798
1799 /// The windows-1251 encoding.
1800 ///
1801 /// This is the Cyrillic encoding for Windows.
1802 ///
1803 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1804 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1805 ///
1806 /// This encoding matches the Windows code page 1251.
1807 ///
1808 /// This will change from `static` to `const` if Rust changes
1809 /// to make the referent of `pub const FOO: &'static Encoding`
1810 /// unique cross-crate, so don't take the address of this
1811 /// `static`.
1812 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1813
1814 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1815 ///
1816 /// For use only for taking the address of this form when
1817 /// Rust prohibits the use of the non-`_INIT` form directly,
1818 /// such as in initializers of other `static`s. If in doubt,
1819 /// use the corresponding non-`_INIT` reference-typed `static`.
1820 ///
1821 /// This part of the public API will go away if Rust changes
1822 /// to make the referent of `pub const FOO: &'static Encoding`
1823 /// unique cross-crate or if Rust starts allowing static arrays
1824 /// to be initialized with `pub static FOO: &'static Encoding`
1825 /// items.
1826 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1827 name: "windows-1252",
1828 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1829 };
1830
1831 /// The windows-1252 encoding.
1832 ///
1833 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1834 /// which is known as Latin 1.
1835 ///
1836 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1837 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1838 ///
1839 /// This encoding matches the Windows code page 1252.
1840 ///
1841 /// This will change from `static` to `const` if Rust changes
1842 /// to make the referent of `pub const FOO: &'static Encoding`
1843 /// unique cross-crate, so don't take the address of this
1844 /// `static`.
1845 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1846
1847 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1848 ///
1849 /// For use only for taking the address of this form when
1850 /// Rust prohibits the use of the non-`_INIT` form directly,
1851 /// such as in initializers of other `static`s. If in doubt,
1852 /// use the corresponding non-`_INIT` reference-typed `static`.
1853 ///
1854 /// This part of the public API will go away if Rust changes
1855 /// to make the referent of `pub const FOO: &'static Encoding`
1856 /// unique cross-crate or if Rust starts allowing static arrays
1857 /// to be initialized with `pub static FOO: &'static Encoding`
1858 /// items.
1859 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1860 name: "windows-1253",
1861 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1862 };
1863
1864 /// The windows-1253 encoding.
1865 ///
1866 /// This is the Greek encoding for Windows. It is mostly an extension of
1867 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1868 ///
1869 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1870 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1871 ///
1872 /// This encoding matches the Windows code page 1253, except Windows decodes
1873 /// unassigned code points to the Private Use Area of Unicode.
1874 ///
1875 /// This will change from `static` to `const` if Rust changes
1876 /// to make the referent of `pub const FOO: &'static Encoding`
1877 /// unique cross-crate, so don't take the address of this
1878 /// `static`.
1879 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1880
1881 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1882 ///
1883 /// For use only for taking the address of this form when
1884 /// Rust prohibits the use of the non-`_INIT` form directly,
1885 /// such as in initializers of other `static`s. If in doubt,
1886 /// use the corresponding non-`_INIT` reference-typed `static`.
1887 ///
1888 /// This part of the public API will go away if Rust changes
1889 /// to make the referent of `pub const FOO: &'static Encoding`
1890 /// unique cross-crate or if Rust starts allowing static arrays
1891 /// to be initialized with `pub static FOO: &'static Encoding`
1892 /// items.
1893 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1894 name: "windows-1254",
1895 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1896 };
1897
1898 /// The windows-1254 encoding.
1899 ///
1900 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1901 /// which is known as Latin 5.
1902 ///
1903 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1904 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1905 ///
1906 /// This encoding matches the Windows code page 1254.
1907 ///
1908 /// This will change from `static` to `const` if Rust changes
1909 /// to make the referent of `pub const FOO: &'static Encoding`
1910 /// unique cross-crate, so don't take the address of this
1911 /// `static`.
1912 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1913
1914 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1915 ///
1916 /// For use only for taking the address of this form when
1917 /// Rust prohibits the use of the non-`_INIT` form directly,
1918 /// such as in initializers of other `static`s. If in doubt,
1919 /// use the corresponding non-`_INIT` reference-typed `static`.
1920 ///
1921 /// This part of the public API will go away if Rust changes
1922 /// to make the referent of `pub const FOO: &'static Encoding`
1923 /// unique cross-crate or if Rust starts allowing static arrays
1924 /// to be initialized with `pub static FOO: &'static Encoding`
1925 /// items.
1926 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1927 name: "windows-1255",
1928 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1929 };
1930
1931 /// The windows-1255 encoding.
1932 ///
1933 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1934 /// except for a currency sign swap.
1935 ///
1936 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1937 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1938 ///
1939 /// This encoding matches the Windows code page 1255, except Windows decodes
1940 /// unassigned code points to the Private Use Area of Unicode.
1941 ///
1942 /// This will change from `static` to `const` if Rust changes
1943 /// to make the referent of `pub const FOO: &'static Encoding`
1944 /// unique cross-crate, so don't take the address of this
1945 /// `static`.
1946 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1947
1948 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1949 ///
1950 /// For use only for taking the address of this form when
1951 /// Rust prohibits the use of the non-`_INIT` form directly,
1952 /// such as in initializers of other `static`s. If in doubt,
1953 /// use the corresponding non-`_INIT` reference-typed `static`.
1954 ///
1955 /// This part of the public API will go away if Rust changes
1956 /// to make the referent of `pub const FOO: &'static Encoding`
1957 /// unique cross-crate or if Rust starts allowing static arrays
1958 /// to be initialized with `pub static FOO: &'static Encoding`
1959 /// items.
1960 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1961 name: "windows-1256",
1962 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1963 };
1964
1965 /// The windows-1256 encoding.
1966 ///
1967 /// This is the Arabic encoding for Windows.
1968 ///
1969 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1970 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1971 ///
1972 /// This encoding matches the Windows code page 1256.
1973 ///
1974 /// This will change from `static` to `const` if Rust changes
1975 /// to make the referent of `pub const FOO: &'static Encoding`
1976 /// unique cross-crate, so don't take the address of this
1977 /// `static`.
1978 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1979
1980 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1981 ///
1982 /// For use only for taking the address of this form when
1983 /// Rust prohibits the use of the non-`_INIT` form directly,
1984 /// such as in initializers of other `static`s. If in doubt,
1985 /// use the corresponding non-`_INIT` reference-typed `static`.
1986 ///
1987 /// This part of the public API will go away if Rust changes
1988 /// to make the referent of `pub const FOO: &'static Encoding`
1989 /// unique cross-crate or if Rust starts allowing static arrays
1990 /// to be initialized with `pub static FOO: &'static Encoding`
1991 /// items.
1992 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1993 name: "windows-1257",
1994 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1995 };
1996
1997 /// The windows-1257 encoding.
1998 ///
1999 /// This is the Baltic encoding for Windows.
2000 ///
2001 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
2002 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
2003 ///
2004 /// This encoding matches the Windows code page 1257, except Windows decodes
2005 /// unassigned code points to the Private Use Area of Unicode.
2006 ///
2007 /// This will change from `static` to `const` if Rust changes
2008 /// to make the referent of `pub const FOO: &'static Encoding`
2009 /// unique cross-crate, so don't take the address of this
2010 /// `static`.
2011 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2012
2013 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2014 ///
2015 /// For use only for taking the address of this form when
2016 /// Rust prohibits the use of the non-`_INIT` form directly,
2017 /// such as in initializers of other `static`s. If in doubt,
2018 /// use the corresponding non-`_INIT` reference-typed `static`.
2019 ///
2020 /// This part of the public API will go away if Rust changes
2021 /// to make the referent of `pub const FOO: &'static Encoding`
2022 /// unique cross-crate or if Rust starts allowing static arrays
2023 /// to be initialized with `pub static FOO: &'static Encoding`
2024 /// items.
2025 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2026 name: "windows-1258",
2027 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2028 };
2029
2030 /// The windows-1258 encoding.
2031 ///
2032 /// This is the Vietnamese encoding for Windows.
2033 ///
2034 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2035 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2036 ///
2037 /// This encoding matches the Windows code page 1258 when used in the
2038 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2039 /// result of decoding is not necessarily in Normalization Form C. On the
2040 /// other hand, input in the Normalization Form C is not encoded without
2041 /// replacement. In general, it's a bad idea to encode to encodings other
2042 /// than UTF-8, but this encoding is especially hazardous to encode to.
2043 ///
2044 /// This will change from `static` to `const` if Rust changes
2045 /// to make the referent of `pub const FOO: &'static Encoding`
2046 /// unique cross-crate, so don't take the address of this
2047 /// `static`.
2048 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2049
2050 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2051 ///
2052 /// For use only for taking the address of this form when
2053 /// Rust prohibits the use of the non-`_INIT` form directly,
2054 /// such as in initializers of other `static`s. If in doubt,
2055 /// use the corresponding non-`_INIT` reference-typed `static`.
2056 ///
2057 /// This part of the public API will go away if Rust changes
2058 /// to make the referent of `pub const FOO: &'static Encoding`
2059 /// unique cross-crate or if Rust starts allowing static arrays
2060 /// to be initialized with `pub static FOO: &'static Encoding`
2061 /// items.
2062 pub static WINDOWS_874_INIT: Encoding = Encoding {
2063 name: "windows-874",
2064 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2065 };
2066
2067 /// The windows-874 encoding.
2068 ///
2069 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2070 ///
2071 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2072 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2073 ///
2074 /// This encoding matches the Windows code page 874, except Windows decodes
2075 /// unassigned code points to the Private Use Area of Unicode.
2076 ///
2077 /// This will change from `static` to `const` if Rust changes
2078 /// to make the referent of `pub const FOO: &'static Encoding`
2079 /// unique cross-crate, so don't take the address of this
2080 /// `static`.
2081 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2082
2083 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2084 ///
2085 /// For use only for taking the address of this form when
2086 /// Rust prohibits the use of the non-`_INIT` form directly,
2087 /// such as in initializers of other `static`s. If in doubt,
2088 /// use the corresponding non-`_INIT` reference-typed `static`.
2089 ///
2090 /// This part of the public API will go away if Rust changes
2091 /// to make the referent of `pub const FOO: &'static Encoding`
2092 /// unique cross-crate or if Rust starts allowing static arrays
2093 /// to be initialized with `pub static FOO: &'static Encoding`
2094 /// items.
2095 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2096 name: "x-mac-cyrillic",
2097 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2098 };
2099
2100 /// The x-mac-cyrillic encoding.
2101 ///
2102 /// This is the MacUkrainian encoding from Mac OS Classic.
2103 ///
2104 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2105 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2106 ///
2107 /// This encoding matches the Windows code page 10017.
2108 ///
2109 /// This will change from `static` to `const` if Rust changes
2110 /// to make the referent of `pub const FOO: &'static Encoding`
2111 /// unique cross-crate, so don't take the address of this
2112 /// `static`.
2113 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2114
2115 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2116 ///
2117 /// For use only for taking the address of this form when
2118 /// Rust prohibits the use of the non-`_INIT` form directly,
2119 /// such as in initializers of other `static`s. If in doubt,
2120 /// use the corresponding non-`_INIT` reference-typed `static`.
2121 ///
2122 /// This part of the public API will go away if Rust changes
2123 /// to make the referent of `pub const FOO: &'static Encoding`
2124 /// unique cross-crate or if Rust starts allowing static arrays
2125 /// to be initialized with `pub static FOO: &'static Encoding`
2126 /// items.
2127 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2128 name: "x-user-defined",
2129 variant: VariantEncoding::UserDefined,
2130 };
2131
2132 /// The x-user-defined encoding.
2133 ///
2134 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2135 /// them to the Private Use Area of Unicode. It was used for loading binary
2136 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2137 /// the `"arraybuffer"` response type.
2138 ///
2139 /// This encoding does not have a Windows code page number.
2140 ///
2141 /// This will change from `static` to `const` if Rust changes
2142 /// to make the referent of `pub const FOO: &'static Encoding`
2143 /// unique cross-crate, so don't take the address of this
2144 /// `static`.
2145 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2146
2147 static LABELS_SORTED: [&'static str; 228] = [
2148 "l1",
2149 "l2",
2150 "l3",
2151 "l4",
2152 "l5",
2153 "l6",
2154 "l9",
2155 "866",
2156 "mac",
2157 "koi",
2158 "gbk",
2159 "big5",
2160 "utf8",
2161 "koi8",
2162 "sjis",
2163 "ucs-2",
2164 "ms932",
2165 "cp866",
2166 "utf-8",
2167 "cp819",
2168 "ascii",
2169 "x-gbk",
2170 "greek",
2171 "cp1250",
2172 "cp1251",
2173 "latin1",
2174 "gb2312",
2175 "cp1252",
2176 "latin2",
2177 "cp1253",
2178 "latin3",
2179 "cp1254",
2180 "latin4",
2181 "cp1255",
2182 "csbig5",
2183 "latin5",
2184 "utf-16",
2185 "cp1256",
2186 "ibm866",
2187 "latin6",
2188 "cp1257",
2189 "cp1258",
2190 "greek8",
2191 "ibm819",
2192 "arabic",
2193 "visual",
2194 "korean",
2195 "euc-jp",
2196 "koi8-r",
2197 "koi8_r",
2198 "euc-kr",
2199 "x-sjis",
2200 "koi8-u",
2201 "hebrew",
2202 "tis-620",
2203 "gb18030",
2204 "ksc5601",
2205 "gb_2312",
2206 "dos-874",
2207 "cn-big5",
2208 "unicode",
2209 "chinese",
2210 "logical",
2211 "cskoi8r",
2212 "cseuckr",
2213 "koi8-ru",
2214 "x-cp1250",
2215 "ksc_5601",
2216 "x-cp1251",
2217 "iso88591",
2218 "csgb2312",
2219 "x-cp1252",
2220 "iso88592",
2221 "x-cp1253",
2222 "iso88593",
2223 "ecma-114",
2224 "x-cp1254",
2225 "iso88594",
2226 "x-cp1255",
2227 "iso88595",
2228 "x-x-big5",
2229 "x-cp1256",
2230 "csibm866",
2231 "iso88596",
2232 "x-cp1257",
2233 "iso88597",
2234 "asmo-708",
2235 "ecma-118",
2236 "elot_928",
2237 "x-cp1258",
2238 "iso88598",
2239 "iso88599",
2240 "cyrillic",
2241 "utf-16be",
2242 "utf-16le",
2243 "us-ascii",
2244 "ms_kanji",
2245 "x-euc-jp",
2246 "iso885910",
2247 "iso8859-1",
2248 "iso885911",
2249 "iso8859-2",
2250 "iso8859-3",
2251 "iso885913",
2252 "iso8859-4",
2253 "iso885914",
2254 "iso8859-5",
2255 "iso885915",
2256 "iso8859-6",
2257 "iso8859-7",
2258 "iso8859-8",
2259 "iso-ir-58",
2260 "iso8859-9",
2261 "csunicode",
2262 "macintosh",
2263 "shift-jis",
2264 "shift_jis",
2265 "iso-ir-100",
2266 "iso8859-10",
2267 "iso-ir-110",
2268 "gb_2312-80",
2269 "iso-8859-1",
2270 "iso_8859-1",
2271 "iso-ir-101",
2272 "iso8859-11",
2273 "iso-8859-2",
2274 "iso_8859-2",
2275 "hz-gb-2312",
2276 "iso-8859-3",
2277 "iso_8859-3",
2278 "iso8859-13",
2279 "iso-8859-4",
2280 "iso_8859-4",
2281 "iso8859-14",
2282 "iso-ir-144",
2283 "iso-8859-5",
2284 "iso_8859-5",
2285 "iso8859-15",
2286 "iso-8859-6",
2287 "iso_8859-6",
2288 "iso-ir-126",
2289 "iso-8859-7",
2290 "iso_8859-7",
2291 "iso-ir-127",
2292 "iso-ir-157",
2293 "iso-8859-8",
2294 "iso_8859-8",
2295 "iso-ir-138",
2296 "iso-ir-148",
2297 "iso-8859-9",
2298 "iso_8859-9",
2299 "iso-ir-109",
2300 "iso-ir-149",
2301 "big5-hkscs",
2302 "csshiftjis",
2303 "iso-8859-10",
2304 "iso-8859-11",
2305 "csisolatin1",
2306 "csisolatin2",
2307 "iso-8859-13",
2308 "csisolatin3",
2309 "iso-8859-14",
2310 "windows-874",
2311 "csisolatin4",
2312 "iso-8859-15",
2313 "iso_8859-15",
2314 "csisolatin5",
2315 "iso-8859-16",
2316 "csisolatin6",
2317 "windows-949",
2318 "csisolatin9",
2319 "csiso88596e",
2320 "csiso88598e",
2321 "unicodefffe",
2322 "unicodefeff",
2323 "csmacintosh",
2324 "csiso88596i",
2325 "csiso88598i",
2326 "windows-31j",
2327 "x-mac-roman",
2328 "iso-2022-cn",
2329 "iso-2022-jp",
2330 "csiso2022jp",
2331 "iso-2022-kr",
2332 "csiso2022kr",
2333 "replacement",
2334 "windows-1250",
2335 "windows-1251",
2336 "windows-1252",
2337 "windows-1253",
2338 "windows-1254",
2339 "windows-1255",
2340 "windows-1256",
2341 "windows-1257",
2342 "windows-1258",
2343 "iso-8859-6-e",
2344 "iso-8859-8-e",
2345 "iso-8859-6-i",
2346 "iso-8859-8-i",
2347 "sun_eu_greek",
2348 "csksc56011987",
2349 "unicode20utf8",
2350 "unicode11utf8",
2351 "ks_c_5601-1987",
2352 "ansi_x3.4-1968",
2353 "ks_c_5601-1989",
2354 "x-mac-cyrillic",
2355 "x-user-defined",
2356 "csiso58gb231280",
2357 "iso-10646-ucs-2",
2358 "iso_8859-1:1987",
2359 "iso_8859-2:1987",
2360 "iso_8859-6:1987",
2361 "iso_8859-7:1987",
2362 "iso_8859-3:1988",
2363 "iso_8859-4:1988",
2364 "iso_8859-5:1988",
2365 "iso_8859-8:1988",
2366 "x-unicode20utf8",
2367 "iso_8859-9:1989",
2368 "csisolatingreek",
2369 "x-mac-ukrainian",
2370 "iso-2022-cn-ext",
2371 "csisolatinarabic",
2372 "csisolatinhebrew",
2373 "unicode-1-1-utf-8",
2374 "csisolatincyrillic",
2375 "cseucpkdfmtjapanese",
2376 ];
2377
2378 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 228] = [
2379 &WINDOWS_1252_INIT,
2380 &ISO_8859_2_INIT,
2381 &ISO_8859_3_INIT,
2382 &ISO_8859_4_INIT,
2383 &WINDOWS_1254_INIT,
2384 &ISO_8859_10_INIT,
2385 &ISO_8859_15_INIT,
2386 &IBM866_INIT,
2387 &MACINTOSH_INIT,
2388 &KOI8_R_INIT,
2389 &GBK_INIT,
2390 &BIG5_INIT,
2391 &UTF_8_INIT,
2392 &KOI8_R_INIT,
2393 &SHIFT_JIS_INIT,
2394 &UTF_16LE_INIT,
2395 &SHIFT_JIS_INIT,
2396 &IBM866_INIT,
2397 &UTF_8_INIT,
2398 &WINDOWS_1252_INIT,
2399 &WINDOWS_1252_INIT,
2400 &GBK_INIT,
2401 &ISO_8859_7_INIT,
2402 &WINDOWS_1250_INIT,
2403 &WINDOWS_1251_INIT,
2404 &WINDOWS_1252_INIT,
2405 &GBK_INIT,
2406 &WINDOWS_1252_INIT,
2407 &ISO_8859_2_INIT,
2408 &WINDOWS_1253_INIT,
2409 &ISO_8859_3_INIT,
2410 &WINDOWS_1254_INIT,
2411 &ISO_8859_4_INIT,
2412 &WINDOWS_1255_INIT,
2413 &BIG5_INIT,
2414 &WINDOWS_1254_INIT,
2415 &UTF_16LE_INIT,
2416 &WINDOWS_1256_INIT,
2417 &IBM866_INIT,
2418 &ISO_8859_10_INIT,
2419 &WINDOWS_1257_INIT,
2420 &WINDOWS_1258_INIT,
2421 &ISO_8859_7_INIT,
2422 &WINDOWS_1252_INIT,
2423 &ISO_8859_6_INIT,
2424 &ISO_8859_8_INIT,
2425 &EUC_KR_INIT,
2426 &EUC_JP_INIT,
2427 &KOI8_R_INIT,
2428 &KOI8_R_INIT,
2429 &EUC_KR_INIT,
2430 &SHIFT_JIS_INIT,
2431 &KOI8_U_INIT,
2432 &ISO_8859_8_INIT,
2433 &WINDOWS_874_INIT,
2434 &GB18030_INIT,
2435 &EUC_KR_INIT,
2436 &GBK_INIT,
2437 &WINDOWS_874_INIT,
2438 &BIG5_INIT,
2439 &UTF_16LE_INIT,
2440 &GBK_INIT,
2441 &ISO_8859_8_I_INIT,
2442 &KOI8_R_INIT,
2443 &EUC_KR_INIT,
2444 &KOI8_U_INIT,
2445 &WINDOWS_1250_INIT,
2446 &EUC_KR_INIT,
2447 &WINDOWS_1251_INIT,
2448 &WINDOWS_1252_INIT,
2449 &GBK_INIT,
2450 &WINDOWS_1252_INIT,
2451 &ISO_8859_2_INIT,
2452 &WINDOWS_1253_INIT,
2453 &ISO_8859_3_INIT,
2454 &ISO_8859_6_INIT,
2455 &WINDOWS_1254_INIT,
2456 &ISO_8859_4_INIT,
2457 &WINDOWS_1255_INIT,
2458 &ISO_8859_5_INIT,
2459 &BIG5_INIT,
2460 &WINDOWS_1256_INIT,
2461 &IBM866_INIT,
2462 &ISO_8859_6_INIT,
2463 &WINDOWS_1257_INIT,
2464 &ISO_8859_7_INIT,
2465 &ISO_8859_6_INIT,
2466 &ISO_8859_7_INIT,
2467 &ISO_8859_7_INIT,
2468 &WINDOWS_1258_INIT,
2469 &ISO_8859_8_INIT,
2470 &WINDOWS_1254_INIT,
2471 &ISO_8859_5_INIT,
2472 &UTF_16BE_INIT,
2473 &UTF_16LE_INIT,
2474 &WINDOWS_1252_INIT,
2475 &SHIFT_JIS_INIT,
2476 &EUC_JP_INIT,
2477 &ISO_8859_10_INIT,
2478 &WINDOWS_1252_INIT,
2479 &WINDOWS_874_INIT,
2480 &ISO_8859_2_INIT,
2481 &ISO_8859_3_INIT,
2482 &ISO_8859_13_INIT,
2483 &ISO_8859_4_INIT,
2484 &ISO_8859_14_INIT,
2485 &ISO_8859_5_INIT,
2486 &ISO_8859_15_INIT,
2487 &ISO_8859_6_INIT,
2488 &ISO_8859_7_INIT,
2489 &ISO_8859_8_INIT,
2490 &GBK_INIT,
2491 &WINDOWS_1254_INIT,
2492 &UTF_16LE_INIT,
2493 &MACINTOSH_INIT,
2494 &SHIFT_JIS_INIT,
2495 &SHIFT_JIS_INIT,
2496 &WINDOWS_1252_INIT,
2497 &ISO_8859_10_INIT,
2498 &ISO_8859_4_INIT,
2499 &GBK_INIT,
2500 &WINDOWS_1252_INIT,
2501 &WINDOWS_1252_INIT,
2502 &ISO_8859_2_INIT,
2503 &WINDOWS_874_INIT,
2504 &ISO_8859_2_INIT,
2505 &ISO_8859_2_INIT,
2506 &REPLACEMENT_INIT,
2507 &ISO_8859_3_INIT,
2508 &ISO_8859_3_INIT,
2509 &ISO_8859_13_INIT,
2510 &ISO_8859_4_INIT,
2511 &ISO_8859_4_INIT,
2512 &ISO_8859_14_INIT,
2513 &ISO_8859_5_INIT,
2514 &ISO_8859_5_INIT,
2515 &ISO_8859_5_INIT,
2516 &ISO_8859_15_INIT,
2517 &ISO_8859_6_INIT,
2518 &ISO_8859_6_INIT,
2519 &ISO_8859_7_INIT,
2520 &ISO_8859_7_INIT,
2521 &ISO_8859_7_INIT,
2522 &ISO_8859_6_INIT,
2523 &ISO_8859_10_INIT,
2524 &ISO_8859_8_INIT,
2525 &ISO_8859_8_INIT,
2526 &ISO_8859_8_INIT,
2527 &WINDOWS_1254_INIT,
2528 &WINDOWS_1254_INIT,
2529 &WINDOWS_1254_INIT,
2530 &ISO_8859_3_INIT,
2531 &EUC_KR_INIT,
2532 &BIG5_INIT,
2533 &SHIFT_JIS_INIT,
2534 &ISO_8859_10_INIT,
2535 &WINDOWS_874_INIT,
2536 &WINDOWS_1252_INIT,
2537 &ISO_8859_2_INIT,
2538 &ISO_8859_13_INIT,
2539 &ISO_8859_3_INIT,
2540 &ISO_8859_14_INIT,
2541 &WINDOWS_874_INIT,
2542 &ISO_8859_4_INIT,
2543 &ISO_8859_15_INIT,
2544 &ISO_8859_15_INIT,
2545 &WINDOWS_1254_INIT,
2546 &ISO_8859_16_INIT,
2547 &ISO_8859_10_INIT,
2548 &EUC_KR_INIT,
2549 &ISO_8859_15_INIT,
2550 &ISO_8859_6_INIT,
2551 &ISO_8859_8_INIT,
2552 &UTF_16BE_INIT,
2553 &UTF_16LE_INIT,
2554 &MACINTOSH_INIT,
2555 &ISO_8859_6_INIT,
2556 &ISO_8859_8_I_INIT,
2557 &SHIFT_JIS_INIT,
2558 &MACINTOSH_INIT,
2559 &REPLACEMENT_INIT,
2560 &ISO_2022_JP_INIT,
2561 &ISO_2022_JP_INIT,
2562 &REPLACEMENT_INIT,
2563 &REPLACEMENT_INIT,
2564 &REPLACEMENT_INIT,
2565 &WINDOWS_1250_INIT,
2566 &WINDOWS_1251_INIT,
2567 &WINDOWS_1252_INIT,
2568 &WINDOWS_1253_INIT,
2569 &WINDOWS_1254_INIT,
2570 &WINDOWS_1255_INIT,
2571 &WINDOWS_1256_INIT,
2572 &WINDOWS_1257_INIT,
2573 &WINDOWS_1258_INIT,
2574 &ISO_8859_6_INIT,
2575 &ISO_8859_8_INIT,
2576 &ISO_8859_6_INIT,
2577 &ISO_8859_8_I_INIT,
2578 &ISO_8859_7_INIT,
2579 &EUC_KR_INIT,
2580 &UTF_8_INIT,
2581 &UTF_8_INIT,
2582 &EUC_KR_INIT,
2583 &WINDOWS_1252_INIT,
2584 &EUC_KR_INIT,
2585 &X_MAC_CYRILLIC_INIT,
2586 &X_USER_DEFINED_INIT,
2587 &GBK_INIT,
2588 &UTF_16LE_INIT,
2589 &WINDOWS_1252_INIT,
2590 &ISO_8859_2_INIT,
2591 &ISO_8859_6_INIT,
2592 &ISO_8859_7_INIT,
2593 &ISO_8859_3_INIT,
2594 &ISO_8859_4_INIT,
2595 &ISO_8859_5_INIT,
2596 &ISO_8859_8_INIT,
2597 &UTF_8_INIT,
2598 &WINDOWS_1254_INIT,
2599 &ISO_8859_7_INIT,
2600 &X_MAC_CYRILLIC_INIT,
2601 &REPLACEMENT_INIT,
2602 &ISO_8859_6_INIT,
2603 &ISO_8859_8_INIT,
2604 &UTF_8_INIT,
2605 &ISO_8859_5_INIT,
2606 &EUC_JP_INIT,
2607 ];
2608
2609 // END GENERATED CODE
2610
2611 /// An encoding as defined in the [Encoding Standard][1].
2612 ///
2613 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2614 /// and, in most cases, vice versa. Each encoding has a name, an output
2615 /// encoding, and one or more labels.
2616 ///
2617 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2618 /// encoding in formats and protocols. The _name_ of the encoding is the
2619 /// preferred label in the case appropriate for returning from the
2620 /// [`characterSet`][2] property of the `Document` DOM interface.
2621 ///
2622 /// The _output encoding_ is the encoding used for form submission and URL
2623 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2624 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2625 /// encodings.
2626 ///
2627 /// [1]: https://encoding.spec.whatwg.org/
2628 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2629 ///
2630 /// # Streaming vs. Non-Streaming
2631 ///
2632 /// When you have the entire input in a single buffer, you can use the
2633 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2634 /// [`decode_without_bom_handling()`][5],
2635 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2636 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2637 /// not available in the C API.) Unlike the rest of the API available to Rust,
2638 /// these methods perform heap allocations. You should the `Decoder` and
2639 /// `Encoder` objects when your input is split into multiple buffers or when
2640 /// you want to control the allocation of the output buffers.
2641 ///
2642 /// [3]: #method.decode
2643 /// [4]: #method.decode_with_bom_removal
2644 /// [5]: #method.decode_without_bom_handling
2645 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2646 /// [7]: #method.encode
2647 ///
2648 /// # Instances
2649 ///
2650 /// All instances of `Encoding` are statically allocated and have the `'static`
2651 /// lifetime. There is precisely one unique `Encoding` instance for each
2652 /// encoding defined in the Encoding Standard.
2653 ///
2654 /// To obtain a reference to a particular encoding whose identity you know at
2655 /// compile time, use a `static` that refers to encoding. There is a `static`
2656 /// for each encoding. The `static`s are named in all caps with hyphens
2657 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2658 /// name). For example, if you know at compile time that you will want to
2659 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2660 /// in C/C++).
2661 ///
2662 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2663 /// work around the problem that `static`s of the type `&'static Encoding`
2664 /// cannot be used to initialize items of an array whose type is
2665 /// `[&'static Encoding; N]`.
2666 ///
2667 /// If you don't know what encoding you need at compile time and need to
2668 /// dynamically get an encoding by label, use
2669 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2670 ///
2671 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2672 /// C/C++).
2673 pub struct Encoding {
2674 name: &'static str,
2675 variant: VariantEncoding,
2676 }
2677
2678 impl Encoding {
2679 /// Implements the
2680 /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2681 /// algorithm.
2682 ///
2683 /// If, after ASCII-lowercasing and removing leading and trailing
2684 /// whitespace, the argument matches a label defined in the Encoding
2685 /// Standard, `Some(&'static Encoding)` representing the corresponding
2686 /// encoding is returned. If there is no match, `None` is returned.
2687 ///
2688 /// This is the right method to use if the action upon the method returning
2689 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2690 /// When the action upon the method returning `None` is not to proceed with
2691 /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2692 /// appropriate.
2693 ///
2694 /// The argument is of type `&[u8]` instead of `&str` to save callers
2695 /// that are extracting the label from a non-UTF-8 protocol the trouble
2696 /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2697 /// on it.)
2698 ///
2699 /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2700 pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2701 let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2702 let mut trimmed_pos = 0usize;
2703 let mut iter = label.into_iter();
2704 // before
2705 loop {
2706 match iter.next() {
2707 None => {
2708 return None;
2709 }
2710 Some(byte) => {
2711 // The characters used in labels are:
2712 // a-z (except q, but excluding it below seems excessive)
2713 // 0-9
2714 // . _ - :
2715 match *byte {
2716 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2717 continue;
2718 }
2719 b'A'..=b'Z' => {
2720 trimmed[trimmed_pos] = *byte + 0x20u8;
2721 trimmed_pos = 1usize;
2722 break;
2723 }
2724 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2725 trimmed[trimmed_pos] = *byte;
2726 trimmed_pos = 1usize;
2727 break;
2728 }
2729 _ => {
2730 return None;
2731 }
2732 }
2733 }
2734 }
2735 }
2736 // inside
2737 loop {
2738 match iter.next() {
2739 None => {
2740 break;
2741 }
2742 Some(byte) => {
2743 match *byte {
2744 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2745 break;
2746 }
2747 b'A'..=b'Z' => {
2748 if trimmed_pos == LONGEST_LABEL_LENGTH {
2749 // There's no encoding with a label this long
2750 return None;
2751 }
2752 trimmed[trimmed_pos] = *byte + 0x20u8;
2753 trimmed_pos += 1usize;
2754 continue;
2755 }
2756 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2757 if trimmed_pos == LONGEST_LABEL_LENGTH {
2758 // There's no encoding with a label this long
2759 return None;
2760 }
2761 trimmed[trimmed_pos] = *byte;
2762 trimmed_pos += 1usize;
2763 continue;
2764 }
2765 _ => {
2766 return None;
2767 }
2768 }
2769 }
2770 }
2771 }
2772 // after
2773 loop {
2774 match iter.next() {
2775 None => {
2776 break;
2777 }
2778 Some(byte) => {
2779 match *byte {
2780 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2781 continue;
2782 }
2783 _ => {
2784 // There's no label with space in the middle
2785 return None;
2786 }
2787 }
2788 }
2789 }
2790 }
2791 let candidate = &trimmed[..trimmed_pos];
2792 match LABELS_SORTED.binary_search_by(|probe| {
2793 let bytes = probe.as_bytes();
2794 let c = bytes.len().cmp(&candidate.len());
2795 if c != Ordering::Equal {
2796 return c;
2797 }
2798 let probe_iter = bytes.iter().rev();
2799 let candidate_iter = candidate.iter().rev();
2800 probe_iter.cmp(candidate_iter)
2801 }) {
2802 Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2803 Err(_) => None,
2804 }
2805 }
2806
2807 /// This method behaves the same as `for_label()`, except when `for_label()`
2808 /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2809 ///
2810 /// This method is useful in scenarios where a fatal error is required
2811 /// upon invalid label, because in those cases the caller typically wishes
2812 /// to treat the labels that map to the replacement encoding as fatal
2813 /// errors, too.
2814 ///
2815 /// It is not OK to use this method when the action upon the method returning
2816 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2817 /// case, the `for_label()` method should be used instead in order to avoid
2818 /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2819 ///
2820 /// Available via the C wrapper.
2821 #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2822 pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2823 match Encoding::for_label(label) {
2824 None => None,
2825 Some(encoding) => {
2826 if encoding == REPLACEMENT {
2827 None
2828 } else {
2829 Some(encoding)
2830 }
2831 }
2832 }
2833 }
2834
2835 /// Performs non-incremental BOM sniffing.
2836 ///
2837 /// The argument must either be a buffer representing the entire input
2838 /// stream (non-streaming case) or a buffer representing at least the first
2839 /// three bytes of the input stream (streaming case).
2840 ///
2841 /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2842 /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2843 /// or UTF-16BE BOM or `None` otherwise.
2844 ///
2845 /// Available via the C wrapper.
2846 #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2847 pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2848 if buffer.starts_with(b"\xEF\xBB\xBF") {
2849 Some((UTF_8, 3))
2850 } else if buffer.starts_with(b"\xFF\xFE") {
2851 Some((UTF_16LE, 2))
2852 } else if buffer.starts_with(b"\xFE\xFF") {
2853 Some((UTF_16BE, 2))
2854 } else {
2855 None
2856 }
2857 }
2858
2859 /// Returns the name of this encoding.
2860 ///
2861 /// This name is appropriate to return as-is from the DOM
2862 /// `document.characterSet` property.
2863 ///
2864 /// Available via the C wrapper.
2865 #[inline]
name(&'static self) -> &'static str2866 pub fn name(&'static self) -> &'static str {
2867 self.name
2868 }
2869
2870 /// Checks whether the _output encoding_ of this encoding can encode every
2871 /// `char`. (Only true if the output encoding is UTF-8.)
2872 ///
2873 /// Available via the C wrapper.
2874 #[inline]
can_encode_everything(&'static self) -> bool2875 pub fn can_encode_everything(&'static self) -> bool {
2876 self.output_encoding() == UTF_8
2877 }
2878
2879 /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2880 /// U+0000...U+007F and vice versa.
2881 ///
2882 /// Available via the C wrapper.
2883 #[inline]
is_ascii_compatible(&'static self) -> bool2884 pub fn is_ascii_compatible(&'static self) -> bool {
2885 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2886 }
2887
2888 /// Checks whether this encoding maps one byte to one Basic Multilingual
2889 /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2890 /// vice versa (for mappable characters).
2891 ///
2892 /// `true` iff this encoding is on the list of [Legacy single-byte
2893 /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2894 /// in the spec or x-user-defined.
2895 ///
2896 /// Available via the C wrapper.
2897 #[inline]
is_single_byte(&'static self) -> bool2898 pub fn is_single_byte(&'static self) -> bool {
2899 self.variant.is_single_byte()
2900 }
2901
2902 /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2903 /// U+0000...U+007F and vice versa.
2904 #[cfg(feature = "alloc")]
2905 #[inline]
is_potentially_borrowable(&'static self) -> bool2906 fn is_potentially_borrowable(&'static self) -> bool {
2907 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2908 }
2909
2910 /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2911 /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2912 ///
2913 /// Available via the C wrapper.
2914 #[inline]
output_encoding(&'static self) -> &'static Encoding2915 pub fn output_encoding(&'static self) -> &'static Encoding {
2916 if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2917 UTF_8
2918 } else {
2919 self
2920 }
2921 }
2922
2923 /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2924 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2925 /// entire input is available as a single buffer (i.e. the end of the
2926 /// buffer marks the end of the stream).
2927 ///
2928 /// This method implements the (non-streaming version of) the
2929 /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2930 ///
2931 /// The second item in the returned tuple is the encoding that was actually
2932 /// used (which may differ from this encoding thanks to BOM sniffing).
2933 ///
2934 /// The third item in the returned tuple indicates whether there were
2935 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2936 ///
2937 /// _Note:_ It is wrong to use this when the input buffer represents only
2938 /// a segment of the input instead of the whole input. Use `new_decoder()`
2939 /// when decoding segmented input.
2940 ///
2941 /// This method performs a one or two heap allocations for the backing
2942 /// buffer of the `String` when unable to borrow. (One allocation if not
2943 /// errors and potentially another one in the presence of errors.) The
2944 /// first allocation assumes jemalloc and may not be optimal with
2945 /// allocators that do not use power-of-two buckets. A borrow is performed
2946 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2947 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2948 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2949 /// transitions.
2950 ///
2951 /// # Panics
2952 ///
2953 /// If the size calculation for a heap-allocated backing buffer overflows
2954 /// `usize`.
2955 ///
2956 /// Available to Rust only and only with the `alloc` feature enabled (enabled
2957 /// by default).
2958 #[cfg(feature = "alloc")]
2959 #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2960 pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2961 let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2962 Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2963 None => (self, bytes),
2964 };
2965 let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2966 (cow, encoding, had_errors)
2967 }
2968
2969 /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2970 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2971 /// entire input is available as a single buffer (i.e. the end of the
2972 /// buffer marks the end of the stream).
2973 ///
2974 /// When invoked on `UTF_8`, this method implements the (non-streaming
2975 /// version of) the
2976 /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2977 /// concept.
2978 ///
2979 /// The second item in the returned pair indicates whether there were
2980 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2981 ///
2982 /// _Note:_ It is wrong to use this when the input buffer represents only
2983 /// a segment of the input instead of the whole input. Use
2984 /// `new_decoder_with_bom_removal()` when decoding segmented input.
2985 ///
2986 /// This method performs a one or two heap allocations for the backing
2987 /// buffer of the `String` when unable to borrow. (One allocation if not
2988 /// errors and potentially another one in the presence of errors.) The
2989 /// first allocation assumes jemalloc and may not be optimal with
2990 /// allocators that do not use power-of-two buckets. A borrow is performed
2991 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2992 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2993 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2994 /// transitions.
2995 ///
2996 /// # Panics
2997 ///
2998 /// If the size calculation for a heap-allocated backing buffer overflows
2999 /// `usize`.
3000 ///
3001 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3002 /// by default).
3003 #[cfg(feature = "alloc")]
3004 #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3005 pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3006 let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
3007 &bytes[3..]
3008 } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
3009 || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
3010 {
3011 &bytes[2..]
3012 } else {
3013 bytes
3014 };
3015 self.decode_without_bom_handling(without_bom)
3016 }
3017
3018 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3019 /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
3020 /// the entire input is available as a single buffer (i.e. the end of the
3021 /// buffer marks the end of the stream).
3022 ///
3023 /// When invoked on `UTF_8`, this method implements the (non-streaming
3024 /// version of) the
3025 /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
3026 /// spec concept.
3027 ///
3028 /// The second item in the returned pair indicates whether there were
3029 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3030 ///
3031 /// _Note:_ It is wrong to use this when the input buffer represents only
3032 /// a segment of the input instead of the whole input. Use
3033 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3034 ///
3035 /// This method performs a one or two heap allocations for the backing
3036 /// buffer of the `String` when unable to borrow. (One allocation if not
3037 /// errors and potentially another one in the presence of errors.) The
3038 /// first allocation assumes jemalloc and may not be optimal with
3039 /// allocators that do not use power-of-two buckets. A borrow is performed
3040 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3041 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3042 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3043 /// transitions.
3044 ///
3045 /// # Panics
3046 ///
3047 /// If the size calculation for a heap-allocated backing buffer overflows
3048 /// `usize`.
3049 ///
3050 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3051 /// by default).
3052 #[cfg(feature = "alloc")]
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3053 pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3054 let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3055 let valid_up_to = if self == UTF_8 {
3056 utf8_valid_up_to(bytes)
3057 } else if self == ISO_2022_JP {
3058 iso_2022_jp_ascii_valid_up_to(bytes)
3059 } else {
3060 ascii_valid_up_to(bytes)
3061 };
3062 if valid_up_to == bytes.len() {
3063 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3064 return (Cow::Borrowed(str), false);
3065 }
3066 let decoder = self.new_decoder_without_bom_handling();
3067
3068 let rounded_without_replacement = checked_next_power_of_two(checked_add(
3069 valid_up_to,
3070 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3071 ));
3072 let with_replacement = checked_add(
3073 valid_up_to,
3074 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3075 );
3076 let mut string = String::with_capacity(
3077 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3078 );
3079 unsafe {
3080 let vec = string.as_mut_vec();
3081 vec.set_len(valid_up_to);
3082 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3083 }
3084 (decoder, string, valid_up_to)
3085 } else {
3086 let decoder = self.new_decoder_without_bom_handling();
3087 let rounded_without_replacement = checked_next_power_of_two(
3088 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3089 );
3090 let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3091 let string = String::with_capacity(
3092 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3093 );
3094 (decoder, string, 0)
3095 };
3096
3097 let mut total_had_errors = false;
3098 loop {
3099 let (result, read, had_errors) =
3100 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3101 total_read += read;
3102 total_had_errors |= had_errors;
3103 match result {
3104 CoderResult::InputEmpty => {
3105 debug_assert_eq!(total_read, bytes.len());
3106 return (Cow::Owned(string), total_had_errors);
3107 }
3108 CoderResult::OutputFull => {
3109 // Allocate for the worst case. That is, we should come
3110 // here at most once per invocation of this method.
3111 let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3112 string.reserve(needed.unwrap());
3113 }
3114 }
3115 }
3116 }
3117
3118 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3119 /// _with malformed sequences treated as fatal_ when the entire input is
3120 /// available as a single buffer (i.e. the end of the buffer marks the end
3121 /// of the stream).
3122 ///
3123 /// When invoked on `UTF_8`, this method implements the (non-streaming
3124 /// version of) the
3125 /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3126 /// spec concept.
3127 ///
3128 /// Returns `None` if a malformed sequence was encountered and the result
3129 /// of the decode as `Some(String)` otherwise.
3130 ///
3131 /// _Note:_ It is wrong to use this when the input buffer represents only
3132 /// a segment of the input instead of the whole input. Use
3133 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3134 ///
3135 /// This method performs a single heap allocation for the backing
3136 /// buffer of the `String` when unable to borrow. A borrow is performed if
3137 /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3138 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3139 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3140 /// transitions.
3141 ///
3142 /// # Panics
3143 ///
3144 /// If the size calculation for a heap-allocated backing buffer overflows
3145 /// `usize`.
3146 ///
3147 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3148 /// by default).
3149 #[cfg(feature = "alloc")]
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3150 pub fn decode_without_bom_handling_and_without_replacement<'a>(
3151 &'static self,
3152 bytes: &'a [u8],
3153 ) -> Option<Cow<'a, str>> {
3154 if self == UTF_8 {
3155 let valid_up_to = utf8_valid_up_to(bytes);
3156 if valid_up_to == bytes.len() {
3157 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3158 return Some(Cow::Borrowed(str));
3159 }
3160 return None;
3161 }
3162 let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3163 let valid_up_to = if self == ISO_2022_JP {
3164 iso_2022_jp_ascii_valid_up_to(bytes)
3165 } else {
3166 ascii_valid_up_to(bytes)
3167 };
3168 if valid_up_to == bytes.len() {
3169 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3170 return Some(Cow::Borrowed(str));
3171 }
3172 let decoder = self.new_decoder_without_bom_handling();
3173 let mut string = String::with_capacity(
3174 checked_add(
3175 valid_up_to,
3176 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3177 )
3178 .unwrap(),
3179 );
3180 unsafe {
3181 let vec = string.as_mut_vec();
3182 vec.set_len(valid_up_to);
3183 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3184 }
3185 (decoder, string, &bytes[valid_up_to..])
3186 } else {
3187 let decoder = self.new_decoder_without_bom_handling();
3188 let string = String::with_capacity(
3189 decoder
3190 .max_utf8_buffer_length_without_replacement(bytes.len())
3191 .unwrap(),
3192 );
3193 (decoder, string, bytes)
3194 };
3195 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3196 match result {
3197 DecoderResult::InputEmpty => {
3198 debug_assert_eq!(read, input.len());
3199 Some(Cow::Owned(string))
3200 }
3201 DecoderResult::Malformed(_, _) => None,
3202 DecoderResult::OutputFull => unreachable!(),
3203 }
3204 }
3205
3206 /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3207 /// replaced with decimal numeric character references when the entire input
3208 /// is available as a single buffer (i.e. the end of the buffer marks the
3209 /// end of the stream).
3210 ///
3211 /// This method implements the (non-streaming version of) the
3212 /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3213 /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3214 /// spec concept, it is slightly more efficient to use
3215 /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3216 /// method on `UTF_8`.
3217 ///
3218 /// The second item in the returned tuple is the encoding that was actually
3219 /// used (which may differ from this encoding thanks to some encodings
3220 /// having UTF-8 as their output encoding).
3221 ///
3222 /// The third item in the returned tuple indicates whether there were
3223 /// unmappable characters (that were replaced with HTML numeric character
3224 /// references).
3225 ///
3226 /// _Note:_ It is wrong to use this when the input buffer represents only
3227 /// a segment of the input instead of the whole input. Use `new_encoder()`
3228 /// when encoding segmented output.
3229 ///
3230 /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3231 /// ASCII-compatible encoding, this method returns a borrow of the input
3232 /// without a heap allocation. Otherwise, this method performs a single
3233 /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3234 /// unmappable characters and potentially multiple heap allocations if
3235 /// there are. These allocations are tuned for jemalloc and may not be
3236 /// optimal when using a different allocator that doesn't use power-of-two
3237 /// buckets.
3238 ///
3239 /// # Panics
3240 ///
3241 /// If the size calculation for a heap-allocated backing buffer overflows
3242 /// `usize`.
3243 ///
3244 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3245 /// by default).
3246 #[cfg(feature = "alloc")]
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3247 pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3248 let output_encoding = self.output_encoding();
3249 if output_encoding == UTF_8 {
3250 return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3251 }
3252 debug_assert!(output_encoding.is_potentially_borrowable());
3253 let bytes = string.as_bytes();
3254 let valid_up_to = if output_encoding == ISO_2022_JP {
3255 iso_2022_jp_ascii_valid_up_to(bytes)
3256 } else {
3257 ascii_valid_up_to(bytes)
3258 };
3259 if valid_up_to == bytes.len() {
3260 return (Cow::Borrowed(bytes), output_encoding, false);
3261 }
3262 let mut encoder = output_encoding.new_encoder();
3263 let mut vec: Vec<u8> = Vec::with_capacity(
3264 (checked_add(
3265 valid_up_to,
3266 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3267 ))
3268 .unwrap()
3269 .next_power_of_two(),
3270 );
3271 unsafe {
3272 vec.set_len(valid_up_to);
3273 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3274 }
3275 let mut total_read = valid_up_to;
3276 let mut total_had_errors = false;
3277 loop {
3278 let (result, read, had_errors) =
3279 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3280 total_read += read;
3281 total_had_errors |= had_errors;
3282 match result {
3283 CoderResult::InputEmpty => {
3284 debug_assert_eq!(total_read, string.len());
3285 return (Cow::Owned(vec), output_encoding, total_had_errors);
3286 }
3287 CoderResult::OutputFull => {
3288 // reserve_exact wants to know how much more on top of current
3289 // length--not current capacity.
3290 let needed = encoder
3291 .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3292 let rounded = (checked_add(vec.capacity(), needed))
3293 .unwrap()
3294 .next_power_of_two();
3295 let additional = rounded - vec.len();
3296 vec.reserve_exact(additional);
3297 }
3298 }
3299 }
3300 }
3301
new_variant_decoder(&'static self) -> VariantDecoder3302 fn new_variant_decoder(&'static self) -> VariantDecoder {
3303 self.variant.new_variant_decoder()
3304 }
3305
3306 /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3307 ///
3308 /// BOM sniffing may cause the returned decoder to morph into a decoder
3309 /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3310 ///
3311 /// Available via the C wrapper.
3312 #[inline]
new_decoder(&'static self) -> Decoder3313 pub fn new_decoder(&'static self) -> Decoder {
3314 Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3315 }
3316
3317 /// Instantiates a new decoder for this encoding with BOM removal.
3318 ///
3319 /// If the input starts with bytes that are the BOM for this encoding,
3320 /// those bytes are removed. However, the decoder never morphs into a
3321 /// decoder for another encoding: A BOM for another encoding is treated as
3322 /// (potentially malformed) input to the decoding algorithm for this
3323 /// encoding.
3324 ///
3325 /// Available via the C wrapper.
3326 #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3327 pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3328 Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3329 }
3330
3331 /// Instantiates a new decoder for this encoding with BOM handling disabled.
3332 ///
3333 /// If the input starts with bytes that look like a BOM, those bytes are
3334 /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3335 /// for another encoding.)
3336 ///
3337 /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3338 /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3339 /// instead of this method to cause the BOM to be removed.
3340 ///
3341 /// Available via the C wrapper.
3342 #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3343 pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3344 Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3345 }
3346
3347 /// Instantiates a new encoder for the output encoding of this encoding.
3348 ///
3349 /// Available via the C wrapper.
3350 #[inline]
new_encoder(&'static self) -> Encoder3351 pub fn new_encoder(&'static self) -> Encoder {
3352 let enc = self.output_encoding();
3353 enc.variant.new_encoder(enc)
3354 }
3355
3356 /// Validates UTF-8.
3357 ///
3358 /// Returns the index of the first byte that makes the input malformed as
3359 /// UTF-8 or the length of the slice if the slice is entirely valid.
3360 ///
3361 /// This is currently faster than the corresponding standard library
3362 /// functionality. If this implementation gets upstreamed to the standard
3363 /// library, this method may be removed in the future.
3364 ///
3365 /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3366 pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3367 utf8_valid_up_to(bytes)
3368 }
3369
3370 /// Validates ASCII.
3371 ///
3372 /// Returns the index of the first byte that makes the input malformed as
3373 /// ASCII or the length of the slice if the slice is entirely valid.
3374 ///
3375 /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3376 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3377 ascii_valid_up_to(bytes)
3378 }
3379
3380 /// Validates ISO-2022-JP ASCII-state data.
3381 ///
3382 /// Returns the index of the first byte that makes the input not
3383 /// representable in the ASCII state of ISO-2022-JP or the length of the
3384 /// slice if the slice is entirely representable in the ASCII state of
3385 /// ISO-2022-JP.
3386 ///
3387 /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3388 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3389 iso_2022_jp_ascii_valid_up_to(bytes)
3390 }
3391 }
3392
3393 impl PartialEq for Encoding {
3394 #[inline]
eq(&self, other: &Encoding) -> bool3395 fn eq(&self, other: &Encoding) -> bool {
3396 (self as *const Encoding) == (other as *const Encoding)
3397 }
3398 }
3399
3400 impl Eq for Encoding {}
3401
3402 #[cfg(test)]
3403 impl PartialOrd for Encoding {
partial_cmp(&self, other: &Self) -> Option<Ordering>3404 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3405 (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3406 }
3407 }
3408
3409 #[cfg(test)]
3410 impl Ord for Encoding {
cmp(&self, other: &Self) -> Ordering3411 fn cmp(&self, other: &Self) -> Ordering {
3412 (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3413 }
3414 }
3415
3416 impl Hash for Encoding {
3417 #[inline]
hash<H: Hasher>(&self, state: &mut H)3418 fn hash<H: Hasher>(&self, state: &mut H) {
3419 (self as *const Encoding).hash(state);
3420 }
3421 }
3422
3423 impl core::fmt::Debug for Encoding {
3424 #[inline]
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result3425 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3426 write!(f, "Encoding {{ {} }}", self.name)
3427 }
3428 }
3429
3430 #[cfg(feature = "serde")]
3431 impl Serialize for Encoding {
3432 #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3433 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3434 where
3435 S: Serializer,
3436 {
3437 serializer.serialize_str(self.name)
3438 }
3439 }
3440
3441 #[cfg(feature = "serde")]
3442 struct EncodingVisitor;
3443
3444 #[cfg(feature = "serde")]
3445 impl<'de> Visitor<'de> for EncodingVisitor {
3446 type Value = &'static Encoding;
3447
expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result3448 fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3449 formatter.write_str("a valid encoding label")
3450 }
3451
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3452 fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3453 where
3454 E: serde::de::Error,
3455 {
3456 if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3457 Ok(enc)
3458 } else {
3459 Err(E::custom(alloc::format!(
3460 "invalid encoding label: {}",
3461 value
3462 )))
3463 }
3464 }
3465 }
3466
3467 #[cfg(feature = "serde")]
3468 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3469 fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3470 where
3471 D: Deserializer<'de>,
3472 {
3473 deserializer.deserialize_str(EncodingVisitor)
3474 }
3475 }
3476
3477 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3478 #[derive(PartialEq, Debug, Copy, Clone)]
3479 enum DecoderLifeCycle {
3480 /// The decoder has seen no input yet.
3481 AtStart,
3482 /// The decoder has seen no input yet but expects UTF-8.
3483 AtUtf8Start,
3484 /// The decoder has seen no input yet but expects UTF-16BE.
3485 AtUtf16BeStart,
3486 /// The decoder has seen no input yet but expects UTF-16LE.
3487 AtUtf16LeStart,
3488 /// The decoder has seen EF.
3489 SeenUtf8First,
3490 /// The decoder has seen EF, BB.
3491 SeenUtf8Second,
3492 /// The decoder has seen FE.
3493 SeenUtf16BeFirst,
3494 /// The decoder has seen FF.
3495 SeenUtf16LeFirst,
3496 /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3497 /// underlying decoder reported EF as an error, so we need to remember to
3498 /// push BB before the next buffer.
3499 ConvertingWithPendingBB,
3500 /// No longer looking for a BOM and EOF not yet seen.
3501 Converting,
3502 /// EOF has been seen.
3503 Finished,
3504 }
3505
3506 /// Communicate the BOM handling mode.
3507 #[derive(Debug, Copy, Clone)]
3508 enum BomHandling {
3509 /// Don't handle the BOM
3510 Off,
3511 /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3512 Sniff,
3513 /// Remove the BOM only if it's the BOM for this encoding
3514 Remove,
3515 }
3516
3517 /// Result of a (potentially partial) decode or encode operation with
3518 /// replacement.
3519 #[must_use]
3520 #[derive(Debug, PartialEq, Eq)]
3521 pub enum CoderResult {
3522 /// The input was exhausted.
3523 ///
3524 /// If this result was returned from a call where `last` was `true`, the
3525 /// conversion process has completed. Otherwise, the caller should call a
3526 /// decode or encode method again with more input.
3527 InputEmpty,
3528
3529 /// The converter cannot produce another unit of output, because the output
3530 /// buffer does not have enough space left.
3531 ///
3532 /// The caller must provide more output space upon the next call and re-push
3533 /// the remaining input to the converter.
3534 OutputFull,
3535 }
3536
3537 /// Result of a (potentially partial) decode operation without replacement.
3538 #[must_use]
3539 #[derive(Debug, PartialEq, Eq)]
3540 pub enum DecoderResult {
3541 /// The input was exhausted.
3542 ///
3543 /// If this result was returned from a call where `last` was `true`, the
3544 /// decoding process has completed. Otherwise, the caller should call a
3545 /// decode method again with more input.
3546 InputEmpty,
3547
3548 /// The decoder cannot produce another unit of output, because the output
3549 /// buffer does not have enough space left.
3550 ///
3551 /// The caller must provide more output space upon the next call and re-push
3552 /// the remaining input to the decoder.
3553 OutputFull,
3554
3555 /// The decoder encountered a malformed byte sequence.
3556 ///
3557 /// The caller must either treat this as a fatal error or must append one
3558 /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3559 /// the remaining input to the decoder.
3560 ///
3561 /// The first wrapped integer indicates the length of the malformed byte
3562 /// sequence. The second wrapped integer indicates the number of bytes
3563 /// that were consumed after the malformed sequence. If the second
3564 /// integer is zero, the last byte that was consumed is the last byte of
3565 /// the malformed sequence. Note that the malformed bytes may have been part
3566 /// of an earlier input buffer.
3567 ///
3568 /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3569 /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3570 /// of the two is 6, which happens with ISO-2022-JP.
3571 Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3572 }
3573
3574 /// A converter that decodes a byte stream into Unicode according to a
3575 /// character encoding in a streaming (incremental) manner.
3576 ///
3577 /// The various `decode_*` methods take an input buffer (`src`) and an output
3578 /// buffer `dst` both of which are caller-allocated. There are variants for
3579 /// both UTF-8 and UTF-16 output buffers.
3580 ///
3581 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3582 /// into `dst` until one of the following three things happens:
3583 ///
3584 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3585 /// variants only).
3586 ///
3587 /// 2. The output buffer has been filled so near capacity that the decoder
3588 /// cannot be sure that processing an additional byte of input wouldn't
3589 /// cause so much output that the output buffer would overflow.
3590 ///
3591 /// 3. All the input bytes have been processed.
3592 ///
3593 /// The `decode_*` method then returns tuple of a status indicating which one
3594 /// of the three reasons to return happened, how many input bytes were read,
3595 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3596 /// when decoding to UTF-16) were written (except when decoding into `String`,
3597 /// whose length change indicates this), and in the case of the
3598 /// variants performing replacement, a boolean indicating whether an error was
3599 /// replaced with the REPLACEMENT CHARACTER during the call.
3600 ///
3601 /// The number of bytes "written" is what's logically written. Garbage may be
3602 /// written in the output buffer beyond the point logically written to.
3603 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3604 /// methods that take an `&mut str` argument instead of the ones that take an
3605 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3606 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3607 /// latter don't.
3608 ///
3609 /// In the case of the `*_without_replacement` variants, the status is a
3610 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3611 /// `InputEmpty` corresponding to the three cases listed above).
3612 ///
3613 /// In the case of methods whose name does not end with
3614 /// `*_without_replacement`, malformed sequences are automatically replaced
3615 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3616 /// return early.
3617 ///
3618 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3619 /// space. When decoding to UTF-16, the output buffer must have at least two
3620 /// UTF-16 code units (`u16`) of space.
3621 ///
3622 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3623 /// not to return indicating that more output space is needed if the length
3624 /// of the output buffer is at least the length returned by
3625 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3626 /// with replacement, the length of the output buffer that guarantees the
3627 /// methods not to return indicating that more output space is needed is given
3628 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3629 /// or without replacement, the length of the output buffer that guarantees
3630 /// the methods not to return indicating that more output space is needed is
3631 /// given by [`max_utf16_buffer_length()`][4].
3632 ///
3633 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3634 /// and the output after each `decode_*` call is guaranteed to consist of
3635 /// complete characters. (I.e. the code unit sequence for the last character is
3636 /// guaranteed not to be split across output buffers.)
3637 ///
3638 /// The boolean argument `last` indicates that the end of the stream is reached
3639 /// when all the bytes in `src` have been consumed.
3640 ///
3641 /// A `Decoder` object can be used to incrementally decode a byte stream.
3642 ///
3643 /// During the processing of a single stream, the caller must call `decode_*`
3644 /// zero or more times with `last` set to `false` and then call `decode_*` at
3645 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3646 /// the processing of the stream has ended. Otherwise, the caller must call
3647 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3648 /// a fatal error).
3649 ///
3650 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3651 /// That is, you need to create another one to process another stream.
3652 ///
3653 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3654 /// the caller does not wish to treat it as a fatal error, the input buffer
3655 /// `src` may not have been completely consumed. In that case, the caller must
3656 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3657 /// call.
3658 ///
3659 /// [1]: enum.DecoderResult.html
3660 /// [2]: #method.max_utf8_buffer_length_without_replacement
3661 /// [3]: #method.max_utf8_buffer_length
3662 /// [4]: #method.max_utf16_buffer_length
3663 ///
3664 /// # Infinite loops
3665 ///
3666 /// When converting with a fixed-size output buffer whose size is too small to
3667 /// accommodate one character or (when applicable) one numeric character
3668 /// reference of output, an infinite loop ensues. When converting with a
3669 /// fixed-size output buffer, it generally makes sense to make the buffer
3670 /// fairly large (e.g. couple of kilobytes).
3671 pub struct Decoder {
3672 encoding: &'static Encoding,
3673 variant: VariantDecoder,
3674 life_cycle: DecoderLifeCycle,
3675 }
3676
3677 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3678 fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3679 Decoder {
3680 encoding: enc,
3681 variant: decoder,
3682 life_cycle: match sniffing {
3683 BomHandling::Off => DecoderLifeCycle::Converting,
3684 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3685 BomHandling::Remove => {
3686 if enc == UTF_8 {
3687 DecoderLifeCycle::AtUtf8Start
3688 } else if enc == UTF_16BE {
3689 DecoderLifeCycle::AtUtf16BeStart
3690 } else if enc == UTF_16LE {
3691 DecoderLifeCycle::AtUtf16LeStart
3692 } else {
3693 DecoderLifeCycle::Converting
3694 }
3695 }
3696 },
3697 }
3698 }
3699
3700 /// The `Encoding` this `Decoder` is for.
3701 ///
3702 /// BOM sniffing can change the return value of this method during the life
3703 /// of the decoder.
3704 ///
3705 /// Available via the C wrapper.
3706 #[inline]
encoding(&self) -> &'static Encoding3707 pub fn encoding(&self) -> &'static Encoding {
3708 self.encoding
3709 }
3710
3711 /// Query the worst-case UTF-8 output size _with replacement_.
3712 ///
3713 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3714 /// that will not overflow given the current state of the decoder and
3715 /// `byte_length` number of additional input bytes when decoding with
3716 /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3717 /// sequence or `None` if `usize` would overflow.
3718 ///
3719 /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3720 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3721 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3722 // BOM getting pushed to the underlying decoder.
3723 match self.life_cycle {
3724 DecoderLifeCycle::Converting
3725 | DecoderLifeCycle::AtUtf8Start
3726 | DecoderLifeCycle::AtUtf16LeStart
3727 | DecoderLifeCycle::AtUtf16BeStart => {
3728 return self.variant.max_utf8_buffer_length(byte_length);
3729 }
3730 DecoderLifeCycle::AtStart => {
3731 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3732 if let Some(utf16_bom) = checked_add(
3733 1,
3734 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3735 ) {
3736 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3737 let encoding = self.encoding();
3738 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3739 // No need to consider the internal state of the underlying decoder,
3740 // because it is at start, because no data has reached it yet.
3741 return Some(utf_bom);
3742 } else if let Some(non_bom) =
3743 self.variant.max_utf8_buffer_length(byte_length)
3744 {
3745 return Some(core::cmp::max(utf_bom, non_bom));
3746 }
3747 }
3748 }
3749 }
3750 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3751 // Add two bytes even when only one byte has been seen,
3752 // because the one byte can become a lead byte in multibyte
3753 // decoders, but only after the decoder has been queried
3754 // for max length, so the decoder's own logic for adding
3755 // one for a pending lead cannot work.
3756 if let Some(sum) = byte_length.checked_add(2) {
3757 if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3758 if self.encoding() == UTF_8 {
3759 // No need to consider the internal state of the underlying decoder,
3760 // because it is at start, because no data has reached it yet.
3761 return Some(utf8_bom);
3762 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3763 return Some(core::cmp::max(utf8_bom, non_bom));
3764 }
3765 }
3766 }
3767 }
3768 DecoderLifeCycle::ConvertingWithPendingBB => {
3769 if let Some(sum) = byte_length.checked_add(2) {
3770 return self.variant.max_utf8_buffer_length(sum);
3771 }
3772 }
3773 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3774 // Add two bytes even when only one byte has been seen,
3775 // because the one byte can become a lead byte in multibyte
3776 // decoders, but only after the decoder has been queried
3777 // for max length, so the decoder's own logic for adding
3778 // one for a pending lead cannot work.
3779 if let Some(sum) = byte_length.checked_add(2) {
3780 if let Some(utf16_bom) =
3781 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3782 {
3783 let encoding = self.encoding();
3784 if encoding == UTF_16LE || encoding == UTF_16BE {
3785 // No need to consider the internal state of the underlying decoder,
3786 // because it is at start, because no data has reached it yet.
3787 return Some(utf16_bom);
3788 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3789 return Some(core::cmp::max(utf16_bom, non_bom));
3790 }
3791 }
3792 }
3793 }
3794 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3795 }
3796 None
3797 }
3798
3799 /// Query the worst-case UTF-8 output size _without replacement_.
3800 ///
3801 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3802 /// that will not overflow given the current state of the decoder and
3803 /// `byte_length` number of additional input bytes when decoding without
3804 /// replacement error handling or `None` if `usize` would overflow.
3805 ///
3806 /// Note that this value may be too small for the `_with_replacement` case.
3807 /// Use `max_utf8_buffer_length()` for that case.
3808 ///
3809 /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3810 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3811 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3812 // BOM getting pushed to the underlying decoder.
3813 match self.life_cycle {
3814 DecoderLifeCycle::Converting
3815 | DecoderLifeCycle::AtUtf8Start
3816 | DecoderLifeCycle::AtUtf16LeStart
3817 | DecoderLifeCycle::AtUtf16BeStart => {
3818 return self
3819 .variant
3820 .max_utf8_buffer_length_without_replacement(byte_length);
3821 }
3822 DecoderLifeCycle::AtStart => {
3823 if let Some(utf8_bom) = byte_length.checked_add(3) {
3824 if let Some(utf16_bom) = checked_add(
3825 1,
3826 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3827 ) {
3828 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3829 let encoding = self.encoding();
3830 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3831 // No need to consider the internal state of the underlying decoder,
3832 // because it is at start, because no data has reached it yet.
3833 return Some(utf_bom);
3834 } else if let Some(non_bom) = self
3835 .variant
3836 .max_utf8_buffer_length_without_replacement(byte_length)
3837 {
3838 return Some(core::cmp::max(utf_bom, non_bom));
3839 }
3840 }
3841 }
3842 }
3843 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3844 // Add two bytes even when only one byte has been seen,
3845 // because the one byte can become a lead byte in multibyte
3846 // decoders, but only after the decoder has been queried
3847 // for max length, so the decoder's own logic for adding
3848 // one for a pending lead cannot work.
3849 if let Some(sum) = byte_length.checked_add(2) {
3850 if let Some(utf8_bom) = sum.checked_add(3) {
3851 if self.encoding() == UTF_8 {
3852 // No need to consider the internal state of the underlying decoder,
3853 // because it is at start, because no data has reached it yet.
3854 return Some(utf8_bom);
3855 } else if let Some(non_bom) =
3856 self.variant.max_utf8_buffer_length_without_replacement(sum)
3857 {
3858 return Some(core::cmp::max(utf8_bom, non_bom));
3859 }
3860 }
3861 }
3862 }
3863 DecoderLifeCycle::ConvertingWithPendingBB => {
3864 if let Some(sum) = byte_length.checked_add(2) {
3865 return self.variant.max_utf8_buffer_length_without_replacement(sum);
3866 }
3867 }
3868 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3869 // Add two bytes even when only one byte has been seen,
3870 // because the one byte can become a lead byte in multibyte
3871 // decoders, but only after the decoder has been queried
3872 // for max length, so the decoder's own logic for adding
3873 // one for a pending lead cannot work.
3874 if let Some(sum) = byte_length.checked_add(2) {
3875 if let Some(utf16_bom) =
3876 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3877 {
3878 let encoding = self.encoding();
3879 if encoding == UTF_16LE || encoding == UTF_16BE {
3880 // No need to consider the internal state of the underlying decoder,
3881 // because it is at start, because no data has reached it yet.
3882 return Some(utf16_bom);
3883 } else if let Some(non_bom) =
3884 self.variant.max_utf8_buffer_length_without_replacement(sum)
3885 {
3886 return Some(core::cmp::max(utf16_bom, non_bom));
3887 }
3888 }
3889 }
3890 }
3891 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3892 }
3893 None
3894 }
3895
3896 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3897 /// replaced with the REPLACEMENT CHARACTER.
3898 ///
3899 /// See the documentation of the struct for documentation for `decode_*`
3900 /// methods collectively.
3901 ///
3902 /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3903 pub fn decode_to_utf8(
3904 &mut self,
3905 src: &[u8],
3906 dst: &mut [u8],
3907 last: bool,
3908 ) -> (CoderResult, usize, usize, bool) {
3909 let mut had_errors = false;
3910 let mut total_read = 0usize;
3911 let mut total_written = 0usize;
3912 loop {
3913 let (result, read, written) = self.decode_to_utf8_without_replacement(
3914 &src[total_read..],
3915 &mut dst[total_written..],
3916 last,
3917 );
3918 total_read += read;
3919 total_written += written;
3920 match result {
3921 DecoderResult::InputEmpty => {
3922 return (
3923 CoderResult::InputEmpty,
3924 total_read,
3925 total_written,
3926 had_errors,
3927 );
3928 }
3929 DecoderResult::OutputFull => {
3930 return (
3931 CoderResult::OutputFull,
3932 total_read,
3933 total_written,
3934 had_errors,
3935 );
3936 }
3937 DecoderResult::Malformed(_, _) => {
3938 had_errors = true;
3939 // There should always be space for the U+FFFD, because
3940 // otherwise we'd have gotten OutputFull already.
3941 // XXX: is the above comment actually true for UTF-8 itself?
3942 // TODO: Consider having fewer bound checks here.
3943 dst[total_written] = 0xEFu8;
3944 total_written += 1;
3945 dst[total_written] = 0xBFu8;
3946 total_written += 1;
3947 dst[total_written] = 0xBDu8;
3948 total_written += 1;
3949 }
3950 }
3951 }
3952 }
3953
3954 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3955 /// replaced with the REPLACEMENT CHARACTER with type system signaling
3956 /// of UTF-8 validity.
3957 ///
3958 /// This methods calls `decode_to_utf8` and then zeroes
3959 /// out up to three bytes that aren't logically part of the write in order
3960 /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3961 ///
3962 /// See the documentation of the struct for documentation for `decode_*`
3963 /// methods collectively.
3964 ///
3965 /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3966 pub fn decode_to_str(
3967 &mut self,
3968 src: &[u8],
3969 dst: &mut str,
3970 last: bool,
3971 ) -> (CoderResult, usize, usize, bool) {
3972 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3973 let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3974 let len = bytes.len();
3975 let mut trail = written;
3976 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3977 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3978 // encodings to avoid overwriting here.
3979 if self.encoding != UTF_8 {
3980 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3981 while trail < max {
3982 bytes[trail] = 0;
3983 trail += 1;
3984 }
3985 }
3986 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3987 bytes[trail] = 0;
3988 trail += 1;
3989 }
3990 (result, read, written, replaced)
3991 }
3992
3993 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3994 /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3995 ///
3996 /// Like the others, this method follows the logic that the output buffer is
3997 /// caller-allocated. This method treats the capacity of the `String` as
3998 /// the output limit. That is, this method guarantees not to cause a
3999 /// reallocation of the backing buffer of `String`.
4000 ///
4001 /// The return value is a tuple that contains the `DecoderResult`, the
4002 /// number of bytes read and a boolean indicating whether replacements
4003 /// were done. The number of bytes written is signaled via the length of
4004 /// the `String` changing.
4005 ///
4006 /// See the documentation of the struct for documentation for `decode_*`
4007 /// methods collectively.
4008 ///
4009 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4010 /// by default).
4011 #[cfg(feature = "alloc")]
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)4012 pub fn decode_to_string(
4013 &mut self,
4014 src: &[u8],
4015 dst: &mut String,
4016 last: bool,
4017 ) -> (CoderResult, usize, bool) {
4018 unsafe {
4019 let vec = dst.as_mut_vec();
4020 let old_len = vec.len();
4021 let capacity = vec.capacity();
4022 vec.set_len(capacity);
4023 let (result, read, written, replaced) =
4024 self.decode_to_utf8(src, &mut vec[old_len..], last);
4025 vec.set_len(old_len + written);
4026 (result, read, replaced)
4027 }
4028 }
4029
4030 public_decode_function!(/// Incrementally decode a byte stream into UTF-8
4031 /// _without replacement_.
4032 ///
4033 /// See the documentation of the struct for
4034 /// documentation for `decode_*` methods
4035 /// collectively.
4036 ///
4037 /// Available via the C wrapper.
4038 ,
4039 decode_to_utf8_without_replacement,
4040 decode_to_utf8_raw,
4041 decode_to_utf8_checking_end,
4042 decode_to_utf8_after_one_potential_bom_byte,
4043 decode_to_utf8_after_two_potential_bom_bytes,
4044 decode_to_utf8_checking_end_with_offset,
4045 u8);
4046
4047 /// Incrementally decode a byte stream into UTF-8 with type system signaling
4048 /// of UTF-8 validity.
4049 ///
4050 /// This methods calls `decode_to_utf8` and then zeroes out up to three
4051 /// bytes that aren't logically part of the write in order to retain the
4052 /// UTF-8 validity even for the unwritten part of the buffer.
4053 ///
4054 /// See the documentation of the struct for documentation for `decode_*`
4055 /// methods collectively.
4056 ///
4057 /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)4058 pub fn decode_to_str_without_replacement(
4059 &mut self,
4060 src: &[u8],
4061 dst: &mut str,
4062 last: bool,
4063 ) -> (DecoderResult, usize, usize) {
4064 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4065 let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4066 let len = bytes.len();
4067 let mut trail = written;
4068 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4069 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4070 // encodings to avoid overwriting here.
4071 if self.encoding != UTF_8 {
4072 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4073 while trail < max {
4074 bytes[trail] = 0;
4075 trail += 1;
4076 }
4077 }
4078 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4079 bytes[trail] = 0;
4080 trail += 1;
4081 }
4082 (result, read, written)
4083 }
4084
4085 /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4086 ///
4087 /// Like the others, this method follows the logic that the output buffer is
4088 /// caller-allocated. This method treats the capacity of the `String` as
4089 /// the output limit. That is, this method guarantees not to cause a
4090 /// reallocation of the backing buffer of `String`.
4091 ///
4092 /// The return value is a pair that contains the `DecoderResult` and the
4093 /// number of bytes read. The number of bytes written is signaled via
4094 /// the length of the `String` changing.
4095 ///
4096 /// See the documentation of the struct for documentation for `decode_*`
4097 /// methods collectively.
4098 ///
4099 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4100 /// by default).
4101 #[cfg(feature = "alloc")]
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4102 pub fn decode_to_string_without_replacement(
4103 &mut self,
4104 src: &[u8],
4105 dst: &mut String,
4106 last: bool,
4107 ) -> (DecoderResult, usize) {
4108 unsafe {
4109 let vec = dst.as_mut_vec();
4110 let old_len = vec.len();
4111 let capacity = vec.capacity();
4112 vec.set_len(capacity);
4113 let (result, read, written) =
4114 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4115 vec.set_len(old_len + written);
4116 (result, read)
4117 }
4118 }
4119
4120 /// Query the worst-case UTF-16 output size (with or without replacement).
4121 ///
4122 /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4123 /// that will not overflow given the current state of the decoder and
4124 /// `byte_length` number of additional input bytes or `None` if `usize`
4125 /// would overflow.
4126 ///
4127 /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4128 /// return value of this method applies also in the
4129 /// `_without_replacement` case.
4130 ///
4131 /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4132 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4133 // Need to consider a) the decoder morphing due to the BOM and b) a partial
4134 // BOM getting pushed to the underlying decoder.
4135 match self.life_cycle {
4136 DecoderLifeCycle::Converting
4137 | DecoderLifeCycle::AtUtf8Start
4138 | DecoderLifeCycle::AtUtf16LeStart
4139 | DecoderLifeCycle::AtUtf16BeStart => {
4140 return self.variant.max_utf16_buffer_length(byte_length);
4141 }
4142 DecoderLifeCycle::AtStart => {
4143 if let Some(utf8_bom) = byte_length.checked_add(1) {
4144 if let Some(utf16_bom) =
4145 checked_add(1, checked_div(byte_length.checked_add(1), 2))
4146 {
4147 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4148 let encoding = self.encoding();
4149 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4150 // No need to consider the internal state of the underlying decoder,
4151 // because it is at start, because no data has reached it yet.
4152 return Some(utf_bom);
4153 } else if let Some(non_bom) =
4154 self.variant.max_utf16_buffer_length(byte_length)
4155 {
4156 return Some(core::cmp::max(utf_bom, non_bom));
4157 }
4158 }
4159 }
4160 }
4161 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4162 // Add two bytes even when only one byte has been seen,
4163 // because the one byte can become a lead byte in multibyte
4164 // decoders, but only after the decoder has been queried
4165 // for max length, so the decoder's own logic for adding
4166 // one for a pending lead cannot work.
4167 if let Some(sum) = byte_length.checked_add(2) {
4168 if let Some(utf8_bom) = sum.checked_add(1) {
4169 if self.encoding() == UTF_8 {
4170 // No need to consider the internal state of the underlying decoder,
4171 // because it is at start, because no data has reached it yet.
4172 return Some(utf8_bom);
4173 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4174 return Some(core::cmp::max(utf8_bom, non_bom));
4175 }
4176 }
4177 }
4178 }
4179 DecoderLifeCycle::ConvertingWithPendingBB => {
4180 if let Some(sum) = byte_length.checked_add(2) {
4181 return self.variant.max_utf16_buffer_length(sum);
4182 }
4183 }
4184 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4185 // Add two bytes even when only one byte has been seen,
4186 // because the one byte can become a lead byte in multibyte
4187 // decoders, but only after the decoder has been queried
4188 // for max length, so the decoder's own logic for adding
4189 // one for a pending lead cannot work.
4190 if let Some(sum) = byte_length.checked_add(2) {
4191 if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4192 let encoding = self.encoding();
4193 if encoding == UTF_16LE || encoding == UTF_16BE {
4194 // No need to consider the internal state of the underlying decoder,
4195 // because it is at start, because no data has reached it yet.
4196 return Some(utf16_bom);
4197 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4198 return Some(core::cmp::max(utf16_bom, non_bom));
4199 }
4200 }
4201 }
4202 }
4203 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4204 }
4205 None
4206 }
4207
4208 /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4209 /// replaced with the REPLACEMENT CHARACTER.
4210 ///
4211 /// See the documentation of the struct for documentation for `decode_*`
4212 /// methods collectively.
4213 ///
4214 /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4215 pub fn decode_to_utf16(
4216 &mut self,
4217 src: &[u8],
4218 dst: &mut [u16],
4219 last: bool,
4220 ) -> (CoderResult, usize, usize, bool) {
4221 let mut had_errors = false;
4222 let mut total_read = 0usize;
4223 let mut total_written = 0usize;
4224 loop {
4225 let (result, read, written) = self.decode_to_utf16_without_replacement(
4226 &src[total_read..],
4227 &mut dst[total_written..],
4228 last,
4229 );
4230 total_read += read;
4231 total_written += written;
4232 match result {
4233 DecoderResult::InputEmpty => {
4234 return (
4235 CoderResult::InputEmpty,
4236 total_read,
4237 total_written,
4238 had_errors,
4239 );
4240 }
4241 DecoderResult::OutputFull => {
4242 return (
4243 CoderResult::OutputFull,
4244 total_read,
4245 total_written,
4246 had_errors,
4247 );
4248 }
4249 DecoderResult::Malformed(_, _) => {
4250 had_errors = true;
4251 // There should always be space for the U+FFFD, because
4252 // otherwise we'd have gotten OutputFull already.
4253 dst[total_written] = 0xFFFD;
4254 total_written += 1;
4255 }
4256 }
4257 }
4258 }
4259
4260 public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4261 /// _without replacement_.
4262 ///
4263 /// See the documentation of the struct for
4264 /// documentation for `decode_*` methods
4265 /// collectively.
4266 ///
4267 /// Available via the C wrapper.
4268 ,
4269 decode_to_utf16_without_replacement,
4270 decode_to_utf16_raw,
4271 decode_to_utf16_checking_end,
4272 decode_to_utf16_after_one_potential_bom_byte,
4273 decode_to_utf16_after_two_potential_bom_bytes,
4274 decode_to_utf16_checking_end_with_offset,
4275 u16);
4276
4277 /// Checks for compatibility with storing Unicode scalar values as unsigned
4278 /// bytes taking into account the state of the decoder.
4279 ///
4280 /// Returns `None` if the decoder is not in a neutral state, including waiting
4281 /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4282 ///
4283 /// Otherwise returns the index of the first byte whose unsigned value doesn't
4284 /// directly correspond to the decoded Unicode scalar value, or the length
4285 /// of the input if all bytes in the input decode directly to scalar values
4286 /// corresponding to the unsigned byte values.
4287 ///
4288 /// Does not change the state of the decoder.
4289 ///
4290 /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4291 /// storage optimizations.
4292 ///
4293 /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4294 pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4295 match self.life_cycle {
4296 DecoderLifeCycle::Converting => {
4297 return self.variant.latin1_byte_compatible_up_to(bytes);
4298 }
4299 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4300 _ => None,
4301 }
4302 }
4303 }
4304
4305 /// Result of a (potentially partial) encode operation without replacement.
4306 #[must_use]
4307 #[derive(Debug, PartialEq, Eq)]
4308 pub enum EncoderResult {
4309 /// The input was exhausted.
4310 ///
4311 /// If this result was returned from a call where `last` was `true`, the
4312 /// decoding process has completed. Otherwise, the caller should call a
4313 /// decode method again with more input.
4314 InputEmpty,
4315
4316 /// The encoder cannot produce another unit of output, because the output
4317 /// buffer does not have enough space left.
4318 ///
4319 /// The caller must provide more output space upon the next call and re-push
4320 /// the remaining input to the decoder.
4321 OutputFull,
4322
4323 /// The encoder encountered an unmappable character.
4324 ///
4325 /// The caller must either treat this as a fatal error or must append
4326 /// a placeholder to the output and then re-push the remaining input to the
4327 /// encoder.
4328 Unmappable(char),
4329 }
4330
4331 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4332 fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4333 EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4334 }
4335 }
4336
4337 /// A converter that encodes a Unicode stream into bytes according to a
4338 /// character encoding in a streaming (incremental) manner.
4339 ///
4340 /// The various `encode_*` methods take an input buffer (`src`) and an output
4341 /// buffer `dst` both of which are caller-allocated. There are variants for
4342 /// both UTF-8 and UTF-16 input buffers.
4343 ///
4344 /// An `encode_*` method encode characters from `src` into bytes characters
4345 /// stored into `dst` until one of the following three things happens:
4346 ///
4347 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4348 /// only).
4349 ///
4350 /// 2. The output buffer has been filled so near capacity that the decoder
4351 /// cannot be sure that processing an additional character of input wouldn't
4352 /// cause so much output that the output buffer would overflow.
4353 ///
4354 /// 3. All the input characters have been processed.
4355 ///
4356 /// The `encode_*` method then returns tuple of a status indicating which one
4357 /// of the three reasons to return happened, how many input code units (`u8`
4358 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4359 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4360 /// whose length change indicates this), and in the case of the variants that
4361 /// perform replacement, a boolean indicating whether an unmappable
4362 /// character was replaced with a numeric character reference during the call.
4363 ///
4364 /// The number of bytes "written" is what's logically written. Garbage may be
4365 /// written in the output buffer beyond the point logically written to.
4366 ///
4367 /// In the case of the methods whose name ends with
4368 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4369 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4370 /// the three cases listed above).
4371 ///
4372 /// In the case of methods whose name does not end with
4373 /// `*_without_replacement`, unmappable characters are automatically replaced
4374 /// with the corresponding numeric character references and unmappable
4375 /// characters do not cause the methods to return early.
4376 ///
4377 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4378 /// not to return indicating that more output space is needed if the length
4379 /// of the output buffer is at least the length returned by
4380 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4381 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4382 /// methods not to return indicating that more output space is needed in the
4383 /// absence of unmappable characters is given by
4384 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4385 /// UTF-16 without replacement, the methods are guaranteed not to return
4386 /// indicating that more output space is needed if the length of the output
4387 /// buffer is at least the length returned by
4388 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4389 /// from UTF-16 with replacement, the the length of the output buffer that
4390 /// guarantees the methods not to return indicating that more output space is
4391 /// needed in the absence of unmappable characters is given by
4392 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4393 /// When encoding with replacement, applications are not expected to size the
4394 /// buffer for the worst case ahead of time but to resize the buffer if there
4395 /// are unmappable characters. This is why max length queries are only available
4396 /// for the case where there are no unmappable characters.
4397 ///
4398 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4399 /// calling from Rust, the type system takes care of this.) When encoding from
4400 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4401 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4402 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4403 /// are not split across input buffer boundaries.
4404 ///
4405 /// After an `encode_*` call returns, the output produced so far, taken as a
4406 /// whole from the start of the stream, is guaranteed to consist of a valid
4407 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4408 /// character is guaranteed not to be split across output buffers. However, due
4409 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4410 /// from the start for it to be valid. For other encodings, the validity holds
4411 /// on a per-output buffer basis.)
4412 ///
4413 /// The boolean argument `last` indicates that the end of the stream is reached
4414 /// when all the characters in `src` have been consumed. This argument is needed
4415 /// for ISO-2022-JP and is ignored for other encodings.
4416 ///
4417 /// An `Encoder` object can be used to incrementally encode a byte stream.
4418 ///
4419 /// During the processing of a single stream, the caller must call `encode_*`
4420 /// zero or more times with `last` set to `false` and then call `encode_*` at
4421 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4422 /// the processing of the stream has ended. Otherwise, the caller must call
4423 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4424 /// as a fatal error).
4425 ///
4426 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4427 /// That is, you need to create another one to process another stream.
4428 ///
4429 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4430 /// and the caller does not wish to treat it as a fatal error, the input buffer
4431 /// `src` may not have been completely consumed. In that case, the caller must
4432 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4433 /// call.
4434 ///
4435 /// [1]: enum.EncoderResult.html
4436 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4437 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4438 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4439 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4440 ///
4441 /// # Infinite loops
4442 ///
4443 /// When converting with a fixed-size output buffer whose size is too small to
4444 /// accommodate one character of output, an infinite loop ensues. When
4445 /// converting with a fixed-size output buffer, it generally makes sense to
4446 /// make the buffer fairly large (e.g. couple of kilobytes).
4447 pub struct Encoder {
4448 encoding: &'static Encoding,
4449 variant: VariantEncoder,
4450 }
4451
4452 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4453 fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4454 Encoder {
4455 encoding: enc,
4456 variant: encoder,
4457 }
4458 }
4459
4460 /// The `Encoding` this `Encoder` is for.
4461 #[inline]
encoding(&self) -> &'static Encoding4462 pub fn encoding(&self) -> &'static Encoding {
4463 self.encoding
4464 }
4465
4466 /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4467 /// ASCII state and `false` otherwise.
4468 #[inline]
has_pending_state(&self) -> bool4469 pub fn has_pending_state(&self) -> bool {
4470 self.variant.has_pending_state()
4471 }
4472
4473 /// Query the worst-case output size when encoding from UTF-8 with
4474 /// replacement.
4475 ///
4476 /// Returns the size of the output buffer in bytes that will not overflow
4477 /// given the current state of the encoder and `byte_length` number of
4478 /// additional input code units if there are no unmappable characters in
4479 /// the input or `None` if `usize` would overflow.
4480 ///
4481 /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4482 pub fn max_buffer_length_from_utf8_if_no_unmappables(
4483 &self,
4484 byte_length: usize,
4485 ) -> Option<usize> {
4486 checked_add(
4487 if self.encoding().can_encode_everything() {
4488 0
4489 } else {
4490 NCR_EXTRA
4491 },
4492 self.max_buffer_length_from_utf8_without_replacement(byte_length),
4493 )
4494 }
4495
4496 /// Query the worst-case output size when encoding from UTF-8 without
4497 /// replacement.
4498 ///
4499 /// Returns the size of the output buffer in bytes that will not overflow
4500 /// given the current state of the encoder and `byte_length` number of
4501 /// additional input code units or `None` if `usize` would overflow.
4502 ///
4503 /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4504 pub fn max_buffer_length_from_utf8_without_replacement(
4505 &self,
4506 byte_length: usize,
4507 ) -> Option<usize> {
4508 self.variant
4509 .max_buffer_length_from_utf8_without_replacement(byte_length)
4510 }
4511
4512 /// Incrementally encode into byte stream from UTF-8 with unmappable
4513 /// characters replaced with HTML (decimal) numeric character references.
4514 ///
4515 /// See the documentation of the struct for documentation for `encode_*`
4516 /// methods collectively.
4517 ///
4518 /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4519 pub fn encode_from_utf8(
4520 &mut self,
4521 src: &str,
4522 dst: &mut [u8],
4523 last: bool,
4524 ) -> (CoderResult, usize, usize, bool) {
4525 let dst_len = dst.len();
4526 let effective_dst_len = if self.encoding().can_encode_everything() {
4527 dst_len
4528 } else {
4529 if dst_len < NCR_EXTRA {
4530 if src.is_empty() && !(last && self.has_pending_state()) {
4531 return (CoderResult::InputEmpty, 0, 0, false);
4532 }
4533 return (CoderResult::OutputFull, 0, 0, false);
4534 }
4535 dst_len - NCR_EXTRA
4536 };
4537 let mut had_unmappables = false;
4538 let mut total_read = 0usize;
4539 let mut total_written = 0usize;
4540 loop {
4541 let (result, read, written) = self.encode_from_utf8_without_replacement(
4542 &src[total_read..],
4543 &mut dst[total_written..effective_dst_len],
4544 last,
4545 );
4546 total_read += read;
4547 total_written += written;
4548 match result {
4549 EncoderResult::InputEmpty => {
4550 return (
4551 CoderResult::InputEmpty,
4552 total_read,
4553 total_written,
4554 had_unmappables,
4555 );
4556 }
4557 EncoderResult::OutputFull => {
4558 return (
4559 CoderResult::OutputFull,
4560 total_read,
4561 total_written,
4562 had_unmappables,
4563 );
4564 }
4565 EncoderResult::Unmappable(unmappable) => {
4566 had_unmappables = true;
4567 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4568 debug_assert_ne!(self.encoding(), UTF_16BE);
4569 debug_assert_ne!(self.encoding(), UTF_16LE);
4570 // Additionally, Iso2022JpEncoder is responsible for
4571 // transitioning to ASCII when returning with Unmappable.
4572 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4573 if total_written >= effective_dst_len {
4574 if total_read == src.len() && !(last && self.has_pending_state()) {
4575 return (
4576 CoderResult::InputEmpty,
4577 total_read,
4578 total_written,
4579 had_unmappables,
4580 );
4581 }
4582 return (
4583 CoderResult::OutputFull,
4584 total_read,
4585 total_written,
4586 had_unmappables,
4587 );
4588 }
4589 }
4590 }
4591 }
4592 }
4593
4594 /// Incrementally encode into byte stream from UTF-8 with unmappable
4595 /// characters replaced with HTML (decimal) numeric character references.
4596 ///
4597 /// See the documentation of the struct for documentation for `encode_*`
4598 /// methods collectively.
4599 ///
4600 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4601 /// by default).
4602 #[cfg(feature = "alloc")]
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4603 pub fn encode_from_utf8_to_vec(
4604 &mut self,
4605 src: &str,
4606 dst: &mut Vec<u8>,
4607 last: bool,
4608 ) -> (CoderResult, usize, bool) {
4609 unsafe {
4610 let old_len = dst.len();
4611 let capacity = dst.capacity();
4612 dst.set_len(capacity);
4613 let (result, read, written, replaced) =
4614 self.encode_from_utf8(src, &mut dst[old_len..], last);
4615 dst.set_len(old_len + written);
4616 (result, read, replaced)
4617 }
4618 }
4619
4620 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4621 ///
4622 /// See the documentation of the struct for documentation for `encode_*`
4623 /// methods collectively.
4624 ///
4625 /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4626 pub fn encode_from_utf8_without_replacement(
4627 &mut self,
4628 src: &str,
4629 dst: &mut [u8],
4630 last: bool,
4631 ) -> (EncoderResult, usize, usize) {
4632 self.variant.encode_from_utf8_raw(src, dst, last)
4633 }
4634
4635 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4636 ///
4637 /// See the documentation of the struct for documentation for `encode_*`
4638 /// methods collectively.
4639 ///
4640 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4641 /// by default).
4642 #[cfg(feature = "alloc")]
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4643 pub fn encode_from_utf8_to_vec_without_replacement(
4644 &mut self,
4645 src: &str,
4646 dst: &mut Vec<u8>,
4647 last: bool,
4648 ) -> (EncoderResult, usize) {
4649 unsafe {
4650 let old_len = dst.len();
4651 let capacity = dst.capacity();
4652 dst.set_len(capacity);
4653 let (result, read, written) =
4654 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4655 dst.set_len(old_len + written);
4656 (result, read)
4657 }
4658 }
4659
4660 /// Query the worst-case output size when encoding from UTF-16 with
4661 /// replacement.
4662 ///
4663 /// Returns the size of the output buffer in bytes that will not overflow
4664 /// given the current state of the encoder and `u16_length` number of
4665 /// additional input code units if there are no unmappable characters in
4666 /// the input or `None` if `usize` would overflow.
4667 ///
4668 /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4669 pub fn max_buffer_length_from_utf16_if_no_unmappables(
4670 &self,
4671 u16_length: usize,
4672 ) -> Option<usize> {
4673 checked_add(
4674 if self.encoding().can_encode_everything() {
4675 0
4676 } else {
4677 NCR_EXTRA
4678 },
4679 self.max_buffer_length_from_utf16_without_replacement(u16_length),
4680 )
4681 }
4682
4683 /// Query the worst-case output size when encoding from UTF-16 without
4684 /// replacement.
4685 ///
4686 /// Returns the size of the output buffer in bytes that will not overflow
4687 /// given the current state of the encoder and `u16_length` number of
4688 /// additional input code units or `None` if `usize` would overflow.
4689 ///
4690 /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4691 pub fn max_buffer_length_from_utf16_without_replacement(
4692 &self,
4693 u16_length: usize,
4694 ) -> Option<usize> {
4695 self.variant
4696 .max_buffer_length_from_utf16_without_replacement(u16_length)
4697 }
4698
4699 /// Incrementally encode into byte stream from UTF-16 with unmappable
4700 /// characters replaced with HTML (decimal) numeric character references.
4701 ///
4702 /// See the documentation of the struct for documentation for `encode_*`
4703 /// methods collectively.
4704 ///
4705 /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4706 pub fn encode_from_utf16(
4707 &mut self,
4708 src: &[u16],
4709 dst: &mut [u8],
4710 last: bool,
4711 ) -> (CoderResult, usize, usize, bool) {
4712 let dst_len = dst.len();
4713 let effective_dst_len = if self.encoding().can_encode_everything() {
4714 dst_len
4715 } else {
4716 if dst_len < NCR_EXTRA {
4717 if src.is_empty() && !(last && self.has_pending_state()) {
4718 return (CoderResult::InputEmpty, 0, 0, false);
4719 }
4720 return (CoderResult::OutputFull, 0, 0, false);
4721 }
4722 dst_len - NCR_EXTRA
4723 };
4724 let mut had_unmappables = false;
4725 let mut total_read = 0usize;
4726 let mut total_written = 0usize;
4727 loop {
4728 let (result, read, written) = self.encode_from_utf16_without_replacement(
4729 &src[total_read..],
4730 &mut dst[total_written..effective_dst_len],
4731 last,
4732 );
4733 total_read += read;
4734 total_written += written;
4735 match result {
4736 EncoderResult::InputEmpty => {
4737 return (
4738 CoderResult::InputEmpty,
4739 total_read,
4740 total_written,
4741 had_unmappables,
4742 );
4743 }
4744 EncoderResult::OutputFull => {
4745 return (
4746 CoderResult::OutputFull,
4747 total_read,
4748 total_written,
4749 had_unmappables,
4750 );
4751 }
4752 EncoderResult::Unmappable(unmappable) => {
4753 had_unmappables = true;
4754 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4755 // There are no UTF-16 encoders and even if there were,
4756 // they'd never have unmappables.
4757 debug_assert_ne!(self.encoding(), UTF_16BE);
4758 debug_assert_ne!(self.encoding(), UTF_16LE);
4759 // Additionally, Iso2022JpEncoder is responsible for
4760 // transitioning to ASCII when returning with Unmappable
4761 // from the jis0208 state. That is, when we encode
4762 // ISO-2022-JP and come here, the encoder is in either the
4763 // ASCII or the Roman state. We are allowed to generate any
4764 // printable ASCII excluding \ and ~.
4765 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4766 if total_written >= effective_dst_len {
4767 if total_read == src.len() && !(last && self.has_pending_state()) {
4768 return (
4769 CoderResult::InputEmpty,
4770 total_read,
4771 total_written,
4772 had_unmappables,
4773 );
4774 }
4775 return (
4776 CoderResult::OutputFull,
4777 total_read,
4778 total_written,
4779 had_unmappables,
4780 );
4781 }
4782 }
4783 }
4784 }
4785 }
4786
4787 /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4788 ///
4789 /// See the documentation of the struct for documentation for `encode_*`
4790 /// methods collectively.
4791 ///
4792 /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4793 pub fn encode_from_utf16_without_replacement(
4794 &mut self,
4795 src: &[u16],
4796 dst: &mut [u8],
4797 last: bool,
4798 ) -> (EncoderResult, usize, usize) {
4799 self.variant.encode_from_utf16_raw(src, dst, last)
4800 }
4801 }
4802
4803 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4804 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4805 // len is the number of decimal digits needed to represent unmappable plus
4806 // 3 (the length of "&#" and ";").
4807 let mut number = unmappable as u32;
4808 let len = if number >= 1_000_000u32 {
4809 10usize
4810 } else if number >= 100_000u32 {
4811 9usize
4812 } else if number >= 10_000u32 {
4813 8usize
4814 } else if number >= 1_000u32 {
4815 7usize
4816 } else if number >= 100u32 {
4817 6usize
4818 } else {
4819 // Review the outcome of https://github.com/whatwg/encoding/issues/15
4820 // to see if this case is possible
4821 5usize
4822 };
4823 debug_assert!(number >= 10u32);
4824 debug_assert!(len <= dst.len());
4825 let mut pos = len - 1;
4826 dst[pos] = b';';
4827 pos -= 1;
4828 loop {
4829 let rightmost = number % 10;
4830 dst[pos] = rightmost as u8 + b'0';
4831 pos -= 1;
4832 if number < 10 {
4833 break;
4834 }
4835 number /= 10;
4836 }
4837 dst[1] = b'#';
4838 dst[0] = b'&';
4839 len
4840 }
4841
4842 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4843 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4844 i.wrapping_sub(start) < (end - start)
4845 }
4846
4847 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4848 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4849 i.wrapping_sub(start) < (end - start)
4850 }
4851
4852 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4853 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4854 i.wrapping_sub(start) <= (end - start)
4855 }
4856
4857 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4858 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4859 i.wrapping_sub(start) <= (end - start)
4860 }
4861
4862 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4863 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4864 i.wrapping_sub(start) <= (end - start)
4865 }
4866
4867 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4868 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4869 i.wrapping_sub(start) <= (end - start)
4870 }
4871
4872 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4873 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4874 if let Some(n) = opt {
4875 n.checked_add(num)
4876 } else {
4877 None
4878 }
4879 }
4880
4881 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4882 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4883 if let Some(n) = one {
4884 checked_add(n, other)
4885 } else {
4886 None
4887 }
4888 }
4889
4890 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4891 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4892 if let Some(n) = opt {
4893 n.checked_mul(num)
4894 } else {
4895 None
4896 }
4897 }
4898
4899 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4900 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4901 if let Some(n) = opt {
4902 n.checked_div(num)
4903 } else {
4904 None
4905 }
4906 }
4907
4908 #[cfg(feature = "alloc")]
4909 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4910 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4911 opt.map(|n| n.next_power_of_two())
4912 }
4913
4914 #[cfg(feature = "alloc")]
4915 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4916 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4917 if let Some(a) = one {
4918 if let Some(b) = other {
4919 Some(::core::cmp::min(a, b))
4920 } else {
4921 Some(a)
4922 }
4923 } else {
4924 other
4925 }
4926 }
4927
4928 // ############## TESTS ###############
4929
4930 #[cfg(all(test, feature = "serde"))]
4931 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4932 struct Demo {
4933 num: u32,
4934 name: String,
4935 enc: &'static Encoding,
4936 }
4937
4938 #[cfg(test)]
4939 mod test_labels_names;
4940
4941 #[cfg(all(test, feature = "alloc"))]
4942 mod tests {
4943 use super::*;
4944 use alloc::borrow::Cow;
4945
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4946 fn sniff_to_utf16(
4947 initial_encoding: &'static Encoding,
4948 expected_encoding: &'static Encoding,
4949 bytes: &[u8],
4950 expect: &[u16],
4951 breaks: &[usize],
4952 ) {
4953 let mut decoder = initial_encoding.new_decoder();
4954
4955 let mut dest: Vec<u16> =
4956 Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4957 let capacity = dest.capacity();
4958 dest.resize(capacity, 0u16);
4959
4960 let mut total_written = 0usize;
4961 let mut start = 0usize;
4962 for br in breaks {
4963 let (result, read, written, _) =
4964 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4965 total_written += written;
4966 assert_eq!(read, *br - start);
4967 match result {
4968 CoderResult::InputEmpty => {}
4969 CoderResult::OutputFull => {
4970 unreachable!();
4971 }
4972 }
4973 start = *br;
4974 }
4975 let (result, read, written, _) =
4976 decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4977 total_written += written;
4978 match result {
4979 CoderResult::InputEmpty => {}
4980 CoderResult::OutputFull => {
4981 unreachable!();
4982 }
4983 }
4984 assert_eq!(read, bytes.len() - start);
4985 assert_eq!(total_written, expect.len());
4986 assert_eq!(&dest[..total_written], expect);
4987 assert_eq!(decoder.encoding(), expected_encoding);
4988 }
4989
4990 // Any copyright to the test code below this comment is dedicated to the
4991 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4992
4993 #[test]
test_bom_sniffing()4994 fn test_bom_sniffing() {
4995 // ASCII
4996 sniff_to_utf16(
4997 WINDOWS_1252,
4998 WINDOWS_1252,
4999 b"\x61\x62",
5000 &[0x0061u16, 0x0062u16],
5001 &[],
5002 );
5003 // UTF-8
5004 sniff_to_utf16(
5005 WINDOWS_1252,
5006 UTF_8,
5007 b"\xEF\xBB\xBF\x61\x62",
5008 &[0x0061u16, 0x0062u16],
5009 &[],
5010 );
5011 sniff_to_utf16(
5012 WINDOWS_1252,
5013 UTF_8,
5014 b"\xEF\xBB\xBF\x61\x62",
5015 &[0x0061u16, 0x0062u16],
5016 &[1],
5017 );
5018 sniff_to_utf16(
5019 WINDOWS_1252,
5020 UTF_8,
5021 b"\xEF\xBB\xBF\x61\x62",
5022 &[0x0061u16, 0x0062u16],
5023 &[2],
5024 );
5025 sniff_to_utf16(
5026 WINDOWS_1252,
5027 UTF_8,
5028 b"\xEF\xBB\xBF\x61\x62",
5029 &[0x0061u16, 0x0062u16],
5030 &[3],
5031 );
5032 sniff_to_utf16(
5033 WINDOWS_1252,
5034 UTF_8,
5035 b"\xEF\xBB\xBF\x61\x62",
5036 &[0x0061u16, 0x0062u16],
5037 &[4],
5038 );
5039 sniff_to_utf16(
5040 WINDOWS_1252,
5041 UTF_8,
5042 b"\xEF\xBB\xBF\x61\x62",
5043 &[0x0061u16, 0x0062u16],
5044 &[2, 3],
5045 );
5046 sniff_to_utf16(
5047 WINDOWS_1252,
5048 UTF_8,
5049 b"\xEF\xBB\xBF\x61\x62",
5050 &[0x0061u16, 0x0062u16],
5051 &[1, 2],
5052 );
5053 sniff_to_utf16(
5054 WINDOWS_1252,
5055 UTF_8,
5056 b"\xEF\xBB\xBF\x61\x62",
5057 &[0x0061u16, 0x0062u16],
5058 &[1, 3],
5059 );
5060 sniff_to_utf16(
5061 WINDOWS_1252,
5062 UTF_8,
5063 b"\xEF\xBB\xBF\x61\x62",
5064 &[0x0061u16, 0x0062u16],
5065 &[1, 2, 3, 4],
5066 );
5067 sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5068 // Not UTF-8
5069 sniff_to_utf16(
5070 WINDOWS_1252,
5071 WINDOWS_1252,
5072 b"\xEF\xBB\x61\x62",
5073 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5074 &[],
5075 );
5076 sniff_to_utf16(
5077 WINDOWS_1252,
5078 WINDOWS_1252,
5079 b"\xEF\xBB\x61\x62",
5080 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5081 &[1],
5082 );
5083 sniff_to_utf16(
5084 WINDOWS_1252,
5085 WINDOWS_1252,
5086 b"\xEF\x61\x62",
5087 &[0x00EFu16, 0x0061u16, 0x0062u16],
5088 &[],
5089 );
5090 sniff_to_utf16(
5091 WINDOWS_1252,
5092 WINDOWS_1252,
5093 b"\xEF\x61\x62",
5094 &[0x00EFu16, 0x0061u16, 0x0062u16],
5095 &[1],
5096 );
5097 sniff_to_utf16(
5098 WINDOWS_1252,
5099 WINDOWS_1252,
5100 b"\xEF\xBB",
5101 &[0x00EFu16, 0x00BBu16],
5102 &[],
5103 );
5104 sniff_to_utf16(
5105 WINDOWS_1252,
5106 WINDOWS_1252,
5107 b"\xEF\xBB",
5108 &[0x00EFu16, 0x00BBu16],
5109 &[1],
5110 );
5111 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5112 // Not UTF-16
5113 sniff_to_utf16(
5114 WINDOWS_1252,
5115 WINDOWS_1252,
5116 b"\xFE\x61\x62",
5117 &[0x00FEu16, 0x0061u16, 0x0062u16],
5118 &[],
5119 );
5120 sniff_to_utf16(
5121 WINDOWS_1252,
5122 WINDOWS_1252,
5123 b"\xFE\x61\x62",
5124 &[0x00FEu16, 0x0061u16, 0x0062u16],
5125 &[1],
5126 );
5127 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5128 sniff_to_utf16(
5129 WINDOWS_1252,
5130 WINDOWS_1252,
5131 b"\xFF\x61\x62",
5132 &[0x00FFu16, 0x0061u16, 0x0062u16],
5133 &[],
5134 );
5135 sniff_to_utf16(
5136 WINDOWS_1252,
5137 WINDOWS_1252,
5138 b"\xFF\x61\x62",
5139 &[0x00FFu16, 0x0061u16, 0x0062u16],
5140 &[1],
5141 );
5142 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5143 // UTF-16
5144 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5145 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5146 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5147 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5148 }
5149
5150 #[test]
test_output_encoding()5151 fn test_output_encoding() {
5152 assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5153 assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5154 assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5155 assert_eq!(UTF_8.output_encoding(), UTF_8);
5156 assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5157 assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5158 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5159 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5160 assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5161 assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5162 }
5163
5164 #[test]
test_label_resolution()5165 fn test_label_resolution() {
5166 assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5167 assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5168 assert_eq!(
5169 Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5170 Some(UTF_8)
5171 );
5172 assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5173 assert_eq!(Encoding::for_label(b"bogus"), None);
5174 assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5175 }
5176
5177 #[test]
test_decode_valid_windows_1257_to_cow()5178 fn test_decode_valid_windows_1257_to_cow() {
5179 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5180 match cow {
5181 Cow::Borrowed(_) => unreachable!(),
5182 Cow::Owned(s) => {
5183 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5184 }
5185 }
5186 assert_eq!(encoding, WINDOWS_1257);
5187 assert!(!had_errors);
5188 }
5189
5190 #[test]
test_decode_invalid_windows_1257_to_cow()5191 fn test_decode_invalid_windows_1257_to_cow() {
5192 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5193 match cow {
5194 Cow::Borrowed(_) => unreachable!(),
5195 Cow::Owned(s) => {
5196 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5197 }
5198 }
5199 assert_eq!(encoding, WINDOWS_1257);
5200 assert!(had_errors);
5201 }
5202
5203 #[test]
test_decode_ascii_only_windows_1257_to_cow()5204 fn test_decode_ascii_only_windows_1257_to_cow() {
5205 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5206 match cow {
5207 Cow::Borrowed(s) => {
5208 assert_eq!(s, "abc");
5209 }
5210 Cow::Owned(_) => unreachable!(),
5211 }
5212 assert_eq!(encoding, WINDOWS_1257);
5213 assert!(!had_errors);
5214 }
5215
5216 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5217 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5218 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5219 match cow {
5220 Cow::Borrowed(s) => {
5221 assert_eq!(s, "\u{20AC}\u{00E4}");
5222 }
5223 Cow::Owned(_) => unreachable!(),
5224 }
5225 assert_eq!(encoding, UTF_8);
5226 assert!(!had_errors);
5227 }
5228
5229 #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5230 fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5231 let (cow, encoding, had_errors) =
5232 WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5233 match cow {
5234 Cow::Borrowed(_) => unreachable!(),
5235 Cow::Owned(s) => {
5236 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5237 }
5238 }
5239 assert_eq!(encoding, UTF_8);
5240 assert!(had_errors);
5241 }
5242
5243 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5244 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5245 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5246 match cow {
5247 Cow::Borrowed(s) => {
5248 assert_eq!(s, "\u{20AC}\u{00E4}");
5249 }
5250 Cow::Owned(_) => unreachable!(),
5251 }
5252 assert_eq!(encoding, UTF_8);
5253 assert!(!had_errors);
5254 }
5255
5256 #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5257 fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5258 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5259 match cow {
5260 Cow::Borrowed(_) => unreachable!(),
5261 Cow::Owned(s) => {
5262 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5263 }
5264 }
5265 assert_eq!(encoding, UTF_8);
5266 assert!(had_errors);
5267 }
5268
5269 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5270 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5271 let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5272 match cow {
5273 Cow::Borrowed(s) => {
5274 assert_eq!(s, "\u{20AC}\u{00E4}");
5275 }
5276 Cow::Owned(_) => unreachable!(),
5277 }
5278 assert!(!had_errors);
5279 }
5280
5281 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5282 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5283 let (cow, had_errors) =
5284 WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5285 match cow {
5286 Cow::Borrowed(_) => unreachable!(),
5287 Cow::Owned(s) => {
5288 assert_eq!(
5289 s,
5290 "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5291 );
5292 }
5293 }
5294 assert!(!had_errors);
5295 }
5296
5297 #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5298 fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5299 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5300 match cow {
5301 Cow::Borrowed(_) => unreachable!(),
5302 Cow::Owned(s) => {
5303 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5304 }
5305 }
5306 assert!(!had_errors);
5307 }
5308
5309 #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5310 fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5311 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5312 match cow {
5313 Cow::Borrowed(_) => unreachable!(),
5314 Cow::Owned(s) => {
5315 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5316 }
5317 }
5318 assert!(had_errors);
5319 }
5320
5321 #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5322 fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5323 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5324 match cow {
5325 Cow::Borrowed(s) => {
5326 assert_eq!(s, "abc");
5327 }
5328 Cow::Owned(_) => unreachable!(),
5329 }
5330 assert!(!had_errors);
5331 }
5332
5333 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5334 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5335 let (cow, had_errors) =
5336 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5337 match cow {
5338 Cow::Borrowed(s) => {
5339 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5340 }
5341 Cow::Owned(_) => unreachable!(),
5342 }
5343 assert!(!had_errors);
5344 }
5345
5346 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5347 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5348 let (cow, had_errors) =
5349 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5350 match cow {
5351 Cow::Borrowed(_) => unreachable!(),
5352 Cow::Owned(s) => {
5353 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5354 }
5355 }
5356 assert!(had_errors);
5357 }
5358
5359 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5360 fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5361 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5362 match cow {
5363 Cow::Borrowed(_) => unreachable!(),
5364 Cow::Owned(s) => {
5365 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5366 }
5367 }
5368 assert!(!had_errors);
5369 }
5370
5371 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5372 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5373 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5374 match cow {
5375 Cow::Borrowed(_) => unreachable!(),
5376 Cow::Owned(s) => {
5377 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5378 }
5379 }
5380 assert!(had_errors);
5381 }
5382
5383 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5384 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5385 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5386 match cow {
5387 Cow::Borrowed(s) => {
5388 assert_eq!(s, "abc");
5389 }
5390 Cow::Owned(_) => unreachable!(),
5391 }
5392 assert!(!had_errors);
5393 }
5394
5395 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5396 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5397 match UTF_8.decode_without_bom_handling_and_without_replacement(
5398 b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5399 ) {
5400 Some(cow) => match cow {
5401 Cow::Borrowed(s) => {
5402 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5403 }
5404 Cow::Owned(_) => unreachable!(),
5405 },
5406 None => unreachable!(),
5407 }
5408 }
5409
5410 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5411 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5412 assert!(UTF_8
5413 .decode_without_bom_handling_and_without_replacement(
5414 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5415 )
5416 .is_none());
5417 }
5418
5419 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5420 fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5421 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5422 Some(cow) => match cow {
5423 Cow::Borrowed(_) => unreachable!(),
5424 Cow::Owned(s) => {
5425 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5426 }
5427 },
5428 None => unreachable!(),
5429 }
5430 }
5431
5432 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5433 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5434 assert!(WINDOWS_1257
5435 .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5436 .is_none());
5437 }
5438
5439 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5440 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5441 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5442 Some(cow) => match cow {
5443 Cow::Borrowed(s) => {
5444 assert_eq!(s, "abc");
5445 }
5446 Cow::Owned(_) => unreachable!(),
5447 },
5448 None => unreachable!(),
5449 }
5450 }
5451
5452 #[test]
test_encode_ascii_only_windows_1257_to_cow()5453 fn test_encode_ascii_only_windows_1257_to_cow() {
5454 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5455 match cow {
5456 Cow::Borrowed(s) => {
5457 assert_eq!(s, b"abc");
5458 }
5459 Cow::Owned(_) => unreachable!(),
5460 }
5461 assert_eq!(encoding, WINDOWS_1257);
5462 assert!(!had_errors);
5463 }
5464
5465 #[test]
test_encode_valid_windows_1257_to_cow()5466 fn test_encode_valid_windows_1257_to_cow() {
5467 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5468 match cow {
5469 Cow::Borrowed(_) => unreachable!(),
5470 Cow::Owned(s) => {
5471 assert_eq!(s, b"abc\x80\xE4");
5472 }
5473 }
5474 assert_eq!(encoding, WINDOWS_1257);
5475 assert!(!had_errors);
5476 }
5477
5478 #[test]
test_utf16_space_with_one_bom_byte()5479 fn test_utf16_space_with_one_bom_byte() {
5480 let mut decoder = UTF_16LE.new_decoder();
5481 let mut dst = [0u16; 12];
5482 {
5483 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5484 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5485 assert_eq!(result, CoderResult::InputEmpty);
5486 }
5487 {
5488 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5489 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5490 assert_eq!(result, CoderResult::InputEmpty);
5491 }
5492 }
5493
5494 #[test]
test_utf8_space_with_one_bom_byte()5495 fn test_utf8_space_with_one_bom_byte() {
5496 let mut decoder = UTF_8.new_decoder();
5497 let mut dst = [0u16; 12];
5498 {
5499 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5500 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5501 assert_eq!(result, CoderResult::InputEmpty);
5502 }
5503 {
5504 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5505 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5506 assert_eq!(result, CoderResult::InputEmpty);
5507 }
5508 }
5509
5510 #[test]
test_utf16_space_with_two_bom_bytes()5511 fn test_utf16_space_with_two_bom_bytes() {
5512 let mut decoder = UTF_16LE.new_decoder();
5513 let mut dst = [0u16; 12];
5514 {
5515 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5516 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5517 assert_eq!(result, CoderResult::InputEmpty);
5518 }
5519 {
5520 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5521 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5522 assert_eq!(result, CoderResult::InputEmpty);
5523 }
5524 {
5525 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5526 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5527 assert_eq!(result, CoderResult::InputEmpty);
5528 }
5529 }
5530
5531 #[test]
test_utf8_space_with_two_bom_bytes()5532 fn test_utf8_space_with_two_bom_bytes() {
5533 let mut decoder = UTF_8.new_decoder();
5534 let mut dst = [0u16; 12];
5535 {
5536 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5537 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5538 assert_eq!(result, CoderResult::InputEmpty);
5539 }
5540 {
5541 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5542 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5543 assert_eq!(result, CoderResult::InputEmpty);
5544 }
5545 {
5546 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5547 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5548 assert_eq!(result, CoderResult::InputEmpty);
5549 }
5550 }
5551
5552 #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5553 fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5554 let mut decoder = UTF_16LE.new_decoder();
5555 let mut dst = [0u16; 12];
5556 {
5557 let needed = decoder.max_utf16_buffer_length(2).unwrap();
5558 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5559 assert_eq!(result, CoderResult::InputEmpty);
5560 }
5561 }
5562
5563 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5564 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5565 let mut dst = [0u8; 8];
5566 let mut encoder = ISO_2022_JP.new_encoder();
5567 {
5568 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5569 assert_eq!(result, CoderResult::InputEmpty);
5570 }
5571 {
5572 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5573 assert_eq!(result, CoderResult::InputEmpty);
5574 }
5575 }
5576
5577 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5578 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5579 let mut dst = [0u8; 16];
5580 let mut encoder = ISO_2022_JP.new_encoder();
5581 {
5582 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5583 assert_eq!(result, CoderResult::InputEmpty);
5584 }
5585 {
5586 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5587 assert_eq!(result, CoderResult::InputEmpty);
5588 }
5589 {
5590 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5591 assert_eq!(result, CoderResult::OutputFull);
5592 }
5593 }
5594
5595 #[test]
test_buffer_end_iso_2022_jp_from_utf8()5596 fn test_buffer_end_iso_2022_jp_from_utf8() {
5597 let mut dst = [0u8; 18];
5598 {
5599 let mut encoder = ISO_2022_JP.new_encoder();
5600 let (result, _, _, _) =
5601 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5602 assert_eq!(result, CoderResult::InputEmpty);
5603 }
5604 {
5605 let mut encoder = ISO_2022_JP.new_encoder();
5606 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5607 assert_eq!(result, CoderResult::OutputFull);
5608 }
5609 {
5610 let mut encoder = ISO_2022_JP.new_encoder();
5611 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5612 assert_eq!(result, CoderResult::InputEmpty);
5613 }
5614 {
5615 let mut encoder = ISO_2022_JP.new_encoder();
5616 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5617 assert_eq!(result, CoderResult::InputEmpty);
5618 }
5619 }
5620
5621 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5622 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5623 let mut dst = [0u8; 8];
5624 let mut encoder = ISO_2022_JP.new_encoder();
5625 {
5626 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5627 assert_eq!(result, CoderResult::InputEmpty);
5628 }
5629 {
5630 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5631 assert_eq!(result, CoderResult::InputEmpty);
5632 }
5633 }
5634
5635 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5636 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5637 let mut dst = [0u8; 16];
5638 let mut encoder = ISO_2022_JP.new_encoder();
5639 {
5640 let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5641 assert_eq!(result, CoderResult::InputEmpty);
5642 }
5643 {
5644 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5645 assert_eq!(result, CoderResult::InputEmpty);
5646 }
5647 {
5648 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5649 assert_eq!(result, CoderResult::OutputFull);
5650 }
5651 }
5652
5653 #[test]
test_buffer_end_iso_2022_jp_from_utf16()5654 fn test_buffer_end_iso_2022_jp_from_utf16() {
5655 let mut dst = [0u8; 18];
5656 {
5657 let mut encoder = ISO_2022_JP.new_encoder();
5658 let (result, _, _, _) =
5659 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5660 assert_eq!(result, CoderResult::InputEmpty);
5661 }
5662 {
5663 let mut encoder = ISO_2022_JP.new_encoder();
5664 let (result, _, _, _) =
5665 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5666 assert_eq!(result, CoderResult::OutputFull);
5667 }
5668 {
5669 let mut encoder = ISO_2022_JP.new_encoder();
5670 let (result, _, _, _) =
5671 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5672 assert_eq!(result, CoderResult::InputEmpty);
5673 }
5674 {
5675 let mut encoder = ISO_2022_JP.new_encoder();
5676 let (result, _, _, _) =
5677 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5678 assert_eq!(result, CoderResult::InputEmpty);
5679 }
5680 }
5681
5682 #[test]
test_buffer_end_utf16be()5683 fn test_buffer_end_utf16be() {
5684 let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5685 let mut dest = [0u8; 4];
5686
5687 assert_eq!(
5688 decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5689 (CoderResult::InputEmpty, 2, 0, false)
5690 );
5691
5692 let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5693 }
5694
5695 #[test]
test_hash()5696 fn test_hash() {
5697 let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5698 encodings.insert(UTF_8);
5699 encodings.insert(ISO_2022_JP);
5700 assert!(encodings.contains(UTF_8));
5701 assert!(encodings.contains(ISO_2022_JP));
5702 assert!(!encodings.contains(WINDOWS_1252));
5703 encodings.remove(ISO_2022_JP);
5704 assert!(!encodings.contains(ISO_2022_JP));
5705 }
5706
5707 #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5708 fn test_iso_2022_jp_ncr_extra_from_utf16() {
5709 let mut dst = [0u8; 17];
5710 {
5711 let mut encoder = ISO_2022_JP.new_encoder();
5712 let (result, _, _, _) =
5713 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5714 assert_eq!(result, CoderResult::OutputFull);
5715 }
5716 }
5717
5718 #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5719 fn test_iso_2022_jp_ncr_extra_from_utf8() {
5720 let mut dst = [0u8; 17];
5721 {
5722 let mut encoder = ISO_2022_JP.new_encoder();
5723 let (result, _, _, _) =
5724 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5725 assert_eq!(result, CoderResult::OutputFull);
5726 }
5727 }
5728
5729 #[test]
test_max_length_with_bom_to_utf8()5730 fn test_max_length_with_bom_to_utf8() {
5731 let mut output = [0u8; 20];
5732 let mut decoder = REPLACEMENT.new_decoder();
5733 let input = b"\xEF\xBB\xBFA";
5734 {
5735 let needed = decoder
5736 .max_utf8_buffer_length_without_replacement(input.len())
5737 .unwrap();
5738 let (result, read, written) =
5739 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5740 assert_eq!(result, DecoderResult::InputEmpty);
5741 assert_eq!(read, input.len());
5742 assert_eq!(written, 1);
5743 assert_eq!(output[0], 0x41);
5744 }
5745 }
5746
5747 #[cfg(feature = "serde")]
5748 #[test]
test_serde()5749 fn test_serde() {
5750 let demo = Demo {
5751 num: 42,
5752 name: "foo".into(),
5753 enc: UTF_8,
5754 };
5755
5756 let serialized = serde_json::to_string(&demo).unwrap();
5757
5758 let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5759 assert_eq!(deserialized, demo);
5760
5761 let bincoded = bincode::serialize(&demo).unwrap();
5762 let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5763 assert_eq!(debincoded, demo);
5764 }
5765
5766 #[test]
test_is_single_byte()5767 fn test_is_single_byte() {
5768 assert!(!BIG5.is_single_byte());
5769 assert!(!EUC_JP.is_single_byte());
5770 assert!(!EUC_KR.is_single_byte());
5771 assert!(!GB18030.is_single_byte());
5772 assert!(!GBK.is_single_byte());
5773 assert!(!REPLACEMENT.is_single_byte());
5774 assert!(!SHIFT_JIS.is_single_byte());
5775 assert!(!UTF_8.is_single_byte());
5776 assert!(!UTF_16BE.is_single_byte());
5777 assert!(!UTF_16LE.is_single_byte());
5778 assert!(!ISO_2022_JP.is_single_byte());
5779
5780 assert!(IBM866.is_single_byte());
5781 assert!(ISO_8859_2.is_single_byte());
5782 assert!(ISO_8859_3.is_single_byte());
5783 assert!(ISO_8859_4.is_single_byte());
5784 assert!(ISO_8859_5.is_single_byte());
5785 assert!(ISO_8859_6.is_single_byte());
5786 assert!(ISO_8859_7.is_single_byte());
5787 assert!(ISO_8859_8.is_single_byte());
5788 assert!(ISO_8859_10.is_single_byte());
5789 assert!(ISO_8859_13.is_single_byte());
5790 assert!(ISO_8859_14.is_single_byte());
5791 assert!(ISO_8859_15.is_single_byte());
5792 assert!(ISO_8859_16.is_single_byte());
5793 assert!(ISO_8859_8_I.is_single_byte());
5794 assert!(KOI8_R.is_single_byte());
5795 assert!(KOI8_U.is_single_byte());
5796 assert!(MACINTOSH.is_single_byte());
5797 assert!(WINDOWS_874.is_single_byte());
5798 assert!(WINDOWS_1250.is_single_byte());
5799 assert!(WINDOWS_1251.is_single_byte());
5800 assert!(WINDOWS_1252.is_single_byte());
5801 assert!(WINDOWS_1253.is_single_byte());
5802 assert!(WINDOWS_1254.is_single_byte());
5803 assert!(WINDOWS_1255.is_single_byte());
5804 assert!(WINDOWS_1256.is_single_byte());
5805 assert!(WINDOWS_1257.is_single_byte());
5806 assert!(WINDOWS_1258.is_single_byte());
5807 assert!(X_MAC_CYRILLIC.is_single_byte());
5808 assert!(X_USER_DEFINED.is_single_byte());
5809 }
5810
5811 #[test]
test_latin1_byte_compatible_up_to()5812 fn test_latin1_byte_compatible_up_to() {
5813 let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5814 assert_eq!(
5815 BIG5.new_decoder_without_bom_handling()
5816 .latin1_byte_compatible_up_to(buffer)
5817 .unwrap(),
5818 1
5819 );
5820 assert_eq!(
5821 EUC_JP
5822 .new_decoder_without_bom_handling()
5823 .latin1_byte_compatible_up_to(buffer)
5824 .unwrap(),
5825 1
5826 );
5827 assert_eq!(
5828 EUC_KR
5829 .new_decoder_without_bom_handling()
5830 .latin1_byte_compatible_up_to(buffer)
5831 .unwrap(),
5832 1
5833 );
5834 assert_eq!(
5835 GB18030
5836 .new_decoder_without_bom_handling()
5837 .latin1_byte_compatible_up_to(buffer)
5838 .unwrap(),
5839 1
5840 );
5841 assert_eq!(
5842 GBK.new_decoder_without_bom_handling()
5843 .latin1_byte_compatible_up_to(buffer)
5844 .unwrap(),
5845 1
5846 );
5847 assert!(REPLACEMENT
5848 .new_decoder_without_bom_handling()
5849 .latin1_byte_compatible_up_to(buffer)
5850 .is_none());
5851 assert_eq!(
5852 SHIFT_JIS
5853 .new_decoder_without_bom_handling()
5854 .latin1_byte_compatible_up_to(buffer)
5855 .unwrap(),
5856 1
5857 );
5858 assert_eq!(
5859 UTF_8
5860 .new_decoder_without_bom_handling()
5861 .latin1_byte_compatible_up_to(buffer)
5862 .unwrap(),
5863 1
5864 );
5865 assert!(UTF_16BE
5866 .new_decoder_without_bom_handling()
5867 .latin1_byte_compatible_up_to(buffer)
5868 .is_none());
5869 assert!(UTF_16LE
5870 .new_decoder_without_bom_handling()
5871 .latin1_byte_compatible_up_to(buffer)
5872 .is_none());
5873 assert_eq!(
5874 ISO_2022_JP
5875 .new_decoder_without_bom_handling()
5876 .latin1_byte_compatible_up_to(buffer)
5877 .unwrap(),
5878 1
5879 );
5880
5881 assert_eq!(
5882 IBM866
5883 .new_decoder_without_bom_handling()
5884 .latin1_byte_compatible_up_to(buffer)
5885 .unwrap(),
5886 1
5887 );
5888 assert_eq!(
5889 ISO_8859_2
5890 .new_decoder_without_bom_handling()
5891 .latin1_byte_compatible_up_to(buffer)
5892 .unwrap(),
5893 2
5894 );
5895 assert_eq!(
5896 ISO_8859_3
5897 .new_decoder_without_bom_handling()
5898 .latin1_byte_compatible_up_to(buffer)
5899 .unwrap(),
5900 2
5901 );
5902 assert_eq!(
5903 ISO_8859_4
5904 .new_decoder_without_bom_handling()
5905 .latin1_byte_compatible_up_to(buffer)
5906 .unwrap(),
5907 2
5908 );
5909 assert_eq!(
5910 ISO_8859_5
5911 .new_decoder_without_bom_handling()
5912 .latin1_byte_compatible_up_to(buffer)
5913 .unwrap(),
5914 2
5915 );
5916 assert_eq!(
5917 ISO_8859_6
5918 .new_decoder_without_bom_handling()
5919 .latin1_byte_compatible_up_to(buffer)
5920 .unwrap(),
5921 2
5922 );
5923 assert_eq!(
5924 ISO_8859_7
5925 .new_decoder_without_bom_handling()
5926 .latin1_byte_compatible_up_to(buffer)
5927 .unwrap(),
5928 2
5929 );
5930 assert_eq!(
5931 ISO_8859_8
5932 .new_decoder_without_bom_handling()
5933 .latin1_byte_compatible_up_to(buffer)
5934 .unwrap(),
5935 3
5936 );
5937 assert_eq!(
5938 ISO_8859_10
5939 .new_decoder_without_bom_handling()
5940 .latin1_byte_compatible_up_to(buffer)
5941 .unwrap(),
5942 2
5943 );
5944 assert_eq!(
5945 ISO_8859_13
5946 .new_decoder_without_bom_handling()
5947 .latin1_byte_compatible_up_to(buffer)
5948 .unwrap(),
5949 4
5950 );
5951 assert_eq!(
5952 ISO_8859_14
5953 .new_decoder_without_bom_handling()
5954 .latin1_byte_compatible_up_to(buffer)
5955 .unwrap(),
5956 4
5957 );
5958 assert_eq!(
5959 ISO_8859_15
5960 .new_decoder_without_bom_handling()
5961 .latin1_byte_compatible_up_to(buffer)
5962 .unwrap(),
5963 6
5964 );
5965 assert_eq!(
5966 ISO_8859_16
5967 .new_decoder_without_bom_handling()
5968 .latin1_byte_compatible_up_to(buffer)
5969 .unwrap(),
5970 4
5971 );
5972 assert_eq!(
5973 ISO_8859_8_I
5974 .new_decoder_without_bom_handling()
5975 .latin1_byte_compatible_up_to(buffer)
5976 .unwrap(),
5977 3
5978 );
5979 assert_eq!(
5980 KOI8_R
5981 .new_decoder_without_bom_handling()
5982 .latin1_byte_compatible_up_to(buffer)
5983 .unwrap(),
5984 1
5985 );
5986 assert_eq!(
5987 KOI8_U
5988 .new_decoder_without_bom_handling()
5989 .latin1_byte_compatible_up_to(buffer)
5990 .unwrap(),
5991 1
5992 );
5993 assert_eq!(
5994 MACINTOSH
5995 .new_decoder_without_bom_handling()
5996 .latin1_byte_compatible_up_to(buffer)
5997 .unwrap(),
5998 1
5999 );
6000 assert_eq!(
6001 WINDOWS_874
6002 .new_decoder_without_bom_handling()
6003 .latin1_byte_compatible_up_to(buffer)
6004 .unwrap(),
6005 2
6006 );
6007 assert_eq!(
6008 WINDOWS_1250
6009 .new_decoder_without_bom_handling()
6010 .latin1_byte_compatible_up_to(buffer)
6011 .unwrap(),
6012 4
6013 );
6014 assert_eq!(
6015 WINDOWS_1251
6016 .new_decoder_without_bom_handling()
6017 .latin1_byte_compatible_up_to(buffer)
6018 .unwrap(),
6019 1
6020 );
6021 assert_eq!(
6022 WINDOWS_1252
6023 .new_decoder_without_bom_handling()
6024 .latin1_byte_compatible_up_to(buffer)
6025 .unwrap(),
6026 5
6027 );
6028 assert_eq!(
6029 WINDOWS_1253
6030 .new_decoder_without_bom_handling()
6031 .latin1_byte_compatible_up_to(buffer)
6032 .unwrap(),
6033 3
6034 );
6035 assert_eq!(
6036 WINDOWS_1254
6037 .new_decoder_without_bom_handling()
6038 .latin1_byte_compatible_up_to(buffer)
6039 .unwrap(),
6040 4
6041 );
6042 assert_eq!(
6043 WINDOWS_1255
6044 .new_decoder_without_bom_handling()
6045 .latin1_byte_compatible_up_to(buffer)
6046 .unwrap(),
6047 3
6048 );
6049 assert_eq!(
6050 WINDOWS_1256
6051 .new_decoder_without_bom_handling()
6052 .latin1_byte_compatible_up_to(buffer)
6053 .unwrap(),
6054 1
6055 );
6056 assert_eq!(
6057 WINDOWS_1257
6058 .new_decoder_without_bom_handling()
6059 .latin1_byte_compatible_up_to(buffer)
6060 .unwrap(),
6061 4
6062 );
6063 assert_eq!(
6064 WINDOWS_1258
6065 .new_decoder_without_bom_handling()
6066 .latin1_byte_compatible_up_to(buffer)
6067 .unwrap(),
6068 4
6069 );
6070 assert_eq!(
6071 X_MAC_CYRILLIC
6072 .new_decoder_without_bom_handling()
6073 .latin1_byte_compatible_up_to(buffer)
6074 .unwrap(),
6075 1
6076 );
6077 assert_eq!(
6078 X_USER_DEFINED
6079 .new_decoder_without_bom_handling()
6080 .latin1_byte_compatible_up_to(buffer)
6081 .unwrap(),
6082 1
6083 );
6084
6085 assert!(UTF_8
6086 .new_decoder()
6087 .latin1_byte_compatible_up_to(buffer)
6088 .is_none());
6089
6090 let mut decoder = UTF_8.new_decoder();
6091 let mut output = [0u16; 4];
6092 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6093 assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6094 let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6095 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6096 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6097 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6098 }
6099 }
6100