1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 #![cfg_attr(
11 feature = "cargo-clippy",
12 allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14
15 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17 //! Gecko-oriented means that converting to and from UTF-16 is supported in
18 //! addition to converting to and from UTF-8, that the performance and
19 //! streamability goals are browser-oriented, and that FFI-friendliness is a
20 //! goal.
21 //!
22 //! Additionally, the `mem` module provides functions that are useful for
23 //! applications that need to be able to deal with legacy in-memory
24 //! representations of Unicode.
25 //!
26 //! For expectation setting, please be sure to read the sections
27 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29 //!
30 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31 //! design and internals of the crate.
32 //!
33 //! # Availability
34 //!
35 //! The code is available under the
36 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38 //! See the
39 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40 //! file for details.
41 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43 //!
44 //! # Integration with `std::io`
45 //!
46 //! This crate doesn't implement traits from `std::io`. However, for the case of
47 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50 //!
51 //! # Examples
52 //!
53 //! Example programs:
54 //!
55 //! * [Rust](https://github.com/hsivonen/recode_rs)
56 //! * [C](https://github.com/hsivonen/recode_c)
57 //! * [C++](https://github.com/hsivonen/recode_cpp)
58 //!
59 //! Decode using the non-streaming API:
60 //!
61 //! ```
62 //! use encoding_rs::*;
63 //!
64 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
65 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
66 //!
67 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
68 //! assert_eq!(&cow[..], expectation);
69 //! assert_eq!(encoding_used, SHIFT_JIS);
70 //! assert!(!had_errors);
71 //! ```
72 //!
73 //! Decode using the streaming API with minimal `unsafe`:
74 //!
75 //! ```
76 //! use encoding_rs::*;
77 //!
78 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
79 //!
80 //! // Use an array of byte slices to demonstrate content arriving piece by
81 //! // piece from the network.
82 //! let bytes: [&'static [u8]; 4] = [b"\x83",
83 //! b"n\x83\x8D\x81",
84 //! b"[\x81E\x83\x8F\x81[\x83",
85 //! b"\x8B\x83h"];
86 //!
87 //! // Very short output buffer to demonstrate the output buffer getting full.
88 //! // Normally, you'd use something like `[0u8; 2048]`.
89 //! let mut buffer_bytes = [0u8; 8];
90 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
91 //!
92 //! // How many bytes in the buffer currently hold significant data.
93 //! let mut bytes_in_buffer = 0usize;
94 //!
95 //! // Collect the output to a string for demonstration purposes.
96 //! let mut output = String::new();
97 //!
98 //! // The `Decoder`
99 //! let mut decoder = SHIFT_JIS.new_decoder();
100 //!
101 //! // Track whether we see errors.
102 //! let mut total_had_errors = false;
103 //!
104 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
105 //! // use of a fixed-size buffer; normally when the output of an incremental
106 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
107 //! // avoid the intermediate buffer).
108 //! for input in &bytes[..] {
109 //! // The number of bytes already read from current `input` in total.
110 //! let mut total_read_from_current_input = 0usize;
111 //!
112 //! loop {
113 //! let (result, read, written, had_errors) =
114 //! decoder.decode_to_str(&input[total_read_from_current_input..],
115 //! &mut buffer[bytes_in_buffer..],
116 //! false);
117 //! total_read_from_current_input += read;
118 //! bytes_in_buffer += written;
119 //! total_had_errors |= had_errors;
120 //! match result {
121 //! CoderResult::InputEmpty => {
122 //! // We have consumed the current input buffer. Break out of
123 //! // the inner loop to get the next input buffer from the
124 //! // outer loop.
125 //! break;
126 //! },
127 //! CoderResult::OutputFull => {
128 //! // Write the current buffer out and consider the buffer
129 //! // empty.
130 //! output.push_str(&buffer[..bytes_in_buffer]);
131 //! bytes_in_buffer = 0usize;
132 //! continue;
133 //! }
134 //! }
135 //! }
136 //! }
137 //!
138 //! // Process EOF
139 //! loop {
140 //! let (result, _, written, had_errors) =
141 //! decoder.decode_to_str(b"",
142 //! &mut buffer[bytes_in_buffer..],
143 //! true);
144 //! bytes_in_buffer += written;
145 //! total_had_errors |= had_errors;
146 //! // Write the current buffer out and consider the buffer empty.
147 //! // Need to do this here for both `match` arms, because we exit the
148 //! // loop on `CoderResult::InputEmpty`.
149 //! output.push_str(&buffer[..bytes_in_buffer]);
150 //! bytes_in_buffer = 0usize;
151 //! match result {
152 //! CoderResult::InputEmpty => {
153 //! // Done!
154 //! break;
155 //! },
156 //! CoderResult::OutputFull => {
157 //! continue;
158 //! }
159 //! }
160 //! }
161 //!
162 //! assert_eq!(&output[..], expectation);
163 //! assert!(!total_had_errors);
164 //! ```
165 //!
166 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
167 //!
168 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
169 //! __so this crate does not provide encoders for those encodings__!
170 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
171 //! so you get an UTF-8 encoder if you request an encoder for them.
172 //!
173 //! Additionally, the Encoding Standard factors BOM handling into wrapper
174 //! algorithms so that BOM handling isn't part of the definition of the
175 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
176 //! Standard define BOM handling or lack thereof as part of the encoding
177 //! scheme.
178 //!
179 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
180 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
181 //! the Unicode Standard.
182 //!
183 //! When used with the `_with_bom_removal` entry points, the UTF-8
184 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
185 //! Standard.
186 //!
187 //! This crate does not provide a mode that matches the UTF-16 _encoding
188 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
189 //! the entry points without `_bom_` qualifiers is the closest match,
190 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
191 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
192 //! Unicode Standard.
193 //!
194 //! The UTF-32 family of Unicode encoding schemes is not supported
195 //! by this crate. The Encoding Standard doesn't define any UTF-32
196 //! family encodings, since they aren't necessary for consuming Web
197 //! content.
198 //!
199 //! ## ISO-8859-1
200 //!
201 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
202 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
203 //! byte value to the same Unicode scalar value is not available via
204 //! `Encoding` in this crate.
205 //!
206 //! However, the functions whose name starts with `convert` and contains
207 //! `latin1` in the `mem` module support such conversions, which are known as
208 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
209 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
210 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
211 //!
212 //! ## Web / Browser Focus
213 //!
214 //! Both in terms of scope and performance, the focus is on the Web. For scope,
215 //! this means that encoding_rs implements the Encoding Standard fully and
216 //! doesn't implement encodings that are not specified in the Encoding
217 //! Standard. For performance, this means that decoding performance is
218 //! important as well as performance for encoding into UTF-8 or encoding the
219 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
220 //! be encoded into legacy encodings in only two places in the Web platform: in
221 //! the query part of URLs, in which case it's a matter of relatively rare
222 //! error handling, and in form submission, in which case the user action and
223 //! networking tend to hide the performance of the encoder.
224 //!
225 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
226 //! encodings enables smaller code size thanks to the encoder side using the
227 //! decode-optimized data tables without having encode-optimized data tables at
228 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
229 //! multiplication operations.
230 //!
231 //! Additionally, performance is a non-goal for the ASCII-incompatible
232 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
233 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
234 //! of implementation.
235 //!
236 //! Despite the browser focus, the hope is that non-browser applications
237 //! that wish to consume Web content or submit Web forms in a Web-compatible
238 //! way will find encoding_rs useful. While encoding_rs does not try to match
239 //! Windows behavior, many of the encodings are close enough to legacy
240 //! encodings implemented by Windows that applications that need to consume
241 //! data in legacy Windows encodins may find encoding_rs useful. The
242 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
243 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
244 //!
245 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
246 //! to the encodings defined in the Encoding Standard. The
247 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
248 //! UTF-7 decoding for email purposes.
249 //!
250 //! For single-byte DOS encodings beyond the ones supported by the Encoding
251 //! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
252 //!
253 //! # Preparing Text for the Encoders
254 //!
255 //! Normalizing text into Unicode Normalization Form C prior to encoding text
256 //! into a legacy encoding minimizes unmappable characters. Text can be
257 //! normalized to Unicode Normalization Form C using the
258 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
259 //!
260 //! The exception is windows-1258, which after normalizing to Unicode
261 //! Normalization Form C requires tone marks to be decomposed in order to
262 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
263 //! using the [`detone`](https://crates.io/crates/detone) crate.
264 //!
265 //! # Streaming & Non-Streaming; Rust & C/C++
266 //!
267 //! The API in Rust has two modes of operation: streaming and non-streaming.
268 //! The streaming API is the foundation of the implementation and should be
269 //! used when processing data that arrives piecemeal from an i/o stream. The
270 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
271 //! to C callers. The non-streaming part of the API is for Rust callers only and
272 //! is smart about borrowing instead of copying when possible. When
273 //! streamability is not needed, the non-streaming API should be preferrer in
274 //! order to avoid copying data when a borrow suffices.
275 //!
276 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
277 //! standard types for growable byte buffers and Unicode strings that know
278 //! their length.
279 //!
280 //! The C API (header file generated at `target/include/encoding_rs.h` when
281 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
282 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
283 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
284 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
285 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
286 //! as part of Mozilla [bug 1261841][4].
287 //!
288 //! The `Encoding` type is common to both the streaming and non-streaming
289 //! modes. In the streaming mode, decoding operations are performed with a
290 //! `Decoder` and encoding operations with an `Encoder` object obtained via
291 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
292 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
293 //! and `Encoder` objects are not used at all.
294 //!
295 //! [1]: https://github.com/hsivonen/encoding_c
296 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
297 //! [3]: https://github.com/Microsoft/GSL/
298 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
299 //!
300 //! # Memory management
301 //!
302 //! The non-streaming mode never performs heap allocations (even the methods
303 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
304 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
305 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
306 //!
307 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
308 //! perform heap allocations but only to allocate the backing buffer of the
309 //! `Vec<u8>` or the `String`.
310 //!
311 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
312 //! `Drop` cleanup.
313 //!
314 //! # Buffer reading and writing behavior
315 //!
316 //! Based on experience gained with the `java.nio.charset` encoding converter
317 //! API and with the Gecko uconv encoding converter API, the buffer reading
318 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
319 //! fully drained but output buffers are not always fully filled.
320 //!
321 //! When reading from an input buffer, encoding_rs always consumes all input
322 //! up to the next error or to the end of the buffer. In particular, when
323 //! decoding, even if the input buffer ends in the middle of a byte sequence
324 //! for a character, the decoder consumes all input. This has the benefit that
325 //! the caller of the API can always fill the next buffer from the start from
326 //! whatever source the bytes come from and never has to first copy the last
327 //! bytes of the previous buffer to the start of the next buffer. However, when
328 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
329 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
330 //! boundaries falling in the middle of a surrogate pair result in both
331 //! suggorates being treated individually as unpaired surrogates.
332 //!
333 //! Additionally, decoders guarantee that they can be fed even one byte at a
334 //! time and encoders guarantee that they can be fed even one code point at a
335 //! time. This has the benefit of not placing restrictions on the size of
336 //! chunks the content arrives e.g. from network.
337 //!
338 //! When writing into an output buffer, encoding_rs makes sure that the code
339 //! unit sequence for a character is never split across output buffer
340 //! boundaries. This may result in wasted space at the end of an output buffer,
341 //! but the advantages are that the output side of both decoders and encoders
342 //! is greatly simplified compared to designs that attempt to fill output
343 //! buffers exactly even when that entails splitting a code unit sequence and
344 //! when encoding_rs methods return to the caller, the output produces thus
345 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
346 //! the output needs to be considered as a whole, because the latest output
347 //! buffer taken alone might not be valid taken alone if the transition away
348 //! from the ASCII state occurred in an earlier output buffer. However, since
349 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
350 //! state as being in error despite the encoder generating a transition to the
351 //! ASCII state at the end, the claim about the partial output taken as a whole
352 //! being valid is true even for ISO-2022-JP.)
353 //!
354 //! # Error Reporting
355 //!
356 //! Based on experience gained with the `java.nio.charset` encoding converter
357 //! API and with the Gecko uconv encoding converter API, the error reporting
358 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
359 //! that leave it up to the caller to extract the erroneous bytes from the
360 //! input stream if the caller wishes to do so but encoder errors provide the
361 //! code point associated with the error without requiring the caller to
362 //! extract it from the input on its own.
363 //!
364 //! On the encoder side, an error is always triggered by the most recently
365 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
366 //! caller. Also, it's very typical for the caller to wish to do something with
367 //! this data: generate a numeric escape for the character. Additionally, the
368 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
369 //! certain cases, so requiring the caller to extract the character from the
370 //! input buffer would require the caller to handle ISO-2022-JP details.
371 //! Furthermore, requiring the caller to extract the character from the input
372 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
373 //! the job of an encoding conversion library.
374 //!
375 //! On the decoder side, errors are triggered in more complex ways. For
376 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
377 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
378 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
379 //! be the ones most recently pushed to the decoder and the error might not even
380 //! be in the current buffer.
381 //!
382 //! Some encoding conversion APIs address the problem by not acknowledging
383 //! trailing bytes of an input buffer as consumed if it's still possible for
384 //! future bytes to cause the trailing bytes to be in error. This way, error
385 //! reporting can always refer to the most recently pushed buffer. This has the
386 //! problem that the caller of the API has to copy the unconsumed trailing
387 //! bytes to the start of the next buffer before being able to fill the rest
388 //! of the next buffer. This is annoying, error-prone and inefficient.
389 //!
390 //! A possible solution would be making the decoder remember recently consumed
391 //! bytes in order to be able to include a copy of the erroneous bytes when
392 //! reporting an error. This has two problem: First, callers a rarely
393 //! interested in the erroneous bytes, so attempts to identify them are most
394 //! often just overhead anyway. Second, the rare applications that are
395 //! interested typically care about the location of the error in the input
396 //! stream.
397 //!
398 //! To keep the API convenient for common uses and the overhead low while making
399 //! it possible to develop applications, such as HTML validators, that care
400 //! about which bytes were in error, encoding_rs reports the length of the
401 //! erroneous sequence and the number of bytes consumed after the erroneous
402 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
403 //! this makes it possible for callers that care about the erroneous bytes to
404 //! locate them.
405 //!
406 //! # No Convenience API for Custom Replacements
407 //!
408 //! The Web Platform and, therefore, the Encoding Standard supports only one
409 //! error recovery mode for decoders and only one error recovery mode for
410 //! encoders. The supported error recovery mode for decoders is emitting the
411 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
412 //! encoders is emitting an HTML decimal numeric character reference for
413 //! unmappable characters.
414 //!
415 //! Since encoding_rs is Web-focused, these are the only error recovery modes
416 //! for which convenient support is provided. Moreover, on the decoder side,
417 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
418 //! on error (other than treating errors as fatal). In particular, simply
419 //! ignoring errors is a
420 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
421 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
422 //! callers to ignore errors.
423 //!
424 //! On the encoder side, there are plausible alternatives for HTML decimal
425 //! numeric character references. For example, when outputting CSS, CSS-style
426 //! escapes would seem to make sense. However, instead of facilitating the
427 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
428 //! position that you shouldn't generate output in encodings other than UTF-8,
429 //! except where backward compatibility with interacting with the legacy Web
430 //! requires it. The legacy Web requires it only when parsing the query strings
431 //! of URLs and when submitting forms, and those two both use HTML decimal
432 //! numeric character references.
433 //!
434 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
435 //! numeric character references easy, it does make them _possible_.
436 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
437 //! for unmappable characters, is implemented on top of
438 //! `encode_from_utf8_without_replacement()`. Applications that really, really
439 //! want other replacement schemes for unmappable characters can likewise
440 //! implement them on top of `encode_from_utf8_without_replacement()`.
441 //!
442 //! # No Extensibility by Design
443 //!
444 //! The set of encodings supported by encoding_rs is not extensible by design.
445 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
446 //! rather than `trait`s. encoding_rs takes the design position that all future
447 //! text interchange should be done using UTF-8, which can represent all of
448 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
449 //! Standard and encoding_rs that can represent all of Unicode and that has
450 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
451 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
452 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
453 //! other than being able to consume legacy content.
454 //!
455 //! Considering that UTF-8 can represent all of Unicode and is already supported
456 //! by all Web browsers, introducing a new encoding wouldn't add to the
457 //! expressiveness but would add to compatibility problems. In that sense,
458 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
459 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
460 //! the Web Platform. On the other hand, the set of legacy encodings that must
461 //! be supported for a Web browser to be able to be successful is not going to
462 //! expand. Empirically, the set of encodings specified in the Encoding Standard
463 //! is already sufficient and the set of legacy encodings won't grow
464 //! retroactively.
465 //!
466 //! Since extensibility doesn't make sense considering the Web focus of
467 //! encoding_rs and adding encodings to Web clients would be actively harmful,
468 //! it makes sense to make the set of encodings that encoding_rs supports
469 //! non-extensible and to take the (admittedly small) benefits arising from
470 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
471 //! of time, which enables stack allocation thereof.
472 //!
473 //! This does have downsides for applications that might want to put encoding_rs
474 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
475 //! needed for Web uses. The needs of such applications should not complicate
476 //! encoding_rs itself, though. It is up to those applications to provide a
477 //! framework that delegates the operations with encodings that encoding_rs
478 //! supports to encoding_rs and operations with other encodings to something
479 //! else (as opposed to encoding_rs itself providing an extensibility
480 //! framework).
481 //!
482 //! # Panics
483 //!
484 //! Methods in encoding_rs can panic if the API is used against the requirements
485 //! stated in the documentation, if a state that's supposed to be impossible
486 //! is reached due to an internal bug or on integer overflow. When used
487 //! according to documentation with buffer sizes that stay below integer
488 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
489 //!
490 //! Panics arising from API misuse aren't documented beyond this on individual
491 //! methods.
492 //!
493 //! # At-Risk Parts of the API
494 //!
495 //! The foreseeable source of partially backward-incompatible API change is the
496 //! way the instances of `Encoding` are made available.
497 //!
498 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
499 //! initialized with `static`s of type `&'static Encoding`, the non-reference
500 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
501 //!
502 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
503 //! unique when the constant is used in different crates, the reference-typed
504 //! `static`s for the encoding instances will be changed from `static` to
505 //! `const` and the non-reference-typed `_INIT` instances will be removed.
506 //!
507 //! # Mapping Spec Concepts onto the API
508 //!
509 //! <table>
510 //! <thead>
511 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
512 //! </thead>
513 //! <tbody>
514 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&'static Encoding</code></td><td><code>&'static Encoding</code></td></tr>
515 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
522 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// … (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
523 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
524 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// …</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
525 //! </tbody>
526 //! </table>
527 //!
528 //! # Compatibility with the rust-encoding API
529 //!
530 //! The crate
531 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
532 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
533 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
534 //!
535 //! # Mapping rust-encoding concepts to encoding_rs concepts
536 //!
537 //! The following table provides a mapping from rust-encoding constructs to
538 //! encoding_rs ones.
539 //!
540 //! <table>
541 //! <thead>
542 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
543 //! </thead>
544 //! <tbody>
545 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&'static encoding_rs::Encoding</code></td></tr>
546 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
547 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
548 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
549 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
550 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
551 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
552 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
553 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
554 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
555 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
556 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
557 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
558 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
559 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
560 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
561 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
562 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
563 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
564 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
565 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
566 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
567 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
568 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
570 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
572 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
573 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
574 //! </tbody>
575 //! </table>
576 //!
577 //! # Relationship with Windows Code Pages
578 //!
579 //! Despite the Web and browser focus, the encodings defined by the Encoding
580 //! Standard and implemented by this crate may be useful for decoding legacy
581 //! data that uses Windows code pages. The following table names the single-byte
582 //! encodings
583 //! that have a closely related Windows code page, the number of the closest
584 //! code page, a column indicating whether Windows maps unassigned code points
585 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
586 //! indicating remarks in the list after the table.
587 //!
588 //! <table>
589 //! <thead>
590 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
591 //! </thead>
592 //! <tbody>
593 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
594 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
595 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
596 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
597 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
598 //! <tr><td>windows-874</td><td>874</td><td>•</td><td></td></tr>
599 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
600 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
601 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
602 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
603 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
604 //! <tr><td>windows-1253</td><td>1253</td><td>•</td><td></td></tr>
605 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
606 //! <tr><td>windows-1255</td><td>1255</td><td>•</td><td></td></tr>
607 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
608 //! <tr><td>windows-1257</td><td>1257</td><td>•</td><td></td></tr>
609 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
610 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
611 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
612 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
613 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
614 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
615 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
616 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
617 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
618 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
619 //! <tr><td>ISO-8859-6</td><td>28596</td><td>•</td><td></td></tr>
620 //! <tr><td>ISO-8859-7</td><td>28597</td><td>•</td><td>3</td></tr>
621 //! <tr><td>ISO-8859-8</td><td>28598</td><td>•</td><td>4</td></tr>
622 //! <tr><td>ISO-8859-13</td><td>28603</td><td>•</td><td></td></tr>
623 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
624 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
625 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
626 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
627 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
628 //! </tbody>
629 //! </table>
630 //!
631 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
632 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
633 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
634 //! which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
635 //! decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
636 //! LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
637 //! instead of U+2019 RIGHT SINGLE QUOTATION MARK.
638 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
639 //! of LRM and RLM.
640 //! 5. Remarks from the previous item apply.
641 //!
642 //! The differences between this crate and Windows in the case of multibyte encodings
643 //! are not yet fully documented here. The lack of remarks above should not be taken
644 //! as indication of lack of differences.
645 //!
646 //! # Notable Differences from IANA Naming
647 //!
648 //! In some cases, the Encoding Standard specifies the popular unextended encoding
649 //! name where in IANA terms one of the other labels would be more precise considering
650 //! the extensions that the Encoding Standard has unified into the encoding.
651 //!
652 //! <table>
653 //! <thead>
654 //! <tr><th>Encoding</th><th>IANA</th></tr>
655 //! </thead>
656 //! <tbody>
657 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
658 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
659 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
660 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
661 //! </tbody>
662 //! </table>
663 //!
664 //! In other cases where the Encoding Standard unifies unextended and extended
665 //! variants of an encoding, the encoding gets the name of the extended
666 //! variant.
667 //!
668 //! <table>
669 //! <thead>
670 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
671 //! </thead>
672 //! <tbody>
673 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
674 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
675 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
676 //! </tbody>
677 //! </table>
678 //!
679 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
680 //! for discussion about the UTF-16 family.
681
682 #![no_std]
683 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
684
685 #[cfg_attr(test, macro_use)]
686 extern crate alloc;
687 extern crate core;
688 #[macro_use]
689 extern crate cfg_if;
690
691 #[cfg(all(
692 feature = "simd-accel",
693 any(
694 target_feature = "sse2",
695 all(target_endian = "little", target_arch = "aarch64"),
696 all(target_endian = "little", target_feature = "neon")
697 )
698 ))]
699 #[macro_use(shuffle)]
700 extern crate packed_simd;
701
702 #[cfg(feature = "serde")]
703 extern crate serde;
704
705 #[cfg(all(test, feature = "serde"))]
706 extern crate bincode;
707 #[cfg(all(test, feature = "serde"))]
708 #[macro_use]
709 extern crate serde_derive;
710 #[cfg(all(test, feature = "serde"))]
711 extern crate serde_json;
712
713 #[macro_use]
714 mod macros;
715
716 #[cfg(all(
717 feature = "simd-accel",
718 any(
719 target_feature = "sse2",
720 all(target_endian = "little", target_arch = "aarch64"),
721 all(target_endian = "little", target_feature = "neon")
722 )
723 ))]
724 mod simd_funcs;
725
726 #[cfg(test)]
727 mod testing;
728
729 mod big5;
730 mod euc_jp;
731 mod euc_kr;
732 mod gb18030;
733 mod iso_2022_jp;
734 mod replacement;
735 mod shift_jis;
736 mod single_byte;
737 mod utf_16;
738 mod utf_8;
739 mod x_user_defined;
740
741 mod ascii;
742 mod data;
743 mod handles;
744 mod variant;
745
746 pub mod mem;
747
748 use crate::ascii::ascii_valid_up_to;
749 use crate::ascii::iso_2022_jp_ascii_valid_up_to;
750 use crate::utf_8::utf8_valid_up_to;
751 use crate::variant::*;
752
753 use alloc::borrow::Cow;
754 use alloc::string::String;
755 use alloc::vec::Vec;
756 use core::cmp::Ordering;
757 use core::hash::Hash;
758 use core::hash::Hasher;
759
760 #[cfg(feature = "serde")]
761 use serde::de::Visitor;
762 #[cfg(feature = "serde")]
763 use serde::{Deserialize, Deserializer, Serialize, Serializer};
764
765 /// This has to be the max length of an NCR instead of max
766 /// minus one, because we can't rely on getting the minus
767 /// one from the space reserved for the current unmappable,
768 /// because the ISO-2022-JP encoder can fill up that space
769 /// with a state transition escape.
770 const NCR_EXTRA: usize = 10; // 
771
772 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
773 // Instead, please regenerate using generate-encoding-data.py
774
775 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
776
777 /// The initializer for the [Big5](static.BIG5.html) encoding.
778 ///
779 /// For use only for taking the address of this form when
780 /// Rust prohibits the use of the non-`_INIT` form directly,
781 /// such as in initializers of other `static`s. If in doubt,
782 /// use the corresponding non-`_INIT` reference-typed `static`.
783 ///
784 /// This part of the public API will go away if Rust changes
785 /// to make the referent of `pub const FOO: &'static Encoding`
786 /// unique cross-crate or if Rust starts allowing static arrays
787 /// to be initialized with `pub static FOO: &'static Encoding`
788 /// items.
789 pub static BIG5_INIT: Encoding = Encoding {
790 name: "Big5",
791 variant: VariantEncoding::Big5,
792 };
793
794 /// The Big5 encoding.
795 ///
796 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
797 /// instead of the Private Use Area code points that have been used historically.
798 /// It is believed to be able to decode existing Web content in a way that makes
799 /// sense.
800 ///
801 /// To avoid form submissions generating data that Web servers don't understand,
802 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
803 /// Big5 in the lexical order.
804 ///
805 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
806 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
807 ///
808 /// This encoding is designed to be suited for decoding the Windows code page 950
809 /// and its HKSCS patched "951" variant such that the text makes sense, given
810 /// assignments that Unicode has made after those encodings used Private Use
811 /// Area characters.
812 ///
813 /// This will change from `static` to `const` if Rust changes
814 /// to make the referent of `pub const FOO: &'static Encoding`
815 /// unique cross-crate, so don't take the address of this
816 /// `static`.
817 pub static BIG5: &'static Encoding = &BIG5_INIT;
818
819 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
820 ///
821 /// For use only for taking the address of this form when
822 /// Rust prohibits the use of the non-`_INIT` form directly,
823 /// such as in initializers of other `static`s. If in doubt,
824 /// use the corresponding non-`_INIT` reference-typed `static`.
825 ///
826 /// This part of the public API will go away if Rust changes
827 /// to make the referent of `pub const FOO: &'static Encoding`
828 /// unique cross-crate or if Rust starts allowing static arrays
829 /// to be initialized with `pub static FOO: &'static Encoding`
830 /// items.
831 pub static EUC_JP_INIT: Encoding = Encoding {
832 name: "EUC-JP",
833 variant: VariantEncoding::EucJp,
834 };
835
836 /// The EUC-JP encoding.
837 ///
838 /// This is the legacy Unix encoding for Japanese.
839 ///
840 /// For compatibility with Web servers that don't expect three-byte sequences
841 /// in form submissions, the encoder doesn't generate three-byte sequences.
842 /// That is, the JIS X 0212 support is decode-only.
843 ///
844 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
845 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
846 ///
847 /// This encoding roughly matches the Windows code page 20932. There are error
848 /// handling differences and a handful of 2-byte sequences that decode differently.
849 /// Additionall, Windows doesn't support 3-byte sequences.
850 ///
851 /// This will change from `static` to `const` if Rust changes
852 /// to make the referent of `pub const FOO: &'static Encoding`
853 /// unique cross-crate, so don't take the address of this
854 /// `static`.
855 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
856
857 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
858 ///
859 /// For use only for taking the address of this form when
860 /// Rust prohibits the use of the non-`_INIT` form directly,
861 /// such as in initializers of other `static`s. If in doubt,
862 /// use the corresponding non-`_INIT` reference-typed `static`.
863 ///
864 /// This part of the public API will go away if Rust changes
865 /// to make the referent of `pub const FOO: &'static Encoding`
866 /// unique cross-crate or if Rust starts allowing static arrays
867 /// to be initialized with `pub static FOO: &'static Encoding`
868 /// items.
869 pub static EUC_KR_INIT: Encoding = Encoding {
870 name: "EUC-KR",
871 variant: VariantEncoding::EucKr,
872 };
873
874 /// The EUC-KR encoding.
875 ///
876 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
877 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
878 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
879 ///
880 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
881 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
882 ///
883 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
884 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
885 /// the question mark or the Private Use Area.
886 ///
887 /// This will change from `static` to `const` if Rust changes
888 /// to make the referent of `pub const FOO: &'static Encoding`
889 /// unique cross-crate, so don't take the address of this
890 /// `static`.
891 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
892
893 /// The initializer for the [GBK](static.GBK.html) encoding.
894 ///
895 /// For use only for taking the address of this form when
896 /// Rust prohibits the use of the non-`_INIT` form directly,
897 /// such as in initializers of other `static`s. If in doubt,
898 /// use the corresponding non-`_INIT` reference-typed `static`.
899 ///
900 /// This part of the public API will go away if Rust changes
901 /// to make the referent of `pub const FOO: &'static Encoding`
902 /// unique cross-crate or if Rust starts allowing static arrays
903 /// to be initialized with `pub static FOO: &'static Encoding`
904 /// items.
905 pub static GBK_INIT: Encoding = Encoding {
906 name: "GBK",
907 variant: VariantEncoding::Gbk,
908 };
909
910 /// The GBK encoding.
911 ///
912 /// The decoder for this encoding is the same as the decoder for gb18030.
913 /// The encoder side of this encoding is GBK with Windows code page 936 euro
914 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
915 /// Unicode block as well as a handful of ideographs from the CJK Unified
916 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
917 ///
918 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
919 /// unified with the gb18030 encoder in the Encoding Standard out of concern
920 /// that servers that expect GBK form submissions might not be able to handle
921 /// the four-byte sequences.
922 ///
923 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
924 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
925 ///
926 /// The encoder of this encoding roughly matches the Windows code page 936.
927 /// The decoder side is a superset.
928 ///
929 /// This will change from `static` to `const` if Rust changes
930 /// to make the referent of `pub const FOO: &'static Encoding`
931 /// unique cross-crate, so don't take the address of this
932 /// `static`.
933 pub static GBK: &'static Encoding = &GBK_INIT;
934
935 /// The initializer for the [IBM866](static.IBM866.html) encoding.
936 ///
937 /// For use only for taking the address of this form when
938 /// Rust prohibits the use of the non-`_INIT` form directly,
939 /// such as in initializers of other `static`s. If in doubt,
940 /// use the corresponding non-`_INIT` reference-typed `static`.
941 ///
942 /// This part of the public API will go away if Rust changes
943 /// to make the referent of `pub const FOO: &'static Encoding`
944 /// unique cross-crate or if Rust starts allowing static arrays
945 /// to be initialized with `pub static FOO: &'static Encoding`
946 /// items.
947 pub static IBM866_INIT: Encoding = Encoding {
948 name: "IBM866",
949 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
950 };
951
952 /// The IBM866 encoding.
953 ///
954 /// This the most notable one of the DOS Cyrillic code pages. It has the same
955 /// box drawing characters as code page 437, so it can be used for decoding
956 /// DOS-era ASCII + box drawing data.
957 ///
958 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
959 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
960 ///
961 /// This encoding matches the Windows code page 866.
962 ///
963 /// This will change from `static` to `const` if Rust changes
964 /// to make the referent of `pub const FOO: &'static Encoding`
965 /// unique cross-crate, so don't take the address of this
966 /// `static`.
967 pub static IBM866: &'static Encoding = &IBM866_INIT;
968
969 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
970 ///
971 /// For use only for taking the address of this form when
972 /// Rust prohibits the use of the non-`_INIT` form directly,
973 /// such as in initializers of other `static`s. If in doubt,
974 /// use the corresponding non-`_INIT` reference-typed `static`.
975 ///
976 /// This part of the public API will go away if Rust changes
977 /// to make the referent of `pub const FOO: &'static Encoding`
978 /// unique cross-crate or if Rust starts allowing static arrays
979 /// to be initialized with `pub static FOO: &'static Encoding`
980 /// items.
981 pub static ISO_2022_JP_INIT: Encoding = Encoding {
982 name: "ISO-2022-JP",
983 variant: VariantEncoding::Iso2022Jp,
984 };
985
986 /// The ISO-2022-JP encoding.
987 ///
988 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
989 /// byte range to encode non-Basic Latin characters. It's the only encoding
990 /// supported by this crate whose encoder is stateful.
991 ///
992 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
993 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
994 ///
995 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
996 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
997 /// error handling.
998 ///
999 /// This will change from `static` to `const` if Rust changes
1000 /// to make the referent of `pub const FOO: &'static Encoding`
1001 /// unique cross-crate, so don't take the address of this
1002 /// `static`.
1003 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1004
1005 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1006 ///
1007 /// For use only for taking the address of this form when
1008 /// Rust prohibits the use of the non-`_INIT` form directly,
1009 /// such as in initializers of other `static`s. If in doubt,
1010 /// use the corresponding non-`_INIT` reference-typed `static`.
1011 ///
1012 /// This part of the public API will go away if Rust changes
1013 /// to make the referent of `pub const FOO: &'static Encoding`
1014 /// unique cross-crate or if Rust starts allowing static arrays
1015 /// to be initialized with `pub static FOO: &'static Encoding`
1016 /// items.
1017 pub static ISO_8859_10_INIT: Encoding = Encoding {
1018 name: "ISO-8859-10",
1019 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1020 };
1021
1022 /// The ISO-8859-10 encoding.
1023 ///
1024 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1025 /// is also known as Latin 6.
1026 ///
1027 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1028 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1029 ///
1030 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1031 /// does not support this encoding.
1032 ///
1033 /// This will change from `static` to `const` if Rust changes
1034 /// to make the referent of `pub const FOO: &'static Encoding`
1035 /// unique cross-crate, so don't take the address of this
1036 /// `static`.
1037 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1038
1039 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1040 ///
1041 /// For use only for taking the address of this form when
1042 /// Rust prohibits the use of the non-`_INIT` form directly,
1043 /// such as in initializers of other `static`s. If in doubt,
1044 /// use the corresponding non-`_INIT` reference-typed `static`.
1045 ///
1046 /// This part of the public API will go away if Rust changes
1047 /// to make the referent of `pub const FOO: &'static Encoding`
1048 /// unique cross-crate or if Rust starts allowing static arrays
1049 /// to be initialized with `pub static FOO: &'static Encoding`
1050 /// items.
1051 pub static ISO_8859_13_INIT: Encoding = Encoding {
1052 name: "ISO-8859-13",
1053 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1054 };
1055
1056 /// The ISO-8859-13 encoding.
1057 ///
1058 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1059 /// is also known as Latin 7.
1060 ///
1061 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1062 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1063 ///
1064 /// This encoding matches the Windows code page 28603, except Windows decodes
1065 /// unassigned code points to the Private Use Area of Unicode.
1066 ///
1067 /// This will change from `static` to `const` if Rust changes
1068 /// to make the referent of `pub const FOO: &'static Encoding`
1069 /// unique cross-crate, so don't take the address of this
1070 /// `static`.
1071 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1072
1073 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1074 ///
1075 /// For use only for taking the address of this form when
1076 /// Rust prohibits the use of the non-`_INIT` form directly,
1077 /// such as in initializers of other `static`s. If in doubt,
1078 /// use the corresponding non-`_INIT` reference-typed `static`.
1079 ///
1080 /// This part of the public API will go away if Rust changes
1081 /// to make the referent of `pub const FOO: &'static Encoding`
1082 /// unique cross-crate or if Rust starts allowing static arrays
1083 /// to be initialized with `pub static FOO: &'static Encoding`
1084 /// items.
1085 pub static ISO_8859_14_INIT: Encoding = Encoding {
1086 name: "ISO-8859-14",
1087 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1088 };
1089
1090 /// The ISO-8859-14 encoding.
1091 ///
1092 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1093 /// is also known as Latin 8.
1094 ///
1095 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1096 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1097 ///
1098 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1099 /// does not support this encoding.
1100 ///
1101 /// This will change from `static` to `const` if Rust changes
1102 /// to make the referent of `pub const FOO: &'static Encoding`
1103 /// unique cross-crate, so don't take the address of this
1104 /// `static`.
1105 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1106
1107 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1108 ///
1109 /// For use only for taking the address of this form when
1110 /// Rust prohibits the use of the non-`_INIT` form directly,
1111 /// such as in initializers of other `static`s. If in doubt,
1112 /// use the corresponding non-`_INIT` reference-typed `static`.
1113 ///
1114 /// This part of the public API will go away if Rust changes
1115 /// to make the referent of `pub const FOO: &'static Encoding`
1116 /// unique cross-crate or if Rust starts allowing static arrays
1117 /// to be initialized with `pub static FOO: &'static Encoding`
1118 /// items.
1119 pub static ISO_8859_15_INIT: Encoding = Encoding {
1120 name: "ISO-8859-15",
1121 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1122 };
1123
1124 /// The ISO-8859-15 encoding.
1125 ///
1126 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1127 /// family. This encoding is also known as Latin 9.
1128 ///
1129 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1130 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1131 ///
1132 /// This encoding matches the Windows code page 28605.
1133 ///
1134 /// This will change from `static` to `const` if Rust changes
1135 /// to make the referent of `pub const FOO: &'static Encoding`
1136 /// unique cross-crate, so don't take the address of this
1137 /// `static`.
1138 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1139
1140 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1141 ///
1142 /// For use only for taking the address of this form when
1143 /// Rust prohibits the use of the non-`_INIT` form directly,
1144 /// such as in initializers of other `static`s. If in doubt,
1145 /// use the corresponding non-`_INIT` reference-typed `static`.
1146 ///
1147 /// This part of the public API will go away if Rust changes
1148 /// to make the referent of `pub const FOO: &'static Encoding`
1149 /// unique cross-crate or if Rust starts allowing static arrays
1150 /// to be initialized with `pub static FOO: &'static Encoding`
1151 /// items.
1152 pub static ISO_8859_16_INIT: Encoding = Encoding {
1153 name: "ISO-8859-16",
1154 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1155 };
1156
1157 /// The ISO-8859-16 encoding.
1158 ///
1159 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1160 /// family. This encoding is also known as Latin 10.
1161 ///
1162 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1163 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1164 ///
1165 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1166 /// does not support this encoding.
1167 ///
1168 /// This will change from `static` to `const` if Rust changes
1169 /// to make the referent of `pub const FOO: &'static Encoding`
1170 /// unique cross-crate, so don't take the address of this
1171 /// `static`.
1172 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1173
1174 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1175 ///
1176 /// For use only for taking the address of this form when
1177 /// Rust prohibits the use of the non-`_INIT` form directly,
1178 /// such as in initializers of other `static`s. If in doubt,
1179 /// use the corresponding non-`_INIT` reference-typed `static`.
1180 ///
1181 /// This part of the public API will go away if Rust changes
1182 /// to make the referent of `pub const FOO: &'static Encoding`
1183 /// unique cross-crate or if Rust starts allowing static arrays
1184 /// to be initialized with `pub static FOO: &'static Encoding`
1185 /// items.
1186 pub static ISO_8859_2_INIT: Encoding = Encoding {
1187 name: "ISO-8859-2",
1188 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1189 };
1190
1191 /// The ISO-8859-2 encoding.
1192 ///
1193 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1194 ///
1195 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1196 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1197 ///
1198 /// This encoding matches the Windows code page 28592.
1199 ///
1200 /// This will change from `static` to `const` if Rust changes
1201 /// to make the referent of `pub const FOO: &'static Encoding`
1202 /// unique cross-crate, so don't take the address of this
1203 /// `static`.
1204 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1205
1206 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1207 ///
1208 /// For use only for taking the address of this form when
1209 /// Rust prohibits the use of the non-`_INIT` form directly,
1210 /// such as in initializers of other `static`s. If in doubt,
1211 /// use the corresponding non-`_INIT` reference-typed `static`.
1212 ///
1213 /// This part of the public API will go away if Rust changes
1214 /// to make the referent of `pub const FOO: &'static Encoding`
1215 /// unique cross-crate or if Rust starts allowing static arrays
1216 /// to be initialized with `pub static FOO: &'static Encoding`
1217 /// items.
1218 pub static ISO_8859_3_INIT: Encoding = Encoding {
1219 name: "ISO-8859-3",
1220 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1221 };
1222
1223 /// The ISO-8859-3 encoding.
1224 ///
1225 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1226 ///
1227 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1228 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1229 ///
1230 /// This encoding matches the Windows code page 28593.
1231 ///
1232 /// This will change from `static` to `const` if Rust changes
1233 /// to make the referent of `pub const FOO: &'static Encoding`
1234 /// unique cross-crate, so don't take the address of this
1235 /// `static`.
1236 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1237
1238 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1239 ///
1240 /// For use only for taking the address of this form when
1241 /// Rust prohibits the use of the non-`_INIT` form directly,
1242 /// such as in initializers of other `static`s. If in doubt,
1243 /// use the corresponding non-`_INIT` reference-typed `static`.
1244 ///
1245 /// This part of the public API will go away if Rust changes
1246 /// to make the referent of `pub const FOO: &'static Encoding`
1247 /// unique cross-crate or if Rust starts allowing static arrays
1248 /// to be initialized with `pub static FOO: &'static Encoding`
1249 /// items.
1250 pub static ISO_8859_4_INIT: Encoding = Encoding {
1251 name: "ISO-8859-4",
1252 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1253 };
1254
1255 /// The ISO-8859-4 encoding.
1256 ///
1257 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1258 ///
1259 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1260 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1261 ///
1262 /// This encoding matches the Windows code page 28594.
1263 ///
1264 /// This will change from `static` to `const` if Rust changes
1265 /// to make the referent of `pub const FOO: &'static Encoding`
1266 /// unique cross-crate, so don't take the address of this
1267 /// `static`.
1268 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1269
1270 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1271 ///
1272 /// For use only for taking the address of this form when
1273 /// Rust prohibits the use of the non-`_INIT` form directly,
1274 /// such as in initializers of other `static`s. If in doubt,
1275 /// use the corresponding non-`_INIT` reference-typed `static`.
1276 ///
1277 /// This part of the public API will go away if Rust changes
1278 /// to make the referent of `pub const FOO: &'static Encoding`
1279 /// unique cross-crate or if Rust starts allowing static arrays
1280 /// to be initialized with `pub static FOO: &'static Encoding`
1281 /// items.
1282 pub static ISO_8859_5_INIT: Encoding = Encoding {
1283 name: "ISO-8859-5",
1284 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1285 };
1286
1287 /// The ISO-8859-5 encoding.
1288 ///
1289 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1290 ///
1291 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1292 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1293 ///
1294 /// This encoding matches the Windows code page 28595.
1295 ///
1296 /// This will change from `static` to `const` if Rust changes
1297 /// to make the referent of `pub const FOO: &'static Encoding`
1298 /// unique cross-crate, so don't take the address of this
1299 /// `static`.
1300 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1301
1302 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1303 ///
1304 /// For use only for taking the address of this form when
1305 /// Rust prohibits the use of the non-`_INIT` form directly,
1306 /// such as in initializers of other `static`s. If in doubt,
1307 /// use the corresponding non-`_INIT` reference-typed `static`.
1308 ///
1309 /// This part of the public API will go away if Rust changes
1310 /// to make the referent of `pub const FOO: &'static Encoding`
1311 /// unique cross-crate or if Rust starts allowing static arrays
1312 /// to be initialized with `pub static FOO: &'static Encoding`
1313 /// items.
1314 pub static ISO_8859_6_INIT: Encoding = Encoding {
1315 name: "ISO-8859-6",
1316 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1317 };
1318
1319 /// The ISO-8859-6 encoding.
1320 ///
1321 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1322 ///
1323 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1324 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1325 ///
1326 /// This encoding matches the Windows code page 28596, except Windows decodes
1327 /// unassigned code points to the Private Use Area of Unicode.
1328 ///
1329 /// This will change from `static` to `const` if Rust changes
1330 /// to make the referent of `pub const FOO: &'static Encoding`
1331 /// unique cross-crate, so don't take the address of this
1332 /// `static`.
1333 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1334
1335 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1336 ///
1337 /// For use only for taking the address of this form when
1338 /// Rust prohibits the use of the non-`_INIT` form directly,
1339 /// such as in initializers of other `static`s. If in doubt,
1340 /// use the corresponding non-`_INIT` reference-typed `static`.
1341 ///
1342 /// This part of the public API will go away if Rust changes
1343 /// to make the referent of `pub const FOO: &'static Encoding`
1344 /// unique cross-crate or if Rust starts allowing static arrays
1345 /// to be initialized with `pub static FOO: &'static Encoding`
1346 /// items.
1347 pub static ISO_8859_7_INIT: Encoding = Encoding {
1348 name: "ISO-8859-7",
1349 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1350 };
1351
1352 /// The ISO-8859-7 encoding.
1353 ///
1354 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1355 ///
1356 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1357 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1358 ///
1359 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1360 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1361 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1362 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1363 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1364 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1365 ///
1366 /// This will change from `static` to `const` if Rust changes
1367 /// to make the referent of `pub const FOO: &'static Encoding`
1368 /// unique cross-crate, so don't take the address of this
1369 /// `static`.
1370 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1371
1372 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1373 ///
1374 /// For use only for taking the address of this form when
1375 /// Rust prohibits the use of the non-`_INIT` form directly,
1376 /// such as in initializers of other `static`s. If in doubt,
1377 /// use the corresponding non-`_INIT` reference-typed `static`.
1378 ///
1379 /// This part of the public API will go away if Rust changes
1380 /// to make the referent of `pub const FOO: &'static Encoding`
1381 /// unique cross-crate or if Rust starts allowing static arrays
1382 /// to be initialized with `pub static FOO: &'static Encoding`
1383 /// items.
1384 pub static ISO_8859_8_INIT: Encoding = Encoding {
1385 name: "ISO-8859-8",
1386 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1387 };
1388
1389 /// The ISO-8859-8 encoding.
1390 ///
1391 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1392 ///
1393 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1394 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1395 ///
1396 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1397 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1398 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1399 /// the private use area.
1400 ///
1401 /// This will change from `static` to `const` if Rust changes
1402 /// to make the referent of `pub const FOO: &'static Encoding`
1403 /// unique cross-crate, so don't take the address of this
1404 /// `static`.
1405 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1406
1407 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1408 ///
1409 /// For use only for taking the address of this form when
1410 /// Rust prohibits the use of the non-`_INIT` form directly,
1411 /// such as in initializers of other `static`s. If in doubt,
1412 /// use the corresponding non-`_INIT` reference-typed `static`.
1413 ///
1414 /// This part of the public API will go away if Rust changes
1415 /// to make the referent of `pub const FOO: &'static Encoding`
1416 /// unique cross-crate or if Rust starts allowing static arrays
1417 /// to be initialized with `pub static FOO: &'static Encoding`
1418 /// items.
1419 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1420 name: "ISO-8859-8-I",
1421 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1422 };
1423
1424 /// The ISO-8859-8-I encoding.
1425 ///
1426 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1427 ///
1428 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1429 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1430 ///
1431 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1432 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1433 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1434 /// the private use area.
1435 ///
1436 /// This will change from `static` to `const` if Rust changes
1437 /// to make the referent of `pub const FOO: &'static Encoding`
1438 /// unique cross-crate, so don't take the address of this
1439 /// `static`.
1440 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1441
1442 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1443 ///
1444 /// For use only for taking the address of this form when
1445 /// Rust prohibits the use of the non-`_INIT` form directly,
1446 /// such as in initializers of other `static`s. If in doubt,
1447 /// use the corresponding non-`_INIT` reference-typed `static`.
1448 ///
1449 /// This part of the public API will go away if Rust changes
1450 /// to make the referent of `pub const FOO: &'static Encoding`
1451 /// unique cross-crate or if Rust starts allowing static arrays
1452 /// to be initialized with `pub static FOO: &'static Encoding`
1453 /// items.
1454 pub static KOI8_R_INIT: Encoding = Encoding {
1455 name: "KOI8-R",
1456 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1457 };
1458
1459 /// The KOI8-R encoding.
1460 ///
1461 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1462 ///
1463 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1464 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1465 ///
1466 /// This encoding matches the Windows code page 20866.
1467 ///
1468 /// This will change from `static` to `const` if Rust changes
1469 /// to make the referent of `pub const FOO: &'static Encoding`
1470 /// unique cross-crate, so don't take the address of this
1471 /// `static`.
1472 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1473
1474 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1475 ///
1476 /// For use only for taking the address of this form when
1477 /// Rust prohibits the use of the non-`_INIT` form directly,
1478 /// such as in initializers of other `static`s. If in doubt,
1479 /// use the corresponding non-`_INIT` reference-typed `static`.
1480 ///
1481 /// This part of the public API will go away if Rust changes
1482 /// to make the referent of `pub const FOO: &'static Encoding`
1483 /// unique cross-crate or if Rust starts allowing static arrays
1484 /// to be initialized with `pub static FOO: &'static Encoding`
1485 /// items.
1486 pub static KOI8_U_INIT: Encoding = Encoding {
1487 name: "KOI8-U",
1488 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1489 };
1490
1491 /// The KOI8-U encoding.
1492 ///
1493 /// This is an encoding for Ukrainian adapted from KOI8-R.
1494 ///
1495 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1496 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1497 ///
1498 /// This encoding matches the Windows code page 21866.
1499 ///
1500 /// This will change from `static` to `const` if Rust changes
1501 /// to make the referent of `pub const FOO: &'static Encoding`
1502 /// unique cross-crate, so don't take the address of this
1503 /// `static`.
1504 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1505
1506 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1507 ///
1508 /// For use only for taking the address of this form when
1509 /// Rust prohibits the use of the non-`_INIT` form directly,
1510 /// such as in initializers of other `static`s. If in doubt,
1511 /// use the corresponding non-`_INIT` reference-typed `static`.
1512 ///
1513 /// This part of the public API will go away if Rust changes
1514 /// to make the referent of `pub const FOO: &'static Encoding`
1515 /// unique cross-crate or if Rust starts allowing static arrays
1516 /// to be initialized with `pub static FOO: &'static Encoding`
1517 /// items.
1518 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1519 name: "Shift_JIS",
1520 variant: VariantEncoding::ShiftJis,
1521 };
1522
1523 /// The Shift_JIS encoding.
1524 ///
1525 /// This is the Japanese encoding for Windows.
1526 ///
1527 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1528 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1529 ///
1530 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1531 /// sequences that are error per the Encoding Standard to the question mark or the
1532 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1533 ///
1534 /// This will change from `static` to `const` if Rust changes
1535 /// to make the referent of `pub const FOO: &'static Encoding`
1536 /// unique cross-crate, so don't take the address of this
1537 /// `static`.
1538 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1539
1540 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1541 ///
1542 /// For use only for taking the address of this form when
1543 /// Rust prohibits the use of the non-`_INIT` form directly,
1544 /// such as in initializers of other `static`s. If in doubt,
1545 /// use the corresponding non-`_INIT` reference-typed `static`.
1546 ///
1547 /// This part of the public API will go away if Rust changes
1548 /// to make the referent of `pub const FOO: &'static Encoding`
1549 /// unique cross-crate or if Rust starts allowing static arrays
1550 /// to be initialized with `pub static FOO: &'static Encoding`
1551 /// items.
1552 pub static UTF_16BE_INIT: Encoding = Encoding {
1553 name: "UTF-16BE",
1554 variant: VariantEncoding::Utf16Be,
1555 };
1556
1557 /// The UTF-16BE encoding.
1558 ///
1559 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1560 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1561 /// mark the big endian byte order is assumed.
1562 ///
1563 /// There is no corresponding encoder in this crate or in the Encoding
1564 /// Standard. The output encoding of this encoding is UTF-8.
1565 ///
1566 /// This encoding matches the Windows code page 1201.
1567 ///
1568 /// This will change from `static` to `const` if Rust changes
1569 /// to make the referent of `pub const FOO: &'static Encoding`
1570 /// unique cross-crate, so don't take the address of this
1571 /// `static`.
1572 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1573
1574 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1575 ///
1576 /// For use only for taking the address of this form when
1577 /// Rust prohibits the use of the non-`_INIT` form directly,
1578 /// such as in initializers of other `static`s. If in doubt,
1579 /// use the corresponding non-`_INIT` reference-typed `static`.
1580 ///
1581 /// This part of the public API will go away if Rust changes
1582 /// to make the referent of `pub const FOO: &'static Encoding`
1583 /// unique cross-crate or if Rust starts allowing static arrays
1584 /// to be initialized with `pub static FOO: &'static Encoding`
1585 /// items.
1586 pub static UTF_16LE_INIT: Encoding = Encoding {
1587 name: "UTF-16LE",
1588 variant: VariantEncoding::Utf16Le,
1589 };
1590
1591 /// The UTF-16LE encoding.
1592 ///
1593 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1594 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1595 /// mark the little endian byte order is assumed.
1596 ///
1597 /// There is no corresponding encoder in this crate or in the Encoding
1598 /// Standard. The output encoding of this encoding is UTF-8.
1599 ///
1600 /// This encoding matches the Windows code page 1200.
1601 ///
1602 /// This will change from `static` to `const` if Rust changes
1603 /// to make the referent of `pub const FOO: &'static Encoding`
1604 /// unique cross-crate, so don't take the address of this
1605 /// `static`.
1606 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1607
1608 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1609 ///
1610 /// For use only for taking the address of this form when
1611 /// Rust prohibits the use of the non-`_INIT` form directly,
1612 /// such as in initializers of other `static`s. If in doubt,
1613 /// use the corresponding non-`_INIT` reference-typed `static`.
1614 ///
1615 /// This part of the public API will go away if Rust changes
1616 /// to make the referent of `pub const FOO: &'static Encoding`
1617 /// unique cross-crate or if Rust starts allowing static arrays
1618 /// to be initialized with `pub static FOO: &'static Encoding`
1619 /// items.
1620 pub static UTF_8_INIT: Encoding = Encoding {
1621 name: "UTF-8",
1622 variant: VariantEncoding::Utf8,
1623 };
1624
1625 /// The UTF-8 encoding.
1626 ///
1627 /// This is the encoding that should be used for all new development it can
1628 /// represent all of Unicode.
1629 ///
1630 /// This encoding matches the Windows code page 65001, except Windows differs
1631 /// in the number of errors generated for some erroneous byte sequences.
1632 ///
1633 /// This will change from `static` to `const` if Rust changes
1634 /// to make the referent of `pub const FOO: &'static Encoding`
1635 /// unique cross-crate, so don't take the address of this
1636 /// `static`.
1637 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1638
1639 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1640 ///
1641 /// For use only for taking the address of this form when
1642 /// Rust prohibits the use of the non-`_INIT` form directly,
1643 /// such as in initializers of other `static`s. If in doubt,
1644 /// use the corresponding non-`_INIT` reference-typed `static`.
1645 ///
1646 /// This part of the public API will go away if Rust changes
1647 /// to make the referent of `pub const FOO: &'static Encoding`
1648 /// unique cross-crate or if Rust starts allowing static arrays
1649 /// to be initialized with `pub static FOO: &'static Encoding`
1650 /// items.
1651 pub static GB18030_INIT: Encoding = Encoding {
1652 name: "gb18030",
1653 variant: VariantEncoding::Gb18030,
1654 };
1655
1656 /// The gb18030 encoding.
1657 ///
1658 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1659 /// maps to U+3000 for compatibility with existing Web content. As a result,
1660 /// this encoding can represent all of Unicode except for the private-use
1661 /// character U+E5E5.
1662 ///
1663 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1664 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1665 ///
1666 /// This encoding matches the Windows code page 54936.
1667 ///
1668 /// This will change from `static` to `const` if Rust changes
1669 /// to make the referent of `pub const FOO: &'static Encoding`
1670 /// unique cross-crate, so don't take the address of this
1671 /// `static`.
1672 pub static GB18030: &'static Encoding = &GB18030_INIT;
1673
1674 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1675 ///
1676 /// For use only for taking the address of this form when
1677 /// Rust prohibits the use of the non-`_INIT` form directly,
1678 /// such as in initializers of other `static`s. If in doubt,
1679 /// use the corresponding non-`_INIT` reference-typed `static`.
1680 ///
1681 /// This part of the public API will go away if Rust changes
1682 /// to make the referent of `pub const FOO: &'static Encoding`
1683 /// unique cross-crate or if Rust starts allowing static arrays
1684 /// to be initialized with `pub static FOO: &'static Encoding`
1685 /// items.
1686 pub static MACINTOSH_INIT: Encoding = Encoding {
1687 name: "macintosh",
1688 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1689 };
1690
1691 /// The macintosh encoding.
1692 ///
1693 /// This is the MacRoman encoding from Mac OS Classic.
1694 ///
1695 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1696 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1697 ///
1698 /// This encoding matches the Windows code page 10000, except Windows decodes
1699 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1700 ///
1701 /// This will change from `static` to `const` if Rust changes
1702 /// to make the referent of `pub const FOO: &'static Encoding`
1703 /// unique cross-crate, so don't take the address of this
1704 /// `static`.
1705 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1706
1707 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1708 ///
1709 /// For use only for taking the address of this form when
1710 /// Rust prohibits the use of the non-`_INIT` form directly,
1711 /// such as in initializers of other `static`s. If in doubt,
1712 /// use the corresponding non-`_INIT` reference-typed `static`.
1713 ///
1714 /// This part of the public API will go away if Rust changes
1715 /// to make the referent of `pub const FOO: &'static Encoding`
1716 /// unique cross-crate or if Rust starts allowing static arrays
1717 /// to be initialized with `pub static FOO: &'static Encoding`
1718 /// items.
1719 pub static REPLACEMENT_INIT: Encoding = Encoding {
1720 name: "replacement",
1721 variant: VariantEncoding::Replacement,
1722 };
1723
1724 /// The replacement encoding.
1725 ///
1726 /// This decode-only encoding decodes all non-zero-length streams to a single
1727 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1728 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1729 /// encodings that are no longer supported by the Web Platform and that
1730 /// would be dangerous to treat as ASCII-compatible.
1731 ///
1732 /// There is no corresponding encoder. The output encoding of this encoding
1733 /// is UTF-8.
1734 ///
1735 /// This encoding does not have a Windows code page number.
1736 ///
1737 /// This will change from `static` to `const` if Rust changes
1738 /// to make the referent of `pub const FOO: &'static Encoding`
1739 /// unique cross-crate, so don't take the address of this
1740 /// `static`.
1741 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1742
1743 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1744 ///
1745 /// For use only for taking the address of this form when
1746 /// Rust prohibits the use of the non-`_INIT` form directly,
1747 /// such as in initializers of other `static`s. If in doubt,
1748 /// use the corresponding non-`_INIT` reference-typed `static`.
1749 ///
1750 /// This part of the public API will go away if Rust changes
1751 /// to make the referent of `pub const FOO: &'static Encoding`
1752 /// unique cross-crate or if Rust starts allowing static arrays
1753 /// to be initialized with `pub static FOO: &'static Encoding`
1754 /// items.
1755 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1756 name: "windows-1250",
1757 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1758 };
1759
1760 /// The windows-1250 encoding.
1761 ///
1762 /// This is the Central European encoding for Windows.
1763 ///
1764 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1765 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1766 ///
1767 /// This encoding matches the Windows code page 1250.
1768 ///
1769 /// This will change from `static` to `const` if Rust changes
1770 /// to make the referent of `pub const FOO: &'static Encoding`
1771 /// unique cross-crate, so don't take the address of this
1772 /// `static`.
1773 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1774
1775 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1776 ///
1777 /// For use only for taking the address of this form when
1778 /// Rust prohibits the use of the non-`_INIT` form directly,
1779 /// such as in initializers of other `static`s. If in doubt,
1780 /// use the corresponding non-`_INIT` reference-typed `static`.
1781 ///
1782 /// This part of the public API will go away if Rust changes
1783 /// to make the referent of `pub const FOO: &'static Encoding`
1784 /// unique cross-crate or if Rust starts allowing static arrays
1785 /// to be initialized with `pub static FOO: &'static Encoding`
1786 /// items.
1787 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1788 name: "windows-1251",
1789 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1790 };
1791
1792 /// The windows-1251 encoding.
1793 ///
1794 /// This is the Cyrillic encoding for Windows.
1795 ///
1796 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1797 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1798 ///
1799 /// This encoding matches the Windows code page 1251.
1800 ///
1801 /// This will change from `static` to `const` if Rust changes
1802 /// to make the referent of `pub const FOO: &'static Encoding`
1803 /// unique cross-crate, so don't take the address of this
1804 /// `static`.
1805 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1806
1807 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1808 ///
1809 /// For use only for taking the address of this form when
1810 /// Rust prohibits the use of the non-`_INIT` form directly,
1811 /// such as in initializers of other `static`s. If in doubt,
1812 /// use the corresponding non-`_INIT` reference-typed `static`.
1813 ///
1814 /// This part of the public API will go away if Rust changes
1815 /// to make the referent of `pub const FOO: &'static Encoding`
1816 /// unique cross-crate or if Rust starts allowing static arrays
1817 /// to be initialized with `pub static FOO: &'static Encoding`
1818 /// items.
1819 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1820 name: "windows-1252",
1821 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1822 };
1823
1824 /// The windows-1252 encoding.
1825 ///
1826 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1827 /// which is known as Latin 1.
1828 ///
1829 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1830 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1831 ///
1832 /// This encoding matches the Windows code page 1252.
1833 ///
1834 /// This will change from `static` to `const` if Rust changes
1835 /// to make the referent of `pub const FOO: &'static Encoding`
1836 /// unique cross-crate, so don't take the address of this
1837 /// `static`.
1838 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1839
1840 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1841 ///
1842 /// For use only for taking the address of this form when
1843 /// Rust prohibits the use of the non-`_INIT` form directly,
1844 /// such as in initializers of other `static`s. If in doubt,
1845 /// use the corresponding non-`_INIT` reference-typed `static`.
1846 ///
1847 /// This part of the public API will go away if Rust changes
1848 /// to make the referent of `pub const FOO: &'static Encoding`
1849 /// unique cross-crate or if Rust starts allowing static arrays
1850 /// to be initialized with `pub static FOO: &'static Encoding`
1851 /// items.
1852 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1853 name: "windows-1253",
1854 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1855 };
1856
1857 /// The windows-1253 encoding.
1858 ///
1859 /// This is the Greek encoding for Windows. It is mostly an extension of
1860 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1861 ///
1862 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1863 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1864 ///
1865 /// This encoding matches the Windows code page 1253, except Windows decodes
1866 /// unassigned code points to the Private Use Area of Unicode.
1867 ///
1868 /// This will change from `static` to `const` if Rust changes
1869 /// to make the referent of `pub const FOO: &'static Encoding`
1870 /// unique cross-crate, so don't take the address of this
1871 /// `static`.
1872 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1873
1874 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1875 ///
1876 /// For use only for taking the address of this form when
1877 /// Rust prohibits the use of the non-`_INIT` form directly,
1878 /// such as in initializers of other `static`s. If in doubt,
1879 /// use the corresponding non-`_INIT` reference-typed `static`.
1880 ///
1881 /// This part of the public API will go away if Rust changes
1882 /// to make the referent of `pub const FOO: &'static Encoding`
1883 /// unique cross-crate or if Rust starts allowing static arrays
1884 /// to be initialized with `pub static FOO: &'static Encoding`
1885 /// items.
1886 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1887 name: "windows-1254",
1888 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1889 };
1890
1891 /// The windows-1254 encoding.
1892 ///
1893 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1894 /// which is known as Latin 5.
1895 ///
1896 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1897 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1898 ///
1899 /// This encoding matches the Windows code page 1254.
1900 ///
1901 /// This will change from `static` to `const` if Rust changes
1902 /// to make the referent of `pub const FOO: &'static Encoding`
1903 /// unique cross-crate, so don't take the address of this
1904 /// `static`.
1905 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1906
1907 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1908 ///
1909 /// For use only for taking the address of this form when
1910 /// Rust prohibits the use of the non-`_INIT` form directly,
1911 /// such as in initializers of other `static`s. If in doubt,
1912 /// use the corresponding non-`_INIT` reference-typed `static`.
1913 ///
1914 /// This part of the public API will go away if Rust changes
1915 /// to make the referent of `pub const FOO: &'static Encoding`
1916 /// unique cross-crate or if Rust starts allowing static arrays
1917 /// to be initialized with `pub static FOO: &'static Encoding`
1918 /// items.
1919 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1920 name: "windows-1255",
1921 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1922 };
1923
1924 /// The windows-1255 encoding.
1925 ///
1926 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1927 /// except for a currency sign swap.
1928 ///
1929 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1930 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1931 ///
1932 /// This encoding matches the Windows code page 1255, except Windows decodes
1933 /// unassigned code points to the Private Use Area of Unicode.
1934 ///
1935 /// This will change from `static` to `const` if Rust changes
1936 /// to make the referent of `pub const FOO: &'static Encoding`
1937 /// unique cross-crate, so don't take the address of this
1938 /// `static`.
1939 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1940
1941 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1942 ///
1943 /// For use only for taking the address of this form when
1944 /// Rust prohibits the use of the non-`_INIT` form directly,
1945 /// such as in initializers of other `static`s. If in doubt,
1946 /// use the corresponding non-`_INIT` reference-typed `static`.
1947 ///
1948 /// This part of the public API will go away if Rust changes
1949 /// to make the referent of `pub const FOO: &'static Encoding`
1950 /// unique cross-crate or if Rust starts allowing static arrays
1951 /// to be initialized with `pub static FOO: &'static Encoding`
1952 /// items.
1953 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1954 name: "windows-1256",
1955 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1956 };
1957
1958 /// The windows-1256 encoding.
1959 ///
1960 /// This is the Arabic encoding for Windows.
1961 ///
1962 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1963 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1964 ///
1965 /// This encoding matches the Windows code page 1256.
1966 ///
1967 /// This will change from `static` to `const` if Rust changes
1968 /// to make the referent of `pub const FOO: &'static Encoding`
1969 /// unique cross-crate, so don't take the address of this
1970 /// `static`.
1971 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1972
1973 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1974 ///
1975 /// For use only for taking the address of this form when
1976 /// Rust prohibits the use of the non-`_INIT` form directly,
1977 /// such as in initializers of other `static`s. If in doubt,
1978 /// use the corresponding non-`_INIT` reference-typed `static`.
1979 ///
1980 /// This part of the public API will go away if Rust changes
1981 /// to make the referent of `pub const FOO: &'static Encoding`
1982 /// unique cross-crate or if Rust starts allowing static arrays
1983 /// to be initialized with `pub static FOO: &'static Encoding`
1984 /// items.
1985 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1986 name: "windows-1257",
1987 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1988 };
1989
1990 /// The windows-1257 encoding.
1991 ///
1992 /// This is the Baltic encoding for Windows.
1993 ///
1994 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1995 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1996 ///
1997 /// This encoding matches the Windows code page 1257, except Windows decodes
1998 /// unassigned code points to the Private Use Area of Unicode.
1999 ///
2000 /// This will change from `static` to `const` if Rust changes
2001 /// to make the referent of `pub const FOO: &'static Encoding`
2002 /// unique cross-crate, so don't take the address of this
2003 /// `static`.
2004 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2005
2006 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2007 ///
2008 /// For use only for taking the address of this form when
2009 /// Rust prohibits the use of the non-`_INIT` form directly,
2010 /// such as in initializers of other `static`s. If in doubt,
2011 /// use the corresponding non-`_INIT` reference-typed `static`.
2012 ///
2013 /// This part of the public API will go away if Rust changes
2014 /// to make the referent of `pub const FOO: &'static Encoding`
2015 /// unique cross-crate or if Rust starts allowing static arrays
2016 /// to be initialized with `pub static FOO: &'static Encoding`
2017 /// items.
2018 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2019 name: "windows-1258",
2020 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2021 };
2022
2023 /// The windows-1258 encoding.
2024 ///
2025 /// This is the Vietnamese encoding for Windows.
2026 ///
2027 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2028 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2029 ///
2030 /// This encoding matches the Windows code page 1258 when used in the
2031 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2032 /// result of decoding is not necessarily in Normalization Form C. On the
2033 /// other hand, input in the Normalization Form C is not encoded without
2034 /// replacement. In general, it's a bad idea to encode to encodings other
2035 /// than UTF-8, but this encoding is especially hazardous to encode to.
2036 ///
2037 /// This will change from `static` to `const` if Rust changes
2038 /// to make the referent of `pub const FOO: &'static Encoding`
2039 /// unique cross-crate, so don't take the address of this
2040 /// `static`.
2041 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2042
2043 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2044 ///
2045 /// For use only for taking the address of this form when
2046 /// Rust prohibits the use of the non-`_INIT` form directly,
2047 /// such as in initializers of other `static`s. If in doubt,
2048 /// use the corresponding non-`_INIT` reference-typed `static`.
2049 ///
2050 /// This part of the public API will go away if Rust changes
2051 /// to make the referent of `pub const FOO: &'static Encoding`
2052 /// unique cross-crate or if Rust starts allowing static arrays
2053 /// to be initialized with `pub static FOO: &'static Encoding`
2054 /// items.
2055 pub static WINDOWS_874_INIT: Encoding = Encoding {
2056 name: "windows-874",
2057 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2058 };
2059
2060 /// The windows-874 encoding.
2061 ///
2062 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2063 ///
2064 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2065 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2066 ///
2067 /// This encoding matches the Windows code page 874, except Windows decodes
2068 /// unassigned code points to the Private Use Area of Unicode.
2069 ///
2070 /// This will change from `static` to `const` if Rust changes
2071 /// to make the referent of `pub const FOO: &'static Encoding`
2072 /// unique cross-crate, so don't take the address of this
2073 /// `static`.
2074 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2075
2076 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2077 ///
2078 /// For use only for taking the address of this form when
2079 /// Rust prohibits the use of the non-`_INIT` form directly,
2080 /// such as in initializers of other `static`s. If in doubt,
2081 /// use the corresponding non-`_INIT` reference-typed `static`.
2082 ///
2083 /// This part of the public API will go away if Rust changes
2084 /// to make the referent of `pub const FOO: &'static Encoding`
2085 /// unique cross-crate or if Rust starts allowing static arrays
2086 /// to be initialized with `pub static FOO: &'static Encoding`
2087 /// items.
2088 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2089 name: "x-mac-cyrillic",
2090 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2091 };
2092
2093 /// The x-mac-cyrillic encoding.
2094 ///
2095 /// This is the MacUkrainian encoding from Mac OS Classic.
2096 ///
2097 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2098 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2099 ///
2100 /// This encoding matches the Windows code page 10017.
2101 ///
2102 /// This will change from `static` to `const` if Rust changes
2103 /// to make the referent of `pub const FOO: &'static Encoding`
2104 /// unique cross-crate, so don't take the address of this
2105 /// `static`.
2106 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2107
2108 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2109 ///
2110 /// For use only for taking the address of this form when
2111 /// Rust prohibits the use of the non-`_INIT` form directly,
2112 /// such as in initializers of other `static`s. If in doubt,
2113 /// use the corresponding non-`_INIT` reference-typed `static`.
2114 ///
2115 /// This part of the public API will go away if Rust changes
2116 /// to make the referent of `pub const FOO: &'static Encoding`
2117 /// unique cross-crate or if Rust starts allowing static arrays
2118 /// to be initialized with `pub static FOO: &'static Encoding`
2119 /// items.
2120 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2121 name: "x-user-defined",
2122 variant: VariantEncoding::UserDefined,
2123 };
2124
2125 /// The x-user-defined encoding.
2126 ///
2127 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2128 /// them to the Private Use Area of Unicode. It was used for loading binary
2129 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2130 /// the `"arraybuffer"` response type.
2131 ///
2132 /// This encoding does not have a Windows code page number.
2133 ///
2134 /// This will change from `static` to `const` if Rust changes
2135 /// to make the referent of `pub const FOO: &'static Encoding`
2136 /// unique cross-crate, so don't take the address of this
2137 /// `static`.
2138 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2139
2140 static LABELS_SORTED: [&'static str; 219] = [
2141 "l1",
2142 "l2",
2143 "l3",
2144 "l4",
2145 "l5",
2146 "l6",
2147 "l9",
2148 "866",
2149 "mac",
2150 "koi",
2151 "gbk",
2152 "big5",
2153 "utf8",
2154 "koi8",
2155 "sjis",
2156 "ms932",
2157 "cp866",
2158 "utf-8",
2159 "cp819",
2160 "ascii",
2161 "x-gbk",
2162 "greek",
2163 "cp1250",
2164 "cp1251",
2165 "latin1",
2166 "gb2312",
2167 "cp1252",
2168 "latin2",
2169 "cp1253",
2170 "latin3",
2171 "cp1254",
2172 "latin4",
2173 "cp1255",
2174 "csbig5",
2175 "latin5",
2176 "utf-16",
2177 "cp1256",
2178 "ibm866",
2179 "latin6",
2180 "cp1257",
2181 "cp1258",
2182 "greek8",
2183 "ibm819",
2184 "arabic",
2185 "visual",
2186 "korean",
2187 "euc-jp",
2188 "koi8-r",
2189 "koi8_r",
2190 "euc-kr",
2191 "x-sjis",
2192 "koi8-u",
2193 "hebrew",
2194 "tis-620",
2195 "gb18030",
2196 "ksc5601",
2197 "gb_2312",
2198 "dos-874",
2199 "cn-big5",
2200 "chinese",
2201 "logical",
2202 "cskoi8r",
2203 "cseuckr",
2204 "koi8-ru",
2205 "x-cp1250",
2206 "ksc_5601",
2207 "x-cp1251",
2208 "iso88591",
2209 "csgb2312",
2210 "x-cp1252",
2211 "iso88592",
2212 "x-cp1253",
2213 "iso88593",
2214 "ecma-114",
2215 "x-cp1254",
2216 "iso88594",
2217 "x-cp1255",
2218 "iso88595",
2219 "x-x-big5",
2220 "x-cp1256",
2221 "csibm866",
2222 "iso88596",
2223 "x-cp1257",
2224 "iso88597",
2225 "asmo-708",
2226 "ecma-118",
2227 "elot_928",
2228 "x-cp1258",
2229 "iso88598",
2230 "iso88599",
2231 "cyrillic",
2232 "utf-16be",
2233 "utf-16le",
2234 "us-ascii",
2235 "ms_kanji",
2236 "x-euc-jp",
2237 "iso885910",
2238 "iso8859-1",
2239 "iso885911",
2240 "iso8859-2",
2241 "iso8859-3",
2242 "iso885913",
2243 "iso8859-4",
2244 "iso885914",
2245 "iso8859-5",
2246 "iso885915",
2247 "iso8859-6",
2248 "iso8859-7",
2249 "iso8859-8",
2250 "iso-ir-58",
2251 "iso8859-9",
2252 "macintosh",
2253 "shift-jis",
2254 "shift_jis",
2255 "iso-ir-100",
2256 "iso8859-10",
2257 "iso-ir-110",
2258 "gb_2312-80",
2259 "iso-8859-1",
2260 "iso_8859-1",
2261 "iso-ir-101",
2262 "iso8859-11",
2263 "iso-8859-2",
2264 "iso_8859-2",
2265 "hz-gb-2312",
2266 "iso-8859-3",
2267 "iso_8859-3",
2268 "iso8859-13",
2269 "iso-8859-4",
2270 "iso_8859-4",
2271 "iso8859-14",
2272 "iso-ir-144",
2273 "iso-8859-5",
2274 "iso_8859-5",
2275 "iso8859-15",
2276 "iso-8859-6",
2277 "iso_8859-6",
2278 "iso-ir-126",
2279 "iso-8859-7",
2280 "iso_8859-7",
2281 "iso-ir-127",
2282 "iso-ir-157",
2283 "iso-8859-8",
2284 "iso_8859-8",
2285 "iso-ir-138",
2286 "iso-ir-148",
2287 "iso-8859-9",
2288 "iso_8859-9",
2289 "iso-ir-109",
2290 "iso-ir-149",
2291 "big5-hkscs",
2292 "csshiftjis",
2293 "iso-8859-10",
2294 "iso-8859-11",
2295 "csisolatin1",
2296 "csisolatin2",
2297 "iso-8859-13",
2298 "csisolatin3",
2299 "iso-8859-14",
2300 "windows-874",
2301 "csisolatin4",
2302 "iso-8859-15",
2303 "iso_8859-15",
2304 "csisolatin5",
2305 "iso-8859-16",
2306 "csisolatin6",
2307 "windows-949",
2308 "csisolatin9",
2309 "csiso88596e",
2310 "csiso88598e",
2311 "csmacintosh",
2312 "csiso88596i",
2313 "csiso88598i",
2314 "windows-31j",
2315 "x-mac-roman",
2316 "iso-2022-cn",
2317 "iso-2022-jp",
2318 "csiso2022jp",
2319 "iso-2022-kr",
2320 "csiso2022kr",
2321 "replacement",
2322 "windows-1250",
2323 "windows-1251",
2324 "windows-1252",
2325 "windows-1253",
2326 "windows-1254",
2327 "windows-1255",
2328 "windows-1256",
2329 "windows-1257",
2330 "windows-1258",
2331 "iso-8859-6-e",
2332 "iso-8859-8-e",
2333 "iso-8859-6-i",
2334 "iso-8859-8-i",
2335 "sun_eu_greek",
2336 "csksc56011987",
2337 "ks_c_5601-1987",
2338 "ansi_x3.4-1968",
2339 "ks_c_5601-1989",
2340 "x-mac-cyrillic",
2341 "x-user-defined",
2342 "csiso58gb231280",
2343 "iso_8859-1:1987",
2344 "iso_8859-2:1987",
2345 "iso_8859-6:1987",
2346 "iso_8859-7:1987",
2347 "iso_8859-3:1988",
2348 "iso_8859-4:1988",
2349 "iso_8859-5:1988",
2350 "iso_8859-8:1988",
2351 "iso_8859-9:1989",
2352 "csisolatingreek",
2353 "x-mac-ukrainian",
2354 "iso-2022-cn-ext",
2355 "csisolatinarabic",
2356 "csisolatinhebrew",
2357 "unicode-1-1-utf-8",
2358 "csisolatincyrillic",
2359 "cseucpkdfmtjapanese",
2360 ];
2361
2362 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2363 &WINDOWS_1252_INIT,
2364 &ISO_8859_2_INIT,
2365 &ISO_8859_3_INIT,
2366 &ISO_8859_4_INIT,
2367 &WINDOWS_1254_INIT,
2368 &ISO_8859_10_INIT,
2369 &ISO_8859_15_INIT,
2370 &IBM866_INIT,
2371 &MACINTOSH_INIT,
2372 &KOI8_R_INIT,
2373 &GBK_INIT,
2374 &BIG5_INIT,
2375 &UTF_8_INIT,
2376 &KOI8_R_INIT,
2377 &SHIFT_JIS_INIT,
2378 &SHIFT_JIS_INIT,
2379 &IBM866_INIT,
2380 &UTF_8_INIT,
2381 &WINDOWS_1252_INIT,
2382 &WINDOWS_1252_INIT,
2383 &GBK_INIT,
2384 &ISO_8859_7_INIT,
2385 &WINDOWS_1250_INIT,
2386 &WINDOWS_1251_INIT,
2387 &WINDOWS_1252_INIT,
2388 &GBK_INIT,
2389 &WINDOWS_1252_INIT,
2390 &ISO_8859_2_INIT,
2391 &WINDOWS_1253_INIT,
2392 &ISO_8859_3_INIT,
2393 &WINDOWS_1254_INIT,
2394 &ISO_8859_4_INIT,
2395 &WINDOWS_1255_INIT,
2396 &BIG5_INIT,
2397 &WINDOWS_1254_INIT,
2398 &UTF_16LE_INIT,
2399 &WINDOWS_1256_INIT,
2400 &IBM866_INIT,
2401 &ISO_8859_10_INIT,
2402 &WINDOWS_1257_INIT,
2403 &WINDOWS_1258_INIT,
2404 &ISO_8859_7_INIT,
2405 &WINDOWS_1252_INIT,
2406 &ISO_8859_6_INIT,
2407 &ISO_8859_8_INIT,
2408 &EUC_KR_INIT,
2409 &EUC_JP_INIT,
2410 &KOI8_R_INIT,
2411 &KOI8_R_INIT,
2412 &EUC_KR_INIT,
2413 &SHIFT_JIS_INIT,
2414 &KOI8_U_INIT,
2415 &ISO_8859_8_INIT,
2416 &WINDOWS_874_INIT,
2417 &GB18030_INIT,
2418 &EUC_KR_INIT,
2419 &GBK_INIT,
2420 &WINDOWS_874_INIT,
2421 &BIG5_INIT,
2422 &GBK_INIT,
2423 &ISO_8859_8_I_INIT,
2424 &KOI8_R_INIT,
2425 &EUC_KR_INIT,
2426 &KOI8_U_INIT,
2427 &WINDOWS_1250_INIT,
2428 &EUC_KR_INIT,
2429 &WINDOWS_1251_INIT,
2430 &WINDOWS_1252_INIT,
2431 &GBK_INIT,
2432 &WINDOWS_1252_INIT,
2433 &ISO_8859_2_INIT,
2434 &WINDOWS_1253_INIT,
2435 &ISO_8859_3_INIT,
2436 &ISO_8859_6_INIT,
2437 &WINDOWS_1254_INIT,
2438 &ISO_8859_4_INIT,
2439 &WINDOWS_1255_INIT,
2440 &ISO_8859_5_INIT,
2441 &BIG5_INIT,
2442 &WINDOWS_1256_INIT,
2443 &IBM866_INIT,
2444 &ISO_8859_6_INIT,
2445 &WINDOWS_1257_INIT,
2446 &ISO_8859_7_INIT,
2447 &ISO_8859_6_INIT,
2448 &ISO_8859_7_INIT,
2449 &ISO_8859_7_INIT,
2450 &WINDOWS_1258_INIT,
2451 &ISO_8859_8_INIT,
2452 &WINDOWS_1254_INIT,
2453 &ISO_8859_5_INIT,
2454 &UTF_16BE_INIT,
2455 &UTF_16LE_INIT,
2456 &WINDOWS_1252_INIT,
2457 &SHIFT_JIS_INIT,
2458 &EUC_JP_INIT,
2459 &ISO_8859_10_INIT,
2460 &WINDOWS_1252_INIT,
2461 &WINDOWS_874_INIT,
2462 &ISO_8859_2_INIT,
2463 &ISO_8859_3_INIT,
2464 &ISO_8859_13_INIT,
2465 &ISO_8859_4_INIT,
2466 &ISO_8859_14_INIT,
2467 &ISO_8859_5_INIT,
2468 &ISO_8859_15_INIT,
2469 &ISO_8859_6_INIT,
2470 &ISO_8859_7_INIT,
2471 &ISO_8859_8_INIT,
2472 &GBK_INIT,
2473 &WINDOWS_1254_INIT,
2474 &MACINTOSH_INIT,
2475 &SHIFT_JIS_INIT,
2476 &SHIFT_JIS_INIT,
2477 &WINDOWS_1252_INIT,
2478 &ISO_8859_10_INIT,
2479 &ISO_8859_4_INIT,
2480 &GBK_INIT,
2481 &WINDOWS_1252_INIT,
2482 &WINDOWS_1252_INIT,
2483 &ISO_8859_2_INIT,
2484 &WINDOWS_874_INIT,
2485 &ISO_8859_2_INIT,
2486 &ISO_8859_2_INIT,
2487 &REPLACEMENT_INIT,
2488 &ISO_8859_3_INIT,
2489 &ISO_8859_3_INIT,
2490 &ISO_8859_13_INIT,
2491 &ISO_8859_4_INIT,
2492 &ISO_8859_4_INIT,
2493 &ISO_8859_14_INIT,
2494 &ISO_8859_5_INIT,
2495 &ISO_8859_5_INIT,
2496 &ISO_8859_5_INIT,
2497 &ISO_8859_15_INIT,
2498 &ISO_8859_6_INIT,
2499 &ISO_8859_6_INIT,
2500 &ISO_8859_7_INIT,
2501 &ISO_8859_7_INIT,
2502 &ISO_8859_7_INIT,
2503 &ISO_8859_6_INIT,
2504 &ISO_8859_10_INIT,
2505 &ISO_8859_8_INIT,
2506 &ISO_8859_8_INIT,
2507 &ISO_8859_8_INIT,
2508 &WINDOWS_1254_INIT,
2509 &WINDOWS_1254_INIT,
2510 &WINDOWS_1254_INIT,
2511 &ISO_8859_3_INIT,
2512 &EUC_KR_INIT,
2513 &BIG5_INIT,
2514 &SHIFT_JIS_INIT,
2515 &ISO_8859_10_INIT,
2516 &WINDOWS_874_INIT,
2517 &WINDOWS_1252_INIT,
2518 &ISO_8859_2_INIT,
2519 &ISO_8859_13_INIT,
2520 &ISO_8859_3_INIT,
2521 &ISO_8859_14_INIT,
2522 &WINDOWS_874_INIT,
2523 &ISO_8859_4_INIT,
2524 &ISO_8859_15_INIT,
2525 &ISO_8859_15_INIT,
2526 &WINDOWS_1254_INIT,
2527 &ISO_8859_16_INIT,
2528 &ISO_8859_10_INIT,
2529 &EUC_KR_INIT,
2530 &ISO_8859_15_INIT,
2531 &ISO_8859_6_INIT,
2532 &ISO_8859_8_INIT,
2533 &MACINTOSH_INIT,
2534 &ISO_8859_6_INIT,
2535 &ISO_8859_8_I_INIT,
2536 &SHIFT_JIS_INIT,
2537 &MACINTOSH_INIT,
2538 &REPLACEMENT_INIT,
2539 &ISO_2022_JP_INIT,
2540 &ISO_2022_JP_INIT,
2541 &REPLACEMENT_INIT,
2542 &REPLACEMENT_INIT,
2543 &REPLACEMENT_INIT,
2544 &WINDOWS_1250_INIT,
2545 &WINDOWS_1251_INIT,
2546 &WINDOWS_1252_INIT,
2547 &WINDOWS_1253_INIT,
2548 &WINDOWS_1254_INIT,
2549 &WINDOWS_1255_INIT,
2550 &WINDOWS_1256_INIT,
2551 &WINDOWS_1257_INIT,
2552 &WINDOWS_1258_INIT,
2553 &ISO_8859_6_INIT,
2554 &ISO_8859_8_INIT,
2555 &ISO_8859_6_INIT,
2556 &ISO_8859_8_I_INIT,
2557 &ISO_8859_7_INIT,
2558 &EUC_KR_INIT,
2559 &EUC_KR_INIT,
2560 &WINDOWS_1252_INIT,
2561 &EUC_KR_INIT,
2562 &X_MAC_CYRILLIC_INIT,
2563 &X_USER_DEFINED_INIT,
2564 &GBK_INIT,
2565 &WINDOWS_1252_INIT,
2566 &ISO_8859_2_INIT,
2567 &ISO_8859_6_INIT,
2568 &ISO_8859_7_INIT,
2569 &ISO_8859_3_INIT,
2570 &ISO_8859_4_INIT,
2571 &ISO_8859_5_INIT,
2572 &ISO_8859_8_INIT,
2573 &WINDOWS_1254_INIT,
2574 &ISO_8859_7_INIT,
2575 &X_MAC_CYRILLIC_INIT,
2576 &REPLACEMENT_INIT,
2577 &ISO_8859_6_INIT,
2578 &ISO_8859_8_INIT,
2579 &UTF_8_INIT,
2580 &ISO_8859_5_INIT,
2581 &EUC_JP_INIT,
2582 ];
2583
2584 // END GENERATED CODE
2585
2586 /// An encoding as defined in the [Encoding Standard][1].
2587 ///
2588 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2589 /// and, in most cases, vice versa. Each encoding has a name, an output
2590 /// encoding, and one or more labels.
2591 ///
2592 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2593 /// encoding in formats and protocols. The _name_ of the encoding is the
2594 /// preferred label in the case appropriate for returning from the
2595 /// [`characterSet`][2] property of the `Document` DOM interface.
2596 ///
2597 /// The _output encoding_ is the encoding used for form submission and URL
2598 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2599 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2600 /// encodings.
2601 ///
2602 /// [1]: https://encoding.spec.whatwg.org/
2603 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2604 ///
2605 /// # Streaming vs. Non-Streaming
2606 ///
2607 /// When you have the entire input in a single buffer, you can use the
2608 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2609 /// [`decode_without_bom_handling()`][5],
2610 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2611 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2612 /// not available in the C API.) Unlike the rest of the API available to Rust,
2613 /// these methods perform heap allocations. You should the `Decoder` and
2614 /// `Encoder` objects when your input is split into multiple buffers or when
2615 /// you want to control the allocation of the output buffers.
2616 ///
2617 /// [3]: #method.decode
2618 /// [4]: #method.decode_with_bom_removal
2619 /// [5]: #method.decode_without_bom_handling
2620 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2621 /// [7]: #method.encode
2622 ///
2623 /// # Instances
2624 ///
2625 /// All instances of `Encoding` are statically allocated and have the `'static`
2626 /// lifetime. There is precisely one unique `Encoding` instance for each
2627 /// encoding defined in the Encoding Standard.
2628 ///
2629 /// To obtain a reference to a particular encoding whose identity you know at
2630 /// compile time, use a `static` that refers to encoding. There is a `static`
2631 /// for each encoding. The `static`s are named in all caps with hyphens
2632 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2633 /// name). For example, if you know at compile time that you will want to
2634 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2635 /// in C/C++).
2636 ///
2637 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2638 /// work around the problem that `static`s of the type `&'static Encoding`
2639 /// cannot be used to initialize items of an array whose type is
2640 /// `[&'static Encoding; N]`.
2641 ///
2642 /// If you don't know what encoding you need at compile time and need to
2643 /// dynamically get an encoding by label, use
2644 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2645 ///
2646 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2647 /// C/C++).
2648 pub struct Encoding {
2649 name: &'static str,
2650 variant: VariantEncoding,
2651 }
2652
2653 impl Encoding {
2654 /// Implements the
2655 /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2656 /// algorithm.
2657 ///
2658 /// If, after ASCII-lowercasing and removing leading and trailing
2659 /// whitespace, the argument matches a label defined in the Encoding
2660 /// Standard, `Some(&'static Encoding)` representing the corresponding
2661 /// encoding is returned. If there is no match, `None` is returned.
2662 ///
2663 /// This is the right method to use if the action upon the method returning
2664 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2665 /// When the action upon the method returning `None` is not to proceed with
2666 /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2667 /// appropriate.
2668 ///
2669 /// The argument is of type `&[u8]` instead of `&str` to save callers
2670 /// that are extracting the label from a non-UTF-8 protocol the trouble
2671 /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2672 /// on it.)
2673 ///
2674 /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2675 pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2676 let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2677 let mut trimmed_pos = 0usize;
2678 let mut iter = label.into_iter();
2679 // before
2680 loop {
2681 match iter.next() {
2682 None => {
2683 return None;
2684 }
2685 Some(byte) => {
2686 // The characters used in labels are:
2687 // a-z (except q, but excluding it below seems excessive)
2688 // 0-9
2689 // . _ - :
2690 match *byte {
2691 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2692 continue;
2693 }
2694 b'A'..=b'Z' => {
2695 trimmed[trimmed_pos] = *byte + 0x20u8;
2696 trimmed_pos = 1usize;
2697 break;
2698 }
2699 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2700 trimmed[trimmed_pos] = *byte;
2701 trimmed_pos = 1usize;
2702 break;
2703 }
2704 _ => {
2705 return None;
2706 }
2707 }
2708 }
2709 }
2710 }
2711 // inside
2712 loop {
2713 match iter.next() {
2714 None => {
2715 break;
2716 }
2717 Some(byte) => {
2718 match *byte {
2719 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2720 break;
2721 }
2722 b'A'..=b'Z' => {
2723 if trimmed_pos == LONGEST_LABEL_LENGTH {
2724 // There's no encoding with a label this long
2725 return None;
2726 }
2727 trimmed[trimmed_pos] = *byte + 0x20u8;
2728 trimmed_pos += 1usize;
2729 continue;
2730 }
2731 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2732 if trimmed_pos == LONGEST_LABEL_LENGTH {
2733 // There's no encoding with a label this long
2734 return None;
2735 }
2736 trimmed[trimmed_pos] = *byte;
2737 trimmed_pos += 1usize;
2738 continue;
2739 }
2740 _ => {
2741 return None;
2742 }
2743 }
2744 }
2745 }
2746 }
2747 // after
2748 loop {
2749 match iter.next() {
2750 None => {
2751 break;
2752 }
2753 Some(byte) => {
2754 match *byte {
2755 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2756 continue;
2757 }
2758 _ => {
2759 // There's no label with space in the middle
2760 return None;
2761 }
2762 }
2763 }
2764 }
2765 }
2766 let candidate = &trimmed[..trimmed_pos];
2767 match LABELS_SORTED.binary_search_by(|probe| {
2768 let bytes = probe.as_bytes();
2769 let c = bytes.len().cmp(&candidate.len());
2770 if c != Ordering::Equal {
2771 return c;
2772 }
2773 let probe_iter = bytes.iter().rev();
2774 let candidate_iter = candidate.iter().rev();
2775 probe_iter.cmp(candidate_iter)
2776 }) {
2777 Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2778 Err(_) => None,
2779 }
2780 }
2781
2782 /// This method behaves the same as `for_label()`, except when `for_label()`
2783 /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2784 ///
2785 /// This method is useful in scenarios where a fatal error is required
2786 /// upon invalid label, because in those cases the caller typically wishes
2787 /// to treat the labels that map to the replacement encoding as fatal
2788 /// errors, too.
2789 ///
2790 /// It is not OK to use this method when the action upon the method returning
2791 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2792 /// case, the `for_label()` method should be used instead in order to avoid
2793 /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2794 ///
2795 /// Available via the C wrapper.
2796 #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2797 pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2798 match Encoding::for_label(label) {
2799 None => None,
2800 Some(encoding) => {
2801 if encoding == REPLACEMENT {
2802 None
2803 } else {
2804 Some(encoding)
2805 }
2806 }
2807 }
2808 }
2809
2810 /// Performs non-incremental BOM sniffing.
2811 ///
2812 /// The argument must either be a buffer representing the entire input
2813 /// stream (non-streaming case) or a buffer representing at least the first
2814 /// three bytes of the input stream (streaming case).
2815 ///
2816 /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2817 /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2818 /// or UTF-16BE BOM or `None` otherwise.
2819 ///
2820 /// Available via the C wrapper.
2821 #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2822 pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2823 if buffer.starts_with(b"\xEF\xBB\xBF") {
2824 Some((UTF_8, 3))
2825 } else if buffer.starts_with(b"\xFF\xFE") {
2826 Some((UTF_16LE, 2))
2827 } else if buffer.starts_with(b"\xFE\xFF") {
2828 Some((UTF_16BE, 2))
2829 } else {
2830 None
2831 }
2832 }
2833
2834 /// Returns the name of this encoding.
2835 ///
2836 /// This name is appropriate to return as-is from the DOM
2837 /// `document.characterSet` property.
2838 ///
2839 /// Available via the C wrapper.
2840 #[inline]
name(&'static self) -> &'static str2841 pub fn name(&'static self) -> &'static str {
2842 self.name
2843 }
2844
2845 /// Checks whether the _output encoding_ of this encoding can encode every
2846 /// `char`. (Only true if the output encoding is UTF-8.)
2847 ///
2848 /// Available via the C wrapper.
2849 #[inline]
can_encode_everything(&'static self) -> bool2850 pub fn can_encode_everything(&'static self) -> bool {
2851 self.output_encoding() == UTF_8
2852 }
2853
2854 /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2855 /// U+0000...U+007F and vice versa.
2856 ///
2857 /// Available via the C wrapper.
2858 #[inline]
is_ascii_compatible(&'static self) -> bool2859 pub fn is_ascii_compatible(&'static self) -> bool {
2860 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2861 }
2862
2863 /// Checks whether this encoding maps one byte to one Basic Multilingual
2864 /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2865 /// vice versa (for mappable characters).
2866 ///
2867 /// `true` iff this encoding is on the list of [Legacy single-byte
2868 /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2869 /// in the spec or x-user-defined.
2870 ///
2871 /// Available via the C wrapper.
2872 #[inline]
is_single_byte(&'static self) -> bool2873 pub fn is_single_byte(&'static self) -> bool {
2874 self.variant.is_single_byte()
2875 }
2876
2877 /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2878 /// U+0000...U+007F and vice versa.
2879 #[inline]
is_potentially_borrowable(&'static self) -> bool2880 fn is_potentially_borrowable(&'static self) -> bool {
2881 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2882 }
2883
2884 /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2885 /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2886 ///
2887 /// Available via the C wrapper.
2888 #[inline]
output_encoding(&'static self) -> &'static Encoding2889 pub fn output_encoding(&'static self) -> &'static Encoding {
2890 if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2891 UTF_8
2892 } else {
2893 self
2894 }
2895 }
2896
2897 /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2898 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2899 /// entire input is available as a single buffer (i.e. the end of the
2900 /// buffer marks the end of the stream).
2901 ///
2902 /// This method implements the (non-streaming version of) the
2903 /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2904 ///
2905 /// The second item in the returned tuple is the encoding that was actually
2906 /// used (which may differ from this encoding thanks to BOM sniffing).
2907 ///
2908 /// The third item in the returned tuple indicates whether there were
2909 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2910 ///
2911 /// _Note:_ It is wrong to use this when the input buffer represents only
2912 /// a segment of the input instead of the whole input. Use `new_decoder()`
2913 /// when decoding segmented input.
2914 ///
2915 /// This method performs a one or two heap allocations for the backing
2916 /// buffer of the `String` when unable to borrow. (One allocation if not
2917 /// errors and potentially another one in the presence of errors.) The
2918 /// first allocation assumes jemalloc and may not be optimal with
2919 /// allocators that do not use power-of-two buckets. A borrow is performed
2920 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2921 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2922 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2923 /// transitions.
2924 ///
2925 /// # Panics
2926 ///
2927 /// If the size calculation for a heap-allocated backing buffer overflows
2928 /// `usize`.
2929 ///
2930 /// Available to Rust only.
2931 #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2932 pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2933 let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2934 Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2935 None => (self, bytes),
2936 };
2937 let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2938 (cow, encoding, had_errors)
2939 }
2940
2941 /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2942 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2943 /// entire input is available as a single buffer (i.e. the end of the
2944 /// buffer marks the end of the stream).
2945 ///
2946 /// When invoked on `UTF_8`, this method implements the (non-streaming
2947 /// version of) the
2948 /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2949 /// concept.
2950 ///
2951 /// The second item in the returned pair indicates whether there were
2952 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2953 ///
2954 /// _Note:_ It is wrong to use this when the input buffer represents only
2955 /// a segment of the input instead of the whole input. Use
2956 /// `new_decoder_with_bom_removal()` when decoding segmented input.
2957 ///
2958 /// This method performs a one or two heap allocations for the backing
2959 /// buffer of the `String` when unable to borrow. (One allocation if not
2960 /// errors and potentially another one in the presence of errors.) The
2961 /// first allocation assumes jemalloc and may not be optimal with
2962 /// allocators that do not use power-of-two buckets. A borrow is performed
2963 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2964 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2965 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2966 /// transitions.
2967 ///
2968 /// # Panics
2969 ///
2970 /// If the size calculation for a heap-allocated backing buffer overflows
2971 /// `usize`.
2972 ///
2973 /// Available to Rust only.
2974 #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)2975 pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2976 let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2977 &bytes[3..]
2978 } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2979 || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2980 {
2981 &bytes[2..]
2982 } else {
2983 bytes
2984 };
2985 self.decode_without_bom_handling(without_bom)
2986 }
2987
2988 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2989 /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2990 /// the entire input is available as a single buffer (i.e. the end of the
2991 /// buffer marks the end of the stream).
2992 ///
2993 /// When invoked on `UTF_8`, this method implements the (non-streaming
2994 /// version of) the
2995 /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2996 /// spec concept.
2997 ///
2998 /// The second item in the returned pair indicates whether there were
2999 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3000 ///
3001 /// _Note:_ It is wrong to use this when the input buffer represents only
3002 /// a segment of the input instead of the whole input. Use
3003 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3004 ///
3005 /// This method performs a one or two heap allocations for the backing
3006 /// buffer of the `String` when unable to borrow. (One allocation if not
3007 /// errors and potentially another one in the presence of errors.) The
3008 /// first allocation assumes jemalloc and may not be optimal with
3009 /// allocators that do not use power-of-two buckets. A borrow is performed
3010 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3011 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3012 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3013 /// transitions.
3014 ///
3015 /// # Panics
3016 ///
3017 /// If the size calculation for a heap-allocated backing buffer overflows
3018 /// `usize`.
3019 ///
3020 /// Available to Rust only.
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3021 pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3022 let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3023 let valid_up_to = if self == UTF_8 {
3024 utf8_valid_up_to(bytes)
3025 } else if self == ISO_2022_JP {
3026 iso_2022_jp_ascii_valid_up_to(bytes)
3027 } else {
3028 ascii_valid_up_to(bytes)
3029 };
3030 if valid_up_to == bytes.len() {
3031 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3032 return (Cow::Borrowed(str), false);
3033 }
3034 let decoder = self.new_decoder_without_bom_handling();
3035
3036 let rounded_without_replacement = checked_next_power_of_two(checked_add(
3037 valid_up_to,
3038 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3039 ));
3040 let with_replacement = checked_add(
3041 valid_up_to,
3042 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3043 );
3044 let mut string = String::with_capacity(
3045 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3046 );
3047 unsafe {
3048 let vec = string.as_mut_vec();
3049 vec.set_len(valid_up_to);
3050 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3051 }
3052 (decoder, string, valid_up_to)
3053 } else {
3054 let decoder = self.new_decoder_without_bom_handling();
3055 let rounded_without_replacement = checked_next_power_of_two(
3056 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3057 );
3058 let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3059 let string = String::with_capacity(
3060 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3061 );
3062 (decoder, string, 0)
3063 };
3064
3065 let mut total_had_errors = false;
3066 loop {
3067 let (result, read, had_errors) =
3068 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3069 total_read += read;
3070 total_had_errors |= had_errors;
3071 match result {
3072 CoderResult::InputEmpty => {
3073 debug_assert_eq!(total_read, bytes.len());
3074 return (Cow::Owned(string), total_had_errors);
3075 }
3076 CoderResult::OutputFull => {
3077 // Allocate for the worst case. That is, we should come
3078 // here at most once per invocation of this method.
3079 let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3080 string.reserve(needed.unwrap());
3081 }
3082 }
3083 }
3084 }
3085
3086 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3087 /// _with malformed sequences treated as fatal_ when the entire input is
3088 /// available as a single buffer (i.e. the end of the buffer marks the end
3089 /// of the stream).
3090 ///
3091 /// When invoked on `UTF_8`, this method implements the (non-streaming
3092 /// version of) the
3093 /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3094 /// spec concept.
3095 ///
3096 /// Returns `None` if a malformed sequence was encountered and the result
3097 /// of the decode as `Some(String)` otherwise.
3098 ///
3099 /// _Note:_ It is wrong to use this when the input buffer represents only
3100 /// a segment of the input instead of the whole input. Use
3101 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3102 ///
3103 /// This method performs a single heap allocation for the backing
3104 /// buffer of the `String` when unable to borrow. A borrow is performed if
3105 /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3106 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3107 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3108 /// transitions.
3109 ///
3110 /// # Panics
3111 ///
3112 /// If the size calculation for a heap-allocated backing buffer overflows
3113 /// `usize`.
3114 ///
3115 /// Available to Rust only.
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3116 pub fn decode_without_bom_handling_and_without_replacement<'a>(
3117 &'static self,
3118 bytes: &'a [u8],
3119 ) -> Option<Cow<'a, str>> {
3120 if self == UTF_8 {
3121 let valid_up_to = utf8_valid_up_to(bytes);
3122 if valid_up_to == bytes.len() {
3123 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3124 return Some(Cow::Borrowed(str));
3125 }
3126 return None;
3127 }
3128 let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3129 let valid_up_to = if self == ISO_2022_JP {
3130 iso_2022_jp_ascii_valid_up_to(bytes)
3131 } else {
3132 ascii_valid_up_to(bytes)
3133 };
3134 if valid_up_to == bytes.len() {
3135 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3136 return Some(Cow::Borrowed(str));
3137 }
3138 let decoder = self.new_decoder_without_bom_handling();
3139 let mut string = String::with_capacity(
3140 checked_add(
3141 valid_up_to,
3142 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3143 )
3144 .unwrap(),
3145 );
3146 unsafe {
3147 let vec = string.as_mut_vec();
3148 vec.set_len(valid_up_to);
3149 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3150 }
3151 (decoder, string, &bytes[valid_up_to..])
3152 } else {
3153 let decoder = self.new_decoder_without_bom_handling();
3154 let string = String::with_capacity(
3155 decoder
3156 .max_utf8_buffer_length_without_replacement(bytes.len())
3157 .unwrap(),
3158 );
3159 (decoder, string, bytes)
3160 };
3161 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3162 match result {
3163 DecoderResult::InputEmpty => {
3164 debug_assert_eq!(read, input.len());
3165 Some(Cow::Owned(string))
3166 }
3167 DecoderResult::Malformed(_, _) => None,
3168 DecoderResult::OutputFull => unreachable!(),
3169 }
3170 }
3171
3172 /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3173 /// replaced with decimal numeric character references when the entire input
3174 /// is available as a single buffer (i.e. the end of the buffer marks the
3175 /// end of the stream).
3176 ///
3177 /// This method implements the (non-streaming version of) the
3178 /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3179 /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3180 /// spec concept, it is slightly more efficient to use
3181 /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3182 /// method on `UTF_8`.
3183 ///
3184 /// The second item in the returned tuple is the encoding that was actually
3185 /// used (which may differ from this encoding thanks to some encodings
3186 /// having UTF-8 as their output encoding).
3187 ///
3188 /// The third item in the returned tuple indicates whether there were
3189 /// unmappable characters (that were replaced with HTML numeric character
3190 /// references).
3191 ///
3192 /// _Note:_ It is wrong to use this when the input buffer represents only
3193 /// a segment of the input instead of the whole input. Use `new_encoder()`
3194 /// when encoding segmented output.
3195 ///
3196 /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3197 /// ASCII-compatible encoding, this method returns a borrow of the input
3198 /// without a heap allocation. Otherwise, this method performs a single
3199 /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3200 /// unmappable characters and potentially multiple heap allocations if
3201 /// there are. These allocations are tuned for jemalloc and may not be
3202 /// optimal when using a different allocator that doesn't use power-of-two
3203 /// buckets.
3204 ///
3205 /// # Panics
3206 ///
3207 /// If the size calculation for a heap-allocated backing buffer overflows
3208 /// `usize`.
3209 ///
3210 /// Available to Rust only.
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3211 pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3212 let output_encoding = self.output_encoding();
3213 if output_encoding == UTF_8 {
3214 return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3215 }
3216 debug_assert!(output_encoding.is_potentially_borrowable());
3217 let bytes = string.as_bytes();
3218 let valid_up_to = if output_encoding == ISO_2022_JP {
3219 iso_2022_jp_ascii_valid_up_to(bytes)
3220 } else {
3221 ascii_valid_up_to(bytes)
3222 };
3223 if valid_up_to == bytes.len() {
3224 return (Cow::Borrowed(bytes), output_encoding, false);
3225 }
3226 let mut encoder = output_encoding.new_encoder();
3227 let mut vec: Vec<u8> = Vec::with_capacity(
3228 (checked_add(
3229 valid_up_to,
3230 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3231 ))
3232 .unwrap()
3233 .next_power_of_two(),
3234 );
3235 unsafe {
3236 vec.set_len(valid_up_to);
3237 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3238 }
3239 let mut total_read = valid_up_to;
3240 let mut total_had_errors = false;
3241 loop {
3242 let (result, read, had_errors) =
3243 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3244 total_read += read;
3245 total_had_errors |= had_errors;
3246 match result {
3247 CoderResult::InputEmpty => {
3248 debug_assert_eq!(total_read, string.len());
3249 return (Cow::Owned(vec), output_encoding, total_had_errors);
3250 }
3251 CoderResult::OutputFull => {
3252 // reserve_exact wants to know how much more on top of current
3253 // length--not current capacity.
3254 let needed = encoder
3255 .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3256 let rounded = (checked_add(vec.capacity(), needed))
3257 .unwrap()
3258 .next_power_of_two();
3259 let additional = rounded - vec.len();
3260 vec.reserve_exact(additional);
3261 }
3262 }
3263 }
3264 }
3265
new_variant_decoder(&'static self) -> VariantDecoder3266 fn new_variant_decoder(&'static self) -> VariantDecoder {
3267 self.variant.new_variant_decoder()
3268 }
3269
3270 /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3271 ///
3272 /// BOM sniffing may cause the returned decoder to morph into a decoder
3273 /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3274 ///
3275 /// Available via the C wrapper.
3276 #[inline]
new_decoder(&'static self) -> Decoder3277 pub fn new_decoder(&'static self) -> Decoder {
3278 Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3279 }
3280
3281 /// Instantiates a new decoder for this encoding with BOM removal.
3282 ///
3283 /// If the input starts with bytes that are the BOM for this encoding,
3284 /// those bytes are removed. However, the decoder never morphs into a
3285 /// decoder for another encoding: A BOM for another encoding is treated as
3286 /// (potentially malformed) input to the decoding algorithm for this
3287 /// encoding.
3288 ///
3289 /// Available via the C wrapper.
3290 #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3291 pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3292 Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3293 }
3294
3295 /// Instantiates a new decoder for this encoding with BOM handling disabled.
3296 ///
3297 /// If the input starts with bytes that look like a BOM, those bytes are
3298 /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3299 /// for another encoding.)
3300 ///
3301 /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3302 /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3303 /// instead of this method to cause the BOM to be removed.
3304 ///
3305 /// Available via the C wrapper.
3306 #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3307 pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3308 Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3309 }
3310
3311 /// Instantiates a new encoder for the output encoding of this encoding.
3312 ///
3313 /// Available via the C wrapper.
3314 #[inline]
new_encoder(&'static self) -> Encoder3315 pub fn new_encoder(&'static self) -> Encoder {
3316 let enc = self.output_encoding();
3317 enc.variant.new_encoder(enc)
3318 }
3319
3320 /// Validates UTF-8.
3321 ///
3322 /// Returns the index of the first byte that makes the input malformed as
3323 /// UTF-8 or the length of the slice if the slice is entirely valid.
3324 ///
3325 /// This is currently faster than the corresponding standard library
3326 /// functionality. If this implementation gets upstreamed to the standard
3327 /// library, this method may be removed in the future.
3328 ///
3329 /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3330 pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3331 utf8_valid_up_to(bytes)
3332 }
3333
3334 /// Validates ASCII.
3335 ///
3336 /// Returns the index of the first byte that makes the input malformed as
3337 /// ASCII or the length of the slice if the slice is entirely valid.
3338 ///
3339 /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3340 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3341 ascii_valid_up_to(bytes)
3342 }
3343
3344 /// Validates ISO-2022-JP ASCII-state data.
3345 ///
3346 /// Returns the index of the first byte that makes the input not
3347 /// representable in the ASCII state of ISO-2022-JP or the length of the
3348 /// slice if the slice is entirely representable in the ASCII state of
3349 /// ISO-2022-JP.
3350 ///
3351 /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3352 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3353 iso_2022_jp_ascii_valid_up_to(bytes)
3354 }
3355 }
3356
3357 impl PartialEq for Encoding {
3358 #[inline]
eq(&self, other: &Encoding) -> bool3359 fn eq(&self, other: &Encoding) -> bool {
3360 (self as *const Encoding) == (other as *const Encoding)
3361 }
3362 }
3363
3364 impl Eq for Encoding {}
3365
3366 #[cfg(test)]
3367 impl PartialOrd for Encoding {
partial_cmp(&self, other: &Self) -> Option<Ordering>3368 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3369 (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3370 }
3371 }
3372
3373 #[cfg(test)]
3374 impl Ord for Encoding {
cmp(&self, other: &Self) -> Ordering3375 fn cmp(&self, other: &Self) -> Ordering {
3376 (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3377 }
3378 }
3379
3380 impl Hash for Encoding {
3381 #[inline]
hash<H: Hasher>(&self, state: &mut H)3382 fn hash<H: Hasher>(&self, state: &mut H) {
3383 (self as *const Encoding).hash(state);
3384 }
3385 }
3386
3387 impl core::fmt::Debug for Encoding {
3388 #[inline]
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result3389 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3390 write!(f, "Encoding {{ {} }}", self.name)
3391 }
3392 }
3393
3394 #[cfg(feature = "serde")]
3395 impl Serialize for Encoding {
3396 #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3397 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3398 where
3399 S: Serializer,
3400 {
3401 serializer.serialize_str(self.name)
3402 }
3403 }
3404
3405 #[cfg(feature = "serde")]
3406 struct EncodingVisitor;
3407
3408 #[cfg(feature = "serde")]
3409 impl<'de> Visitor<'de> for EncodingVisitor {
3410 type Value = &'static Encoding;
3411
expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result3412 fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3413 formatter.write_str("a valid encoding label")
3414 }
3415
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3416 fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3417 where
3418 E: serde::de::Error,
3419 {
3420 if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3421 Ok(enc)
3422 } else {
3423 Err(E::custom(alloc::format!("invalid encoding label: {}", value)))
3424 }
3425 }
3426 }
3427
3428 #[cfg(feature = "serde")]
3429 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3430 fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3431 where
3432 D: Deserializer<'de>,
3433 {
3434 deserializer.deserialize_str(EncodingVisitor)
3435 }
3436 }
3437
3438 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3439 #[derive(PartialEq, Debug, Copy, Clone)]
3440 enum DecoderLifeCycle {
3441 /// The decoder has seen no input yet.
3442 AtStart,
3443 /// The decoder has seen no input yet but expects UTF-8.
3444 AtUtf8Start,
3445 /// The decoder has seen no input yet but expects UTF-16BE.
3446 AtUtf16BeStart,
3447 /// The decoder has seen no input yet but expects UTF-16LE.
3448 AtUtf16LeStart,
3449 /// The decoder has seen EF.
3450 SeenUtf8First,
3451 /// The decoder has seen EF, BB.
3452 SeenUtf8Second,
3453 /// The decoder has seen FE.
3454 SeenUtf16BeFirst,
3455 /// The decoder has seen FF.
3456 SeenUtf16LeFirst,
3457 /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3458 /// underlying decoder reported EF as an error, so we need to remember to
3459 /// push BB before the next buffer.
3460 ConvertingWithPendingBB,
3461 /// No longer looking for a BOM and EOF not yet seen.
3462 Converting,
3463 /// EOF has been seen.
3464 Finished,
3465 }
3466
3467 /// Communicate the BOM handling mode.
3468 #[derive(Debug, Copy, Clone)]
3469 enum BomHandling {
3470 /// Don't handle the BOM
3471 Off,
3472 /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3473 Sniff,
3474 /// Remove the BOM only if it's the BOM for this encoding
3475 Remove,
3476 }
3477
3478 /// Result of a (potentially partial) decode or encode operation with
3479 /// replacement.
3480 #[must_use]
3481 #[derive(Debug, PartialEq, Eq)]
3482 pub enum CoderResult {
3483 /// The input was exhausted.
3484 ///
3485 /// If this result was returned from a call where `last` was `true`, the
3486 /// conversion process has completed. Otherwise, the caller should call a
3487 /// decode or encode method again with more input.
3488 InputEmpty,
3489
3490 /// The converter cannot produce another unit of output, because the output
3491 /// buffer does not have enough space left.
3492 ///
3493 /// The caller must provide more output space upon the next call and re-push
3494 /// the remaining input to the converter.
3495 OutputFull,
3496 }
3497
3498 /// Result of a (potentially partial) decode operation without replacement.
3499 #[must_use]
3500 #[derive(Debug, PartialEq, Eq)]
3501 pub enum DecoderResult {
3502 /// The input was exhausted.
3503 ///
3504 /// If this result was returned from a call where `last` was `true`, the
3505 /// decoding process has completed. Otherwise, the caller should call a
3506 /// decode method again with more input.
3507 InputEmpty,
3508
3509 /// The decoder cannot produce another unit of output, because the output
3510 /// buffer does not have enough space left.
3511 ///
3512 /// The caller must provide more output space upon the next call and re-push
3513 /// the remaining input to the decoder.
3514 OutputFull,
3515
3516 /// The decoder encountered a malformed byte sequence.
3517 ///
3518 /// The caller must either treat this as a fatal error or must append one
3519 /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3520 /// the remaining input to the decoder.
3521 ///
3522 /// The first wrapped integer indicates the length of the malformed byte
3523 /// sequence. The second wrapped integer indicates the number of bytes
3524 /// that were consumed after the malformed sequence. If the second
3525 /// integer is zero, the last byte that was consumed is the last byte of
3526 /// the malformed sequence. Note that the malformed bytes may have been part
3527 /// of an earlier input buffer.
3528 ///
3529 /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3530 /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3531 /// of the two is 6, which happens with ISO-2022-JP.
3532 Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3533 }
3534
3535 /// A converter that decodes a byte stream into Unicode according to a
3536 /// character encoding in a streaming (incremental) manner.
3537 ///
3538 /// The various `decode_*` methods take an input buffer (`src`) and an output
3539 /// buffer `dst` both of which are caller-allocated. There are variants for
3540 /// both UTF-8 and UTF-16 output buffers.
3541 ///
3542 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3543 /// into `dst` until one of the following three things happens:
3544 ///
3545 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3546 /// variants only).
3547 ///
3548 /// 2. The output buffer has been filled so near capacity that the decoder
3549 /// cannot be sure that processing an additional byte of input wouldn't
3550 /// cause so much output that the output buffer would overflow.
3551 ///
3552 /// 3. All the input bytes have been processed.
3553 ///
3554 /// The `decode_*` method then returns tuple of a status indicating which one
3555 /// of the three reasons to return happened, how many input bytes were read,
3556 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3557 /// when decoding to UTF-16) were written (except when decoding into `String`,
3558 /// whose length change indicates this), and in the case of the
3559 /// variants performing replacement, a boolean indicating whether an error was
3560 /// replaced with the REPLACEMENT CHARACTER during the call.
3561 ///
3562 /// The number of bytes "written" is what's logically written. Garbage may be
3563 /// written in the output buffer beyond the point logically written to.
3564 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3565 /// methods that take an `&mut str` argument instead of the ones that take an
3566 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3567 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3568 /// latter don't.
3569 ///
3570 /// In the case of the `*_without_replacement` variants, the status is a
3571 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3572 /// `InputEmpty` corresponding to the three cases listed above).
3573 ///
3574 /// In the case of methods whose name does not end with
3575 /// `*_without_replacement`, malformed sequences are automatically replaced
3576 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3577 /// return early.
3578 ///
3579 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3580 /// space. When decoding to UTF-16, the output buffer must have at least two
3581 /// UTF-16 code units (`u16`) of space.
3582 ///
3583 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3584 /// not to return indicating that more output space is needed if the length
3585 /// of the output buffer is at least the length returned by
3586 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3587 /// with replacement, the length of the output buffer that guarantees the
3588 /// methods not to return indicating that more output space is needed is given
3589 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3590 /// or without replacement, the length of the output buffer that guarantees
3591 /// the methods not to return indicating that more output space is needed is
3592 /// given by [`max_utf16_buffer_length()`][4].
3593 ///
3594 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3595 /// and the output after each `decode_*` call is guaranteed to consist of
3596 /// complete characters. (I.e. the code unit sequence for the last character is
3597 /// guaranteed not to be split across output buffers.)
3598 ///
3599 /// The boolean argument `last` indicates that the end of the stream is reached
3600 /// when all the bytes in `src` have been consumed.
3601 ///
3602 /// A `Decoder` object can be used to incrementally decode a byte stream.
3603 ///
3604 /// During the processing of a single stream, the caller must call `decode_*`
3605 /// zero or more times with `last` set to `false` and then call `decode_*` at
3606 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3607 /// the processing of the stream has ended. Otherwise, the caller must call
3608 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3609 /// a fatal error).
3610 ///
3611 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3612 /// That is, you need to create another one to process another stream.
3613 ///
3614 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3615 /// the caller does not wish to treat it as a fatal error, the input buffer
3616 /// `src` may not have been completely consumed. In that case, the caller must
3617 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3618 /// call.
3619 ///
3620 /// [1]: enum.DecoderResult.html
3621 /// [2]: #method.max_utf8_buffer_length_without_replacement
3622 /// [3]: #method.max_utf8_buffer_length
3623 /// [4]: #method.max_utf16_buffer_length
3624 ///
3625 /// # Infinite loops
3626 ///
3627 /// When converting with a fixed-size output buffer whose size is too small to
3628 /// accommodate one character or (when applicable) one numeric character
3629 /// reference of output, an infinite loop ensues. When converting with a
3630 /// fixed-size output buffer, it generally makes sense to make the buffer
3631 /// fairly large (e.g. couple of kilobytes).
3632 pub struct Decoder {
3633 encoding: &'static Encoding,
3634 variant: VariantDecoder,
3635 life_cycle: DecoderLifeCycle,
3636 }
3637
3638 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3639 fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3640 Decoder {
3641 encoding: enc,
3642 variant: decoder,
3643 life_cycle: match sniffing {
3644 BomHandling::Off => DecoderLifeCycle::Converting,
3645 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3646 BomHandling::Remove => {
3647 if enc == UTF_8 {
3648 DecoderLifeCycle::AtUtf8Start
3649 } else if enc == UTF_16BE {
3650 DecoderLifeCycle::AtUtf16BeStart
3651 } else if enc == UTF_16LE {
3652 DecoderLifeCycle::AtUtf16LeStart
3653 } else {
3654 DecoderLifeCycle::Converting
3655 }
3656 }
3657 },
3658 }
3659 }
3660
3661 /// The `Encoding` this `Decoder` is for.
3662 ///
3663 /// BOM sniffing can change the return value of this method during the life
3664 /// of the decoder.
3665 ///
3666 /// Available via the C wrapper.
3667 #[inline]
encoding(&self) -> &'static Encoding3668 pub fn encoding(&self) -> &'static Encoding {
3669 self.encoding
3670 }
3671
3672 /// Query the worst-case UTF-8 output size _with replacement_.
3673 ///
3674 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3675 /// that will not overflow given the current state of the decoder and
3676 /// `byte_length` number of additional input bytes when decoding with
3677 /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3678 /// sequence or `None` if `usize` would overflow.
3679 ///
3680 /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3681 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3682 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3683 // BOM getting pushed to the underlying decoder.
3684 match self.life_cycle {
3685 DecoderLifeCycle::Converting
3686 | DecoderLifeCycle::AtUtf8Start
3687 | DecoderLifeCycle::AtUtf16LeStart
3688 | DecoderLifeCycle::AtUtf16BeStart => {
3689 return self.variant.max_utf8_buffer_length(byte_length);
3690 }
3691 DecoderLifeCycle::AtStart => {
3692 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3693 if let Some(utf16_bom) = checked_add(
3694 1,
3695 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3696 ) {
3697 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3698 let encoding = self.encoding();
3699 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3700 // No need to consider the internal state of the underlying decoder,
3701 // because it is at start, because no data has reached it yet.
3702 return Some(utf_bom);
3703 } else if let Some(non_bom) =
3704 self.variant.max_utf8_buffer_length(byte_length)
3705 {
3706 return Some(core::cmp::max(utf_bom, non_bom));
3707 }
3708 }
3709 }
3710 }
3711 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3712 // Add two bytes even when only one byte has been seen,
3713 // because the one byte can become a lead byte in multibyte
3714 // decoders, but only after the decoder has been queried
3715 // for max length, so the decoder's own logic for adding
3716 // one for a pending lead cannot work.
3717 if let Some(sum) = byte_length.checked_add(2) {
3718 if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3719 if self.encoding() == UTF_8 {
3720 // No need to consider the internal state of the underlying decoder,
3721 // because it is at start, because no data has reached it yet.
3722 return Some(utf8_bom);
3723 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3724 return Some(core::cmp::max(utf8_bom, non_bom));
3725 }
3726 }
3727 }
3728 }
3729 DecoderLifeCycle::ConvertingWithPendingBB => {
3730 if let Some(sum) = byte_length.checked_add(2) {
3731 return self.variant.max_utf8_buffer_length(sum);
3732 }
3733 }
3734 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3735 // Add two bytes even when only one byte has been seen,
3736 // because the one byte can become a lead byte in multibyte
3737 // decoders, but only after the decoder has been queried
3738 // for max length, so the decoder's own logic for adding
3739 // one for a pending lead cannot work.
3740 if let Some(sum) = byte_length.checked_add(2) {
3741 if let Some(utf16_bom) =
3742 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3743 {
3744 let encoding = self.encoding();
3745 if encoding == UTF_16LE || encoding == UTF_16BE {
3746 // No need to consider the internal state of the underlying decoder,
3747 // because it is at start, because no data has reached it yet.
3748 return Some(utf16_bom);
3749 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3750 return Some(core::cmp::max(utf16_bom, non_bom));
3751 }
3752 }
3753 }
3754 }
3755 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3756 }
3757 None
3758 }
3759
3760 /// Query the worst-case UTF-8 output size _without replacement_.
3761 ///
3762 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3763 /// that will not overflow given the current state of the decoder and
3764 /// `byte_length` number of additional input bytes when decoding without
3765 /// replacement error handling or `None` if `usize` would overflow.
3766 ///
3767 /// Note that this value may be too small for the `_with_replacement` case.
3768 /// Use `max_utf8_buffer_length()` for that case.
3769 ///
3770 /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3771 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3772 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3773 // BOM getting pushed to the underlying decoder.
3774 match self.life_cycle {
3775 DecoderLifeCycle::Converting
3776 | DecoderLifeCycle::AtUtf8Start
3777 | DecoderLifeCycle::AtUtf16LeStart
3778 | DecoderLifeCycle::AtUtf16BeStart => {
3779 return self
3780 .variant
3781 .max_utf8_buffer_length_without_replacement(byte_length);
3782 }
3783 DecoderLifeCycle::AtStart => {
3784 if let Some(utf8_bom) = byte_length.checked_add(3) {
3785 if let Some(utf16_bom) = checked_add(
3786 1,
3787 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3788 ) {
3789 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3790 let encoding = self.encoding();
3791 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3792 // No need to consider the internal state of the underlying decoder,
3793 // because it is at start, because no data has reached it yet.
3794 return Some(utf_bom);
3795 } else if let Some(non_bom) = self
3796 .variant
3797 .max_utf8_buffer_length_without_replacement(byte_length)
3798 {
3799 return Some(core::cmp::max(utf_bom, non_bom));
3800 }
3801 }
3802 }
3803 }
3804 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3805 // Add two bytes even when only one byte has been seen,
3806 // because the one byte can become a lead byte in multibyte
3807 // decoders, but only after the decoder has been queried
3808 // for max length, so the decoder's own logic for adding
3809 // one for a pending lead cannot work.
3810 if let Some(sum) = byte_length.checked_add(2) {
3811 if let Some(utf8_bom) = sum.checked_add(3) {
3812 if self.encoding() == UTF_8 {
3813 // No need to consider the internal state of the underlying decoder,
3814 // because it is at start, because no data has reached it yet.
3815 return Some(utf8_bom);
3816 } else if let Some(non_bom) =
3817 self.variant.max_utf8_buffer_length_without_replacement(sum)
3818 {
3819 return Some(core::cmp::max(utf8_bom, non_bom));
3820 }
3821 }
3822 }
3823 }
3824 DecoderLifeCycle::ConvertingWithPendingBB => {
3825 if let Some(sum) = byte_length.checked_add(2) {
3826 return self.variant.max_utf8_buffer_length_without_replacement(sum);
3827 }
3828 }
3829 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3830 // Add two bytes even when only one byte has been seen,
3831 // because the one byte can become a lead byte in multibyte
3832 // decoders, but only after the decoder has been queried
3833 // for max length, so the decoder's own logic for adding
3834 // one for a pending lead cannot work.
3835 if let Some(sum) = byte_length.checked_add(2) {
3836 if let Some(utf16_bom) =
3837 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3838 {
3839 let encoding = self.encoding();
3840 if encoding == UTF_16LE || encoding == UTF_16BE {
3841 // No need to consider the internal state of the underlying decoder,
3842 // because it is at start, because no data has reached it yet.
3843 return Some(utf16_bom);
3844 } else if let Some(non_bom) =
3845 self.variant.max_utf8_buffer_length_without_replacement(sum)
3846 {
3847 return Some(core::cmp::max(utf16_bom, non_bom));
3848 }
3849 }
3850 }
3851 }
3852 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3853 }
3854 None
3855 }
3856
3857 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3858 /// replaced with the REPLACEMENT CHARACTER.
3859 ///
3860 /// See the documentation of the struct for documentation for `decode_*`
3861 /// methods collectively.
3862 ///
3863 /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3864 pub fn decode_to_utf8(
3865 &mut self,
3866 src: &[u8],
3867 dst: &mut [u8],
3868 last: bool,
3869 ) -> (CoderResult, usize, usize, bool) {
3870 let mut had_errors = false;
3871 let mut total_read = 0usize;
3872 let mut total_written = 0usize;
3873 loop {
3874 let (result, read, written) = self.decode_to_utf8_without_replacement(
3875 &src[total_read..],
3876 &mut dst[total_written..],
3877 last,
3878 );
3879 total_read += read;
3880 total_written += written;
3881 match result {
3882 DecoderResult::InputEmpty => {
3883 return (
3884 CoderResult::InputEmpty,
3885 total_read,
3886 total_written,
3887 had_errors,
3888 );
3889 }
3890 DecoderResult::OutputFull => {
3891 return (
3892 CoderResult::OutputFull,
3893 total_read,
3894 total_written,
3895 had_errors,
3896 );
3897 }
3898 DecoderResult::Malformed(_, _) => {
3899 had_errors = true;
3900 // There should always be space for the U+FFFD, because
3901 // otherwise we'd have gotten OutputFull already.
3902 // XXX: is the above comment actually true for UTF-8 itself?
3903 // TODO: Consider having fewer bound checks here.
3904 dst[total_written] = 0xEFu8;
3905 total_written += 1;
3906 dst[total_written] = 0xBFu8;
3907 total_written += 1;
3908 dst[total_written] = 0xBDu8;
3909 total_written += 1;
3910 }
3911 }
3912 }
3913 }
3914
3915 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3916 /// replaced with the REPLACEMENT CHARACTER with type system signaling
3917 /// of UTF-8 validity.
3918 ///
3919 /// This methods calls `decode_to_utf8` and then zeroes
3920 /// out up to three bytes that aren't logically part of the write in order
3921 /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3922 ///
3923 /// See the documentation of the struct for documentation for `decode_*`
3924 /// methods collectively.
3925 ///
3926 /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3927 pub fn decode_to_str(
3928 &mut self,
3929 src: &[u8],
3930 dst: &mut str,
3931 last: bool,
3932 ) -> (CoderResult, usize, usize, bool) {
3933 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3934 let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3935 let len = bytes.len();
3936 let mut trail = written;
3937 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3938 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3939 // encodings to avoid overwriting here.
3940 if self.encoding != UTF_8 {
3941 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3942 while trail < max {
3943 bytes[trail] = 0;
3944 trail += 1;
3945 }
3946 }
3947 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3948 bytes[trail] = 0;
3949 trail += 1;
3950 }
3951 (result, read, written, replaced)
3952 }
3953
3954 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3955 /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3956 ///
3957 /// Like the others, this method follows the logic that the output buffer is
3958 /// caller-allocated. This method treats the capacity of the `String` as
3959 /// the output limit. That is, this method guarantees not to cause a
3960 /// reallocation of the backing buffer of `String`.
3961 ///
3962 /// The return value is a tuple that contains the `DecoderResult`, the
3963 /// number of bytes read and a boolean indicating whether replacements
3964 /// were done. The number of bytes written is signaled via the length of
3965 /// the `String` changing.
3966 ///
3967 /// See the documentation of the struct for documentation for `decode_*`
3968 /// methods collectively.
3969 ///
3970 /// Available to Rust only.
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)3971 pub fn decode_to_string(
3972 &mut self,
3973 src: &[u8],
3974 dst: &mut String,
3975 last: bool,
3976 ) -> (CoderResult, usize, bool) {
3977 unsafe {
3978 let vec = dst.as_mut_vec();
3979 let old_len = vec.len();
3980 let capacity = vec.capacity();
3981 vec.set_len(capacity);
3982 let (result, read, written, replaced) =
3983 self.decode_to_utf8(src, &mut vec[old_len..], last);
3984 vec.set_len(old_len + written);
3985 (result, read, replaced)
3986 }
3987 }
3988
3989 public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3990 /// _without replacement_.
3991 ///
3992 /// See the documentation of the struct for
3993 /// documentation for `decode_*` methods
3994 /// collectively.
3995 ///
3996 /// Available via the C wrapper.
3997 ,
3998 decode_to_utf8_without_replacement,
3999 decode_to_utf8_raw,
4000 decode_to_utf8_checking_end,
4001 decode_to_utf8_after_one_potential_bom_byte,
4002 decode_to_utf8_after_two_potential_bom_bytes,
4003 decode_to_utf8_checking_end_with_offset,
4004 u8);
4005
4006 /// Incrementally decode a byte stream into UTF-8 with type system signaling
4007 /// of UTF-8 validity.
4008 ///
4009 /// This methods calls `decode_to_utf8` and then zeroes out up to three
4010 /// bytes that aren't logically part of the write in order to retain the
4011 /// UTF-8 validity even for the unwritten part of the buffer.
4012 ///
4013 /// See the documentation of the struct for documentation for `decode_*`
4014 /// methods collectively.
4015 ///
4016 /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)4017 pub fn decode_to_str_without_replacement(
4018 &mut self,
4019 src: &[u8],
4020 dst: &mut str,
4021 last: bool,
4022 ) -> (DecoderResult, usize, usize) {
4023 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4024 let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4025 let len = bytes.len();
4026 let mut trail = written;
4027 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4028 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4029 // encodings to avoid overwriting here.
4030 if self.encoding != UTF_8 {
4031 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4032 while trail < max {
4033 bytes[trail] = 0;
4034 trail += 1;
4035 }
4036 }
4037 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4038 bytes[trail] = 0;
4039 trail += 1;
4040 }
4041 (result, read, written)
4042 }
4043
4044 /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4045 ///
4046 /// Like the others, this method follows the logic that the output buffer is
4047 /// caller-allocated. This method treats the capacity of the `String` as
4048 /// the output limit. That is, this method guarantees not to cause a
4049 /// reallocation of the backing buffer of `String`.
4050 ///
4051 /// The return value is a pair that contains the `DecoderResult` and the
4052 /// number of bytes read. The number of bytes written is signaled via
4053 /// the length of the `String` changing.
4054 ///
4055 /// See the documentation of the struct for documentation for `decode_*`
4056 /// methods collectively.
4057 ///
4058 /// Available to Rust only.
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4059 pub fn decode_to_string_without_replacement(
4060 &mut self,
4061 src: &[u8],
4062 dst: &mut String,
4063 last: bool,
4064 ) -> (DecoderResult, usize) {
4065 unsafe {
4066 let vec = dst.as_mut_vec();
4067 let old_len = vec.len();
4068 let capacity = vec.capacity();
4069 vec.set_len(capacity);
4070 let (result, read, written) =
4071 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4072 vec.set_len(old_len + written);
4073 (result, read)
4074 }
4075 }
4076
4077 /// Query the worst-case UTF-16 output size (with or without replacement).
4078 ///
4079 /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4080 /// that will not overflow given the current state of the decoder and
4081 /// `byte_length` number of additional input bytes or `None` if `usize`
4082 /// would overflow.
4083 ///
4084 /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4085 /// return value of this method applies also in the
4086 /// `_without_replacement` case.
4087 ///
4088 /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4089 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4090 // Need to consider a) the decoder morphing due to the BOM and b) a partial
4091 // BOM getting pushed to the underlying decoder.
4092 match self.life_cycle {
4093 DecoderLifeCycle::Converting
4094 | DecoderLifeCycle::AtUtf8Start
4095 | DecoderLifeCycle::AtUtf16LeStart
4096 | DecoderLifeCycle::AtUtf16BeStart => {
4097 return self.variant.max_utf16_buffer_length(byte_length);
4098 }
4099 DecoderLifeCycle::AtStart => {
4100 if let Some(utf8_bom) = byte_length.checked_add(1) {
4101 if let Some(utf16_bom) =
4102 checked_add(1, checked_div(byte_length.checked_add(1), 2))
4103 {
4104 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4105 let encoding = self.encoding();
4106 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4107 // No need to consider the internal state of the underlying decoder,
4108 // because it is at start, because no data has reached it yet.
4109 return Some(utf_bom);
4110 } else if let Some(non_bom) =
4111 self.variant.max_utf16_buffer_length(byte_length)
4112 {
4113 return Some(core::cmp::max(utf_bom, non_bom));
4114 }
4115 }
4116 }
4117 }
4118 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4119 // Add two bytes even when only one byte has been seen,
4120 // because the one byte can become a lead byte in multibyte
4121 // decoders, but only after the decoder has been queried
4122 // for max length, so the decoder's own logic for adding
4123 // one for a pending lead cannot work.
4124 if let Some(sum) = byte_length.checked_add(2) {
4125 if let Some(utf8_bom) = sum.checked_add(1) {
4126 if self.encoding() == UTF_8 {
4127 // No need to consider the internal state of the underlying decoder,
4128 // because it is at start, because no data has reached it yet.
4129 return Some(utf8_bom);
4130 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4131 return Some(core::cmp::max(utf8_bom, non_bom));
4132 }
4133 }
4134 }
4135 }
4136 DecoderLifeCycle::ConvertingWithPendingBB => {
4137 if let Some(sum) = byte_length.checked_add(2) {
4138 return self.variant.max_utf16_buffer_length(sum);
4139 }
4140 }
4141 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4142 // Add two bytes even when only one byte has been seen,
4143 // because the one byte can become a lead byte in multibyte
4144 // decoders, but only after the decoder has been queried
4145 // for max length, so the decoder's own logic for adding
4146 // one for a pending lead cannot work.
4147 if let Some(sum) = byte_length.checked_add(2) {
4148 if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4149 let encoding = self.encoding();
4150 if encoding == UTF_16LE || encoding == UTF_16BE {
4151 // No need to consider the internal state of the underlying decoder,
4152 // because it is at start, because no data has reached it yet.
4153 return Some(utf16_bom);
4154 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4155 return Some(core::cmp::max(utf16_bom, non_bom));
4156 }
4157 }
4158 }
4159 }
4160 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4161 }
4162 None
4163 }
4164
4165 /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4166 /// replaced with the REPLACEMENT CHARACTER.
4167 ///
4168 /// See the documentation of the struct for documentation for `decode_*`
4169 /// methods collectively.
4170 ///
4171 /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4172 pub fn decode_to_utf16(
4173 &mut self,
4174 src: &[u8],
4175 dst: &mut [u16],
4176 last: bool,
4177 ) -> (CoderResult, usize, usize, bool) {
4178 let mut had_errors = false;
4179 let mut total_read = 0usize;
4180 let mut total_written = 0usize;
4181 loop {
4182 let (result, read, written) = self.decode_to_utf16_without_replacement(
4183 &src[total_read..],
4184 &mut dst[total_written..],
4185 last,
4186 );
4187 total_read += read;
4188 total_written += written;
4189 match result {
4190 DecoderResult::InputEmpty => {
4191 return (
4192 CoderResult::InputEmpty,
4193 total_read,
4194 total_written,
4195 had_errors,
4196 );
4197 }
4198 DecoderResult::OutputFull => {
4199 return (
4200 CoderResult::OutputFull,
4201 total_read,
4202 total_written,
4203 had_errors,
4204 );
4205 }
4206 DecoderResult::Malformed(_, _) => {
4207 had_errors = true;
4208 // There should always be space for the U+FFFD, because
4209 // otherwise we'd have gotten OutputFull already.
4210 dst[total_written] = 0xFFFD;
4211 total_written += 1;
4212 }
4213 }
4214 }
4215 }
4216
4217 public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4218 /// _without replacement_.
4219 ///
4220 /// See the documentation of the struct for
4221 /// documentation for `decode_*` methods
4222 /// collectively.
4223 ///
4224 /// Available via the C wrapper.
4225 ,
4226 decode_to_utf16_without_replacement,
4227 decode_to_utf16_raw,
4228 decode_to_utf16_checking_end,
4229 decode_to_utf16_after_one_potential_bom_byte,
4230 decode_to_utf16_after_two_potential_bom_bytes,
4231 decode_to_utf16_checking_end_with_offset,
4232 u16);
4233
4234 /// Checks for compatibility with storing Unicode scalar values as unsigned
4235 /// bytes taking into account the state of the decoder.
4236 ///
4237 /// Returns `None` if the decoder is not in a neutral state, including waiting
4238 /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4239 ///
4240 /// Otherwise returns the index of the first byte whose unsigned value doesn't
4241 /// directly correspond to the decoded Unicode scalar value, or the length
4242 /// of the input if all bytes in the input decode directly to scalar values
4243 /// corresponding to the unsigned byte values.
4244 ///
4245 /// Does not change the state of the decoder.
4246 ///
4247 /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4248 /// storage optimizations.
4249 ///
4250 /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4251 pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4252 match self.life_cycle {
4253 DecoderLifeCycle::Converting => {
4254 return self.variant.latin1_byte_compatible_up_to(bytes);
4255 }
4256 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4257 _ => None,
4258 }
4259 }
4260 }
4261
4262 /// Result of a (potentially partial) encode operation without replacement.
4263 #[must_use]
4264 #[derive(Debug, PartialEq, Eq)]
4265 pub enum EncoderResult {
4266 /// The input was exhausted.
4267 ///
4268 /// If this result was returned from a call where `last` was `true`, the
4269 /// decoding process has completed. Otherwise, the caller should call a
4270 /// decode method again with more input.
4271 InputEmpty,
4272
4273 /// The encoder cannot produce another unit of output, because the output
4274 /// buffer does not have enough space left.
4275 ///
4276 /// The caller must provide more output space upon the next call and re-push
4277 /// the remaining input to the decoder.
4278 OutputFull,
4279
4280 /// The encoder encountered an unmappable character.
4281 ///
4282 /// The caller must either treat this as a fatal error or must append
4283 /// a placeholder to the output and then re-push the remaining input to the
4284 /// encoder.
4285 Unmappable(char),
4286 }
4287
4288 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4289 fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4290 EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4291 }
4292 }
4293
4294 /// A converter that encodes a Unicode stream into bytes according to a
4295 /// character encoding in a streaming (incremental) manner.
4296 ///
4297 /// The various `encode_*` methods take an input buffer (`src`) and an output
4298 /// buffer `dst` both of which are caller-allocated. There are variants for
4299 /// both UTF-8 and UTF-16 input buffers.
4300 ///
4301 /// An `encode_*` method encode characters from `src` into bytes characters
4302 /// stored into `dst` until one of the following three things happens:
4303 ///
4304 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4305 /// only).
4306 ///
4307 /// 2. The output buffer has been filled so near capacity that the decoder
4308 /// cannot be sure that processing an additional character of input wouldn't
4309 /// cause so much output that the output buffer would overflow.
4310 ///
4311 /// 3. All the input characters have been processed.
4312 ///
4313 /// The `encode_*` method then returns tuple of a status indicating which one
4314 /// of the three reasons to return happened, how many input code units (`u8`
4315 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4316 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4317 /// whose length change indicates this), and in the case of the variants that
4318 /// perform replacement, a boolean indicating whether an unmappable
4319 /// character was replaced with a numeric character reference during the call.
4320 ///
4321 /// The number of bytes "written" is what's logically written. Garbage may be
4322 /// written in the output buffer beyond the point logically written to.
4323 ///
4324 /// In the case of the methods whose name ends with
4325 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4326 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4327 /// the three cases listed above).
4328 ///
4329 /// In the case of methods whose name does not end with
4330 /// `*_without_replacement`, unmappable characters are automatically replaced
4331 /// with the corresponding numeric character references and unmappable
4332 /// characters do not cause the methods to return early.
4333 ///
4334 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4335 /// not to return indicating that more output space is needed if the length
4336 /// of the output buffer is at least the length returned by
4337 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4338 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4339 /// methods not to return indicating that more output space is needed in the
4340 /// absence of unmappable characters is given by
4341 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4342 /// UTF-16 without replacement, the methods are guaranteed not to return
4343 /// indicating that more output space is needed if the length of the output
4344 /// buffer is at least the length returned by
4345 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4346 /// from UTF-16 with replacement, the the length of the output buffer that
4347 /// guarantees the methods not to return indicating that more output space is
4348 /// needed in the absence of unmappable characters is given by
4349 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4350 /// When encoding with replacement, applications are not expected to size the
4351 /// buffer for the worst case ahead of time but to resize the buffer if there
4352 /// are unmappable characters. This is why max length queries are only available
4353 /// for the case where there are no unmappable characters.
4354 ///
4355 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4356 /// calling from Rust, the type system takes care of this.) When encoding from
4357 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4358 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4359 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4360 /// are not split across input buffer boundaries.
4361 ///
4362 /// After an `encode_*` call returns, the output produced so far, taken as a
4363 /// whole from the start of the stream, is guaranteed to consist of a valid
4364 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4365 /// character is guaranteed not to be split across output buffers. However, due
4366 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4367 /// from the start for it to be valid. For other encodings, the validity holds
4368 /// on a per-output buffer basis.)
4369 ///
4370 /// The boolean argument `last` indicates that the end of the stream is reached
4371 /// when all the characters in `src` have been consumed. This argument is needed
4372 /// for ISO-2022-JP and is ignored for other encodings.
4373 ///
4374 /// An `Encoder` object can be used to incrementally encode a byte stream.
4375 ///
4376 /// During the processing of a single stream, the caller must call `encode_*`
4377 /// zero or more times with `last` set to `false` and then call `encode_*` at
4378 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4379 /// the processing of the stream has ended. Otherwise, the caller must call
4380 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4381 /// as a fatal error).
4382 ///
4383 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4384 /// That is, you need to create another one to process another stream.
4385 ///
4386 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4387 /// and the caller does not wish to treat it as a fatal error, the input buffer
4388 /// `src` may not have been completely consumed. In that case, the caller must
4389 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4390 /// call.
4391 ///
4392 /// [1]: enum.EncoderResult.html
4393 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4394 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4395 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4396 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4397 ///
4398 /// # Infinite loops
4399 ///
4400 /// When converting with a fixed-size output buffer whose size is too small to
4401 /// accommodate one character of output, an infinite loop ensues. When
4402 /// converting with a fixed-size output buffer, it generally makes sense to
4403 /// make the buffer fairly large (e.g. couple of kilobytes).
4404 pub struct Encoder {
4405 encoding: &'static Encoding,
4406 variant: VariantEncoder,
4407 }
4408
4409 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4410 fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4411 Encoder {
4412 encoding: enc,
4413 variant: encoder,
4414 }
4415 }
4416
4417 /// The `Encoding` this `Encoder` is for.
4418 #[inline]
encoding(&self) -> &'static Encoding4419 pub fn encoding(&self) -> &'static Encoding {
4420 self.encoding
4421 }
4422
4423 /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4424 /// ASCII state and `false` otherwise.
4425 #[inline]
has_pending_state(&self) -> bool4426 pub fn has_pending_state(&self) -> bool {
4427 self.variant.has_pending_state()
4428 }
4429
4430 /// Query the worst-case output size when encoding from UTF-8 with
4431 /// replacement.
4432 ///
4433 /// Returns the size of the output buffer in bytes that will not overflow
4434 /// given the current state of the encoder and `byte_length` number of
4435 /// additional input code units if there are no unmappable characters in
4436 /// the input or `None` if `usize` would overflow.
4437 ///
4438 /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4439 pub fn max_buffer_length_from_utf8_if_no_unmappables(
4440 &self,
4441 byte_length: usize,
4442 ) -> Option<usize> {
4443 checked_add(
4444 if self.encoding().can_encode_everything() {
4445 0
4446 } else {
4447 NCR_EXTRA
4448 },
4449 self.max_buffer_length_from_utf8_without_replacement(byte_length),
4450 )
4451 }
4452
4453 /// Query the worst-case output size when encoding from UTF-8 without
4454 /// replacement.
4455 ///
4456 /// Returns the size of the output buffer in bytes that will not overflow
4457 /// given the current state of the encoder and `byte_length` number of
4458 /// additional input code units or `None` if `usize` would overflow.
4459 ///
4460 /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4461 pub fn max_buffer_length_from_utf8_without_replacement(
4462 &self,
4463 byte_length: usize,
4464 ) -> Option<usize> {
4465 self.variant
4466 .max_buffer_length_from_utf8_without_replacement(byte_length)
4467 }
4468
4469 /// Incrementally encode into byte stream from UTF-8 with unmappable
4470 /// characters replaced with HTML (decimal) numeric character references.
4471 ///
4472 /// See the documentation of the struct for documentation for `encode_*`
4473 /// methods collectively.
4474 ///
4475 /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4476 pub fn encode_from_utf8(
4477 &mut self,
4478 src: &str,
4479 dst: &mut [u8],
4480 last: bool,
4481 ) -> (CoderResult, usize, usize, bool) {
4482 let dst_len = dst.len();
4483 let effective_dst_len = if self.encoding().can_encode_everything() {
4484 dst_len
4485 } else {
4486 if dst_len < NCR_EXTRA {
4487 if src.is_empty() && !(last && self.has_pending_state()) {
4488 return (CoderResult::InputEmpty, 0, 0, false);
4489 }
4490 return (CoderResult::OutputFull, 0, 0, false);
4491 }
4492 dst_len - NCR_EXTRA
4493 };
4494 let mut had_unmappables = false;
4495 let mut total_read = 0usize;
4496 let mut total_written = 0usize;
4497 loop {
4498 let (result, read, written) = self.encode_from_utf8_without_replacement(
4499 &src[total_read..],
4500 &mut dst[total_written..effective_dst_len],
4501 last,
4502 );
4503 total_read += read;
4504 total_written += written;
4505 match result {
4506 EncoderResult::InputEmpty => {
4507 return (
4508 CoderResult::InputEmpty,
4509 total_read,
4510 total_written,
4511 had_unmappables,
4512 );
4513 }
4514 EncoderResult::OutputFull => {
4515 return (
4516 CoderResult::OutputFull,
4517 total_read,
4518 total_written,
4519 had_unmappables,
4520 );
4521 }
4522 EncoderResult::Unmappable(unmappable) => {
4523 had_unmappables = true;
4524 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4525 debug_assert_ne!(self.encoding(), UTF_16BE);
4526 debug_assert_ne!(self.encoding(), UTF_16LE);
4527 // Additionally, Iso2022JpEncoder is responsible for
4528 // transitioning to ASCII when returning with Unmappable.
4529 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4530 if total_written >= effective_dst_len {
4531 if total_read == src.len() && !(last && self.has_pending_state()) {
4532 return (
4533 CoderResult::InputEmpty,
4534 total_read,
4535 total_written,
4536 had_unmappables,
4537 );
4538 }
4539 return (
4540 CoderResult::OutputFull,
4541 total_read,
4542 total_written,
4543 had_unmappables,
4544 );
4545 }
4546 }
4547 }
4548 }
4549 }
4550
4551 /// Incrementally encode into byte stream from UTF-8 with unmappable
4552 /// characters replaced with HTML (decimal) numeric character references.
4553 ///
4554 /// See the documentation of the struct for documentation for `encode_*`
4555 /// methods collectively.
4556 ///
4557 /// Available to Rust only.
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4558 pub fn encode_from_utf8_to_vec(
4559 &mut self,
4560 src: &str,
4561 dst: &mut Vec<u8>,
4562 last: bool,
4563 ) -> (CoderResult, usize, bool) {
4564 unsafe {
4565 let old_len = dst.len();
4566 let capacity = dst.capacity();
4567 dst.set_len(capacity);
4568 let (result, read, written, replaced) =
4569 self.encode_from_utf8(src, &mut dst[old_len..], last);
4570 dst.set_len(old_len + written);
4571 (result, read, replaced)
4572 }
4573 }
4574
4575 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4576 ///
4577 /// See the documentation of the struct for documentation for `encode_*`
4578 /// methods collectively.
4579 ///
4580 /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4581 pub fn encode_from_utf8_without_replacement(
4582 &mut self,
4583 src: &str,
4584 dst: &mut [u8],
4585 last: bool,
4586 ) -> (EncoderResult, usize, usize) {
4587 self.variant.encode_from_utf8_raw(src, dst, last)
4588 }
4589
4590 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4591 ///
4592 /// See the documentation of the struct for documentation for `encode_*`
4593 /// methods collectively.
4594 ///
4595 /// Available to Rust only.
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4596 pub fn encode_from_utf8_to_vec_without_replacement(
4597 &mut self,
4598 src: &str,
4599 dst: &mut Vec<u8>,
4600 last: bool,
4601 ) -> (EncoderResult, usize) {
4602 unsafe {
4603 let old_len = dst.len();
4604 let capacity = dst.capacity();
4605 dst.set_len(capacity);
4606 let (result, read, written) =
4607 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4608 dst.set_len(old_len + written);
4609 (result, read)
4610 }
4611 }
4612
4613 /// Query the worst-case output size when encoding from UTF-16 with
4614 /// replacement.
4615 ///
4616 /// Returns the size of the output buffer in bytes that will not overflow
4617 /// given the current state of the encoder and `u16_length` number of
4618 /// additional input code units if there are no unmappable characters in
4619 /// the input or `None` if `usize` would overflow.
4620 ///
4621 /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4622 pub fn max_buffer_length_from_utf16_if_no_unmappables(
4623 &self,
4624 u16_length: usize,
4625 ) -> Option<usize> {
4626 checked_add(
4627 if self.encoding().can_encode_everything() {
4628 0
4629 } else {
4630 NCR_EXTRA
4631 },
4632 self.max_buffer_length_from_utf16_without_replacement(u16_length),
4633 )
4634 }
4635
4636 /// Query the worst-case output size when encoding from UTF-16 without
4637 /// replacement.
4638 ///
4639 /// Returns the size of the output buffer in bytes that will not overflow
4640 /// given the current state of the encoder and `u16_length` number of
4641 /// additional input code units or `None` if `usize` would overflow.
4642 ///
4643 /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4644 pub fn max_buffer_length_from_utf16_without_replacement(
4645 &self,
4646 u16_length: usize,
4647 ) -> Option<usize> {
4648 self.variant
4649 .max_buffer_length_from_utf16_without_replacement(u16_length)
4650 }
4651
4652 /// Incrementally encode into byte stream from UTF-16 with unmappable
4653 /// characters replaced with HTML (decimal) numeric character references.
4654 ///
4655 /// See the documentation of the struct for documentation for `encode_*`
4656 /// methods collectively.
4657 ///
4658 /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4659 pub fn encode_from_utf16(
4660 &mut self,
4661 src: &[u16],
4662 dst: &mut [u8],
4663 last: bool,
4664 ) -> (CoderResult, usize, usize, bool) {
4665 let dst_len = dst.len();
4666 let effective_dst_len = if self.encoding().can_encode_everything() {
4667 dst_len
4668 } else {
4669 if dst_len < NCR_EXTRA {
4670 if src.is_empty() && !(last && self.has_pending_state()) {
4671 return (CoderResult::InputEmpty, 0, 0, false);
4672 }
4673 return (CoderResult::OutputFull, 0, 0, false);
4674 }
4675 dst_len - NCR_EXTRA
4676 };
4677 let mut had_unmappables = false;
4678 let mut total_read = 0usize;
4679 let mut total_written = 0usize;
4680 loop {
4681 let (result, read, written) = self.encode_from_utf16_without_replacement(
4682 &src[total_read..],
4683 &mut dst[total_written..effective_dst_len],
4684 last,
4685 );
4686 total_read += read;
4687 total_written += written;
4688 match result {
4689 EncoderResult::InputEmpty => {
4690 return (
4691 CoderResult::InputEmpty,
4692 total_read,
4693 total_written,
4694 had_unmappables,
4695 );
4696 }
4697 EncoderResult::OutputFull => {
4698 return (
4699 CoderResult::OutputFull,
4700 total_read,
4701 total_written,
4702 had_unmappables,
4703 );
4704 }
4705 EncoderResult::Unmappable(unmappable) => {
4706 had_unmappables = true;
4707 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4708 // There are no UTF-16 encoders and even if there were,
4709 // they'd never have unmappables.
4710 debug_assert_ne!(self.encoding(), UTF_16BE);
4711 debug_assert_ne!(self.encoding(), UTF_16LE);
4712 // Additionally, Iso2022JpEncoder is responsible for
4713 // transitioning to ASCII when returning with Unmappable
4714 // from the jis0208 state. That is, when we encode
4715 // ISO-2022-JP and come here, the encoder is in either the
4716 // ASCII or the Roman state. We are allowed to generate any
4717 // printable ASCII excluding \ and ~.
4718 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4719 if total_written >= effective_dst_len {
4720 if total_read == src.len() && !(last && self.has_pending_state()) {
4721 return (
4722 CoderResult::InputEmpty,
4723 total_read,
4724 total_written,
4725 had_unmappables,
4726 );
4727 }
4728 return (
4729 CoderResult::OutputFull,
4730 total_read,
4731 total_written,
4732 had_unmappables,
4733 );
4734 }
4735 }
4736 }
4737 }
4738 }
4739
4740 /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4741 ///
4742 /// See the documentation of the struct for documentation for `encode_*`
4743 /// methods collectively.
4744 ///
4745 /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4746 pub fn encode_from_utf16_without_replacement(
4747 &mut self,
4748 src: &[u16],
4749 dst: &mut [u8],
4750 last: bool,
4751 ) -> (EncoderResult, usize, usize) {
4752 self.variant.encode_from_utf16_raw(src, dst, last)
4753 }
4754 }
4755
4756 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4757 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4758 // len is the number of decimal digits needed to represent unmappable plus
4759 // 3 (the length of "&#" and ";").
4760 let mut number = unmappable as u32;
4761 let len = if number >= 1_000_000u32 {
4762 10usize
4763 } else if number >= 100_000u32 {
4764 9usize
4765 } else if number >= 10_000u32 {
4766 8usize
4767 } else if number >= 1_000u32 {
4768 7usize
4769 } else if number >= 100u32 {
4770 6usize
4771 } else {
4772 // Review the outcome of https://github.com/whatwg/encoding/issues/15
4773 // to see if this case is possible
4774 5usize
4775 };
4776 debug_assert!(number >= 10u32);
4777 debug_assert!(len <= dst.len());
4778 let mut pos = len - 1;
4779 dst[pos] = b';';
4780 pos -= 1;
4781 loop {
4782 let rightmost = number % 10;
4783 dst[pos] = rightmost as u8 + b'0';
4784 pos -= 1;
4785 if number < 10 {
4786 break;
4787 }
4788 number /= 10;
4789 }
4790 dst[1] = b'#';
4791 dst[0] = b'&';
4792 len
4793 }
4794
4795 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4796 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4797 i.wrapping_sub(start) < (end - start)
4798 }
4799
4800 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4801 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4802 i.wrapping_sub(start) < (end - start)
4803 }
4804
4805 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4806 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4807 i.wrapping_sub(start) <= (end - start)
4808 }
4809
4810 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4811 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4812 i.wrapping_sub(start) <= (end - start)
4813 }
4814
4815 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4816 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4817 i.wrapping_sub(start) <= (end - start)
4818 }
4819
4820 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4821 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4822 i.wrapping_sub(start) <= (end - start)
4823 }
4824
4825 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4826 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4827 if let Some(n) = opt {
4828 n.checked_add(num)
4829 } else {
4830 None
4831 }
4832 }
4833
4834 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4835 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4836 if let Some(n) = one {
4837 checked_add(n, other)
4838 } else {
4839 None
4840 }
4841 }
4842
4843 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4844 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4845 if let Some(n) = opt {
4846 n.checked_mul(num)
4847 } else {
4848 None
4849 }
4850 }
4851
4852 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4853 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4854 if let Some(n) = opt {
4855 n.checked_div(num)
4856 } else {
4857 None
4858 }
4859 }
4860
4861 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4862 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4863 opt.map(|n| n.next_power_of_two())
4864 }
4865
4866 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4867 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4868 if let Some(a) = one {
4869 if let Some(b) = other {
4870 Some(::core::cmp::min(a, b))
4871 } else {
4872 Some(a)
4873 }
4874 } else {
4875 other
4876 }
4877 }
4878
4879 // ############## TESTS ###############
4880
4881 #[cfg(all(test, feature = "serde"))]
4882 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4883 struct Demo {
4884 num: u32,
4885 name: String,
4886 enc: &'static Encoding,
4887 }
4888
4889 #[cfg(test)]
4890 mod test_labels_names;
4891
4892 #[cfg(test)]
4893 mod tests {
4894 use super::*;
4895 use alloc::borrow::Cow;
4896
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4897 fn sniff_to_utf16(
4898 initial_encoding: &'static Encoding,
4899 expected_encoding: &'static Encoding,
4900 bytes: &[u8],
4901 expect: &[u16],
4902 breaks: &[usize],
4903 ) {
4904 let mut decoder = initial_encoding.new_decoder();
4905
4906 let mut dest: Vec<u16> =
4907 Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4908 let capacity = dest.capacity();
4909 dest.resize(capacity, 0u16);
4910
4911 let mut total_written = 0usize;
4912 let mut start = 0usize;
4913 for br in breaks {
4914 let (result, read, written, _) =
4915 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4916 total_written += written;
4917 assert_eq!(read, *br - start);
4918 match result {
4919 CoderResult::InputEmpty => {}
4920 CoderResult::OutputFull => {
4921 unreachable!();
4922 }
4923 }
4924 start = *br;
4925 }
4926 let (result, read, written, _) =
4927 decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4928 total_written += written;
4929 match result {
4930 CoderResult::InputEmpty => {}
4931 CoderResult::OutputFull => {
4932 unreachable!();
4933 }
4934 }
4935 assert_eq!(read, bytes.len() - start);
4936 assert_eq!(total_written, expect.len());
4937 assert_eq!(&dest[..total_written], expect);
4938 assert_eq!(decoder.encoding(), expected_encoding);
4939 }
4940
4941 // Any copyright to the test code below this comment is dedicated to the
4942 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4943
4944 #[test]
test_bom_sniffing()4945 fn test_bom_sniffing() {
4946 // ASCII
4947 sniff_to_utf16(
4948 WINDOWS_1252,
4949 WINDOWS_1252,
4950 b"\x61\x62",
4951 &[0x0061u16, 0x0062u16],
4952 &[],
4953 );
4954 // UTF-8
4955 sniff_to_utf16(
4956 WINDOWS_1252,
4957 UTF_8,
4958 b"\xEF\xBB\xBF\x61\x62",
4959 &[0x0061u16, 0x0062u16],
4960 &[],
4961 );
4962 sniff_to_utf16(
4963 WINDOWS_1252,
4964 UTF_8,
4965 b"\xEF\xBB\xBF\x61\x62",
4966 &[0x0061u16, 0x0062u16],
4967 &[1],
4968 );
4969 sniff_to_utf16(
4970 WINDOWS_1252,
4971 UTF_8,
4972 b"\xEF\xBB\xBF\x61\x62",
4973 &[0x0061u16, 0x0062u16],
4974 &[2],
4975 );
4976 sniff_to_utf16(
4977 WINDOWS_1252,
4978 UTF_8,
4979 b"\xEF\xBB\xBF\x61\x62",
4980 &[0x0061u16, 0x0062u16],
4981 &[3],
4982 );
4983 sniff_to_utf16(
4984 WINDOWS_1252,
4985 UTF_8,
4986 b"\xEF\xBB\xBF\x61\x62",
4987 &[0x0061u16, 0x0062u16],
4988 &[4],
4989 );
4990 sniff_to_utf16(
4991 WINDOWS_1252,
4992 UTF_8,
4993 b"\xEF\xBB\xBF\x61\x62",
4994 &[0x0061u16, 0x0062u16],
4995 &[2, 3],
4996 );
4997 sniff_to_utf16(
4998 WINDOWS_1252,
4999 UTF_8,
5000 b"\xEF\xBB\xBF\x61\x62",
5001 &[0x0061u16, 0x0062u16],
5002 &[1, 2],
5003 );
5004 sniff_to_utf16(
5005 WINDOWS_1252,
5006 UTF_8,
5007 b"\xEF\xBB\xBF\x61\x62",
5008 &[0x0061u16, 0x0062u16],
5009 &[1, 3],
5010 );
5011 sniff_to_utf16(
5012 WINDOWS_1252,
5013 UTF_8,
5014 b"\xEF\xBB\xBF\x61\x62",
5015 &[0x0061u16, 0x0062u16],
5016 &[1, 2, 3, 4],
5017 );
5018 sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5019 // Not UTF-8
5020 sniff_to_utf16(
5021 WINDOWS_1252,
5022 WINDOWS_1252,
5023 b"\xEF\xBB\x61\x62",
5024 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5025 &[],
5026 );
5027 sniff_to_utf16(
5028 WINDOWS_1252,
5029 WINDOWS_1252,
5030 b"\xEF\xBB\x61\x62",
5031 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5032 &[1],
5033 );
5034 sniff_to_utf16(
5035 WINDOWS_1252,
5036 WINDOWS_1252,
5037 b"\xEF\x61\x62",
5038 &[0x00EFu16, 0x0061u16, 0x0062u16],
5039 &[],
5040 );
5041 sniff_to_utf16(
5042 WINDOWS_1252,
5043 WINDOWS_1252,
5044 b"\xEF\x61\x62",
5045 &[0x00EFu16, 0x0061u16, 0x0062u16],
5046 &[1],
5047 );
5048 sniff_to_utf16(
5049 WINDOWS_1252,
5050 WINDOWS_1252,
5051 b"\xEF\xBB",
5052 &[0x00EFu16, 0x00BBu16],
5053 &[],
5054 );
5055 sniff_to_utf16(
5056 WINDOWS_1252,
5057 WINDOWS_1252,
5058 b"\xEF\xBB",
5059 &[0x00EFu16, 0x00BBu16],
5060 &[1],
5061 );
5062 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5063 // Not UTF-16
5064 sniff_to_utf16(
5065 WINDOWS_1252,
5066 WINDOWS_1252,
5067 b"\xFE\x61\x62",
5068 &[0x00FEu16, 0x0061u16, 0x0062u16],
5069 &[],
5070 );
5071 sniff_to_utf16(
5072 WINDOWS_1252,
5073 WINDOWS_1252,
5074 b"\xFE\x61\x62",
5075 &[0x00FEu16, 0x0061u16, 0x0062u16],
5076 &[1],
5077 );
5078 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5079 sniff_to_utf16(
5080 WINDOWS_1252,
5081 WINDOWS_1252,
5082 b"\xFF\x61\x62",
5083 &[0x00FFu16, 0x0061u16, 0x0062u16],
5084 &[],
5085 );
5086 sniff_to_utf16(
5087 WINDOWS_1252,
5088 WINDOWS_1252,
5089 b"\xFF\x61\x62",
5090 &[0x00FFu16, 0x0061u16, 0x0062u16],
5091 &[1],
5092 );
5093 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5094 // UTF-16
5095 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5096 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5097 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5098 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5099 }
5100
5101 #[test]
test_output_encoding()5102 fn test_output_encoding() {
5103 assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5104 assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5105 assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5106 assert_eq!(UTF_8.output_encoding(), UTF_8);
5107 assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5108 assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5109 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5110 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5111 assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5112 assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5113 }
5114
5115 #[test]
test_label_resolution()5116 fn test_label_resolution() {
5117 assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5118 assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5119 assert_eq!(
5120 Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5121 Some(UTF_8)
5122 );
5123 assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5124 assert_eq!(Encoding::for_label(b"bogus"), None);
5125 assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5126 }
5127
5128 #[test]
test_decode_valid_windows_1257_to_cow()5129 fn test_decode_valid_windows_1257_to_cow() {
5130 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5131 match cow {
5132 Cow::Borrowed(_) => unreachable!(),
5133 Cow::Owned(s) => {
5134 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5135 }
5136 }
5137 assert_eq!(encoding, WINDOWS_1257);
5138 assert!(!had_errors);
5139 }
5140
5141 #[test]
test_decode_invalid_windows_1257_to_cow()5142 fn test_decode_invalid_windows_1257_to_cow() {
5143 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5144 match cow {
5145 Cow::Borrowed(_) => unreachable!(),
5146 Cow::Owned(s) => {
5147 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5148 }
5149 }
5150 assert_eq!(encoding, WINDOWS_1257);
5151 assert!(had_errors);
5152 }
5153
5154 #[test]
test_decode_ascii_only_windows_1257_to_cow()5155 fn test_decode_ascii_only_windows_1257_to_cow() {
5156 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5157 match cow {
5158 Cow::Borrowed(s) => {
5159 assert_eq!(s, "abc");
5160 }
5161 Cow::Owned(_) => unreachable!(),
5162 }
5163 assert_eq!(encoding, WINDOWS_1257);
5164 assert!(!had_errors);
5165 }
5166
5167 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5168 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5169 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5170 match cow {
5171 Cow::Borrowed(s) => {
5172 assert_eq!(s, "\u{20AC}\u{00E4}");
5173 }
5174 Cow::Owned(_) => unreachable!(),
5175 }
5176 assert_eq!(encoding, UTF_8);
5177 assert!(!had_errors);
5178 }
5179
5180 #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5181 fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5182 let (cow, encoding, had_errors) =
5183 WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5184 match cow {
5185 Cow::Borrowed(_) => unreachable!(),
5186 Cow::Owned(s) => {
5187 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5188 }
5189 }
5190 assert_eq!(encoding, UTF_8);
5191 assert!(had_errors);
5192 }
5193
5194 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5195 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5196 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5197 match cow {
5198 Cow::Borrowed(s) => {
5199 assert_eq!(s, "\u{20AC}\u{00E4}");
5200 }
5201 Cow::Owned(_) => unreachable!(),
5202 }
5203 assert_eq!(encoding, UTF_8);
5204 assert!(!had_errors);
5205 }
5206
5207 #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5208 fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5209 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5210 match cow {
5211 Cow::Borrowed(_) => unreachable!(),
5212 Cow::Owned(s) => {
5213 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5214 }
5215 }
5216 assert_eq!(encoding, UTF_8);
5217 assert!(had_errors);
5218 }
5219
5220 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5221 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5222 let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5223 match cow {
5224 Cow::Borrowed(s) => {
5225 assert_eq!(s, "\u{20AC}\u{00E4}");
5226 }
5227 Cow::Owned(_) => unreachable!(),
5228 }
5229 assert!(!had_errors);
5230 }
5231
5232 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5233 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5234 let (cow, had_errors) =
5235 WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5236 match cow {
5237 Cow::Borrowed(_) => unreachable!(),
5238 Cow::Owned(s) => {
5239 assert_eq!(
5240 s,
5241 "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5242 );
5243 }
5244 }
5245 assert!(!had_errors);
5246 }
5247
5248 #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5249 fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5250 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5251 match cow {
5252 Cow::Borrowed(_) => unreachable!(),
5253 Cow::Owned(s) => {
5254 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5255 }
5256 }
5257 assert!(!had_errors);
5258 }
5259
5260 #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5261 fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5262 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5263 match cow {
5264 Cow::Borrowed(_) => unreachable!(),
5265 Cow::Owned(s) => {
5266 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5267 }
5268 }
5269 assert!(had_errors);
5270 }
5271
5272 #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5273 fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5274 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5275 match cow {
5276 Cow::Borrowed(s) => {
5277 assert_eq!(s, "abc");
5278 }
5279 Cow::Owned(_) => unreachable!(),
5280 }
5281 assert!(!had_errors);
5282 }
5283
5284 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5285 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5286 let (cow, had_errors) =
5287 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5288 match cow {
5289 Cow::Borrowed(s) => {
5290 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5291 }
5292 Cow::Owned(_) => unreachable!(),
5293 }
5294 assert!(!had_errors);
5295 }
5296
5297 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5298 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5299 let (cow, had_errors) =
5300 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5301 match cow {
5302 Cow::Borrowed(_) => unreachable!(),
5303 Cow::Owned(s) => {
5304 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5305 }
5306 }
5307 assert!(had_errors);
5308 }
5309
5310 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5311 fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5312 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5313 match cow {
5314 Cow::Borrowed(_) => unreachable!(),
5315 Cow::Owned(s) => {
5316 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5317 }
5318 }
5319 assert!(!had_errors);
5320 }
5321
5322 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5323 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5324 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5325 match cow {
5326 Cow::Borrowed(_) => unreachable!(),
5327 Cow::Owned(s) => {
5328 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5329 }
5330 }
5331 assert!(had_errors);
5332 }
5333
5334 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5335 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5336 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5337 match cow {
5338 Cow::Borrowed(s) => {
5339 assert_eq!(s, "abc");
5340 }
5341 Cow::Owned(_) => unreachable!(),
5342 }
5343 assert!(!had_errors);
5344 }
5345
5346 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5347 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5348 match UTF_8.decode_without_bom_handling_and_without_replacement(
5349 b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5350 ) {
5351 Some(cow) => match cow {
5352 Cow::Borrowed(s) => {
5353 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5354 }
5355 Cow::Owned(_) => unreachable!(),
5356 },
5357 None => unreachable!(),
5358 }
5359 }
5360
5361 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5362 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5363 assert!(UTF_8
5364 .decode_without_bom_handling_and_without_replacement(
5365 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5366 )
5367 .is_none());
5368 }
5369
5370 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5371 fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5372 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5373 Some(cow) => match cow {
5374 Cow::Borrowed(_) => unreachable!(),
5375 Cow::Owned(s) => {
5376 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5377 }
5378 },
5379 None => unreachable!(),
5380 }
5381 }
5382
5383 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5384 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5385 assert!(WINDOWS_1257
5386 .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5387 .is_none());
5388 }
5389
5390 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5391 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5392 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5393 Some(cow) => match cow {
5394 Cow::Borrowed(s) => {
5395 assert_eq!(s, "abc");
5396 }
5397 Cow::Owned(_) => unreachable!(),
5398 },
5399 None => unreachable!(),
5400 }
5401 }
5402
5403 #[test]
test_encode_ascii_only_windows_1257_to_cow()5404 fn test_encode_ascii_only_windows_1257_to_cow() {
5405 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5406 match cow {
5407 Cow::Borrowed(s) => {
5408 assert_eq!(s, b"abc");
5409 }
5410 Cow::Owned(_) => unreachable!(),
5411 }
5412 assert_eq!(encoding, WINDOWS_1257);
5413 assert!(!had_errors);
5414 }
5415
5416 #[test]
test_encode_valid_windows_1257_to_cow()5417 fn test_encode_valid_windows_1257_to_cow() {
5418 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5419 match cow {
5420 Cow::Borrowed(_) => unreachable!(),
5421 Cow::Owned(s) => {
5422 assert_eq!(s, b"abc\x80\xE4");
5423 }
5424 }
5425 assert_eq!(encoding, WINDOWS_1257);
5426 assert!(!had_errors);
5427 }
5428
5429 #[test]
test_utf16_space_with_one_bom_byte()5430 fn test_utf16_space_with_one_bom_byte() {
5431 let mut decoder = UTF_16LE.new_decoder();
5432 let mut dst = [0u16; 12];
5433 {
5434 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5435 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5436 assert_eq!(result, CoderResult::InputEmpty);
5437 }
5438 {
5439 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5440 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5441 assert_eq!(result, CoderResult::InputEmpty);
5442 }
5443 }
5444
5445 #[test]
test_utf8_space_with_one_bom_byte()5446 fn test_utf8_space_with_one_bom_byte() {
5447 let mut decoder = UTF_8.new_decoder();
5448 let mut dst = [0u16; 12];
5449 {
5450 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5451 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5452 assert_eq!(result, CoderResult::InputEmpty);
5453 }
5454 {
5455 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5456 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5457 assert_eq!(result, CoderResult::InputEmpty);
5458 }
5459 }
5460
5461 #[test]
test_utf16_space_with_two_bom_bytes()5462 fn test_utf16_space_with_two_bom_bytes() {
5463 let mut decoder = UTF_16LE.new_decoder();
5464 let mut dst = [0u16; 12];
5465 {
5466 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5467 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5468 assert_eq!(result, CoderResult::InputEmpty);
5469 }
5470 {
5471 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5472 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5473 assert_eq!(result, CoderResult::InputEmpty);
5474 }
5475 {
5476 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5477 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5478 assert_eq!(result, CoderResult::InputEmpty);
5479 }
5480 }
5481
5482 #[test]
test_utf8_space_with_two_bom_bytes()5483 fn test_utf8_space_with_two_bom_bytes() {
5484 let mut decoder = UTF_8.new_decoder();
5485 let mut dst = [0u16; 12];
5486 {
5487 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5488 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5489 assert_eq!(result, CoderResult::InputEmpty);
5490 }
5491 {
5492 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5493 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5494 assert_eq!(result, CoderResult::InputEmpty);
5495 }
5496 {
5497 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5498 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5499 assert_eq!(result, CoderResult::InputEmpty);
5500 }
5501 }
5502
5503 #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5504 fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5505 let mut decoder = UTF_16LE.new_decoder();
5506 let mut dst = [0u16; 12];
5507 {
5508 let needed = decoder.max_utf16_buffer_length(2).unwrap();
5509 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5510 assert_eq!(result, CoderResult::InputEmpty);
5511 }
5512 }
5513
5514 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5515 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5516 let mut dst = [0u8; 8];
5517 let mut encoder = ISO_2022_JP.new_encoder();
5518 {
5519 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5520 assert_eq!(result, CoderResult::InputEmpty);
5521 }
5522 {
5523 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5524 assert_eq!(result, CoderResult::InputEmpty);
5525 }
5526 }
5527
5528 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5529 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5530 let mut dst = [0u8; 16];
5531 let mut encoder = ISO_2022_JP.new_encoder();
5532 {
5533 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5534 assert_eq!(result, CoderResult::InputEmpty);
5535 }
5536 {
5537 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5538 assert_eq!(result, CoderResult::InputEmpty);
5539 }
5540 {
5541 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5542 assert_eq!(result, CoderResult::OutputFull);
5543 }
5544 }
5545
5546 #[test]
test_buffer_end_iso_2022_jp_from_utf8()5547 fn test_buffer_end_iso_2022_jp_from_utf8() {
5548 let mut dst = [0u8; 18];
5549 {
5550 let mut encoder = ISO_2022_JP.new_encoder();
5551 let (result, _, _, _) =
5552 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5553 assert_eq!(result, CoderResult::InputEmpty);
5554 }
5555 {
5556 let mut encoder = ISO_2022_JP.new_encoder();
5557 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5558 assert_eq!(result, CoderResult::OutputFull);
5559 }
5560 {
5561 let mut encoder = ISO_2022_JP.new_encoder();
5562 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5563 assert_eq!(result, CoderResult::InputEmpty);
5564 }
5565 {
5566 let mut encoder = ISO_2022_JP.new_encoder();
5567 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5568 assert_eq!(result, CoderResult::InputEmpty);
5569 }
5570 }
5571
5572 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5573 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5574 let mut dst = [0u8; 8];
5575 let mut encoder = ISO_2022_JP.new_encoder();
5576 {
5577 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5578 assert_eq!(result, CoderResult::InputEmpty);
5579 }
5580 {
5581 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5582 assert_eq!(result, CoderResult::InputEmpty);
5583 }
5584 }
5585
5586 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5587 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5588 let mut dst = [0u8; 16];
5589 let mut encoder = ISO_2022_JP.new_encoder();
5590 {
5591 let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5592 assert_eq!(result, CoderResult::InputEmpty);
5593 }
5594 {
5595 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5596 assert_eq!(result, CoderResult::InputEmpty);
5597 }
5598 {
5599 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5600 assert_eq!(result, CoderResult::OutputFull);
5601 }
5602 }
5603
5604 #[test]
test_buffer_end_iso_2022_jp_from_utf16()5605 fn test_buffer_end_iso_2022_jp_from_utf16() {
5606 let mut dst = [0u8; 18];
5607 {
5608 let mut encoder = ISO_2022_JP.new_encoder();
5609 let (result, _, _, _) =
5610 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5611 assert_eq!(result, CoderResult::InputEmpty);
5612 }
5613 {
5614 let mut encoder = ISO_2022_JP.new_encoder();
5615 let (result, _, _, _) =
5616 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5617 assert_eq!(result, CoderResult::OutputFull);
5618 }
5619 {
5620 let mut encoder = ISO_2022_JP.new_encoder();
5621 let (result, _, _, _) =
5622 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5623 assert_eq!(result, CoderResult::InputEmpty);
5624 }
5625 {
5626 let mut encoder = ISO_2022_JP.new_encoder();
5627 let (result, _, _, _) =
5628 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5629 assert_eq!(result, CoderResult::InputEmpty);
5630 }
5631 }
5632
5633 #[test]
test_buffer_end_utf16be()5634 fn test_buffer_end_utf16be() {
5635 let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5636 let mut dest = [0u8; 4];
5637
5638 assert_eq!(
5639 decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5640 (CoderResult::InputEmpty, 2, 0, false)
5641 );
5642
5643 let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5644 }
5645
5646 #[test]
test_hash()5647 fn test_hash() {
5648 let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5649 encodings.insert(UTF_8);
5650 encodings.insert(ISO_2022_JP);
5651 assert!(encodings.contains(UTF_8));
5652 assert!(encodings.contains(ISO_2022_JP));
5653 assert!(!encodings.contains(WINDOWS_1252));
5654 encodings.remove(ISO_2022_JP);
5655 assert!(!encodings.contains(ISO_2022_JP));
5656 }
5657
5658 #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5659 fn test_iso_2022_jp_ncr_extra_from_utf16() {
5660 let mut dst = [0u8; 17];
5661 {
5662 let mut encoder = ISO_2022_JP.new_encoder();
5663 let (result, _, _, _) =
5664 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5665 assert_eq!(result, CoderResult::OutputFull);
5666 }
5667 }
5668
5669 #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5670 fn test_iso_2022_jp_ncr_extra_from_utf8() {
5671 let mut dst = [0u8; 17];
5672 {
5673 let mut encoder = ISO_2022_JP.new_encoder();
5674 let (result, _, _, _) =
5675 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5676 assert_eq!(result, CoderResult::OutputFull);
5677 }
5678 }
5679
5680 #[test]
test_max_length_with_bom_to_utf8()5681 fn test_max_length_with_bom_to_utf8() {
5682 let mut output = [0u8; 20];
5683 let mut decoder = REPLACEMENT.new_decoder();
5684 let input = b"\xEF\xBB\xBFA";
5685 {
5686 let needed = decoder
5687 .max_utf8_buffer_length_without_replacement(input.len())
5688 .unwrap();
5689 let (result, read, written) =
5690 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5691 assert_eq!(result, DecoderResult::InputEmpty);
5692 assert_eq!(read, input.len());
5693 assert_eq!(written, 1);
5694 assert_eq!(output[0], 0x41);
5695 }
5696 }
5697
5698 #[cfg(feature = "serde")]
5699 #[test]
test_serde()5700 fn test_serde() {
5701 let demo = Demo {
5702 num: 42,
5703 name: "foo".into(),
5704 enc: UTF_8,
5705 };
5706
5707 let serialized = serde_json::to_string(&demo).unwrap();
5708
5709 let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5710 assert_eq!(deserialized, demo);
5711
5712 let bincoded = bincode::serialize(&demo).unwrap();
5713 let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5714 assert_eq!(debincoded, demo);
5715 }
5716
5717 #[test]
test_is_single_byte()5718 fn test_is_single_byte() {
5719 assert!(!BIG5.is_single_byte());
5720 assert!(!EUC_JP.is_single_byte());
5721 assert!(!EUC_KR.is_single_byte());
5722 assert!(!GB18030.is_single_byte());
5723 assert!(!GBK.is_single_byte());
5724 assert!(!REPLACEMENT.is_single_byte());
5725 assert!(!SHIFT_JIS.is_single_byte());
5726 assert!(!UTF_8.is_single_byte());
5727 assert!(!UTF_16BE.is_single_byte());
5728 assert!(!UTF_16LE.is_single_byte());
5729 assert!(!ISO_2022_JP.is_single_byte());
5730
5731 assert!(IBM866.is_single_byte());
5732 assert!(ISO_8859_2.is_single_byte());
5733 assert!(ISO_8859_3.is_single_byte());
5734 assert!(ISO_8859_4.is_single_byte());
5735 assert!(ISO_8859_5.is_single_byte());
5736 assert!(ISO_8859_6.is_single_byte());
5737 assert!(ISO_8859_7.is_single_byte());
5738 assert!(ISO_8859_8.is_single_byte());
5739 assert!(ISO_8859_10.is_single_byte());
5740 assert!(ISO_8859_13.is_single_byte());
5741 assert!(ISO_8859_14.is_single_byte());
5742 assert!(ISO_8859_15.is_single_byte());
5743 assert!(ISO_8859_16.is_single_byte());
5744 assert!(ISO_8859_8_I.is_single_byte());
5745 assert!(KOI8_R.is_single_byte());
5746 assert!(KOI8_U.is_single_byte());
5747 assert!(MACINTOSH.is_single_byte());
5748 assert!(WINDOWS_874.is_single_byte());
5749 assert!(WINDOWS_1250.is_single_byte());
5750 assert!(WINDOWS_1251.is_single_byte());
5751 assert!(WINDOWS_1252.is_single_byte());
5752 assert!(WINDOWS_1253.is_single_byte());
5753 assert!(WINDOWS_1254.is_single_byte());
5754 assert!(WINDOWS_1255.is_single_byte());
5755 assert!(WINDOWS_1256.is_single_byte());
5756 assert!(WINDOWS_1257.is_single_byte());
5757 assert!(WINDOWS_1258.is_single_byte());
5758 assert!(X_MAC_CYRILLIC.is_single_byte());
5759 assert!(X_USER_DEFINED.is_single_byte());
5760 }
5761
5762 #[test]
test_latin1_byte_compatible_up_to()5763 fn test_latin1_byte_compatible_up_to() {
5764 let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5765 assert_eq!(
5766 BIG5.new_decoder_without_bom_handling()
5767 .latin1_byte_compatible_up_to(buffer)
5768 .unwrap(),
5769 1
5770 );
5771 assert_eq!(
5772 EUC_JP
5773 .new_decoder_without_bom_handling()
5774 .latin1_byte_compatible_up_to(buffer)
5775 .unwrap(),
5776 1
5777 );
5778 assert_eq!(
5779 EUC_KR
5780 .new_decoder_without_bom_handling()
5781 .latin1_byte_compatible_up_to(buffer)
5782 .unwrap(),
5783 1
5784 );
5785 assert_eq!(
5786 GB18030
5787 .new_decoder_without_bom_handling()
5788 .latin1_byte_compatible_up_to(buffer)
5789 .unwrap(),
5790 1
5791 );
5792 assert_eq!(
5793 GBK.new_decoder_without_bom_handling()
5794 .latin1_byte_compatible_up_to(buffer)
5795 .unwrap(),
5796 1
5797 );
5798 assert!(REPLACEMENT
5799 .new_decoder_without_bom_handling()
5800 .latin1_byte_compatible_up_to(buffer)
5801 .is_none());
5802 assert_eq!(
5803 SHIFT_JIS
5804 .new_decoder_without_bom_handling()
5805 .latin1_byte_compatible_up_to(buffer)
5806 .unwrap(),
5807 1
5808 );
5809 assert_eq!(
5810 UTF_8
5811 .new_decoder_without_bom_handling()
5812 .latin1_byte_compatible_up_to(buffer)
5813 .unwrap(),
5814 1
5815 );
5816 assert!(UTF_16BE
5817 .new_decoder_without_bom_handling()
5818 .latin1_byte_compatible_up_to(buffer)
5819 .is_none());
5820 assert!(UTF_16LE
5821 .new_decoder_without_bom_handling()
5822 .latin1_byte_compatible_up_to(buffer)
5823 .is_none());
5824 assert_eq!(
5825 ISO_2022_JP
5826 .new_decoder_without_bom_handling()
5827 .latin1_byte_compatible_up_to(buffer)
5828 .unwrap(),
5829 1
5830 );
5831
5832 assert_eq!(
5833 IBM866
5834 .new_decoder_without_bom_handling()
5835 .latin1_byte_compatible_up_to(buffer)
5836 .unwrap(),
5837 1
5838 );
5839 assert_eq!(
5840 ISO_8859_2
5841 .new_decoder_without_bom_handling()
5842 .latin1_byte_compatible_up_to(buffer)
5843 .unwrap(),
5844 2
5845 );
5846 assert_eq!(
5847 ISO_8859_3
5848 .new_decoder_without_bom_handling()
5849 .latin1_byte_compatible_up_to(buffer)
5850 .unwrap(),
5851 2
5852 );
5853 assert_eq!(
5854 ISO_8859_4
5855 .new_decoder_without_bom_handling()
5856 .latin1_byte_compatible_up_to(buffer)
5857 .unwrap(),
5858 2
5859 );
5860 assert_eq!(
5861 ISO_8859_5
5862 .new_decoder_without_bom_handling()
5863 .latin1_byte_compatible_up_to(buffer)
5864 .unwrap(),
5865 2
5866 );
5867 assert_eq!(
5868 ISO_8859_6
5869 .new_decoder_without_bom_handling()
5870 .latin1_byte_compatible_up_to(buffer)
5871 .unwrap(),
5872 2
5873 );
5874 assert_eq!(
5875 ISO_8859_7
5876 .new_decoder_without_bom_handling()
5877 .latin1_byte_compatible_up_to(buffer)
5878 .unwrap(),
5879 2
5880 );
5881 assert_eq!(
5882 ISO_8859_8
5883 .new_decoder_without_bom_handling()
5884 .latin1_byte_compatible_up_to(buffer)
5885 .unwrap(),
5886 3
5887 );
5888 assert_eq!(
5889 ISO_8859_10
5890 .new_decoder_without_bom_handling()
5891 .latin1_byte_compatible_up_to(buffer)
5892 .unwrap(),
5893 2
5894 );
5895 assert_eq!(
5896 ISO_8859_13
5897 .new_decoder_without_bom_handling()
5898 .latin1_byte_compatible_up_to(buffer)
5899 .unwrap(),
5900 4
5901 );
5902 assert_eq!(
5903 ISO_8859_14
5904 .new_decoder_without_bom_handling()
5905 .latin1_byte_compatible_up_to(buffer)
5906 .unwrap(),
5907 4
5908 );
5909 assert_eq!(
5910 ISO_8859_15
5911 .new_decoder_without_bom_handling()
5912 .latin1_byte_compatible_up_to(buffer)
5913 .unwrap(),
5914 6
5915 );
5916 assert_eq!(
5917 ISO_8859_16
5918 .new_decoder_without_bom_handling()
5919 .latin1_byte_compatible_up_to(buffer)
5920 .unwrap(),
5921 4
5922 );
5923 assert_eq!(
5924 ISO_8859_8_I
5925 .new_decoder_without_bom_handling()
5926 .latin1_byte_compatible_up_to(buffer)
5927 .unwrap(),
5928 3
5929 );
5930 assert_eq!(
5931 KOI8_R
5932 .new_decoder_without_bom_handling()
5933 .latin1_byte_compatible_up_to(buffer)
5934 .unwrap(),
5935 1
5936 );
5937 assert_eq!(
5938 KOI8_U
5939 .new_decoder_without_bom_handling()
5940 .latin1_byte_compatible_up_to(buffer)
5941 .unwrap(),
5942 1
5943 );
5944 assert_eq!(
5945 MACINTOSH
5946 .new_decoder_without_bom_handling()
5947 .latin1_byte_compatible_up_to(buffer)
5948 .unwrap(),
5949 1
5950 );
5951 assert_eq!(
5952 WINDOWS_874
5953 .new_decoder_without_bom_handling()
5954 .latin1_byte_compatible_up_to(buffer)
5955 .unwrap(),
5956 2
5957 );
5958 assert_eq!(
5959 WINDOWS_1250
5960 .new_decoder_without_bom_handling()
5961 .latin1_byte_compatible_up_to(buffer)
5962 .unwrap(),
5963 4
5964 );
5965 assert_eq!(
5966 WINDOWS_1251
5967 .new_decoder_without_bom_handling()
5968 .latin1_byte_compatible_up_to(buffer)
5969 .unwrap(),
5970 1
5971 );
5972 assert_eq!(
5973 WINDOWS_1252
5974 .new_decoder_without_bom_handling()
5975 .latin1_byte_compatible_up_to(buffer)
5976 .unwrap(),
5977 5
5978 );
5979 assert_eq!(
5980 WINDOWS_1253
5981 .new_decoder_without_bom_handling()
5982 .latin1_byte_compatible_up_to(buffer)
5983 .unwrap(),
5984 3
5985 );
5986 assert_eq!(
5987 WINDOWS_1254
5988 .new_decoder_without_bom_handling()
5989 .latin1_byte_compatible_up_to(buffer)
5990 .unwrap(),
5991 4
5992 );
5993 assert_eq!(
5994 WINDOWS_1255
5995 .new_decoder_without_bom_handling()
5996 .latin1_byte_compatible_up_to(buffer)
5997 .unwrap(),
5998 3
5999 );
6000 assert_eq!(
6001 WINDOWS_1256
6002 .new_decoder_without_bom_handling()
6003 .latin1_byte_compatible_up_to(buffer)
6004 .unwrap(),
6005 1
6006 );
6007 assert_eq!(
6008 WINDOWS_1257
6009 .new_decoder_without_bom_handling()
6010 .latin1_byte_compatible_up_to(buffer)
6011 .unwrap(),
6012 4
6013 );
6014 assert_eq!(
6015 WINDOWS_1258
6016 .new_decoder_without_bom_handling()
6017 .latin1_byte_compatible_up_to(buffer)
6018 .unwrap(),
6019 4
6020 );
6021 assert_eq!(
6022 X_MAC_CYRILLIC
6023 .new_decoder_without_bom_handling()
6024 .latin1_byte_compatible_up_to(buffer)
6025 .unwrap(),
6026 1
6027 );
6028 assert_eq!(
6029 X_USER_DEFINED
6030 .new_decoder_without_bom_handling()
6031 .latin1_byte_compatible_up_to(buffer)
6032 .unwrap(),
6033 1
6034 );
6035
6036 assert!(UTF_8
6037 .new_decoder()
6038 .latin1_byte_compatible_up_to(buffer)
6039 .is_none());
6040
6041 let mut decoder = UTF_8.new_decoder();
6042 let mut output = [0u16; 4];
6043 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6044 assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6045 let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6046 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6047 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6048 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6049 }
6050 }
6051