1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 #![cfg_attr(
11 feature = "cargo-clippy",
12 allow(doc_markdown, inline_always, new_ret_no_self)
13 )]
14 #![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.20")]
15
16 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
17 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
18 //! Gecko-oriented means that converting to and from UTF-16 is supported in
19 //! addition to converting to and from UTF-8, that the performance and
20 //! streamability goals are browser-oriented, and that FFI-friendliness is a
21 //! goal.
22 //!
23 //! Additionally, the `mem` module provides functions that are useful for
24 //! applications that need to be able to deal with legacy in-memory
25 //! representations of Unicode.
26 //!
27 //! For expectation setting, please be sure to read the sections
28 //! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
29 //! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
30 //!
31 //! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
32 //! design and internals of the crate.
33 //!
34 //! # Availability
35 //!
36 //! The code is available under the
37 //! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
38 //! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
39 //! See the
40 //! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
41 //! file for details.
42 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
43 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
44 //!
45 //! # Integration with `std::io`
46 //!
47 //! This crate doesn't implement traits from `std::io`. However, for the case of
48 //! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
49 //! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
50 //! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
51 //!
52 //! # Examples
53 //!
54 //! Example programs:
55 //!
56 //! * [Rust](https://github.com/hsivonen/recode_rs)
57 //! * [C](https://github.com/hsivonen/recode_c)
58 //! * [C++](https://github.com/hsivonen/recode_cpp)
59 //!
60 //! Decode using the non-streaming API:
61 //!
62 //! ```
63 //! use encoding_rs::*;
64 //!
65 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
66 //! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
67 //!
68 //! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
69 //! assert_eq!(&cow[..], expectation);
70 //! assert_eq!(encoding_used, SHIFT_JIS);
71 //! assert!(!had_errors);
72 //! ```
73 //!
74 //! Decode using the streaming API with minimal `unsafe`:
75 //!
76 //! ```
77 //! use encoding_rs::*;
78 //!
79 //! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
80 //!
81 //! // Use an array of byte slices to demonstrate content arriving piece by
82 //! // piece from the network.
83 //! let bytes: [&'static [u8]; 4] = [b"\x83",
84 //! b"n\x83\x8D\x81",
85 //! b"[\x81E\x83\x8F\x81[\x83",
86 //! b"\x8B\x83h"];
87 //!
88 //! // Very short output buffer to demonstrate the output buffer getting full.
89 //! // Normally, you'd use something like `[0u8; 2048]`.
90 //! let mut buffer_bytes = [0u8; 8];
91 //! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
92 //!
93 //! // How many bytes in the buffer currently hold significant data.
94 //! let mut bytes_in_buffer = 0usize;
95 //!
96 //! // Collect the output to a string for demonstration purposes.
97 //! let mut output = String::new();
98 //!
99 //! // The `Decoder`
100 //! let mut decoder = SHIFT_JIS.new_decoder();
101 //!
102 //! // Track whether we see errors.
103 //! let mut total_had_errors = false;
104 //!
105 //! // Decode using a fixed-size intermediate buffer (for demonstrating the
106 //! // use of a fixed-size buffer; normally when the output of an incremental
107 //! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
108 //! // avoid the intermediate buffer).
109 //! for input in &bytes[..] {
110 //! // The number of bytes already read from current `input` in total.
111 //! let mut total_read_from_current_input = 0usize;
112 //!
113 //! loop {
114 //! let (result, read, written, had_errors) =
115 //! decoder.decode_to_str(&input[total_read_from_current_input..],
116 //! &mut buffer[bytes_in_buffer..],
117 //! false);
118 //! total_read_from_current_input += read;
119 //! bytes_in_buffer += written;
120 //! total_had_errors |= had_errors;
121 //! match result {
122 //! CoderResult::InputEmpty => {
123 //! // We have consumed the current input buffer. Break out of
124 //! // the inner loop to get the next input buffer from the
125 //! // outer loop.
126 //! break;
127 //! },
128 //! CoderResult::OutputFull => {
129 //! // Write the current buffer out and consider the buffer
130 //! // empty.
131 //! output.push_str(&buffer[..bytes_in_buffer]);
132 //! bytes_in_buffer = 0usize;
133 //! continue;
134 //! }
135 //! }
136 //! }
137 //! }
138 //!
139 //! // Process EOF
140 //! loop {
141 //! let (result, _, written, had_errors) =
142 //! decoder.decode_to_str(b"",
143 //! &mut buffer[bytes_in_buffer..],
144 //! true);
145 //! bytes_in_buffer += written;
146 //! total_had_errors |= had_errors;
147 //! // Write the current buffer out and consider the buffer empty.
148 //! // Need to do this here for both `match` arms, because we exit the
149 //! // loop on `CoderResult::InputEmpty`.
150 //! output.push_str(&buffer[..bytes_in_buffer]);
151 //! bytes_in_buffer = 0usize;
152 //! match result {
153 //! CoderResult::InputEmpty => {
154 //! // Done!
155 //! break;
156 //! },
157 //! CoderResult::OutputFull => {
158 //! continue;
159 //! }
160 //! }
161 //! }
162 //!
163 //! assert_eq!(&output[..], expectation);
164 //! assert!(!total_had_errors);
165 //! ```
166 //!
167 //! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
168 //!
169 //! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
170 //! __so this crate does not provide encoders for those encodings__!
171 //! Along with the replacement encoding, their _output encoding_ is UTF-8,
172 //! so you get an UTF-8 encoder if you request an encoder for them.
173 //!
174 //! Additionally, the Encoding Standard factors BOM handling into wrapper
175 //! algorithms so that BOM handling isn't part of the definition of the
176 //! encodings themselves. The Unicode _encoding schemes_ in the Unicode
177 //! Standard define BOM handling or lack thereof as part of the encoding
178 //! scheme.
179 //!
180 //! When used with the `_without_bom_handling` entry points, the UTF-16LE
181 //! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
182 //! the Unicode Standard.
183 //!
184 //! When used with the `_with_bom_removal` entry points, the UTF-8
185 //! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
186 //! Standard.
187 //!
188 //! This crate does not provide a mode that matches the UTF-16 _encoding
189 //! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
190 //! the entry points without `_bom_` qualifiers is the closest match,
191 //! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
192 //! not part of the behavior of the UTF-16 _encoding scheme_ per the
193 //! Unicode Standard.
194 //!
195 //! The UTF-32 family of Unicode encoding schemes is not supported
196 //! by this crate. The Encoding Standard doesn't define any UTF-32
197 //! family encodings, since they aren't necessary for consuming Web
198 //! content.
199 //!
200 //! ## ISO-8859-1
201 //!
202 //! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
203 //! the Encoding Standard. Therefore, an encoding that maps the unsigned
204 //! byte value to the same Unicode scalar value is not available via
205 //! `Encoding` in this crate.
206 //!
207 //! However, the functions whose name starts with `convert` and contains
208 //! `latin1` in the `mem` module support such conversions, which are known as
209 //! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
210 //! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
211 //! in the [Infra Standard](https://infra.spec.whatwg.org/).
212 //!
213 //! ## Web / Browser Focus
214 //!
215 //! Both in terms of scope and performance, the focus is on the Web. For scope,
216 //! this means that encoding_rs implements the Encoding Standard fully and
217 //! doesn't implement encodings that are not specified in the Encoding
218 //! Standard. For performance, this means that decoding performance is
219 //! important as well as performance for encoding into UTF-8 or encoding the
220 //! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
221 //! be encoded into legacy encodings in only two places in the Web platform: in
222 //! the query part of URLs, in which case it's a matter of relatively rare
223 //! error handling, and in form submission, in which case the user action and
224 //! networking tend to hide the performance of the encoder.
225 //!
226 //! Deemphasizing performance of encoding non-Basic Latin text into legacy
227 //! encodings enables smaller code size thanks to the encoder side using the
228 //! decode-optimized data tables without having encode-optimized data tables at
229 //! all. Even in decoders, smaller lookup table size is preferred over avoiding
230 //! multiplication operations.
231 //!
232 //! Additionally, performance is a non-goal for the ASCII-incompatible
233 //! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
234 //! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
235 //! of implementation.
236 //!
237 //! Despite the browser focus, the hope is that non-browser applications
238 //! that wish to consume Web content or submit Web forms in a Web-compatible
239 //! way will find encoding_rs useful. While encoding_rs does not try to match
240 //! Windows behavior, many of the encodings are close enough to legacy
241 //! encodings implemented by Windows that applications that need to consume
242 //! data in legacy Windows encodins may find encoding_rs useful. The
243 //! [codepage](https://crates.io/crates/codepage) crate maps from Windows
244 //! code page identifiers onto encoding_rs `Encoding`s and vice versa.
245 //!
246 //! For decoding email, UTF-7 support is needed (unfortunately) in additition
247 //! to the encodings defined in the Encoding Standard. The
248 //! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
249 //! UTF-7 decoding for email purposes.
250 //!
251 //! # Preparing Text for the Encoders
252 //!
253 //! Normalizing text into Unicode Normalization Form C prior to encoding text
254 //! into a legacy encoding minimizes unmappable characters. Text can be
255 //! normalized to Unicode Normalization Form C using the
256 //! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
257 //!
258 //! The exception is windows-1258, which after normalizing to Unicode
259 //! Normalization Form C requires tone marks to be decomposed in order to
260 //! minimize unmappable characters. Vietnamese tone marks can be decomposed
261 //! using the [`detone`](https://crates.io/crates/detone) crate.
262 //!
263 //! # Streaming & Non-Streaming; Rust & C/C++
264 //!
265 //! The API in Rust has two modes of operation: streaming and non-streaming.
266 //! The streaming API is the foundation of the implementation and should be
267 //! used when processing data that arrives piecemeal from an i/o stream. The
268 //! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
269 //! to C callers. The non-streaming part of the API is for Rust callers only and
270 //! is smart about borrowing instead of copying when possible. When
271 //! streamability is not needed, the non-streaming API should be preferrer in
272 //! order to avoid copying data when a borrow suffices.
273 //!
274 //! There is no analogous C API exposed via FFI, mainly because C doesn't have
275 //! standard types for growable byte buffers and Unicode strings that know
276 //! their length.
277 //!
278 //! The C API (header file generated at `target/include/encoding_rs.h` when
279 //! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
280 //! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
281 //! The C binding comes with a [C++14 wrapper][2] that uses standard library +
282 //! [GSL][3] types and that recreates the non-streaming API in C++ on top of
283 //! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
284 //! as part of Mozilla [bug 1261841][4].
285 //!
286 //! The `Encoding` type is common to both the streaming and non-streaming
287 //! modes. In the streaming mode, decoding operations are performed with a
288 //! `Decoder` and encoding operations with an `Encoder` object obtained via
289 //! `Encoding`. In the non-streaming mode, decoding and encoding operations are
290 //! performed using methods on `Encoding` objects themselves, so the `Decoder`
291 //! and `Encoder` objects are not used at all.
292 //!
293 //! [1]: https://github.com/hsivonen/encoding_c
294 //! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
295 //! [3]: https://github.com/Microsoft/GSL/
296 //! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
297 //!
298 //! # Memory management
299 //!
300 //! The non-streaming mode never performs heap allocations (even the methods
301 //! that write into a `Vec<u8>` or a `String` by taking them as arguments do
302 //! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
303 //! is, the non-streaming mode uses caller-allocated buffers exclusively.
304 //!
305 //! The methods of the streaming mode that return a `Vec<u8>` or a `String`
306 //! perform heap allocations but only to allocate the backing buffer of the
307 //! `Vec<u8>` or the `String`.
308 //!
309 //! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
310 //! `Drop` cleanup.
311 //!
312 //! # Buffer reading and writing behavior
313 //!
314 //! Based on experience gained with the `java.nio.charset` encoding converter
315 //! API and with the Gecko uconv encoding converter API, the buffer reading
316 //! and writing behaviors of encoding_rs are asymmetric: input buffers are
317 //! fully drained but output buffers are not always fully filled.
318 //!
319 //! When reading from an input buffer, encoding_rs always consumes all input
320 //! up to the next error or to the end of the buffer. In particular, when
321 //! decoding, even if the input buffer ends in the middle of a byte sequence
322 //! for a character, the decoder consumes all input. This has the benefit that
323 //! the caller of the API can always fill the next buffer from the start from
324 //! whatever source the bytes come from and never has to first copy the last
325 //! bytes of the previous buffer to the start of the next buffer. However, when
326 //! encoding, the UTF-8 input buffers have to end at a character boundary, which
327 //! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
328 //! boundaries falling in the middle of a surrogate pair result in both
329 //! suggorates being treated individually as unpaired surrogates.
330 //!
331 //! Additionally, decoders guarantee that they can be fed even one byte at a
332 //! time and encoders guarantee that they can be fed even one code point at a
333 //! time. This has the benefit of not placing restrictions on the size of
334 //! chunks the content arrives e.g. from network.
335 //!
336 //! When writing into an output buffer, encoding_rs makes sure that the code
337 //! unit sequence for a character is never split across output buffer
338 //! boundaries. This may result in wasted space at the end of an output buffer,
339 //! but the advantages are that the output side of both decoders and encoders
340 //! is greatly simplified compared to designs that attempt to fill output
341 //! buffers exactly even when that entails splitting a code unit sequence and
342 //! when encoding_rs methods return to the caller, the output produces thus
343 //! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
344 //! the output needs to be considered as a whole, because the latest output
345 //! buffer taken alone might not be valid taken alone if the transition away
346 //! from the ASCII state occurred in an earlier output buffer. However, since
347 //! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
348 //! state as being in error despite the encoder generating a transition to the
349 //! ASCII state at the end, the claim about the partial output taken as a whole
350 //! being valid is true even for ISO-2022-JP.)
351 //!
352 //! # Error Reporting
353 //!
354 //! Based on experience gained with the `java.nio.charset` encoding converter
355 //! API and with the Gecko uconv encoding converter API, the error reporting
356 //! behaviors of encoding_rs are asymmetric: decoder errors include offsets
357 //! that leave it up to the caller to extract the erroneous bytes from the
358 //! input stream if the caller wishes to do so but encoder errors provide the
359 //! code point associated with the error without requiring the caller to
360 //! extract it from the input on its own.
361 //!
362 //! On the encoder side, an error is always triggered by the most recently
363 //! pushed Unicode scalar, which makes it simple to pass the `char` to the
364 //! caller. Also, it's very typical for the caller to wish to do something with
365 //! this data: generate a numeric escape for the character. Additionally, the
366 //! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
367 //! certain cases, so requiring the caller to extract the character from the
368 //! input buffer would require the caller to handle ISO-2022-JP details.
369 //! Furthermore, requiring the caller to extract the character from the input
370 //! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
371 //! the job of an encoding conversion library.
372 //!
373 //! On the decoder side, errors are triggered in more complex ways. For
374 //! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
375 //! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
376 //! the buffer boundary when processing 'A'. Thus, the bytes in error might not
377 //! be the ones most recently pushed to the decoder and the error might not even
378 //! be in the current buffer.
379 //!
380 //! Some encoding conversion APIs address the problem by not acknowledging
381 //! trailing bytes of an input buffer as consumed if it's still possible for
382 //! future bytes to cause the trailing bytes to be in error. This way, error
383 //! reporting can always refer to the most recently pushed buffer. This has the
384 //! problem that the caller of the API has to copy the unconsumed trailing
385 //! bytes to the start of the next buffer before being able to fill the rest
386 //! of the next buffer. This is annoying, error-prone and inefficient.
387 //!
388 //! A possible solution would be making the decoder remember recently consumed
389 //! bytes in order to be able to include a copy of the erroneous bytes when
390 //! reporting an error. This has two problem: First, callers a rarely
391 //! interested in the erroneous bytes, so attempts to identify them are most
392 //! often just overhead anyway. Second, the rare applications that are
393 //! interested typically care about the location of the error in the input
394 //! stream.
395 //!
396 //! To keep the API convenient for common uses and the overhead low while making
397 //! it possible to develop applications, such as HTML validators, that care
398 //! about which bytes were in error, encoding_rs reports the length of the
399 //! erroneous sequence and the number of bytes consumed after the erroneous
400 //! sequence. As long as the caller doesn't discard the 6 most recent bytes,
401 //! this makes it possible for callers that care about the erroneous bytes to
402 //! locate them.
403 //!
404 //! # No Convenience API for Custom Replacements
405 //!
406 //! The Web Platform and, therefore, the Encoding Standard supports only one
407 //! error recovery mode for decoders and only one error recovery mode for
408 //! encoders. The supported error recovery mode for decoders is emitting the
409 //! REPLACEMENT CHARACTER on error. The supported error recovery mode for
410 //! encoders is emitting an HTML decimal numeric character reference for
411 //! unmappable characters.
412 //!
413 //! Since encoding_rs is Web-focused, these are the only error recovery modes
414 //! for which convenient support is provided. Moreover, on the decoder side,
415 //! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
416 //! on error (other than treating errors as fatal). In particular, simply
417 //! ignoring errors is a
418 //! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
419 //! so it would be a bad idea for encoding_rs to provide a mode that encouraged
420 //! callers to ignore errors.
421 //!
422 //! On the encoder side, there are plausible alternatives for HTML decimal
423 //! numeric character references. For example, when outputting CSS, CSS-style
424 //! escapes would seem to make sense. However, instead of facilitating the
425 //! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
426 //! position that you shouldn't generate output in encodings other than UTF-8,
427 //! except where backward compatibility with interacting with the legacy Web
428 //! requires it. The legacy Web requires it only when parsing the query strings
429 //! of URLs and when submitting forms, and those two both use HTML decimal
430 //! numeric character references.
431 //!
432 //! While encoding_rs doesn't make encoder replacements other than HTML decimal
433 //! numeric character references easy, it does make them _possible_.
434 //! `encode_from_utf8()`, which emits HTML decimal numeric character references
435 //! for unmappable characters, is implemented on top of
436 //! `encode_from_utf8_without_replacement()`. Applications that really, really
437 //! want other replacement schemes for unmappable characters can likewise
438 //! implement them on top of `encode_from_utf8_without_replacement()`.
439 //!
440 //! # No Extensibility by Design
441 //!
442 //! The set of encodings supported by encoding_rs is not extensible by design.
443 //! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
444 //! rather than `trait`s. encoding_rs takes the design position that all future
445 //! text interchange should be done using UTF-8, which can represent all of
446 //! Unicode. (It is, in fact, the only encoding supported by the Encoding
447 //! Standard and encoding_rs that can represent all of Unicode and that has
448 //! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
449 //! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
450 //! legacy compatibility and not due to non-UTF-8 encodings having benefits
451 //! other than being able to consume legacy content.
452 //!
453 //! Considering that UTF-8 can represent all of Unicode and is already supported
454 //! by all Web browsers, introducing a new encoding wouldn't add to the
455 //! expressiveness but would add to compatibility problems. In that sense,
456 //! adding new encodings to the Web Platform doesn't make sense, and, in fact,
457 //! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
458 //! the Web Platform. On the other hand, the set of legacy encodings that must
459 //! be supported for a Web browser to be able to be successful is not going to
460 //! expand. Empirically, the set of encodings specified in the Encoding Standard
461 //! is already sufficient and the set of legacy encodings won't grow
462 //! retroactively.
463 //!
464 //! Since extensibility doesn't make sense considering the Web focus of
465 //! encoding_rs and adding encodings to Web clients would be actively harmful,
466 //! it makes sense to make the set of encodings that encoding_rs supports
467 //! non-extensible and to take the (admittedly small) benefits arising from
468 //! that, such as the size of `Decoder` and `Encoder` objects being known ahead
469 //! of time, which enables stack allocation thereof.
470 //!
471 //! This does have downsides for applications that might want to put encoding_rs
472 //! to non-Web uses if those non-Web uses involve legacy encodings that aren't
473 //! needed for Web uses. The needs of such applications should not complicate
474 //! encoding_rs itself, though. It is up to those applications to provide a
475 //! framework that delegates the operations with encodings that encoding_rs
476 //! supports to encoding_rs and operations with other encodings to something
477 //! else (as opposed to encoding_rs itself providing an extensibility
478 //! framework).
479 //!
480 //! # Panics
481 //!
482 //! Methods in encoding_rs can panic if the API is used against the requirements
483 //! stated in the documentation, if a state that's supposed to be impossible
484 //! is reached due to an internal bug or on integer overflow. When used
485 //! according to documentation with buffer sizes that stay below integer
486 //! overflow, in the absence of internal bugs, encoding_rs does not panic.
487 //!
488 //! Panics arising from API misuse aren't documented beyond this on individual
489 //! methods.
490 //!
491 //! # At-Risk Parts of the API
492 //!
493 //! The foreseeable source of partially backward-incompatible API change is the
494 //! way the instances of `Encoding` are made available.
495 //!
496 //! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
497 //! initialized with `static`s of type `&'static Encoding`, the non-reference
498 //! `FOO_INIT` public `Encoding` instances will be removed from the public API.
499 //!
500 //! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
501 //! unique when the constant is used in different crates, the reference-typed
502 //! `static`s for the encoding instances will be changed from `static` to
503 //! `const` and the non-reference-typed `_INIT` instances will be removed.
504 //!
505 //! # Mapping Spec Concepts onto the API
506 //!
507 //! <table>
508 //! <thead>
509 //! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
510 //! </thead>
511 //! <tbody>
512 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&'static Encoding</code></td><td><code>&'static Encoding</code></td></tr>
513 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
514 //! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
515 //! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
516 //! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
517 //! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
518 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
519 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
520 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// … (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
521 //! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
522 //! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// …</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
523 //! </tbody>
524 //! </table>
525 //!
526 //! # Compatibility with the rust-encoding API
527 //!
528 //! The crate
529 //! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
530 //! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
531 //! the API of rust-encoding 0.2.32 on top of encoding_rs.
532 //!
533 //! # Mapping rust-encoding concepts to encoding_rs concepts
534 //!
535 //! The following table provides a mapping from rust-encoding constructs to
536 //! encoding_rs ones.
537 //!
538 //! <table>
539 //! <thead>
540 //! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
541 //! </thead>
542 //! <tbody>
543 //! <tr><td><code>encoding::EncodingRef</code></td><td><code>&'static encoding_rs::Encoding</code></td></tr>
544 //! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
545 //! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
546 //! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
547 //! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
548 //! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
549 //! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
550 //! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
551 //! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
552 //! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
553 //! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
554 //! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
555 //! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
556 //! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
557 //! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
558 //! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
559 //! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
560 //! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
561 //! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
562 //! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
563 //! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
564 //! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
565 //! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
566 //! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
567 //! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
568 //! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569 //! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
570 //! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
571 //! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
572 //! </tbody>
573 //! </table>
574 //!
575 //! # Relationship with Windows Code Pages
576 //!
577 //! Despite the Web and browser focus, the encodings defined by the Encoding
578 //! Standard and implemented by this crate may be useful for decoding legacy
579 //! data that uses Windows code pages. The following table names the single-byte
580 //! encodings
581 //! that have a closely related Windows code page, the number of the closest
582 //! code page, a column indicating whether Windows maps unassigned code points
583 //! to the Unicode Private Use Area instead of U+FFFD and a remark number
584 //! indicating remarks in the list after the table.
585 //!
586 //! <table>
587 //! <thead>
588 //! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
589 //! </thead>
590 //! <tbody>
591 //! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
592 //! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
593 //! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
594 //! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
595 //! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
596 //! <tr><td>windows-874</td><td>874</td><td>•</td><td></td></tr>
597 //! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
598 //! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
599 //! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
600 //! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
601 //! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
602 //! <tr><td>windows-1253</td><td>1253</td><td>•</td><td></td></tr>
603 //! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
604 //! <tr><td>windows-1255</td><td>1255</td><td>•</td><td></td></tr>
605 //! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
606 //! <tr><td>windows-1257</td><td>1257</td><td>•</td><td></td></tr>
607 //! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
608 //! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
609 //! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
610 //! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
611 //! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
612 //! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
613 //! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
614 //! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
615 //! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
616 //! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
617 //! <tr><td>ISO-8859-6</td><td>28596</td><td>•</td><td></td></tr>
618 //! <tr><td>ISO-8859-7</td><td>28597</td><td>•</td><td>3</td></tr>
619 //! <tr><td>ISO-8859-8</td><td>28598</td><td>•</td><td>4</td></tr>
620 //! <tr><td>ISO-8859-13</td><td>28603</td><td>•</td><td></td></tr>
621 //! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
622 //! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
623 //! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
624 //! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
625 //! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
626 //! </tbody>
627 //! </table>
628 //!
629 //! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
630 //! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
631 //! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
632 //! which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
633 //! decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
634 //! LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
635 //! instead of U+2019 RIGHT SINGLE QUOTATION MARK.
636 //! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
637 //! of LRM and RLM.
638 //! 5. Remarks from the previous item apply.
639 //!
640 //! The differences between this crate and Windows in the case of multibyte encodings
641 //! are not yet fully documented here. The lack of remarks above should not be taken
642 //! as indication of lack of differences.
643 //!
644 //! # Notable Differences from IANA Naming
645 //!
646 //! In some cases, the Encoding Standard specifies the popular unextended encoding
647 //! name where in IANA terms one of the other labels would be more precise considering
648 //! the extensions that the Encoding Standard has unified into the encoding.
649 //!
650 //! <table>
651 //! <thead>
652 //! <tr><th>Encoding</th><th>IANA</th></tr>
653 //! </thead>
654 //! <tbody>
655 //! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
656 //! <tr><td>EUC-KR</td><td>windows-949</td></tr>
657 //! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
658 //! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
659 //! </tbody>
660 //! </table>
661 //!
662 //! In other cases where the Encoding Standard unifies unextended and extended
663 //! variants of an encoding, the encoding gets the name of the extended
664 //! variant.
665 //!
666 //! <table>
667 //! <thead>
668 //! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
669 //! </thead>
670 //! <tbody>
671 //! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
672 //! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
673 //! <tr><td>TIS-620</td><td>windows-874</td></tr>
674 //! </tbody>
675 //! </table>
676 //!
677 //! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
678 //! for discussion about the UTF-16 family.
679
680 #![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
681
682 #[macro_use]
683 extern crate cfg_if;
684
685 #[cfg(all(
686 feature = "simd-accel",
687 any(
688 target_feature = "sse2",
689 all(target_endian = "little", target_arch = "aarch64"),
690 all(target_endian = "little", target_feature = "neon")
691 )
692 ))]
693 #[macro_use(shuffle)]
694 extern crate packed_simd;
695
696 #[cfg(feature = "serde")]
697 extern crate serde;
698
699 #[cfg(all(test, feature = "serde"))]
700 extern crate bincode;
701 #[cfg(all(test, feature = "serde"))]
702 #[macro_use]
703 extern crate serde_derive;
704 #[cfg(all(test, feature = "serde"))]
705 extern crate serde_json;
706
707 #[macro_use]
708 mod macros;
709
710 #[cfg(all(
711 feature = "simd-accel",
712 any(
713 target_feature = "sse2",
714 all(target_endian = "little", target_arch = "aarch64"),
715 all(target_endian = "little", target_feature = "neon")
716 )
717 ))]
718 mod simd_funcs;
719
720 #[cfg(test)]
721 mod testing;
722
723 mod big5;
724 mod euc_jp;
725 mod euc_kr;
726 mod gb18030;
727 mod iso_2022_jp;
728 mod replacement;
729 mod shift_jis;
730 mod single_byte;
731 mod utf_16;
732 mod utf_8;
733 mod x_user_defined;
734
735 mod ascii;
736 mod data;
737 mod handles;
738 mod variant;
739
740 pub mod mem;
741
742 use ascii::ascii_valid_up_to;
743 use ascii::iso_2022_jp_ascii_valid_up_to;
744 use utf_8::utf8_valid_up_to;
745 use variant::*;
746
747 use std::borrow::Cow;
748 use std::cmp::Ordering;
749 use std::hash::Hash;
750 use std::hash::Hasher;
751
752 #[cfg(feature = "serde")]
753 use serde::de::Visitor;
754 #[cfg(feature = "serde")]
755 use serde::{Deserialize, Deserializer, Serialize, Serializer};
756
757 /// This has to be the max length of an NCR instead of max
758 /// minus one, because we can't rely on getting the minus
759 /// one from the space reserved for the current unmappable,
760 /// because the ISO-2022-JP encoder can fill up that space
761 /// with a state transition escape.
762 const NCR_EXTRA: usize = 10; // 
763
764 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
765 // Instead, please regenerate using generate-encoding-data.py
766
767 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
768
769 /// The initializer for the [Big5](static.BIG5.html) encoding.
770 ///
771 /// For use only for taking the address of this form when
772 /// Rust prohibits the use of the non-`_INIT` form directly,
773 /// such as in initializers of other `static`s. If in doubt,
774 /// use the corresponding non-`_INIT` reference-typed `static`.
775 ///
776 /// This part of the public API will go away if Rust changes
777 /// to make the referent of `pub const FOO: &'static Encoding`
778 /// unique cross-crate or if Rust starts allowing static arrays
779 /// to be initialized with `pub static FOO: &'static Encoding`
780 /// items.
781 pub static BIG5_INIT: Encoding = Encoding {
782 name: "Big5",
783 variant: VariantEncoding::Big5,
784 };
785
786 /// The Big5 encoding.
787 ///
788 /// This is Big5 with HKSCS with mappings to more recent Unicode assignments
789 /// instead of the Private Use Area code points that have been used historically.
790 /// It is believed to be able to decode existing Web content in a way that makes
791 /// sense.
792 ///
793 /// To avoid form submissions generating data that Web servers don't understand,
794 /// the encoder doesn't use the HKSCS byte sequences that precede the unextended
795 /// Big5 in the lexical order.
796 ///
797 /// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
798 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
799 ///
800 /// This encoding is designed to be suited for decoding the Windows code page 950
801 /// and its HKSCS patched "951" variant such that the text makes sense, given
802 /// assignments that Unicode has made after those encodings used Private Use
803 /// Area characters.
804 ///
805 /// This will change from `static` to `const` if Rust changes
806 /// to make the referent of `pub const FOO: &'static Encoding`
807 /// unique cross-crate, so don't take the address of this
808 /// `static`.
809 pub static BIG5: &'static Encoding = &BIG5_INIT;
810
811 /// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
812 ///
813 /// For use only for taking the address of this form when
814 /// Rust prohibits the use of the non-`_INIT` form directly,
815 /// such as in initializers of other `static`s. If in doubt,
816 /// use the corresponding non-`_INIT` reference-typed `static`.
817 ///
818 /// This part of the public API will go away if Rust changes
819 /// to make the referent of `pub const FOO: &'static Encoding`
820 /// unique cross-crate or if Rust starts allowing static arrays
821 /// to be initialized with `pub static FOO: &'static Encoding`
822 /// items.
823 pub static EUC_JP_INIT: Encoding = Encoding {
824 name: "EUC-JP",
825 variant: VariantEncoding::EucJp,
826 };
827
828 /// The EUC-JP encoding.
829 ///
830 /// This is the legacy Unix encoding for Japanese.
831 ///
832 /// For compatibility with Web servers that don't expect three-byte sequences
833 /// in form submissions, the encoder doesn't generate three-byte sequences.
834 /// That is, the JIS X 0212 support is decode-only.
835 ///
836 /// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
837 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
838 ///
839 /// This encoding roughly matches the Windows code page 20932. There are error
840 /// handling differences and a handful of 2-byte sequences that decode differently.
841 /// Additionall, Windows doesn't support 3-byte sequences.
842 ///
843 /// This will change from `static` to `const` if Rust changes
844 /// to make the referent of `pub const FOO: &'static Encoding`
845 /// unique cross-crate, so don't take the address of this
846 /// `static`.
847 pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
848
849 /// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
850 ///
851 /// For use only for taking the address of this form when
852 /// Rust prohibits the use of the non-`_INIT` form directly,
853 /// such as in initializers of other `static`s. If in doubt,
854 /// use the corresponding non-`_INIT` reference-typed `static`.
855 ///
856 /// This part of the public API will go away if Rust changes
857 /// to make the referent of `pub const FOO: &'static Encoding`
858 /// unique cross-crate or if Rust starts allowing static arrays
859 /// to be initialized with `pub static FOO: &'static Encoding`
860 /// items.
861 pub static EUC_KR_INIT: Encoding = Encoding {
862 name: "EUC-KR",
863 variant: VariantEncoding::EucKr,
864 };
865
866 /// The EUC-KR encoding.
867 ///
868 /// This is the Korean encoding for Windows. It extends the Unix legacy encoding
869 /// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
870 /// Classic), with all the characters from the Hangul Syllables block of Unicode.
871 ///
872 /// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
873 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
874 ///
875 /// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
876 /// to U+0080 and some byte sequences that are error per the Encoding Standard to
877 /// the question mark or the Private Use Area.
878 ///
879 /// This will change from `static` to `const` if Rust changes
880 /// to make the referent of `pub const FOO: &'static Encoding`
881 /// unique cross-crate, so don't take the address of this
882 /// `static`.
883 pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
884
885 /// The initializer for the [GBK](static.GBK.html) encoding.
886 ///
887 /// For use only for taking the address of this form when
888 /// Rust prohibits the use of the non-`_INIT` form directly,
889 /// such as in initializers of other `static`s. If in doubt,
890 /// use the corresponding non-`_INIT` reference-typed `static`.
891 ///
892 /// This part of the public API will go away if Rust changes
893 /// to make the referent of `pub const FOO: &'static Encoding`
894 /// unique cross-crate or if Rust starts allowing static arrays
895 /// to be initialized with `pub static FOO: &'static Encoding`
896 /// items.
897 pub static GBK_INIT: Encoding = Encoding {
898 name: "GBK",
899 variant: VariantEncoding::Gbk,
900 };
901
902 /// The GBK encoding.
903 ///
904 /// The decoder for this encoding is the same as the decoder for gb18030.
905 /// The encoder side of this encoding is GBK with Windows code page 936 euro
906 /// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
907 /// Unicode block as well as a handful of ideographs from the CJK Unified
908 /// Ideographs Extension A and CJK Compatibility Ideographs blocks.
909 ///
910 /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
911 /// unified with the gb18030 encoder in the Encoding Standard out of concern
912 /// that servers that expect GBK form submissions might not be able to handle
913 /// the four-byte sequences.
914 ///
915 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
916 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
917 ///
918 /// The encoder of this encoding roughly matches the Windows code page 936.
919 /// The decoder side is a superset.
920 ///
921 /// This will change from `static` to `const` if Rust changes
922 /// to make the referent of `pub const FOO: &'static Encoding`
923 /// unique cross-crate, so don't take the address of this
924 /// `static`.
925 pub static GBK: &'static Encoding = &GBK_INIT;
926
927 /// The initializer for the [IBM866](static.IBM866.html) encoding.
928 ///
929 /// For use only for taking the address of this form when
930 /// Rust prohibits the use of the non-`_INIT` form directly,
931 /// such as in initializers of other `static`s. If in doubt,
932 /// use the corresponding non-`_INIT` reference-typed `static`.
933 ///
934 /// This part of the public API will go away if Rust changes
935 /// to make the referent of `pub const FOO: &'static Encoding`
936 /// unique cross-crate or if Rust starts allowing static arrays
937 /// to be initialized with `pub static FOO: &'static Encoding`
938 /// items.
939 pub static IBM866_INIT: Encoding = Encoding {
940 name: "IBM866",
941 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
942 };
943
944 /// The IBM866 encoding.
945 ///
946 /// This the most notable one of the DOS Cyrillic code pages. It has the same
947 /// box drawing characters as code page 437, so it can be used for decoding
948 /// DOS-era ASCII + box drawing data.
949 ///
950 /// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
951 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
952 ///
953 /// This encoding matches the Windows code page 866.
954 ///
955 /// This will change from `static` to `const` if Rust changes
956 /// to make the referent of `pub const FOO: &'static Encoding`
957 /// unique cross-crate, so don't take the address of this
958 /// `static`.
959 pub static IBM866: &'static Encoding = &IBM866_INIT;
960
961 /// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
962 ///
963 /// For use only for taking the address of this form when
964 /// Rust prohibits the use of the non-`_INIT` form directly,
965 /// such as in initializers of other `static`s. If in doubt,
966 /// use the corresponding non-`_INIT` reference-typed `static`.
967 ///
968 /// This part of the public API will go away if Rust changes
969 /// to make the referent of `pub const FOO: &'static Encoding`
970 /// unique cross-crate or if Rust starts allowing static arrays
971 /// to be initialized with `pub static FOO: &'static Encoding`
972 /// items.
973 pub static ISO_2022_JP_INIT: Encoding = Encoding {
974 name: "ISO-2022-JP",
975 variant: VariantEncoding::Iso2022Jp,
976 };
977
978 /// The ISO-2022-JP encoding.
979 ///
980 /// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
981 /// byte range to encode non-Basic Latin characters. It's the only encoding
982 /// supported by this crate whose encoder is stateful.
983 ///
984 /// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
985 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
986 ///
987 /// This encoding roughly matches the Windows code page 50220. Notably, Windows
988 /// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
989 /// error handling.
990 ///
991 /// This will change from `static` to `const` if Rust changes
992 /// to make the referent of `pub const FOO: &'static Encoding`
993 /// unique cross-crate, so don't take the address of this
994 /// `static`.
995 pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
996
997 /// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
998 ///
999 /// For use only for taking the address of this form when
1000 /// Rust prohibits the use of the non-`_INIT` form directly,
1001 /// such as in initializers of other `static`s. If in doubt,
1002 /// use the corresponding non-`_INIT` reference-typed `static`.
1003 ///
1004 /// This part of the public API will go away if Rust changes
1005 /// to make the referent of `pub const FOO: &'static Encoding`
1006 /// unique cross-crate or if Rust starts allowing static arrays
1007 /// to be initialized with `pub static FOO: &'static Encoding`
1008 /// items.
1009 pub static ISO_8859_10_INIT: Encoding = Encoding {
1010 name: "ISO-8859-10",
1011 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1012 };
1013
1014 /// The ISO-8859-10 encoding.
1015 ///
1016 /// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1017 /// is also known as Latin 6.
1018 ///
1019 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1020 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1021 ///
1022 /// The Windows code page number for this encoding is 28600, but kernel32.dll
1023 /// does not support this encoding.
1024 ///
1025 /// This will change from `static` to `const` if Rust changes
1026 /// to make the referent of `pub const FOO: &'static Encoding`
1027 /// unique cross-crate, so don't take the address of this
1028 /// `static`.
1029 pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1030
1031 /// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1032 ///
1033 /// For use only for taking the address of this form when
1034 /// Rust prohibits the use of the non-`_INIT` form directly,
1035 /// such as in initializers of other `static`s. If in doubt,
1036 /// use the corresponding non-`_INIT` reference-typed `static`.
1037 ///
1038 /// This part of the public API will go away if Rust changes
1039 /// to make the referent of `pub const FOO: &'static Encoding`
1040 /// unique cross-crate or if Rust starts allowing static arrays
1041 /// to be initialized with `pub static FOO: &'static Encoding`
1042 /// items.
1043 pub static ISO_8859_13_INIT: Encoding = Encoding {
1044 name: "ISO-8859-13",
1045 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1046 };
1047
1048 /// The ISO-8859-13 encoding.
1049 ///
1050 /// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1051 /// is also known as Latin 7.
1052 ///
1053 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1054 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1055 ///
1056 /// This encoding matches the Windows code page 28603, except Windows decodes
1057 /// unassigned code points to the Private Use Area of Unicode.
1058 ///
1059 /// This will change from `static` to `const` if Rust changes
1060 /// to make the referent of `pub const FOO: &'static Encoding`
1061 /// unique cross-crate, so don't take the address of this
1062 /// `static`.
1063 pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1064
1065 /// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1066 ///
1067 /// For use only for taking the address of this form when
1068 /// Rust prohibits the use of the non-`_INIT` form directly,
1069 /// such as in initializers of other `static`s. If in doubt,
1070 /// use the corresponding non-`_INIT` reference-typed `static`.
1071 ///
1072 /// This part of the public API will go away if Rust changes
1073 /// to make the referent of `pub const FOO: &'static Encoding`
1074 /// unique cross-crate or if Rust starts allowing static arrays
1075 /// to be initialized with `pub static FOO: &'static Encoding`
1076 /// items.
1077 pub static ISO_8859_14_INIT: Encoding = Encoding {
1078 name: "ISO-8859-14",
1079 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1080 };
1081
1082 /// The ISO-8859-14 encoding.
1083 ///
1084 /// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1085 /// is also known as Latin 8.
1086 ///
1087 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1088 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1089 ///
1090 /// The Windows code page number for this encoding is 28604, but kernel32.dll
1091 /// does not support this encoding.
1092 ///
1093 /// This will change from `static` to `const` if Rust changes
1094 /// to make the referent of `pub const FOO: &'static Encoding`
1095 /// unique cross-crate, so don't take the address of this
1096 /// `static`.
1097 pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1098
1099 /// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1100 ///
1101 /// For use only for taking the address of this form when
1102 /// Rust prohibits the use of the non-`_INIT` form directly,
1103 /// such as in initializers of other `static`s. If in doubt,
1104 /// use the corresponding non-`_INIT` reference-typed `static`.
1105 ///
1106 /// This part of the public API will go away if Rust changes
1107 /// to make the referent of `pub const FOO: &'static Encoding`
1108 /// unique cross-crate or if Rust starts allowing static arrays
1109 /// to be initialized with `pub static FOO: &'static Encoding`
1110 /// items.
1111 pub static ISO_8859_15_INIT: Encoding = Encoding {
1112 name: "ISO-8859-15",
1113 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1114 };
1115
1116 /// The ISO-8859-15 encoding.
1117 ///
1118 /// This is the revised Western European part of the ISO/IEC 8859 encoding
1119 /// family. This encoding is also known as Latin 9.
1120 ///
1121 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1122 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1123 ///
1124 /// This encoding matches the Windows code page 28605.
1125 ///
1126 /// This will change from `static` to `const` if Rust changes
1127 /// to make the referent of `pub const FOO: &'static Encoding`
1128 /// unique cross-crate, so don't take the address of this
1129 /// `static`.
1130 pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1131
1132 /// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1133 ///
1134 /// For use only for taking the address of this form when
1135 /// Rust prohibits the use of the non-`_INIT` form directly,
1136 /// such as in initializers of other `static`s. If in doubt,
1137 /// use the corresponding non-`_INIT` reference-typed `static`.
1138 ///
1139 /// This part of the public API will go away if Rust changes
1140 /// to make the referent of `pub const FOO: &'static Encoding`
1141 /// unique cross-crate or if Rust starts allowing static arrays
1142 /// to be initialized with `pub static FOO: &'static Encoding`
1143 /// items.
1144 pub static ISO_8859_16_INIT: Encoding = Encoding {
1145 name: "ISO-8859-16",
1146 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1147 };
1148
1149 /// The ISO-8859-16 encoding.
1150 ///
1151 /// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1152 /// family. This encoding is also known as Latin 10.
1153 ///
1154 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1155 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1156 ///
1157 /// The Windows code page number for this encoding is 28606, but kernel32.dll
1158 /// does not support this encoding.
1159 ///
1160 /// This will change from `static` to `const` if Rust changes
1161 /// to make the referent of `pub const FOO: &'static Encoding`
1162 /// unique cross-crate, so don't take the address of this
1163 /// `static`.
1164 pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1165
1166 /// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1167 ///
1168 /// For use only for taking the address of this form when
1169 /// Rust prohibits the use of the non-`_INIT` form directly,
1170 /// such as in initializers of other `static`s. If in doubt,
1171 /// use the corresponding non-`_INIT` reference-typed `static`.
1172 ///
1173 /// This part of the public API will go away if Rust changes
1174 /// to make the referent of `pub const FOO: &'static Encoding`
1175 /// unique cross-crate or if Rust starts allowing static arrays
1176 /// to be initialized with `pub static FOO: &'static Encoding`
1177 /// items.
1178 pub static ISO_8859_2_INIT: Encoding = Encoding {
1179 name: "ISO-8859-2",
1180 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1181 };
1182
1183 /// The ISO-8859-2 encoding.
1184 ///
1185 /// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1186 ///
1187 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1188 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1189 ///
1190 /// This encoding matches the Windows code page 28592.
1191 ///
1192 /// This will change from `static` to `const` if Rust changes
1193 /// to make the referent of `pub const FOO: &'static Encoding`
1194 /// unique cross-crate, so don't take the address of this
1195 /// `static`.
1196 pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1197
1198 /// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1199 ///
1200 /// For use only for taking the address of this form when
1201 /// Rust prohibits the use of the non-`_INIT` form directly,
1202 /// such as in initializers of other `static`s. If in doubt,
1203 /// use the corresponding non-`_INIT` reference-typed `static`.
1204 ///
1205 /// This part of the public API will go away if Rust changes
1206 /// to make the referent of `pub const FOO: &'static Encoding`
1207 /// unique cross-crate or if Rust starts allowing static arrays
1208 /// to be initialized with `pub static FOO: &'static Encoding`
1209 /// items.
1210 pub static ISO_8859_3_INIT: Encoding = Encoding {
1211 name: "ISO-8859-3",
1212 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1213 };
1214
1215 /// The ISO-8859-3 encoding.
1216 ///
1217 /// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1218 ///
1219 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1220 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1221 ///
1222 /// This encoding matches the Windows code page 28593.
1223 ///
1224 /// This will change from `static` to `const` if Rust changes
1225 /// to make the referent of `pub const FOO: &'static Encoding`
1226 /// unique cross-crate, so don't take the address of this
1227 /// `static`.
1228 pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1229
1230 /// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1231 ///
1232 /// For use only for taking the address of this form when
1233 /// Rust prohibits the use of the non-`_INIT` form directly,
1234 /// such as in initializers of other `static`s. If in doubt,
1235 /// use the corresponding non-`_INIT` reference-typed `static`.
1236 ///
1237 /// This part of the public API will go away if Rust changes
1238 /// to make the referent of `pub const FOO: &'static Encoding`
1239 /// unique cross-crate or if Rust starts allowing static arrays
1240 /// to be initialized with `pub static FOO: &'static Encoding`
1241 /// items.
1242 pub static ISO_8859_4_INIT: Encoding = Encoding {
1243 name: "ISO-8859-4",
1244 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1245 };
1246
1247 /// The ISO-8859-4 encoding.
1248 ///
1249 /// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1250 ///
1251 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1252 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1253 ///
1254 /// This encoding matches the Windows code page 28594.
1255 ///
1256 /// This will change from `static` to `const` if Rust changes
1257 /// to make the referent of `pub const FOO: &'static Encoding`
1258 /// unique cross-crate, so don't take the address of this
1259 /// `static`.
1260 pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1261
1262 /// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1263 ///
1264 /// For use only for taking the address of this form when
1265 /// Rust prohibits the use of the non-`_INIT` form directly,
1266 /// such as in initializers of other `static`s. If in doubt,
1267 /// use the corresponding non-`_INIT` reference-typed `static`.
1268 ///
1269 /// This part of the public API will go away if Rust changes
1270 /// to make the referent of `pub const FOO: &'static Encoding`
1271 /// unique cross-crate or if Rust starts allowing static arrays
1272 /// to be initialized with `pub static FOO: &'static Encoding`
1273 /// items.
1274 pub static ISO_8859_5_INIT: Encoding = Encoding {
1275 name: "ISO-8859-5",
1276 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1277 };
1278
1279 /// The ISO-8859-5 encoding.
1280 ///
1281 /// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1282 ///
1283 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1284 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1285 ///
1286 /// This encoding matches the Windows code page 28595.
1287 ///
1288 /// This will change from `static` to `const` if Rust changes
1289 /// to make the referent of `pub const FOO: &'static Encoding`
1290 /// unique cross-crate, so don't take the address of this
1291 /// `static`.
1292 pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1293
1294 /// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1295 ///
1296 /// For use only for taking the address of this form when
1297 /// Rust prohibits the use of the non-`_INIT` form directly,
1298 /// such as in initializers of other `static`s. If in doubt,
1299 /// use the corresponding non-`_INIT` reference-typed `static`.
1300 ///
1301 /// This part of the public API will go away if Rust changes
1302 /// to make the referent of `pub const FOO: &'static Encoding`
1303 /// unique cross-crate or if Rust starts allowing static arrays
1304 /// to be initialized with `pub static FOO: &'static Encoding`
1305 /// items.
1306 pub static ISO_8859_6_INIT: Encoding = Encoding {
1307 name: "ISO-8859-6",
1308 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1309 };
1310
1311 /// The ISO-8859-6 encoding.
1312 ///
1313 /// This is the Arabic part of the ISO/IEC 8859 encoding family.
1314 ///
1315 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1316 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1317 ///
1318 /// This encoding matches the Windows code page 28596, except Windows decodes
1319 /// unassigned code points to the Private Use Area of Unicode.
1320 ///
1321 /// This will change from `static` to `const` if Rust changes
1322 /// to make the referent of `pub const FOO: &'static Encoding`
1323 /// unique cross-crate, so don't take the address of this
1324 /// `static`.
1325 pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1326
1327 /// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1328 ///
1329 /// For use only for taking the address of this form when
1330 /// Rust prohibits the use of the non-`_INIT` form directly,
1331 /// such as in initializers of other `static`s. If in doubt,
1332 /// use the corresponding non-`_INIT` reference-typed `static`.
1333 ///
1334 /// This part of the public API will go away if Rust changes
1335 /// to make the referent of `pub const FOO: &'static Encoding`
1336 /// unique cross-crate or if Rust starts allowing static arrays
1337 /// to be initialized with `pub static FOO: &'static Encoding`
1338 /// items.
1339 pub static ISO_8859_7_INIT: Encoding = Encoding {
1340 name: "ISO-8859-7",
1341 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1342 };
1343
1344 /// The ISO-8859-7 encoding.
1345 ///
1346 /// This is the Greek part of the ISO/IEC 8859 encoding family.
1347 ///
1348 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1349 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1350 ///
1351 /// This encoding roughly matches the Windows code page 28597. Windows decodes
1352 /// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1353 /// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1354 /// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1355 /// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1356 /// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1357 ///
1358 /// This will change from `static` to `const` if Rust changes
1359 /// to make the referent of `pub const FOO: &'static Encoding`
1360 /// unique cross-crate, so don't take the address of this
1361 /// `static`.
1362 pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1363
1364 /// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1365 ///
1366 /// For use only for taking the address of this form when
1367 /// Rust prohibits the use of the non-`_INIT` form directly,
1368 /// such as in initializers of other `static`s. If in doubt,
1369 /// use the corresponding non-`_INIT` reference-typed `static`.
1370 ///
1371 /// This part of the public API will go away if Rust changes
1372 /// to make the referent of `pub const FOO: &'static Encoding`
1373 /// unique cross-crate or if Rust starts allowing static arrays
1374 /// to be initialized with `pub static FOO: &'static Encoding`
1375 /// items.
1376 pub static ISO_8859_8_INIT: Encoding = Encoding {
1377 name: "ISO-8859-8",
1378 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1379 };
1380
1381 /// The ISO-8859-8 encoding.
1382 ///
1383 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1384 ///
1385 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1386 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1387 ///
1388 /// This encoding roughly matches the Windows code page 28598. Windows decodes
1389 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1390 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1391 /// the private use area.
1392 ///
1393 /// This will change from `static` to `const` if Rust changes
1394 /// to make the referent of `pub const FOO: &'static Encoding`
1395 /// unique cross-crate, so don't take the address of this
1396 /// `static`.
1397 pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1398
1399 /// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1400 ///
1401 /// For use only for taking the address of this form when
1402 /// Rust prohibits the use of the non-`_INIT` form directly,
1403 /// such as in initializers of other `static`s. If in doubt,
1404 /// use the corresponding non-`_INIT` reference-typed `static`.
1405 ///
1406 /// This part of the public API will go away if Rust changes
1407 /// to make the referent of `pub const FOO: &'static Encoding`
1408 /// unique cross-crate or if Rust starts allowing static arrays
1409 /// to be initialized with `pub static FOO: &'static Encoding`
1410 /// items.
1411 pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1412 name: "ISO-8859-8-I",
1413 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1414 };
1415
1416 /// The ISO-8859-8-I encoding.
1417 ///
1418 /// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1419 ///
1420 /// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1421 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1422 ///
1423 /// This encoding roughly matches the Windows code page 38598. Windows decodes
1424 /// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1425 /// Area instead of LRM and RLM. Windows decodes unassigned code points to
1426 /// the private use area.
1427 ///
1428 /// This will change from `static` to `const` if Rust changes
1429 /// to make the referent of `pub const FOO: &'static Encoding`
1430 /// unique cross-crate, so don't take the address of this
1431 /// `static`.
1432 pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1433
1434 /// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1435 ///
1436 /// For use only for taking the address of this form when
1437 /// Rust prohibits the use of the non-`_INIT` form directly,
1438 /// such as in initializers of other `static`s. If in doubt,
1439 /// use the corresponding non-`_INIT` reference-typed `static`.
1440 ///
1441 /// This part of the public API will go away if Rust changes
1442 /// to make the referent of `pub const FOO: &'static Encoding`
1443 /// unique cross-crate or if Rust starts allowing static arrays
1444 /// to be initialized with `pub static FOO: &'static Encoding`
1445 /// items.
1446 pub static KOI8_R_INIT: Encoding = Encoding {
1447 name: "KOI8-R",
1448 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1449 };
1450
1451 /// The KOI8-R encoding.
1452 ///
1453 /// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1454 ///
1455 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1456 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1457 ///
1458 /// This encoding matches the Windows code page 20866.
1459 ///
1460 /// This will change from `static` to `const` if Rust changes
1461 /// to make the referent of `pub const FOO: &'static Encoding`
1462 /// unique cross-crate, so don't take the address of this
1463 /// `static`.
1464 pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1465
1466 /// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1467 ///
1468 /// For use only for taking the address of this form when
1469 /// Rust prohibits the use of the non-`_INIT` form directly,
1470 /// such as in initializers of other `static`s. If in doubt,
1471 /// use the corresponding non-`_INIT` reference-typed `static`.
1472 ///
1473 /// This part of the public API will go away if Rust changes
1474 /// to make the referent of `pub const FOO: &'static Encoding`
1475 /// unique cross-crate or if Rust starts allowing static arrays
1476 /// to be initialized with `pub static FOO: &'static Encoding`
1477 /// items.
1478 pub static KOI8_U_INIT: Encoding = Encoding {
1479 name: "KOI8-U",
1480 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1481 };
1482
1483 /// The KOI8-U encoding.
1484 ///
1485 /// This is an encoding for Ukrainian adapted from KOI8-R.
1486 ///
1487 /// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1488 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1489 ///
1490 /// This encoding matches the Windows code page 21866.
1491 ///
1492 /// This will change from `static` to `const` if Rust changes
1493 /// to make the referent of `pub const FOO: &'static Encoding`
1494 /// unique cross-crate, so don't take the address of this
1495 /// `static`.
1496 pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1497
1498 /// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1499 ///
1500 /// For use only for taking the address of this form when
1501 /// Rust prohibits the use of the non-`_INIT` form directly,
1502 /// such as in initializers of other `static`s. If in doubt,
1503 /// use the corresponding non-`_INIT` reference-typed `static`.
1504 ///
1505 /// This part of the public API will go away if Rust changes
1506 /// to make the referent of `pub const FOO: &'static Encoding`
1507 /// unique cross-crate or if Rust starts allowing static arrays
1508 /// to be initialized with `pub static FOO: &'static Encoding`
1509 /// items.
1510 pub static SHIFT_JIS_INIT: Encoding = Encoding {
1511 name: "Shift_JIS",
1512 variant: VariantEncoding::ShiftJis,
1513 };
1514
1515 /// The Shift_JIS encoding.
1516 ///
1517 /// This is the Japanese encoding for Windows.
1518 ///
1519 /// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1520 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1521 ///
1522 /// This encoding matches the Windows code page 932, except Windows decodes some byte
1523 /// sequences that are error per the Encoding Standard to the question mark or the
1524 /// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1525 ///
1526 /// This will change from `static` to `const` if Rust changes
1527 /// to make the referent of `pub const FOO: &'static Encoding`
1528 /// unique cross-crate, so don't take the address of this
1529 /// `static`.
1530 pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1531
1532 /// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1533 ///
1534 /// For use only for taking the address of this form when
1535 /// Rust prohibits the use of the non-`_INIT` form directly,
1536 /// such as in initializers of other `static`s. If in doubt,
1537 /// use the corresponding non-`_INIT` reference-typed `static`.
1538 ///
1539 /// This part of the public API will go away if Rust changes
1540 /// to make the referent of `pub const FOO: &'static Encoding`
1541 /// unique cross-crate or if Rust starts allowing static arrays
1542 /// to be initialized with `pub static FOO: &'static Encoding`
1543 /// items.
1544 pub static UTF_16BE_INIT: Encoding = Encoding {
1545 name: "UTF-16BE",
1546 variant: VariantEncoding::Utf16Be,
1547 };
1548
1549 /// The UTF-16BE encoding.
1550 ///
1551 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1552 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1553 /// mark the big endian byte order is assumed.
1554 ///
1555 /// There is no corresponding encoder in this crate or in the Encoding
1556 /// Standard. The output encoding of this encoding is UTF-8.
1557 ///
1558 /// This encoding matches the Windows code page 1201.
1559 ///
1560 /// This will change from `static` to `const` if Rust changes
1561 /// to make the referent of `pub const FOO: &'static Encoding`
1562 /// unique cross-crate, so don't take the address of this
1563 /// `static`.
1564 pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1565
1566 /// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1567 ///
1568 /// For use only for taking the address of this form when
1569 /// Rust prohibits the use of the non-`_INIT` form directly,
1570 /// such as in initializers of other `static`s. If in doubt,
1571 /// use the corresponding non-`_INIT` reference-typed `static`.
1572 ///
1573 /// This part of the public API will go away if Rust changes
1574 /// to make the referent of `pub const FOO: &'static Encoding`
1575 /// unique cross-crate or if Rust starts allowing static arrays
1576 /// to be initialized with `pub static FOO: &'static Encoding`
1577 /// items.
1578 pub static UTF_16LE_INIT: Encoding = Encoding {
1579 name: "UTF-16LE",
1580 variant: VariantEncoding::Utf16Le,
1581 };
1582
1583 /// The UTF-16LE encoding.
1584 ///
1585 /// This decode-only encoding uses 16-bit code units due to Unicode originally
1586 /// having been designed as a 16-bit reportoire. In the absence of a byte order
1587 /// mark the little endian byte order is assumed.
1588 ///
1589 /// There is no corresponding encoder in this crate or in the Encoding
1590 /// Standard. The output encoding of this encoding is UTF-8.
1591 ///
1592 /// This encoding matches the Windows code page 1200.
1593 ///
1594 /// This will change from `static` to `const` if Rust changes
1595 /// to make the referent of `pub const FOO: &'static Encoding`
1596 /// unique cross-crate, so don't take the address of this
1597 /// `static`.
1598 pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1599
1600 /// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1601 ///
1602 /// For use only for taking the address of this form when
1603 /// Rust prohibits the use of the non-`_INIT` form directly,
1604 /// such as in initializers of other `static`s. If in doubt,
1605 /// use the corresponding non-`_INIT` reference-typed `static`.
1606 ///
1607 /// This part of the public API will go away if Rust changes
1608 /// to make the referent of `pub const FOO: &'static Encoding`
1609 /// unique cross-crate or if Rust starts allowing static arrays
1610 /// to be initialized with `pub static FOO: &'static Encoding`
1611 /// items.
1612 pub static UTF_8_INIT: Encoding = Encoding {
1613 name: "UTF-8",
1614 variant: VariantEncoding::Utf8,
1615 };
1616
1617 /// The UTF-8 encoding.
1618 ///
1619 /// This is the encoding that should be used for all new development it can
1620 /// represent all of Unicode.
1621 ///
1622 /// This encoding matches the Windows code page 65001, except Windows differs
1623 /// in the number of errors generated for some erroneous byte sequences.
1624 ///
1625 /// This will change from `static` to `const` if Rust changes
1626 /// to make the referent of `pub const FOO: &'static Encoding`
1627 /// unique cross-crate, so don't take the address of this
1628 /// `static`.
1629 pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1630
1631 /// The initializer for the [gb18030](static.GB18030.html) encoding.
1632 ///
1633 /// For use only for taking the address of this form when
1634 /// Rust prohibits the use of the non-`_INIT` form directly,
1635 /// such as in initializers of other `static`s. If in doubt,
1636 /// use the corresponding non-`_INIT` reference-typed `static`.
1637 ///
1638 /// This part of the public API will go away if Rust changes
1639 /// to make the referent of `pub const FOO: &'static Encoding`
1640 /// unique cross-crate or if Rust starts allowing static arrays
1641 /// to be initialized with `pub static FOO: &'static Encoding`
1642 /// items.
1643 pub static GB18030_INIT: Encoding = Encoding {
1644 name: "gb18030",
1645 variant: VariantEncoding::Gb18030,
1646 };
1647
1648 /// The gb18030 encoding.
1649 ///
1650 /// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1651 /// maps to U+3000 for compatibility with existing Web content. As a result,
1652 /// this encoding can represent all of Unicode except for the private-use
1653 /// character U+E5E5.
1654 ///
1655 /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1656 /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1657 ///
1658 /// This encoding matches the Windows code page 54936.
1659 ///
1660 /// This will change from `static` to `const` if Rust changes
1661 /// to make the referent of `pub const FOO: &'static Encoding`
1662 /// unique cross-crate, so don't take the address of this
1663 /// `static`.
1664 pub static GB18030: &'static Encoding = &GB18030_INIT;
1665
1666 /// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1667 ///
1668 /// For use only for taking the address of this form when
1669 /// Rust prohibits the use of the non-`_INIT` form directly,
1670 /// such as in initializers of other `static`s. If in doubt,
1671 /// use the corresponding non-`_INIT` reference-typed `static`.
1672 ///
1673 /// This part of the public API will go away if Rust changes
1674 /// to make the referent of `pub const FOO: &'static Encoding`
1675 /// unique cross-crate or if Rust starts allowing static arrays
1676 /// to be initialized with `pub static FOO: &'static Encoding`
1677 /// items.
1678 pub static MACINTOSH_INIT: Encoding = Encoding {
1679 name: "macintosh",
1680 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1681 };
1682
1683 /// The macintosh encoding.
1684 ///
1685 /// This is the MacRoman encoding from Mac OS Classic.
1686 ///
1687 /// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1688 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1689 ///
1690 /// This encoding matches the Windows code page 10000, except Windows decodes
1691 /// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1692 ///
1693 /// This will change from `static` to `const` if Rust changes
1694 /// to make the referent of `pub const FOO: &'static Encoding`
1695 /// unique cross-crate, so don't take the address of this
1696 /// `static`.
1697 pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1698
1699 /// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1700 ///
1701 /// For use only for taking the address of this form when
1702 /// Rust prohibits the use of the non-`_INIT` form directly,
1703 /// such as in initializers of other `static`s. If in doubt,
1704 /// use the corresponding non-`_INIT` reference-typed `static`.
1705 ///
1706 /// This part of the public API will go away if Rust changes
1707 /// to make the referent of `pub const FOO: &'static Encoding`
1708 /// unique cross-crate or if Rust starts allowing static arrays
1709 /// to be initialized with `pub static FOO: &'static Encoding`
1710 /// items.
1711 pub static REPLACEMENT_INIT: Encoding = Encoding {
1712 name: "replacement",
1713 variant: VariantEncoding::Replacement,
1714 };
1715
1716 /// The replacement encoding.
1717 ///
1718 /// This decode-only encoding decodes all non-zero-length streams to a single
1719 /// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1720 /// ASCII-compatible fallback encoding (typically windows-1252) for some
1721 /// encodings that are no longer supported by the Web Platform and that
1722 /// would be dangerous to treat as ASCII-compatible.
1723 ///
1724 /// There is no corresponding encoder. The output encoding of this encoding
1725 /// is UTF-8.
1726 ///
1727 /// This encoding does not have a Windows code page number.
1728 ///
1729 /// This will change from `static` to `const` if Rust changes
1730 /// to make the referent of `pub const FOO: &'static Encoding`
1731 /// unique cross-crate, so don't take the address of this
1732 /// `static`.
1733 pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1734
1735 /// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1736 ///
1737 /// For use only for taking the address of this form when
1738 /// Rust prohibits the use of the non-`_INIT` form directly,
1739 /// such as in initializers of other `static`s. If in doubt,
1740 /// use the corresponding non-`_INIT` reference-typed `static`.
1741 ///
1742 /// This part of the public API will go away if Rust changes
1743 /// to make the referent of `pub const FOO: &'static Encoding`
1744 /// unique cross-crate or if Rust starts allowing static arrays
1745 /// to be initialized with `pub static FOO: &'static Encoding`
1746 /// items.
1747 pub static WINDOWS_1250_INIT: Encoding = Encoding {
1748 name: "windows-1250",
1749 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1750 };
1751
1752 /// The windows-1250 encoding.
1753 ///
1754 /// This is the Central European encoding for Windows.
1755 ///
1756 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1757 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1758 ///
1759 /// This encoding matches the Windows code page 1250.
1760 ///
1761 /// This will change from `static` to `const` if Rust changes
1762 /// to make the referent of `pub const FOO: &'static Encoding`
1763 /// unique cross-crate, so don't take the address of this
1764 /// `static`.
1765 pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1766
1767 /// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1768 ///
1769 /// For use only for taking the address of this form when
1770 /// Rust prohibits the use of the non-`_INIT` form directly,
1771 /// such as in initializers of other `static`s. If in doubt,
1772 /// use the corresponding non-`_INIT` reference-typed `static`.
1773 ///
1774 /// This part of the public API will go away if Rust changes
1775 /// to make the referent of `pub const FOO: &'static Encoding`
1776 /// unique cross-crate or if Rust starts allowing static arrays
1777 /// to be initialized with `pub static FOO: &'static Encoding`
1778 /// items.
1779 pub static WINDOWS_1251_INIT: Encoding = Encoding {
1780 name: "windows-1251",
1781 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1782 };
1783
1784 /// The windows-1251 encoding.
1785 ///
1786 /// This is the Cyrillic encoding for Windows.
1787 ///
1788 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1789 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1790 ///
1791 /// This encoding matches the Windows code page 1251.
1792 ///
1793 /// This will change from `static` to `const` if Rust changes
1794 /// to make the referent of `pub const FOO: &'static Encoding`
1795 /// unique cross-crate, so don't take the address of this
1796 /// `static`.
1797 pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1798
1799 /// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1800 ///
1801 /// For use only for taking the address of this form when
1802 /// Rust prohibits the use of the non-`_INIT` form directly,
1803 /// such as in initializers of other `static`s. If in doubt,
1804 /// use the corresponding non-`_INIT` reference-typed `static`.
1805 ///
1806 /// This part of the public API will go away if Rust changes
1807 /// to make the referent of `pub const FOO: &'static Encoding`
1808 /// unique cross-crate or if Rust starts allowing static arrays
1809 /// to be initialized with `pub static FOO: &'static Encoding`
1810 /// items.
1811 pub static WINDOWS_1252_INIT: Encoding = Encoding {
1812 name: "windows-1252",
1813 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1814 };
1815
1816 /// The windows-1252 encoding.
1817 ///
1818 /// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1819 /// which is known as Latin 1.
1820 ///
1821 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1822 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1823 ///
1824 /// This encoding matches the Windows code page 1252.
1825 ///
1826 /// This will change from `static` to `const` if Rust changes
1827 /// to make the referent of `pub const FOO: &'static Encoding`
1828 /// unique cross-crate, so don't take the address of this
1829 /// `static`.
1830 pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1831
1832 /// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1833 ///
1834 /// For use only for taking the address of this form when
1835 /// Rust prohibits the use of the non-`_INIT` form directly,
1836 /// such as in initializers of other `static`s. If in doubt,
1837 /// use the corresponding non-`_INIT` reference-typed `static`.
1838 ///
1839 /// This part of the public API will go away if Rust changes
1840 /// to make the referent of `pub const FOO: &'static Encoding`
1841 /// unique cross-crate or if Rust starts allowing static arrays
1842 /// to be initialized with `pub static FOO: &'static Encoding`
1843 /// items.
1844 pub static WINDOWS_1253_INIT: Encoding = Encoding {
1845 name: "windows-1253",
1846 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1847 };
1848
1849 /// The windows-1253 encoding.
1850 ///
1851 /// This is the Greek encoding for Windows. It is mostly an extension of
1852 /// ISO-8859-7, but U+0386 is mapped to a different byte.
1853 ///
1854 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1855 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1856 ///
1857 /// This encoding matches the Windows code page 1253, except Windows decodes
1858 /// unassigned code points to the Private Use Area of Unicode.
1859 ///
1860 /// This will change from `static` to `const` if Rust changes
1861 /// to make the referent of `pub const FOO: &'static Encoding`
1862 /// unique cross-crate, so don't take the address of this
1863 /// `static`.
1864 pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1865
1866 /// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1867 ///
1868 /// For use only for taking the address of this form when
1869 /// Rust prohibits the use of the non-`_INIT` form directly,
1870 /// such as in initializers of other `static`s. If in doubt,
1871 /// use the corresponding non-`_INIT` reference-typed `static`.
1872 ///
1873 /// This part of the public API will go away if Rust changes
1874 /// to make the referent of `pub const FOO: &'static Encoding`
1875 /// unique cross-crate or if Rust starts allowing static arrays
1876 /// to be initialized with `pub static FOO: &'static Encoding`
1877 /// items.
1878 pub static WINDOWS_1254_INIT: Encoding = Encoding {
1879 name: "windows-1254",
1880 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1881 };
1882
1883 /// The windows-1254 encoding.
1884 ///
1885 /// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1886 /// which is known as Latin 5.
1887 ///
1888 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1889 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1890 ///
1891 /// This encoding matches the Windows code page 1254.
1892 ///
1893 /// This will change from `static` to `const` if Rust changes
1894 /// to make the referent of `pub const FOO: &'static Encoding`
1895 /// unique cross-crate, so don't take the address of this
1896 /// `static`.
1897 pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1898
1899 /// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1900 ///
1901 /// For use only for taking the address of this form when
1902 /// Rust prohibits the use of the non-`_INIT` form directly,
1903 /// such as in initializers of other `static`s. If in doubt,
1904 /// use the corresponding non-`_INIT` reference-typed `static`.
1905 ///
1906 /// This part of the public API will go away if Rust changes
1907 /// to make the referent of `pub const FOO: &'static Encoding`
1908 /// unique cross-crate or if Rust starts allowing static arrays
1909 /// to be initialized with `pub static FOO: &'static Encoding`
1910 /// items.
1911 pub static WINDOWS_1255_INIT: Encoding = Encoding {
1912 name: "windows-1255",
1913 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1914 };
1915
1916 /// The windows-1255 encoding.
1917 ///
1918 /// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1919 /// except for a currency sign swap.
1920 ///
1921 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1922 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1923 ///
1924 /// This encoding matches the Windows code page 1255, except Windows decodes
1925 /// unassigned code points to the Private Use Area of Unicode.
1926 ///
1927 /// This will change from `static` to `const` if Rust changes
1928 /// to make the referent of `pub const FOO: &'static Encoding`
1929 /// unique cross-crate, so don't take the address of this
1930 /// `static`.
1931 pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1932
1933 /// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1934 ///
1935 /// For use only for taking the address of this form when
1936 /// Rust prohibits the use of the non-`_INIT` form directly,
1937 /// such as in initializers of other `static`s. If in doubt,
1938 /// use the corresponding non-`_INIT` reference-typed `static`.
1939 ///
1940 /// This part of the public API will go away if Rust changes
1941 /// to make the referent of `pub const FOO: &'static Encoding`
1942 /// unique cross-crate or if Rust starts allowing static arrays
1943 /// to be initialized with `pub static FOO: &'static Encoding`
1944 /// items.
1945 pub static WINDOWS_1256_INIT: Encoding = Encoding {
1946 name: "windows-1256",
1947 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1948 };
1949
1950 /// The windows-1256 encoding.
1951 ///
1952 /// This is the Arabic encoding for Windows.
1953 ///
1954 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1955 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1956 ///
1957 /// This encoding matches the Windows code page 1256.
1958 ///
1959 /// This will change from `static` to `const` if Rust changes
1960 /// to make the referent of `pub const FOO: &'static Encoding`
1961 /// unique cross-crate, so don't take the address of this
1962 /// `static`.
1963 pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1964
1965 /// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1966 ///
1967 /// For use only for taking the address of this form when
1968 /// Rust prohibits the use of the non-`_INIT` form directly,
1969 /// such as in initializers of other `static`s. If in doubt,
1970 /// use the corresponding non-`_INIT` reference-typed `static`.
1971 ///
1972 /// This part of the public API will go away if Rust changes
1973 /// to make the referent of `pub const FOO: &'static Encoding`
1974 /// unique cross-crate or if Rust starts allowing static arrays
1975 /// to be initialized with `pub static FOO: &'static Encoding`
1976 /// items.
1977 pub static WINDOWS_1257_INIT: Encoding = Encoding {
1978 name: "windows-1257",
1979 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1980 };
1981
1982 /// The windows-1257 encoding.
1983 ///
1984 /// This is the Baltic encoding for Windows.
1985 ///
1986 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1987 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1988 ///
1989 /// This encoding matches the Windows code page 1257, except Windows decodes
1990 /// unassigned code points to the Private Use Area of Unicode.
1991 ///
1992 /// This will change from `static` to `const` if Rust changes
1993 /// to make the referent of `pub const FOO: &'static Encoding`
1994 /// unique cross-crate, so don't take the address of this
1995 /// `static`.
1996 pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
1997
1998 /// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
1999 ///
2000 /// For use only for taking the address of this form when
2001 /// Rust prohibits the use of the non-`_INIT` form directly,
2002 /// such as in initializers of other `static`s. If in doubt,
2003 /// use the corresponding non-`_INIT` reference-typed `static`.
2004 ///
2005 /// This part of the public API will go away if Rust changes
2006 /// to make the referent of `pub const FOO: &'static Encoding`
2007 /// unique cross-crate or if Rust starts allowing static arrays
2008 /// to be initialized with `pub static FOO: &'static Encoding`
2009 /// items.
2010 pub static WINDOWS_1258_INIT: Encoding = Encoding {
2011 name: "windows-1258",
2012 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2013 };
2014
2015 /// The windows-1258 encoding.
2016 ///
2017 /// This is the Vietnamese encoding for Windows.
2018 ///
2019 /// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2020 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2021 ///
2022 /// This encoding matches the Windows code page 1258 when used in the
2023 /// non-normalizing mode. Unlike with the other single-byte encodings, the
2024 /// result of decoding is not necessarily in Normalization Form C. On the
2025 /// other hand, input in the Normalization Form C is not encoded without
2026 /// replacement. In general, it's a bad idea to encode to encodings other
2027 /// than UTF-8, but this encoding is especially hazardous to encode to.
2028 ///
2029 /// This will change from `static` to `const` if Rust changes
2030 /// to make the referent of `pub const FOO: &'static Encoding`
2031 /// unique cross-crate, so don't take the address of this
2032 /// `static`.
2033 pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2034
2035 /// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2036 ///
2037 /// For use only for taking the address of this form when
2038 /// Rust prohibits the use of the non-`_INIT` form directly,
2039 /// such as in initializers of other `static`s. If in doubt,
2040 /// use the corresponding non-`_INIT` reference-typed `static`.
2041 ///
2042 /// This part of the public API will go away if Rust changes
2043 /// to make the referent of `pub const FOO: &'static Encoding`
2044 /// unique cross-crate or if Rust starts allowing static arrays
2045 /// to be initialized with `pub static FOO: &'static Encoding`
2046 /// items.
2047 pub static WINDOWS_874_INIT: Encoding = Encoding {
2048 name: "windows-874",
2049 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2050 };
2051
2052 /// The windows-874 encoding.
2053 ///
2054 /// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2055 ///
2056 /// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2057 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2058 ///
2059 /// This encoding matches the Windows code page 874, except Windows decodes
2060 /// unassigned code points to the Private Use Area of Unicode.
2061 ///
2062 /// This will change from `static` to `const` if Rust changes
2063 /// to make the referent of `pub const FOO: &'static Encoding`
2064 /// unique cross-crate, so don't take the address of this
2065 /// `static`.
2066 pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2067
2068 /// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2069 ///
2070 /// For use only for taking the address of this form when
2071 /// Rust prohibits the use of the non-`_INIT` form directly,
2072 /// such as in initializers of other `static`s. If in doubt,
2073 /// use the corresponding non-`_INIT` reference-typed `static`.
2074 ///
2075 /// This part of the public API will go away if Rust changes
2076 /// to make the referent of `pub const FOO: &'static Encoding`
2077 /// unique cross-crate or if Rust starts allowing static arrays
2078 /// to be initialized with `pub static FOO: &'static Encoding`
2079 /// items.
2080 pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2081 name: "x-mac-cyrillic",
2082 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2083 };
2084
2085 /// The x-mac-cyrillic encoding.
2086 ///
2087 /// This is the MacUkrainian encoding from Mac OS Classic.
2088 ///
2089 /// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2090 /// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2091 ///
2092 /// This encoding matches the Windows code page 10017.
2093 ///
2094 /// This will change from `static` to `const` if Rust changes
2095 /// to make the referent of `pub const FOO: &'static Encoding`
2096 /// unique cross-crate, so don't take the address of this
2097 /// `static`.
2098 pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2099
2100 /// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2101 ///
2102 /// For use only for taking the address of this form when
2103 /// Rust prohibits the use of the non-`_INIT` form directly,
2104 /// such as in initializers of other `static`s. If in doubt,
2105 /// use the corresponding non-`_INIT` reference-typed `static`.
2106 ///
2107 /// This part of the public API will go away if Rust changes
2108 /// to make the referent of `pub const FOO: &'static Encoding`
2109 /// unique cross-crate or if Rust starts allowing static arrays
2110 /// to be initialized with `pub static FOO: &'static Encoding`
2111 /// items.
2112 pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2113 name: "x-user-defined",
2114 variant: VariantEncoding::UserDefined,
2115 };
2116
2117 /// The x-user-defined encoding.
2118 ///
2119 /// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2120 /// them to the Private Use Area of Unicode. It was used for loading binary
2121 /// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2122 /// the `"arraybuffer"` response type.
2123 ///
2124 /// This encoding does not have a Windows code page number.
2125 ///
2126 /// This will change from `static` to `const` if Rust changes
2127 /// to make the referent of `pub const FOO: &'static Encoding`
2128 /// unique cross-crate, so don't take the address of this
2129 /// `static`.
2130 pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2131
2132 static LABELS_SORTED: [&'static str; 219] = [
2133 "l1",
2134 "l2",
2135 "l3",
2136 "l4",
2137 "l5",
2138 "l6",
2139 "l9",
2140 "866",
2141 "mac",
2142 "koi",
2143 "gbk",
2144 "big5",
2145 "utf8",
2146 "koi8",
2147 "sjis",
2148 "ms932",
2149 "cp866",
2150 "utf-8",
2151 "cp819",
2152 "ascii",
2153 "x-gbk",
2154 "greek",
2155 "cp1250",
2156 "cp1251",
2157 "latin1",
2158 "gb2312",
2159 "cp1252",
2160 "latin2",
2161 "cp1253",
2162 "latin3",
2163 "cp1254",
2164 "latin4",
2165 "cp1255",
2166 "csbig5",
2167 "latin5",
2168 "utf-16",
2169 "cp1256",
2170 "ibm866",
2171 "latin6",
2172 "cp1257",
2173 "cp1258",
2174 "greek8",
2175 "ibm819",
2176 "arabic",
2177 "visual",
2178 "korean",
2179 "euc-jp",
2180 "koi8-r",
2181 "koi8_r",
2182 "euc-kr",
2183 "x-sjis",
2184 "koi8-u",
2185 "hebrew",
2186 "tis-620",
2187 "gb18030",
2188 "ksc5601",
2189 "gb_2312",
2190 "dos-874",
2191 "cn-big5",
2192 "chinese",
2193 "logical",
2194 "cskoi8r",
2195 "cseuckr",
2196 "koi8-ru",
2197 "x-cp1250",
2198 "ksc_5601",
2199 "x-cp1251",
2200 "iso88591",
2201 "csgb2312",
2202 "x-cp1252",
2203 "iso88592",
2204 "x-cp1253",
2205 "iso88593",
2206 "ecma-114",
2207 "x-cp1254",
2208 "iso88594",
2209 "x-cp1255",
2210 "iso88595",
2211 "x-x-big5",
2212 "x-cp1256",
2213 "csibm866",
2214 "iso88596",
2215 "x-cp1257",
2216 "iso88597",
2217 "asmo-708",
2218 "ecma-118",
2219 "elot_928",
2220 "x-cp1258",
2221 "iso88598",
2222 "iso88599",
2223 "cyrillic",
2224 "utf-16be",
2225 "utf-16le",
2226 "us-ascii",
2227 "ms_kanji",
2228 "x-euc-jp",
2229 "iso885910",
2230 "iso8859-1",
2231 "iso885911",
2232 "iso8859-2",
2233 "iso8859-3",
2234 "iso885913",
2235 "iso8859-4",
2236 "iso885914",
2237 "iso8859-5",
2238 "iso885915",
2239 "iso8859-6",
2240 "iso8859-7",
2241 "iso8859-8",
2242 "iso-ir-58",
2243 "iso8859-9",
2244 "macintosh",
2245 "shift-jis",
2246 "shift_jis",
2247 "iso-ir-100",
2248 "iso8859-10",
2249 "iso-ir-110",
2250 "gb_2312-80",
2251 "iso-8859-1",
2252 "iso_8859-1",
2253 "iso-ir-101",
2254 "iso8859-11",
2255 "iso-8859-2",
2256 "iso_8859-2",
2257 "hz-gb-2312",
2258 "iso-8859-3",
2259 "iso_8859-3",
2260 "iso8859-13",
2261 "iso-8859-4",
2262 "iso_8859-4",
2263 "iso8859-14",
2264 "iso-ir-144",
2265 "iso-8859-5",
2266 "iso_8859-5",
2267 "iso8859-15",
2268 "iso-8859-6",
2269 "iso_8859-6",
2270 "iso-ir-126",
2271 "iso-8859-7",
2272 "iso_8859-7",
2273 "iso-ir-127",
2274 "iso-ir-157",
2275 "iso-8859-8",
2276 "iso_8859-8",
2277 "iso-ir-138",
2278 "iso-ir-148",
2279 "iso-8859-9",
2280 "iso_8859-9",
2281 "iso-ir-109",
2282 "iso-ir-149",
2283 "big5-hkscs",
2284 "csshiftjis",
2285 "iso-8859-10",
2286 "iso-8859-11",
2287 "csisolatin1",
2288 "csisolatin2",
2289 "iso-8859-13",
2290 "csisolatin3",
2291 "iso-8859-14",
2292 "windows-874",
2293 "csisolatin4",
2294 "iso-8859-15",
2295 "iso_8859-15",
2296 "csisolatin5",
2297 "iso-8859-16",
2298 "csisolatin6",
2299 "windows-949",
2300 "csisolatin9",
2301 "csiso88596e",
2302 "csiso88598e",
2303 "csmacintosh",
2304 "csiso88596i",
2305 "csiso88598i",
2306 "windows-31j",
2307 "x-mac-roman",
2308 "iso-2022-cn",
2309 "iso-2022-jp",
2310 "csiso2022jp",
2311 "iso-2022-kr",
2312 "csiso2022kr",
2313 "replacement",
2314 "windows-1250",
2315 "windows-1251",
2316 "windows-1252",
2317 "windows-1253",
2318 "windows-1254",
2319 "windows-1255",
2320 "windows-1256",
2321 "windows-1257",
2322 "windows-1258",
2323 "iso-8859-6-e",
2324 "iso-8859-8-e",
2325 "iso-8859-6-i",
2326 "iso-8859-8-i",
2327 "sun_eu_greek",
2328 "csksc56011987",
2329 "ks_c_5601-1987",
2330 "ansi_x3.4-1968",
2331 "ks_c_5601-1989",
2332 "x-mac-cyrillic",
2333 "x-user-defined",
2334 "csiso58gb231280",
2335 "iso_8859-1:1987",
2336 "iso_8859-2:1987",
2337 "iso_8859-6:1987",
2338 "iso_8859-7:1987",
2339 "iso_8859-3:1988",
2340 "iso_8859-4:1988",
2341 "iso_8859-5:1988",
2342 "iso_8859-8:1988",
2343 "iso_8859-9:1989",
2344 "csisolatingreek",
2345 "x-mac-ukrainian",
2346 "iso-2022-cn-ext",
2347 "csisolatinarabic",
2348 "csisolatinhebrew",
2349 "unicode-1-1-utf-8",
2350 "csisolatincyrillic",
2351 "cseucpkdfmtjapanese",
2352 ];
2353
2354 static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2355 &WINDOWS_1252_INIT,
2356 &ISO_8859_2_INIT,
2357 &ISO_8859_3_INIT,
2358 &ISO_8859_4_INIT,
2359 &WINDOWS_1254_INIT,
2360 &ISO_8859_10_INIT,
2361 &ISO_8859_15_INIT,
2362 &IBM866_INIT,
2363 &MACINTOSH_INIT,
2364 &KOI8_R_INIT,
2365 &GBK_INIT,
2366 &BIG5_INIT,
2367 &UTF_8_INIT,
2368 &KOI8_R_INIT,
2369 &SHIFT_JIS_INIT,
2370 &SHIFT_JIS_INIT,
2371 &IBM866_INIT,
2372 &UTF_8_INIT,
2373 &WINDOWS_1252_INIT,
2374 &WINDOWS_1252_INIT,
2375 &GBK_INIT,
2376 &ISO_8859_7_INIT,
2377 &WINDOWS_1250_INIT,
2378 &WINDOWS_1251_INIT,
2379 &WINDOWS_1252_INIT,
2380 &GBK_INIT,
2381 &WINDOWS_1252_INIT,
2382 &ISO_8859_2_INIT,
2383 &WINDOWS_1253_INIT,
2384 &ISO_8859_3_INIT,
2385 &WINDOWS_1254_INIT,
2386 &ISO_8859_4_INIT,
2387 &WINDOWS_1255_INIT,
2388 &BIG5_INIT,
2389 &WINDOWS_1254_INIT,
2390 &UTF_16LE_INIT,
2391 &WINDOWS_1256_INIT,
2392 &IBM866_INIT,
2393 &ISO_8859_10_INIT,
2394 &WINDOWS_1257_INIT,
2395 &WINDOWS_1258_INIT,
2396 &ISO_8859_7_INIT,
2397 &WINDOWS_1252_INIT,
2398 &ISO_8859_6_INIT,
2399 &ISO_8859_8_INIT,
2400 &EUC_KR_INIT,
2401 &EUC_JP_INIT,
2402 &KOI8_R_INIT,
2403 &KOI8_R_INIT,
2404 &EUC_KR_INIT,
2405 &SHIFT_JIS_INIT,
2406 &KOI8_U_INIT,
2407 &ISO_8859_8_INIT,
2408 &WINDOWS_874_INIT,
2409 &GB18030_INIT,
2410 &EUC_KR_INIT,
2411 &GBK_INIT,
2412 &WINDOWS_874_INIT,
2413 &BIG5_INIT,
2414 &GBK_INIT,
2415 &ISO_8859_8_I_INIT,
2416 &KOI8_R_INIT,
2417 &EUC_KR_INIT,
2418 &KOI8_U_INIT,
2419 &WINDOWS_1250_INIT,
2420 &EUC_KR_INIT,
2421 &WINDOWS_1251_INIT,
2422 &WINDOWS_1252_INIT,
2423 &GBK_INIT,
2424 &WINDOWS_1252_INIT,
2425 &ISO_8859_2_INIT,
2426 &WINDOWS_1253_INIT,
2427 &ISO_8859_3_INIT,
2428 &ISO_8859_6_INIT,
2429 &WINDOWS_1254_INIT,
2430 &ISO_8859_4_INIT,
2431 &WINDOWS_1255_INIT,
2432 &ISO_8859_5_INIT,
2433 &BIG5_INIT,
2434 &WINDOWS_1256_INIT,
2435 &IBM866_INIT,
2436 &ISO_8859_6_INIT,
2437 &WINDOWS_1257_INIT,
2438 &ISO_8859_7_INIT,
2439 &ISO_8859_6_INIT,
2440 &ISO_8859_7_INIT,
2441 &ISO_8859_7_INIT,
2442 &WINDOWS_1258_INIT,
2443 &ISO_8859_8_INIT,
2444 &WINDOWS_1254_INIT,
2445 &ISO_8859_5_INIT,
2446 &UTF_16BE_INIT,
2447 &UTF_16LE_INIT,
2448 &WINDOWS_1252_INIT,
2449 &SHIFT_JIS_INIT,
2450 &EUC_JP_INIT,
2451 &ISO_8859_10_INIT,
2452 &WINDOWS_1252_INIT,
2453 &WINDOWS_874_INIT,
2454 &ISO_8859_2_INIT,
2455 &ISO_8859_3_INIT,
2456 &ISO_8859_13_INIT,
2457 &ISO_8859_4_INIT,
2458 &ISO_8859_14_INIT,
2459 &ISO_8859_5_INIT,
2460 &ISO_8859_15_INIT,
2461 &ISO_8859_6_INIT,
2462 &ISO_8859_7_INIT,
2463 &ISO_8859_8_INIT,
2464 &GBK_INIT,
2465 &WINDOWS_1254_INIT,
2466 &MACINTOSH_INIT,
2467 &SHIFT_JIS_INIT,
2468 &SHIFT_JIS_INIT,
2469 &WINDOWS_1252_INIT,
2470 &ISO_8859_10_INIT,
2471 &ISO_8859_4_INIT,
2472 &GBK_INIT,
2473 &WINDOWS_1252_INIT,
2474 &WINDOWS_1252_INIT,
2475 &ISO_8859_2_INIT,
2476 &WINDOWS_874_INIT,
2477 &ISO_8859_2_INIT,
2478 &ISO_8859_2_INIT,
2479 &REPLACEMENT_INIT,
2480 &ISO_8859_3_INIT,
2481 &ISO_8859_3_INIT,
2482 &ISO_8859_13_INIT,
2483 &ISO_8859_4_INIT,
2484 &ISO_8859_4_INIT,
2485 &ISO_8859_14_INIT,
2486 &ISO_8859_5_INIT,
2487 &ISO_8859_5_INIT,
2488 &ISO_8859_5_INIT,
2489 &ISO_8859_15_INIT,
2490 &ISO_8859_6_INIT,
2491 &ISO_8859_6_INIT,
2492 &ISO_8859_7_INIT,
2493 &ISO_8859_7_INIT,
2494 &ISO_8859_7_INIT,
2495 &ISO_8859_6_INIT,
2496 &ISO_8859_10_INIT,
2497 &ISO_8859_8_INIT,
2498 &ISO_8859_8_INIT,
2499 &ISO_8859_8_INIT,
2500 &WINDOWS_1254_INIT,
2501 &WINDOWS_1254_INIT,
2502 &WINDOWS_1254_INIT,
2503 &ISO_8859_3_INIT,
2504 &EUC_KR_INIT,
2505 &BIG5_INIT,
2506 &SHIFT_JIS_INIT,
2507 &ISO_8859_10_INIT,
2508 &WINDOWS_874_INIT,
2509 &WINDOWS_1252_INIT,
2510 &ISO_8859_2_INIT,
2511 &ISO_8859_13_INIT,
2512 &ISO_8859_3_INIT,
2513 &ISO_8859_14_INIT,
2514 &WINDOWS_874_INIT,
2515 &ISO_8859_4_INIT,
2516 &ISO_8859_15_INIT,
2517 &ISO_8859_15_INIT,
2518 &WINDOWS_1254_INIT,
2519 &ISO_8859_16_INIT,
2520 &ISO_8859_10_INIT,
2521 &EUC_KR_INIT,
2522 &ISO_8859_15_INIT,
2523 &ISO_8859_6_INIT,
2524 &ISO_8859_8_INIT,
2525 &MACINTOSH_INIT,
2526 &ISO_8859_6_INIT,
2527 &ISO_8859_8_I_INIT,
2528 &SHIFT_JIS_INIT,
2529 &MACINTOSH_INIT,
2530 &REPLACEMENT_INIT,
2531 &ISO_2022_JP_INIT,
2532 &ISO_2022_JP_INIT,
2533 &REPLACEMENT_INIT,
2534 &REPLACEMENT_INIT,
2535 &REPLACEMENT_INIT,
2536 &WINDOWS_1250_INIT,
2537 &WINDOWS_1251_INIT,
2538 &WINDOWS_1252_INIT,
2539 &WINDOWS_1253_INIT,
2540 &WINDOWS_1254_INIT,
2541 &WINDOWS_1255_INIT,
2542 &WINDOWS_1256_INIT,
2543 &WINDOWS_1257_INIT,
2544 &WINDOWS_1258_INIT,
2545 &ISO_8859_6_INIT,
2546 &ISO_8859_8_INIT,
2547 &ISO_8859_6_INIT,
2548 &ISO_8859_8_I_INIT,
2549 &ISO_8859_7_INIT,
2550 &EUC_KR_INIT,
2551 &EUC_KR_INIT,
2552 &WINDOWS_1252_INIT,
2553 &EUC_KR_INIT,
2554 &X_MAC_CYRILLIC_INIT,
2555 &X_USER_DEFINED_INIT,
2556 &GBK_INIT,
2557 &WINDOWS_1252_INIT,
2558 &ISO_8859_2_INIT,
2559 &ISO_8859_6_INIT,
2560 &ISO_8859_7_INIT,
2561 &ISO_8859_3_INIT,
2562 &ISO_8859_4_INIT,
2563 &ISO_8859_5_INIT,
2564 &ISO_8859_8_INIT,
2565 &WINDOWS_1254_INIT,
2566 &ISO_8859_7_INIT,
2567 &X_MAC_CYRILLIC_INIT,
2568 &REPLACEMENT_INIT,
2569 &ISO_8859_6_INIT,
2570 &ISO_8859_8_INIT,
2571 &UTF_8_INIT,
2572 &ISO_8859_5_INIT,
2573 &EUC_JP_INIT,
2574 ];
2575
2576 // END GENERATED CODE
2577
2578 /// An encoding as defined in the [Encoding Standard][1].
2579 ///
2580 /// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2581 /// and, in most cases, vice versa. Each encoding has a name, an output
2582 /// encoding, and one or more labels.
2583 ///
2584 /// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2585 /// encoding in formats and protocols. The _name_ of the encoding is the
2586 /// preferred label in the case appropriate for returning from the
2587 /// [`characterSet`][2] property of the `Document` DOM interface.
2588 ///
2589 /// The _output encoding_ is the encoding used for form submission and URL
2590 /// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2591 /// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2592 /// encodings.
2593 ///
2594 /// [1]: https://encoding.spec.whatwg.org/
2595 /// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2596 ///
2597 /// # Streaming vs. Non-Streaming
2598 ///
2599 /// When you have the entire input in a single buffer, you can use the
2600 /// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2601 /// [`decode_without_bom_handling()`][5],
2602 /// [`decode_without_bom_handling_and_without_replacement()`][6] and
2603 /// [`encode()`][7]. (These methods are available to Rust callers only and are
2604 /// not available in the C API.) Unlike the rest of the API available to Rust,
2605 /// these methods perform heap allocations. You should the `Decoder` and
2606 /// `Encoder` objects when your input is split into multiple buffers or when
2607 /// you want to control the allocation of the output buffers.
2608 ///
2609 /// [3]: #method.decode
2610 /// [4]: #method.decode_with_bom_removal
2611 /// [5]: #method.decode_without_bom_handling
2612 /// [6]: #method.decode_without_bom_handling_and_without_replacement
2613 /// [7]: #method.encode
2614 ///
2615 /// # Instances
2616 ///
2617 /// All instances of `Encoding` are statically allocated and have the `'static`
2618 /// lifetime. There is precisely one unique `Encoding` instance for each
2619 /// encoding defined in the Encoding Standard.
2620 ///
2621 /// To obtain a reference to a particular encoding whose identity you know at
2622 /// compile time, use a `static` that refers to encoding. There is a `static`
2623 /// for each encoding. The `static`s are named in all caps with hyphens
2624 /// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2625 /// name). For example, if you know at compile time that you will want to
2626 /// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2627 /// in C/C++).
2628 ///
2629 /// Additionally, there are non-reference-typed forms ending with `_INIT` to
2630 /// work around the problem that `static`s of the type `&'static Encoding`
2631 /// cannot be used to initialize items of an array whose type is
2632 /// `[&'static Encoding; N]`.
2633 ///
2634 /// If you don't know what encoding you need at compile time and need to
2635 /// dynamically get an encoding by label, use
2636 /// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2637 ///
2638 /// Instances of `Encoding` can be compared with `==` (in both Rust and in
2639 /// C/C++).
2640 pub struct Encoding {
2641 name: &'static str,
2642 variant: VariantEncoding,
2643 }
2644
2645 impl Encoding {
2646 /// Implements the
2647 /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2648 /// algorithm.
2649 ///
2650 /// If, after ASCII-lowercasing and removing leading and trailing
2651 /// whitespace, the argument matches a label defined in the Encoding
2652 /// Standard, `Some(&'static Encoding)` representing the corresponding
2653 /// encoding is returned. If there is no match, `None` is returned.
2654 ///
2655 /// This is the right method to use if the action upon the method returning
2656 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2657 /// When the action upon the method returning `None` is not to proceed with
2658 /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2659 /// appropriate.
2660 ///
2661 /// The argument is of type `&[u8]` instead of `&str` to save callers
2662 /// that are extracting the label from a non-UTF-8 protocol the trouble
2663 /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2664 /// on it.)
2665 ///
2666 /// Available via the C wrapper.
for_label(label: &[u8]) -> Option<&'static Encoding>2667 pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2668 let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2669 let mut trimmed_pos = 0usize;
2670 let mut iter = label.into_iter();
2671 // before
2672 loop {
2673 match iter.next() {
2674 None => {
2675 return None;
2676 }
2677 Some(byte) => {
2678 // The characters used in labels are:
2679 // a-z (except q, but excluding it below seems excessive)
2680 // 0-9
2681 // . _ - :
2682 match *byte {
2683 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2684 continue;
2685 }
2686 b'A'...b'Z' => {
2687 trimmed[trimmed_pos] = *byte + 0x20u8;
2688 trimmed_pos = 1usize;
2689 break;
2690 }
2691 b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2692 trimmed[trimmed_pos] = *byte;
2693 trimmed_pos = 1usize;
2694 break;
2695 }
2696 _ => {
2697 return None;
2698 }
2699 }
2700 }
2701 }
2702 }
2703 // inside
2704 loop {
2705 match iter.next() {
2706 None => {
2707 break;
2708 }
2709 Some(byte) => {
2710 match *byte {
2711 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2712 break;
2713 }
2714 b'A'...b'Z' => {
2715 if trimmed_pos == LONGEST_LABEL_LENGTH {
2716 // There's no encoding with a label this long
2717 return None;
2718 }
2719 trimmed[trimmed_pos] = *byte + 0x20u8;
2720 trimmed_pos += 1usize;
2721 continue;
2722 }
2723 b'a'...b'z' | b'0'...b'9' | b'-' | b'_' | b':' | b'.' => {
2724 if trimmed_pos == LONGEST_LABEL_LENGTH {
2725 // There's no encoding with a label this long
2726 return None;
2727 }
2728 trimmed[trimmed_pos] = *byte;
2729 trimmed_pos += 1usize;
2730 continue;
2731 }
2732 _ => {
2733 return None;
2734 }
2735 }
2736 }
2737 }
2738 }
2739 // after
2740 loop {
2741 match iter.next() {
2742 None => {
2743 break;
2744 }
2745 Some(byte) => {
2746 match *byte {
2747 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2748 continue;
2749 }
2750 _ => {
2751 // There's no label with space in the middle
2752 return None;
2753 }
2754 }
2755 }
2756 }
2757 }
2758 let candidate = &trimmed[..trimmed_pos];
2759 match LABELS_SORTED.binary_search_by(|probe| {
2760 let bytes = probe.as_bytes();
2761 let c = bytes.len().cmp(&candidate.len());
2762 if c != Ordering::Equal {
2763 return c;
2764 }
2765 let probe_iter = bytes.iter().rev();
2766 let candidate_iter = candidate.iter().rev();
2767 probe_iter.cmp(candidate_iter)
2768 }) {
2769 Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2770 Err(_) => None,
2771 }
2772 }
2773
2774 /// This method behaves the same as `for_label()`, except when `for_label()`
2775 /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2776 ///
2777 /// This method is useful in scenarios where a fatal error is required
2778 /// upon invalid label, because in those cases the caller typically wishes
2779 /// to treat the labels that map to the replacement encoding as fatal
2780 /// errors, too.
2781 ///
2782 /// It is not OK to use this method when the action upon the method returning
2783 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2784 /// case, the `for_label()` method should be used instead in order to avoid
2785 /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2786 ///
2787 /// Available via the C wrapper.
2788 #[inline]
for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding>2789 pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2790 match Encoding::for_label(label) {
2791 None => None,
2792 Some(encoding) => {
2793 if encoding == REPLACEMENT {
2794 None
2795 } else {
2796 Some(encoding)
2797 }
2798 }
2799 }
2800 }
2801
2802 /// Performs non-incremental BOM sniffing.
2803 ///
2804 /// The argument must either be a buffer representing the entire input
2805 /// stream (non-streaming case) or a buffer representing at least the first
2806 /// three bytes of the input stream (streaming case).
2807 ///
2808 /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2809 /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2810 /// or UTF-16BE BOM or `None` otherwise.
2811 ///
2812 /// Available via the C wrapper.
2813 #[inline]
for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)>2814 pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2815 if buffer.starts_with(b"\xEF\xBB\xBF") {
2816 Some((UTF_8, 3))
2817 } else if buffer.starts_with(b"\xFF\xFE") {
2818 Some((UTF_16LE, 2))
2819 } else if buffer.starts_with(b"\xFE\xFF") {
2820 Some((UTF_16BE, 2))
2821 } else {
2822 None
2823 }
2824 }
2825
2826 /// Returns the name of this encoding.
2827 ///
2828 /// This name is appropriate to return as-is from the DOM
2829 /// `document.characterSet` property.
2830 ///
2831 /// Available via the C wrapper.
2832 #[inline]
name(&'static self) -> &'static str2833 pub fn name(&'static self) -> &'static str {
2834 self.name
2835 }
2836
2837 /// Checks whether the _output encoding_ of this encoding can encode every
2838 /// `char`. (Only true if the output encoding is UTF-8.)
2839 ///
2840 /// Available via the C wrapper.
2841 #[inline]
can_encode_everything(&'static self) -> bool2842 pub fn can_encode_everything(&'static self) -> bool {
2843 self.output_encoding() == UTF_8
2844 }
2845
2846 /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2847 /// U+0000...U+007F and vice versa.
2848 ///
2849 /// Available via the C wrapper.
2850 #[inline]
is_ascii_compatible(&'static self) -> bool2851 pub fn is_ascii_compatible(&'static self) -> bool {
2852 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2853 }
2854
2855 /// Checks whether this encoding maps one byte to one Basic Multilingual
2856 /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2857 /// vice versa (for mappable characters).
2858 ///
2859 /// `true` iff this encoding is on the list of [Legacy single-byte
2860 /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2861 /// in the spec or x-user-defined.
2862 ///
2863 /// Available via the C wrapper.
2864 #[inline]
is_single_byte(&'static self) -> bool2865 pub fn is_single_byte(&'static self) -> bool {
2866 self.variant.is_single_byte()
2867 }
2868
2869 /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2870 /// U+0000...U+007F and vice versa.
2871 #[inline]
is_potentially_borrowable(&'static self) -> bool2872 fn is_potentially_borrowable(&'static self) -> bool {
2873 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2874 }
2875
2876 /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2877 /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2878 ///
2879 /// Available via the C wrapper.
2880 #[inline]
output_encoding(&'static self) -> &'static Encoding2881 pub fn output_encoding(&'static self) -> &'static Encoding {
2882 if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2883 UTF_8
2884 } else {
2885 self
2886 }
2887 }
2888
2889 /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2890 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2891 /// entire input is available as a single buffer (i.e. the end of the
2892 /// buffer marks the end of the stream).
2893 ///
2894 /// This method implements the (non-streaming version of) the
2895 /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2896 ///
2897 /// The second item in the returned tuple is the encoding that was actually
2898 /// used (which may differ from this encoding thanks to BOM sniffing).
2899 ///
2900 /// The third item in the returned tuple indicates whether there were
2901 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2902 ///
2903 /// _Note:_ It is wrong to use this when the input buffer represents only
2904 /// a segment of the input instead of the whole input. Use `new_decoder()`
2905 /// when decoding segmented input.
2906 ///
2907 /// This method performs a one or two heap allocations for the backing
2908 /// buffer of the `String` when unable to borrow. (One allocation if not
2909 /// errors and potentially another one in the presence of errors.) The
2910 /// first allocation assumes jemalloc and may not be optimal with
2911 /// allocators that do not use power-of-two buckets. A borrow is performed
2912 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2913 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2914 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2915 /// transitions.
2916 ///
2917 /// # Panics
2918 ///
2919 /// If the size calculation for a heap-allocated backing buffer overflows
2920 /// `usize`.
2921 ///
2922 /// Available to Rust only.
2923 #[inline]
decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool)2924 pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2925 let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2926 Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2927 None => (self, bytes),
2928 };
2929 let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2930 (cow, encoding, had_errors)
2931 }
2932
2933 /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2934 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2935 /// entire input is available as a single buffer (i.e. the end of the
2936 /// buffer marks the end of the stream).
2937 ///
2938 /// When invoked on `UTF_8`, this method implements the (non-streaming
2939 /// version of) the
2940 /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2941 /// concept.
2942 ///
2943 /// The second item in the returned pair indicates whether there were
2944 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2945 ///
2946 /// _Note:_ It is wrong to use this when the input buffer represents only
2947 /// a segment of the input instead of the whole input. Use
2948 /// `new_decoder_with_bom_removal()` when decoding segmented input.
2949 ///
2950 /// This method performs a one or two heap allocations for the backing
2951 /// buffer of the `String` when unable to borrow. (One allocation if not
2952 /// errors and potentially another one in the presence of errors.) The
2953 /// first allocation assumes jemalloc and may not be optimal with
2954 /// allocators that do not use power-of-two buckets. A borrow is performed
2955 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2956 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2957 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2958 /// transitions.
2959 ///
2960 /// # Panics
2961 ///
2962 /// If the size calculation for a heap-allocated backing buffer overflows
2963 /// `usize`.
2964 ///
2965 /// Available to Rust only.
2966 #[inline]
decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)2967 pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2968 let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2969 &bytes[3..]
2970 } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2971 || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2972 {
2973 &bytes[2..]
2974 } else {
2975 bytes
2976 };
2977 self.decode_without_bom_handling(without_bom)
2978 }
2979
2980 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2981 /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2982 /// the entire input is available as a single buffer (i.e. the end of the
2983 /// buffer marks the end of the stream).
2984 ///
2985 /// When invoked on `UTF_8`, this method implements the (non-streaming
2986 /// version of) the
2987 /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2988 /// spec concept.
2989 ///
2990 /// The second item in the returned pair indicates whether there were
2991 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2992 ///
2993 /// _Note:_ It is wrong to use this when the input buffer represents only
2994 /// a segment of the input instead of the whole input. Use
2995 /// `new_decoder_without_bom_handling()` when decoding segmented input.
2996 ///
2997 /// This method performs a one or two heap allocations for the backing
2998 /// buffer of the `String` when unable to borrow. (One allocation if not
2999 /// errors and potentially another one in the presence of errors.) The
3000 /// first allocation assumes jemalloc and may not be optimal with
3001 /// allocators that do not use power-of-two buckets. A borrow is performed
3002 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3003 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3004 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3005 /// transitions.
3006 ///
3007 /// # Panics
3008 ///
3009 /// If the size calculation for a heap-allocated backing buffer overflows
3010 /// `usize`.
3011 ///
3012 /// Available to Rust only.
decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool)3013 pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3014 let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3015 let valid_up_to = if self == UTF_8 {
3016 utf8_valid_up_to(bytes)
3017 } else if self == ISO_2022_JP {
3018 iso_2022_jp_ascii_valid_up_to(bytes)
3019 } else {
3020 ascii_valid_up_to(bytes)
3021 };
3022 if valid_up_to == bytes.len() {
3023 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3024 return (Cow::Borrowed(str), false);
3025 }
3026 let decoder = self.new_decoder_without_bom_handling();
3027
3028 let rounded_without_replacement = checked_next_power_of_two(checked_add(
3029 valid_up_to,
3030 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3031 ));
3032 let with_replacement = checked_add(
3033 valid_up_to,
3034 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3035 );
3036 let mut string = String::with_capacity(
3037 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3038 );
3039 unsafe {
3040 let vec = string.as_mut_vec();
3041 vec.set_len(valid_up_to);
3042 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3043 }
3044 (decoder, string, valid_up_to)
3045 } else {
3046 let decoder = self.new_decoder_without_bom_handling();
3047 let rounded_without_replacement = checked_next_power_of_two(
3048 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3049 );
3050 let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3051 let string = String::with_capacity(
3052 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3053 );
3054 (decoder, string, 0)
3055 };
3056
3057 let mut total_had_errors = false;
3058 loop {
3059 let (result, read, had_errors) =
3060 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3061 total_read += read;
3062 total_had_errors |= had_errors;
3063 match result {
3064 CoderResult::InputEmpty => {
3065 debug_assert_eq!(total_read, bytes.len());
3066 return (Cow::Owned(string), total_had_errors);
3067 }
3068 CoderResult::OutputFull => {
3069 // Allocate for the worst case. That is, we should come
3070 // here at most once per invocation of this method.
3071 let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3072 string.reserve(needed.unwrap());
3073 }
3074 }
3075 }
3076 }
3077
3078 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3079 /// _with malformed sequences treated as fatal_ when the entire input is
3080 /// available as a single buffer (i.e. the end of the buffer marks the end
3081 /// of the stream).
3082 ///
3083 /// When invoked on `UTF_8`, this method implements the (non-streaming
3084 /// version of) the
3085 /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3086 /// spec concept.
3087 ///
3088 /// Returns `None` if a malformed sequence was encountered and the result
3089 /// of the decode as `Some(String)` otherwise.
3090 ///
3091 /// _Note:_ It is wrong to use this when the input buffer represents only
3092 /// a segment of the input instead of the whole input. Use
3093 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3094 ///
3095 /// This method performs a single heap allocation for the backing
3096 /// buffer of the `String` when unable to borrow. A borrow is performed if
3097 /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3098 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3099 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3100 /// transitions.
3101 ///
3102 /// # Panics
3103 ///
3104 /// If the size calculation for a heap-allocated backing buffer overflows
3105 /// `usize`.
3106 ///
3107 /// Available to Rust only.
decode_without_bom_handling_and_without_replacement<'a>( &'static self, bytes: &'a [u8], ) -> Option<Cow<'a, str>>3108 pub fn decode_without_bom_handling_and_without_replacement<'a>(
3109 &'static self,
3110 bytes: &'a [u8],
3111 ) -> Option<Cow<'a, str>> {
3112 if self == UTF_8 {
3113 let valid_up_to = utf8_valid_up_to(bytes);
3114 if valid_up_to == bytes.len() {
3115 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3116 return Some(Cow::Borrowed(str));
3117 }
3118 return None;
3119 }
3120 let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3121 let valid_up_to = if self == ISO_2022_JP {
3122 iso_2022_jp_ascii_valid_up_to(bytes)
3123 } else {
3124 ascii_valid_up_to(bytes)
3125 };
3126 if valid_up_to == bytes.len() {
3127 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3128 return Some(Cow::Borrowed(str));
3129 }
3130 let decoder = self.new_decoder_without_bom_handling();
3131 let mut string = String::with_capacity(
3132 checked_add(
3133 valid_up_to,
3134 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3135 )
3136 .unwrap(),
3137 );
3138 unsafe {
3139 let vec = string.as_mut_vec();
3140 vec.set_len(valid_up_to);
3141 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3142 }
3143 (decoder, string, &bytes[valid_up_to..])
3144 } else {
3145 let decoder = self.new_decoder_without_bom_handling();
3146 let string = String::with_capacity(
3147 decoder
3148 .max_utf8_buffer_length_without_replacement(bytes.len())
3149 .unwrap(),
3150 );
3151 (decoder, string, bytes)
3152 };
3153 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3154 match result {
3155 DecoderResult::InputEmpty => {
3156 debug_assert_eq!(read, input.len());
3157 Some(Cow::Owned(string))
3158 }
3159 DecoderResult::Malformed(_, _) => None,
3160 DecoderResult::OutputFull => unreachable!(),
3161 }
3162 }
3163
3164 /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3165 /// replaced with decimal numeric character references when the entire input
3166 /// is available as a single buffer (i.e. the end of the buffer marks the
3167 /// end of the stream).
3168 ///
3169 /// This method implements the (non-streaming version of) the
3170 /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3171 /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3172 /// spec concept, it is slightly more efficient to use
3173 /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3174 /// method on `UTF_8`.
3175 ///
3176 /// The second item in the returned tuple is the encoding that was actually
3177 /// used (which may differ from this encoding thanks to some encodings
3178 /// having UTF-8 as their output encoding).
3179 ///
3180 /// The third item in the returned tuple indicates whether there were
3181 /// unmappable characters (that were replaced with HTML numeric character
3182 /// references).
3183 ///
3184 /// _Note:_ It is wrong to use this when the input buffer represents only
3185 /// a segment of the input instead of the whole input. Use `new_encoder()`
3186 /// when encoding segmented output.
3187 ///
3188 /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3189 /// ASCII-compatible encoding, this method returns a borrow of the input
3190 /// without a heap allocation. Otherwise, this method performs a single
3191 /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3192 /// unmappable characters and potentially multiple heap allocations if
3193 /// there are. These allocations are tuned for jemalloc and may not be
3194 /// optimal when using a different allocator that doesn't use power-of-two
3195 /// buckets.
3196 ///
3197 /// # Panics
3198 ///
3199 /// If the size calculation for a heap-allocated backing buffer overflows
3200 /// `usize`.
3201 ///
3202 /// Available to Rust only.
encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool)3203 pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3204 let output_encoding = self.output_encoding();
3205 if output_encoding == UTF_8 {
3206 return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3207 }
3208 debug_assert!(output_encoding.is_potentially_borrowable());
3209 let bytes = string.as_bytes();
3210 let valid_up_to = if output_encoding == ISO_2022_JP {
3211 iso_2022_jp_ascii_valid_up_to(bytes)
3212 } else {
3213 ascii_valid_up_to(bytes)
3214 };
3215 if valid_up_to == bytes.len() {
3216 return (Cow::Borrowed(bytes), output_encoding, false);
3217 }
3218 let mut encoder = output_encoding.new_encoder();
3219 let mut vec: Vec<u8> = Vec::with_capacity(
3220 (checked_add(
3221 valid_up_to,
3222 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3223 ))
3224 .unwrap()
3225 .next_power_of_two(),
3226 );
3227 unsafe {
3228 vec.set_len(valid_up_to);
3229 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3230 }
3231 let mut total_read = valid_up_to;
3232 let mut total_had_errors = false;
3233 loop {
3234 let (result, read, had_errors) =
3235 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3236 total_read += read;
3237 total_had_errors |= had_errors;
3238 match result {
3239 CoderResult::InputEmpty => {
3240 debug_assert_eq!(total_read, string.len());
3241 return (Cow::Owned(vec), output_encoding, total_had_errors);
3242 }
3243 CoderResult::OutputFull => {
3244 // reserve_exact wants to know how much more on top of current
3245 // length--not current capacity.
3246 let needed = encoder
3247 .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3248 let rounded = (checked_add(vec.capacity(), needed))
3249 .unwrap()
3250 .next_power_of_two();
3251 let additional = rounded - vec.len();
3252 vec.reserve_exact(additional);
3253 }
3254 }
3255 }
3256 }
3257
new_variant_decoder(&'static self) -> VariantDecoder3258 fn new_variant_decoder(&'static self) -> VariantDecoder {
3259 self.variant.new_variant_decoder()
3260 }
3261
3262 /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3263 ///
3264 /// BOM sniffing may cause the returned decoder to morph into a decoder
3265 /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3266 ///
3267 /// Available via the C wrapper.
3268 #[inline]
new_decoder(&'static self) -> Decoder3269 pub fn new_decoder(&'static self) -> Decoder {
3270 Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3271 }
3272
3273 /// Instantiates a new decoder for this encoding with BOM removal.
3274 ///
3275 /// If the input starts with bytes that are the BOM for this encoding,
3276 /// those bytes are removed. However, the decoder never morphs into a
3277 /// decoder for another encoding: A BOM for another encoding is treated as
3278 /// (potentially malformed) input to the decoding algorithm for this
3279 /// encoding.
3280 ///
3281 /// Available via the C wrapper.
3282 #[inline]
new_decoder_with_bom_removal(&'static self) -> Decoder3283 pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3284 Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3285 }
3286
3287 /// Instantiates a new decoder for this encoding with BOM handling disabled.
3288 ///
3289 /// If the input starts with bytes that look like a BOM, those bytes are
3290 /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3291 /// for another encoding.)
3292 ///
3293 /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3294 /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3295 /// instead of this method to cause the BOM to be removed.
3296 ///
3297 /// Available via the C wrapper.
3298 #[inline]
new_decoder_without_bom_handling(&'static self) -> Decoder3299 pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3300 Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3301 }
3302
3303 /// Instantiates a new encoder for the output encoding of this encoding.
3304 ///
3305 /// Available via the C wrapper.
3306 #[inline]
new_encoder(&'static self) -> Encoder3307 pub fn new_encoder(&'static self) -> Encoder {
3308 let enc = self.output_encoding();
3309 enc.variant.new_encoder(enc)
3310 }
3311
3312 /// Validates UTF-8.
3313 ///
3314 /// Returns the index of the first byte that makes the input malformed as
3315 /// UTF-8 or the length of the slice if the slice is entirely valid.
3316 ///
3317 /// This is currently faster than the corresponding standard library
3318 /// functionality. If this implementation gets upstreamed to the standard
3319 /// library, this method may be removed in the future.
3320 ///
3321 /// Available via the C wrapper.
utf8_valid_up_to(bytes: &[u8]) -> usize3322 pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3323 utf8_valid_up_to(bytes)
3324 }
3325
3326 /// Validates ASCII.
3327 ///
3328 /// Returns the index of the first byte that makes the input malformed as
3329 /// ASCII or the length of the slice if the slice is entirely valid.
3330 ///
3331 /// Available via the C wrapper.
ascii_valid_up_to(bytes: &[u8]) -> usize3332 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3333 ascii_valid_up_to(bytes)
3334 }
3335
3336 /// Validates ISO-2022-JP ASCII-state data.
3337 ///
3338 /// Returns the index of the first byte that makes the input not
3339 /// representable in the ASCII state of ISO-2022-JP or the length of the
3340 /// slice if the slice is entirely representable in the ASCII state of
3341 /// ISO-2022-JP.
3342 ///
3343 /// Available via the C wrapper.
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize3344 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3345 iso_2022_jp_ascii_valid_up_to(bytes)
3346 }
3347 }
3348
3349 impl PartialEq for Encoding {
3350 #[inline]
eq(&self, other: &Encoding) -> bool3351 fn eq(&self, other: &Encoding) -> bool {
3352 (self as *const Encoding) == (other as *const Encoding)
3353 }
3354 }
3355
3356 impl Eq for Encoding {}
3357
3358 impl Hash for Encoding {
3359 #[inline]
hash<H: Hasher>(&self, state: &mut H)3360 fn hash<H: Hasher>(&self, state: &mut H) {
3361 (self as *const Encoding).hash(state);
3362 }
3363 }
3364
3365 impl std::fmt::Debug for Encoding {
3366 #[inline]
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result3367 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3368 write!(f, "Encoding {{ {} }}", self.name)
3369 }
3370 }
3371
3372 #[cfg(feature = "serde")]
3373 impl Serialize for Encoding {
3374 #[inline]
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer,3375 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3376 where
3377 S: Serializer,
3378 {
3379 serializer.serialize_str(self.name)
3380 }
3381 }
3382
3383 #[cfg(feature = "serde")]
3384 struct EncodingVisitor;
3385
3386 #[cfg(feature = "serde")]
3387 impl<'de> Visitor<'de> for EncodingVisitor {
3388 type Value = &'static Encoding;
3389
expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result3390 fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
3391 formatter.write_str("a valid encoding label")
3392 }
3393
visit_str<E>(self, value: &str) -> Result<&'static Encoding, E> where E: serde::de::Error,3394 fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3395 where
3396 E: serde::de::Error,
3397 {
3398 if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3399 Ok(enc)
3400 } else {
3401 Err(E::custom(format!("invalid encoding label: {}", value)))
3402 }
3403 }
3404 }
3405
3406 #[cfg(feature = "serde")]
3407 impl<'de> Deserialize<'de> for &'static Encoding {
deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error> where D: Deserializer<'de>,3408 fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3409 where
3410 D: Deserializer<'de>,
3411 {
3412 deserializer.deserialize_str(EncodingVisitor)
3413 }
3414 }
3415
3416 /// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3417 #[derive(PartialEq, Debug, Copy, Clone)]
3418 enum DecoderLifeCycle {
3419 /// The decoder has seen no input yet.
3420 AtStart,
3421 /// The decoder has seen no input yet but expects UTF-8.
3422 AtUtf8Start,
3423 /// The decoder has seen no input yet but expects UTF-16BE.
3424 AtUtf16BeStart,
3425 /// The decoder has seen no input yet but expects UTF-16LE.
3426 AtUtf16LeStart,
3427 /// The decoder has seen EF.
3428 SeenUtf8First,
3429 /// The decoder has seen EF, BB.
3430 SeenUtf8Second,
3431 /// The decoder has seen FE.
3432 SeenUtf16BeFirst,
3433 /// The decoder has seen FF.
3434 SeenUtf16LeFirst,
3435 /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3436 /// underlying decoder reported EF as an error, so we need to remember to
3437 /// push BB before the next buffer.
3438 ConvertingWithPendingBB,
3439 /// No longer looking for a BOM and EOF not yet seen.
3440 Converting,
3441 /// EOF has been seen.
3442 Finished,
3443 }
3444
3445 /// Communicate the BOM handling mode.
3446 #[derive(Debug, Copy, Clone)]
3447 enum BomHandling {
3448 /// Don't handle the BOM
3449 Off,
3450 /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3451 Sniff,
3452 /// Remove the BOM only if it's the BOM for this encoding
3453 Remove,
3454 }
3455
3456 /// Result of a (potentially partial) decode or encode operation with
3457 /// replacement.
3458 #[must_use]
3459 #[derive(Debug, PartialEq, Eq)]
3460 pub enum CoderResult {
3461 /// The input was exhausted.
3462 ///
3463 /// If this result was returned from a call where `last` was `true`, the
3464 /// conversion process has completed. Otherwise, the caller should call a
3465 /// decode or encode method again with more input.
3466 InputEmpty,
3467
3468 /// The converter cannot produce another unit of output, because the output
3469 /// buffer does not have enough space left.
3470 ///
3471 /// The caller must provide more output space upon the next call and re-push
3472 /// the remaining input to the converter.
3473 OutputFull,
3474 }
3475
3476 /// Result of a (potentially partial) decode operation without replacement.
3477 #[must_use]
3478 #[derive(Debug, PartialEq, Eq)]
3479 pub enum DecoderResult {
3480 /// The input was exhausted.
3481 ///
3482 /// If this result was returned from a call where `last` was `true`, the
3483 /// decoding process has completed. Otherwise, the caller should call a
3484 /// decode method again with more input.
3485 InputEmpty,
3486
3487 /// The decoder cannot produce another unit of output, because the output
3488 /// buffer does not have enough space left.
3489 ///
3490 /// The caller must provide more output space upon the next call and re-push
3491 /// the remaining input to the decoder.
3492 OutputFull,
3493
3494 /// The decoder encountered a malformed byte sequence.
3495 ///
3496 /// The caller must either treat this as a fatal error or must append one
3497 /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3498 /// the remaining input to the decoder.
3499 ///
3500 /// The first wrapped integer indicates the length of the malformed byte
3501 /// sequence. The second wrapped integer indicates the number of bytes
3502 /// that were consumed after the malformed sequence. If the second
3503 /// integer is zero, the last byte that was consumed is the last byte of
3504 /// the malformed sequence. Note that the malformed bytes may have been part
3505 /// of an earlier input buffer.
3506 ///
3507 /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3508 /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3509 /// of the two is 6, which happens with ISO-2022-JP.
3510 Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3511 }
3512
3513 /// A converter that decodes a byte stream into Unicode according to a
3514 /// character encoding in a streaming (incremental) manner.
3515 ///
3516 /// The various `decode_*` methods take an input buffer (`src`) and an output
3517 /// buffer `dst` both of which are caller-allocated. There are variants for
3518 /// both UTF-8 and UTF-16 output buffers.
3519 ///
3520 /// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3521 /// into `dst` until one of the following three things happens:
3522 ///
3523 /// 1. A malformed byte sequence is encountered (`*_without_replacement`
3524 /// variants only).
3525 ///
3526 /// 2. The output buffer has been filled so near capacity that the decoder
3527 /// cannot be sure that processing an additional byte of input wouldn't
3528 /// cause so much output that the output buffer would overflow.
3529 ///
3530 /// 3. All the input bytes have been processed.
3531 ///
3532 /// The `decode_*` method then returns tuple of a status indicating which one
3533 /// of the three reasons to return happened, how many input bytes were read,
3534 /// how many output code units (`u8` when decoding into UTF-8 and `u16`
3535 /// when decoding to UTF-16) were written (except when decoding into `String`,
3536 /// whose length change indicates this), and in the case of the
3537 /// variants performing replacement, a boolean indicating whether an error was
3538 /// replaced with the REPLACEMENT CHARACTER during the call.
3539 ///
3540 /// The number of bytes "written" is what's logically written. Garbage may be
3541 /// written in the output buffer beyond the point logically written to.
3542 /// Therefore, if you wish to decode into an `&mut str`, you should use the
3543 /// methods that take an `&mut str` argument instead of the ones that take an
3544 /// `&mut [u8]` argument. The former take care of overwriting the trailing
3545 /// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3546 /// latter don't.
3547 ///
3548 /// In the case of the `*_without_replacement` variants, the status is a
3549 /// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3550 /// `InputEmpty` corresponding to the three cases listed above).
3551 ///
3552 /// In the case of methods whose name does not end with
3553 /// `*_without_replacement`, malformed sequences are automatically replaced
3554 /// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3555 /// return early.
3556 ///
3557 /// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3558 /// space. When decoding to UTF-16, the output buffer must have at least two
3559 /// UTF-16 code units (`u16`) of space.
3560 ///
3561 /// When decoding to UTF-8 without replacement, the methods are guaranteed
3562 /// not to return indicating that more output space is needed if the length
3563 /// of the output buffer is at least the length returned by
3564 /// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3565 /// with replacement, the length of the output buffer that guarantees the
3566 /// methods not to return indicating that more output space is needed is given
3567 /// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3568 /// or without replacement, the length of the output buffer that guarantees
3569 /// the methods not to return indicating that more output space is needed is
3570 /// given by [`max_utf16_buffer_length()`][4].
3571 ///
3572 /// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3573 /// and the output after each `decode_*` call is guaranteed to consist of
3574 /// complete characters. (I.e. the code unit sequence for the last character is
3575 /// guaranteed not to be split across output buffers.)
3576 ///
3577 /// The boolean argument `last` indicates that the end of the stream is reached
3578 /// when all the bytes in `src` have been consumed.
3579 ///
3580 /// A `Decoder` object can be used to incrementally decode a byte stream.
3581 ///
3582 /// During the processing of a single stream, the caller must call `decode_*`
3583 /// zero or more times with `last` set to `false` and then call `decode_*` at
3584 /// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3585 /// the processing of the stream has ended. Otherwise, the caller must call
3586 /// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3587 /// a fatal error).
3588 ///
3589 /// Once the stream has ended, the `Decoder` object must not be used anymore.
3590 /// That is, you need to create another one to process another stream.
3591 ///
3592 /// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3593 /// the caller does not wish to treat it as a fatal error, the input buffer
3594 /// `src` may not have been completely consumed. In that case, the caller must
3595 /// pass the unconsumed contents of `src` to `decode_*` again upon the next
3596 /// call.
3597 ///
3598 /// [1]: enum.DecoderResult.html
3599 /// [2]: #method.max_utf8_buffer_length_without_replacement
3600 /// [3]: #method.max_utf8_buffer_length
3601 /// [4]: #method.max_utf16_buffer_length
3602 ///
3603 /// # Infinite loops
3604 ///
3605 /// When converting with a fixed-size output buffer whose size is too small to
3606 /// accommodate one character or (when applicable) one numeric character
3607 /// reference of output, an infinite loop ensues. When converting with a
3608 /// fixed-size output buffer, it generally makes sense to make the buffer
3609 /// fairly large (e.g. couple of kilobytes).
3610 pub struct Decoder {
3611 encoding: &'static Encoding,
3612 variant: VariantDecoder,
3613 life_cycle: DecoderLifeCycle,
3614 }
3615
3616 impl Decoder {
new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder3617 fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3618 Decoder {
3619 encoding: enc,
3620 variant: decoder,
3621 life_cycle: match sniffing {
3622 BomHandling::Off => DecoderLifeCycle::Converting,
3623 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3624 BomHandling::Remove => {
3625 if enc == UTF_8 {
3626 DecoderLifeCycle::AtUtf8Start
3627 } else if enc == UTF_16BE {
3628 DecoderLifeCycle::AtUtf16BeStart
3629 } else if enc == UTF_16LE {
3630 DecoderLifeCycle::AtUtf16LeStart
3631 } else {
3632 DecoderLifeCycle::Converting
3633 }
3634 }
3635 },
3636 }
3637 }
3638
3639 /// The `Encoding` this `Decoder` is for.
3640 ///
3641 /// BOM sniffing can change the return value of this method during the life
3642 /// of the decoder.
3643 ///
3644 /// Available via the C wrapper.
3645 #[inline]
encoding(&self) -> &'static Encoding3646 pub fn encoding(&self) -> &'static Encoding {
3647 self.encoding
3648 }
3649
3650 /// Query the worst-case UTF-8 output size _with replacement_.
3651 ///
3652 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3653 /// that will not overflow given the current state of the decoder and
3654 /// `byte_length` number of additional input bytes when decoding with
3655 /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3656 /// sequence or `None` if `usize` would overflow.
3657 ///
3658 /// Available via the C wrapper.
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>3659 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3660 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3661 // BOM getting pushed to the underlying decoder.
3662 match self.life_cycle {
3663 DecoderLifeCycle::Converting
3664 | DecoderLifeCycle::AtUtf8Start
3665 | DecoderLifeCycle::AtUtf16LeStart
3666 | DecoderLifeCycle::AtUtf16BeStart => {
3667 return self.variant.max_utf8_buffer_length(byte_length);
3668 }
3669 DecoderLifeCycle::AtStart => {
3670 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3671 if let Some(utf16_bom) = checked_add(
3672 1,
3673 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3674 ) {
3675 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3676 let encoding = self.encoding();
3677 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3678 // No need to consider the internal state of the underlying decoder,
3679 // because it is at start, because no data has reached it yet.
3680 return Some(utf_bom);
3681 } else if let Some(non_bom) =
3682 self.variant.max_utf8_buffer_length(byte_length)
3683 {
3684 return Some(std::cmp::max(utf_bom, non_bom));
3685 }
3686 }
3687 }
3688 }
3689 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3690 // Add two bytes even when only one byte has been seen,
3691 // because the one byte can become a lead byte in multibyte
3692 // decoders, but only after the decoder has been queried
3693 // for max length, so the decoder's own logic for adding
3694 // one for a pending lead cannot work.
3695 if let Some(sum) = byte_length.checked_add(2) {
3696 if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3697 if self.encoding() == UTF_8 {
3698 // No need to consider the internal state of the underlying decoder,
3699 // because it is at start, because no data has reached it yet.
3700 return Some(utf8_bom);
3701 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3702 return Some(std::cmp::max(utf8_bom, non_bom));
3703 }
3704 }
3705 }
3706 }
3707 DecoderLifeCycle::ConvertingWithPendingBB => {
3708 if let Some(sum) = byte_length.checked_add(2) {
3709 return self.variant.max_utf8_buffer_length(sum);
3710 }
3711 }
3712 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3713 // Add two bytes even when only one byte has been seen,
3714 // because the one byte can become a lead byte in multibyte
3715 // decoders, but only after the decoder has been queried
3716 // for max length, so the decoder's own logic for adding
3717 // one for a pending lead cannot work.
3718 if let Some(sum) = byte_length.checked_add(2) {
3719 if let Some(utf16_bom) =
3720 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3721 {
3722 let encoding = self.encoding();
3723 if encoding == UTF_16LE || encoding == UTF_16BE {
3724 // No need to consider the internal state of the underlying decoder,
3725 // because it is at start, because no data has reached it yet.
3726 return Some(utf16_bom);
3727 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3728 return Some(std::cmp::max(utf16_bom, non_bom));
3729 }
3730 }
3731 }
3732 }
3733 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3734 }
3735 None
3736 }
3737
3738 /// Query the worst-case UTF-8 output size _without replacement_.
3739 ///
3740 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3741 /// that will not overflow given the current state of the decoder and
3742 /// `byte_length` number of additional input bytes when decoding without
3743 /// replacement error handling or `None` if `usize` would overflow.
3744 ///
3745 /// Note that this value may be too small for the `_with_replacement` case.
3746 /// Use `max_utf8_buffer_length()` for that case.
3747 ///
3748 /// Available via the C wrapper.
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>3749 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3750 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3751 // BOM getting pushed to the underlying decoder.
3752 match self.life_cycle {
3753 DecoderLifeCycle::Converting
3754 | DecoderLifeCycle::AtUtf8Start
3755 | DecoderLifeCycle::AtUtf16LeStart
3756 | DecoderLifeCycle::AtUtf16BeStart => {
3757 return self
3758 .variant
3759 .max_utf8_buffer_length_without_replacement(byte_length);
3760 }
3761 DecoderLifeCycle::AtStart => {
3762 if let Some(utf8_bom) = byte_length.checked_add(3) {
3763 if let Some(utf16_bom) = checked_add(
3764 1,
3765 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3766 ) {
3767 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3768 let encoding = self.encoding();
3769 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3770 // No need to consider the internal state of the underlying decoder,
3771 // because it is at start, because no data has reached it yet.
3772 return Some(utf_bom);
3773 } else if let Some(non_bom) = self
3774 .variant
3775 .max_utf8_buffer_length_without_replacement(byte_length)
3776 {
3777 return Some(std::cmp::max(utf_bom, non_bom));
3778 }
3779 }
3780 }
3781 }
3782 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3783 // Add two bytes even when only one byte has been seen,
3784 // because the one byte can become a lead byte in multibyte
3785 // decoders, but only after the decoder has been queried
3786 // for max length, so the decoder's own logic for adding
3787 // one for a pending lead cannot work.
3788 if let Some(sum) = byte_length.checked_add(2) {
3789 if let Some(utf8_bom) = sum.checked_add(3) {
3790 if self.encoding() == UTF_8 {
3791 // No need to consider the internal state of the underlying decoder,
3792 // because it is at start, because no data has reached it yet.
3793 return Some(utf8_bom);
3794 } else if let Some(non_bom) =
3795 self.variant.max_utf8_buffer_length_without_replacement(sum)
3796 {
3797 return Some(std::cmp::max(utf8_bom, non_bom));
3798 }
3799 }
3800 }
3801 }
3802 DecoderLifeCycle::ConvertingWithPendingBB => {
3803 if let Some(sum) = byte_length.checked_add(2) {
3804 return self.variant.max_utf8_buffer_length_without_replacement(sum);
3805 }
3806 }
3807 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3808 // Add two bytes even when only one byte has been seen,
3809 // because the one byte can become a lead byte in multibyte
3810 // decoders, but only after the decoder has been queried
3811 // for max length, so the decoder's own logic for adding
3812 // one for a pending lead cannot work.
3813 if let Some(sum) = byte_length.checked_add(2) {
3814 if let Some(utf16_bom) =
3815 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3816 {
3817 let encoding = self.encoding();
3818 if encoding == UTF_16LE || encoding == UTF_16BE {
3819 // No need to consider the internal state of the underlying decoder,
3820 // because it is at start, because no data has reached it yet.
3821 return Some(utf16_bom);
3822 } else if let Some(non_bom) =
3823 self.variant.max_utf8_buffer_length_without_replacement(sum)
3824 {
3825 return Some(std::cmp::max(utf16_bom, non_bom));
3826 }
3827 }
3828 }
3829 }
3830 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3831 }
3832 None
3833 }
3834
3835 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3836 /// replaced with the REPLACEMENT CHARACTER.
3837 ///
3838 /// See the documentation of the struct for documentation for `decode_*`
3839 /// methods collectively.
3840 ///
3841 /// Available via the C wrapper.
decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)3842 pub fn decode_to_utf8(
3843 &mut self,
3844 src: &[u8],
3845 dst: &mut [u8],
3846 last: bool,
3847 ) -> (CoderResult, usize, usize, bool) {
3848 let mut had_errors = false;
3849 let mut total_read = 0usize;
3850 let mut total_written = 0usize;
3851 loop {
3852 let (result, read, written) = self.decode_to_utf8_without_replacement(
3853 &src[total_read..],
3854 &mut dst[total_written..],
3855 last,
3856 );
3857 total_read += read;
3858 total_written += written;
3859 match result {
3860 DecoderResult::InputEmpty => {
3861 return (
3862 CoderResult::InputEmpty,
3863 total_read,
3864 total_written,
3865 had_errors,
3866 );
3867 }
3868 DecoderResult::OutputFull => {
3869 return (
3870 CoderResult::OutputFull,
3871 total_read,
3872 total_written,
3873 had_errors,
3874 );
3875 }
3876 DecoderResult::Malformed(_, _) => {
3877 had_errors = true;
3878 // There should always be space for the U+FFFD, because
3879 // otherwise we'd have gotten OutputFull already.
3880 // XXX: is the above comment actually true for UTF-8 itself?
3881 // TODO: Consider having fewer bound checks here.
3882 dst[total_written] = 0xEFu8;
3883 total_written += 1;
3884 dst[total_written] = 0xBFu8;
3885 total_written += 1;
3886 dst[total_written] = 0xBDu8;
3887 total_written += 1;
3888 }
3889 }
3890 }
3891 }
3892
3893 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3894 /// replaced with the REPLACEMENT CHARACTER with type system signaling
3895 /// of UTF-8 validity.
3896 ///
3897 /// This methods calls `decode_to_utf8` and then zeroes
3898 /// out up to three bytes that aren't logically part of the write in order
3899 /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3900 ///
3901 /// See the documentation of the struct for documentation for `decode_*`
3902 /// methods collectively.
3903 ///
3904 /// Available to Rust only.
decode_to_str( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (CoderResult, usize, usize, bool)3905 pub fn decode_to_str(
3906 &mut self,
3907 src: &[u8],
3908 dst: &mut str,
3909 last: bool,
3910 ) -> (CoderResult, usize, usize, bool) {
3911 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3912 let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3913 let len = bytes.len();
3914 let mut trail = written;
3915 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3916 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3917 // encodings to avoid overwriting here.
3918 if self.encoding != UTF_8 {
3919 let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3920 while trail < max {
3921 bytes[trail] = 0;
3922 trail += 1;
3923 }
3924 }
3925 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3926 bytes[trail] = 0;
3927 trail += 1;
3928 }
3929 (result, read, written, replaced)
3930 }
3931
3932 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3933 /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3934 ///
3935 /// Like the others, this method follows the logic that the output buffer is
3936 /// caller-allocated. This method treats the capacity of the `String` as
3937 /// the output limit. That is, this method guarantees not to cause a
3938 /// reallocation of the backing buffer of `String`.
3939 ///
3940 /// The return value is a tuple that contains the `DecoderResult`, the
3941 /// number of bytes read and a boolean indicating whether replacements
3942 /// were done. The number of bytes written is signaled via the length of
3943 /// the `String` changing.
3944 ///
3945 /// See the documentation of the struct for documentation for `decode_*`
3946 /// methods collectively.
3947 ///
3948 /// Available to Rust only.
decode_to_string( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (CoderResult, usize, bool)3949 pub fn decode_to_string(
3950 &mut self,
3951 src: &[u8],
3952 dst: &mut String,
3953 last: bool,
3954 ) -> (CoderResult, usize, bool) {
3955 unsafe {
3956 let vec = dst.as_mut_vec();
3957 let old_len = vec.len();
3958 let capacity = vec.capacity();
3959 vec.set_len(capacity);
3960 let (result, read, written, replaced) =
3961 self.decode_to_utf8(src, &mut vec[old_len..], last);
3962 vec.set_len(old_len + written);
3963 (result, read, replaced)
3964 }
3965 }
3966
3967 public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3968 /// _without replacement_.
3969 ///
3970 /// See the documentation of the struct for
3971 /// documentation for `decode_*` methods
3972 /// collectively.
3973 ///
3974 /// Available via the C wrapper.
3975 ,
3976 decode_to_utf8_without_replacement,
3977 decode_to_utf8_raw,
3978 decode_to_utf8_checking_end,
3979 decode_to_utf8_after_one_potential_bom_byte,
3980 decode_to_utf8_after_two_potential_bom_bytes,
3981 decode_to_utf8_checking_end_with_offset,
3982 u8);
3983
3984 /// Incrementally decode a byte stream into UTF-8 with type system signaling
3985 /// of UTF-8 validity.
3986 ///
3987 /// This methods calls `decode_to_utf8` and then zeroes out up to three
3988 /// bytes that aren't logically part of the write in order to retain the
3989 /// UTF-8 validity even for the unwritten part of the buffer.
3990 ///
3991 /// See the documentation of the struct for documentation for `decode_*`
3992 /// methods collectively.
3993 ///
3994 /// Available to Rust only.
decode_to_str_without_replacement( &mut self, src: &[u8], dst: &mut str, last: bool, ) -> (DecoderResult, usize, usize)3995 pub fn decode_to_str_without_replacement(
3996 &mut self,
3997 src: &[u8],
3998 dst: &mut str,
3999 last: bool,
4000 ) -> (DecoderResult, usize, usize) {
4001 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4002 let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4003 let len = bytes.len();
4004 let mut trail = written;
4005 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4006 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4007 // encodings to avoid overwriting here.
4008 if self.encoding != UTF_8 {
4009 let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4010 while trail < max {
4011 bytes[trail] = 0;
4012 trail += 1;
4013 }
4014 }
4015 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4016 bytes[trail] = 0;
4017 trail += 1;
4018 }
4019 (result, read, written)
4020 }
4021
4022 /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4023 ///
4024 /// Like the others, this method follows the logic that the output buffer is
4025 /// caller-allocated. This method treats the capacity of the `String` as
4026 /// the output limit. That is, this method guarantees not to cause a
4027 /// reallocation of the backing buffer of `String`.
4028 ///
4029 /// The return value is a pair that contains the `DecoderResult` and the
4030 /// number of bytes read. The number of bytes written is signaled via
4031 /// the length of the `String` changing.
4032 ///
4033 /// See the documentation of the struct for documentation for `decode_*`
4034 /// methods collectively.
4035 ///
4036 /// Available to Rust only.
decode_to_string_without_replacement( &mut self, src: &[u8], dst: &mut String, last: bool, ) -> (DecoderResult, usize)4037 pub fn decode_to_string_without_replacement(
4038 &mut self,
4039 src: &[u8],
4040 dst: &mut String,
4041 last: bool,
4042 ) -> (DecoderResult, usize) {
4043 unsafe {
4044 let vec = dst.as_mut_vec();
4045 let old_len = vec.len();
4046 let capacity = vec.capacity();
4047 vec.set_len(capacity);
4048 let (result, read, written) =
4049 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4050 vec.set_len(old_len + written);
4051 (result, read)
4052 }
4053 }
4054
4055 /// Query the worst-case UTF-16 output size (with or without replacement).
4056 ///
4057 /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4058 /// that will not overflow given the current state of the decoder and
4059 /// `byte_length` number of additional input bytes or `None` if `usize`
4060 /// would overflow.
4061 ///
4062 /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4063 /// return value of this method applies also in the
4064 /// `_without_replacement` case.
4065 ///
4066 /// Available via the C wrapper.
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>4067 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4068 // Need to consider a) the decoder morphing due to the BOM and b) a partial
4069 // BOM getting pushed to the underlying decoder.
4070 match self.life_cycle {
4071 DecoderLifeCycle::Converting
4072 | DecoderLifeCycle::AtUtf8Start
4073 | DecoderLifeCycle::AtUtf16LeStart
4074 | DecoderLifeCycle::AtUtf16BeStart => {
4075 return self.variant.max_utf16_buffer_length(byte_length);
4076 }
4077 DecoderLifeCycle::AtStart => {
4078 if let Some(utf8_bom) = byte_length.checked_add(1) {
4079 if let Some(utf16_bom) =
4080 checked_add(1, checked_div(byte_length.checked_add(1), 2))
4081 {
4082 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
4083 let encoding = self.encoding();
4084 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4085 // No need to consider the internal state of the underlying decoder,
4086 // because it is at start, because no data has reached it yet.
4087 return Some(utf_bom);
4088 } else if let Some(non_bom) =
4089 self.variant.max_utf16_buffer_length(byte_length)
4090 {
4091 return Some(std::cmp::max(utf_bom, non_bom));
4092 }
4093 }
4094 }
4095 }
4096 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4097 // Add two bytes even when only one byte has been seen,
4098 // because the one byte can become a lead byte in multibyte
4099 // decoders, but only after the decoder has been queried
4100 // for max length, so the decoder's own logic for adding
4101 // one for a pending lead cannot work.
4102 if let Some(sum) = byte_length.checked_add(2) {
4103 if let Some(utf8_bom) = sum.checked_add(1) {
4104 if self.encoding() == UTF_8 {
4105 // No need to consider the internal state of the underlying decoder,
4106 // because it is at start, because no data has reached it yet.
4107 return Some(utf8_bom);
4108 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4109 return Some(std::cmp::max(utf8_bom, non_bom));
4110 }
4111 }
4112 }
4113 }
4114 DecoderLifeCycle::ConvertingWithPendingBB => {
4115 if let Some(sum) = byte_length.checked_add(2) {
4116 return self.variant.max_utf16_buffer_length(sum);
4117 }
4118 }
4119 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4120 // Add two bytes even when only one byte has been seen,
4121 // because the one byte can become a lead byte in multibyte
4122 // decoders, but only after the decoder has been queried
4123 // for max length, so the decoder's own logic for adding
4124 // one for a pending lead cannot work.
4125 if let Some(sum) = byte_length.checked_add(2) {
4126 if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4127 let encoding = self.encoding();
4128 if encoding == UTF_16LE || encoding == UTF_16BE {
4129 // No need to consider the internal state of the underlying decoder,
4130 // because it is at start, because no data has reached it yet.
4131 return Some(utf16_bom);
4132 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4133 return Some(std::cmp::max(utf16_bom, non_bom));
4134 }
4135 }
4136 }
4137 }
4138 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4139 }
4140 None
4141 }
4142
4143 /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4144 /// replaced with the REPLACEMENT CHARACTER.
4145 ///
4146 /// See the documentation of the struct for documentation for `decode_*`
4147 /// methods collectively.
4148 ///
4149 /// Available via the C wrapper.
decode_to_utf16( &mut self, src: &[u8], dst: &mut [u16], last: bool, ) -> (CoderResult, usize, usize, bool)4150 pub fn decode_to_utf16(
4151 &mut self,
4152 src: &[u8],
4153 dst: &mut [u16],
4154 last: bool,
4155 ) -> (CoderResult, usize, usize, bool) {
4156 let mut had_errors = false;
4157 let mut total_read = 0usize;
4158 let mut total_written = 0usize;
4159 loop {
4160 let (result, read, written) = self.decode_to_utf16_without_replacement(
4161 &src[total_read..],
4162 &mut dst[total_written..],
4163 last,
4164 );
4165 total_read += read;
4166 total_written += written;
4167 match result {
4168 DecoderResult::InputEmpty => {
4169 return (
4170 CoderResult::InputEmpty,
4171 total_read,
4172 total_written,
4173 had_errors,
4174 );
4175 }
4176 DecoderResult::OutputFull => {
4177 return (
4178 CoderResult::OutputFull,
4179 total_read,
4180 total_written,
4181 had_errors,
4182 );
4183 }
4184 DecoderResult::Malformed(_, _) => {
4185 had_errors = true;
4186 // There should always be space for the U+FFFD, because
4187 // otherwise we'd have gotten OutputFull already.
4188 dst[total_written] = 0xFFFD;
4189 total_written += 1;
4190 }
4191 }
4192 }
4193 }
4194
4195 public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4196 /// _without replacement_.
4197 ///
4198 /// See the documentation of the struct for
4199 /// documentation for `decode_*` methods
4200 /// collectively.
4201 ///
4202 /// Available via the C wrapper.
4203 ,
4204 decode_to_utf16_without_replacement,
4205 decode_to_utf16_raw,
4206 decode_to_utf16_checking_end,
4207 decode_to_utf16_after_one_potential_bom_byte,
4208 decode_to_utf16_after_two_potential_bom_bytes,
4209 decode_to_utf16_checking_end_with_offset,
4210 u16);
4211
4212 /// Checks for compatibility with storing Unicode scalar values as unsigned
4213 /// bytes taking into account the state of the decoder.
4214 ///
4215 /// Returns `None` if the decoder is not in a neutral state, including waiting
4216 /// for the BOM or if the encoding is never Latin-byte-compatible.
4217 ///
4218 /// Otherwise returns the index of the first byte whose unsigned value doesn't
4219 /// directly correspond to the decoded Unicode scalar value, or the length
4220 /// of the input if all bytes in the input decode directly to scalar values
4221 /// corresponding to the unsigned byte values.
4222 ///
4223 /// Does not change the state of the decoder.
4224 ///
4225 /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4226 /// storage optimizations.
4227 ///
4228 /// Available via the C wrapper.
latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize>4229 pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4230 match self.life_cycle {
4231 DecoderLifeCycle::Converting => {
4232 return self.variant.latin1_byte_compatible_up_to(bytes);
4233 }
4234 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4235 _ => None,
4236 }
4237 }
4238 }
4239
4240 /// Result of a (potentially partial) encode operation without replacement.
4241 #[must_use]
4242 #[derive(Debug, PartialEq, Eq)]
4243 pub enum EncoderResult {
4244 /// The input was exhausted.
4245 ///
4246 /// If this result was returned from a call where `last` was `true`, the
4247 /// decoding process has completed. Otherwise, the caller should call a
4248 /// decode method again with more input.
4249 InputEmpty,
4250
4251 /// The encoder cannot produce another unit of output, because the output
4252 /// buffer does not have enough space left.
4253 ///
4254 /// The caller must provide more output space upon the next call and re-push
4255 /// the remaining input to the decoder.
4256 OutputFull,
4257
4258 /// The encoder encountered an unmappable character.
4259 ///
4260 /// The caller must either treat this as a fatal error or must append
4261 /// a placeholder to the output and then re-push the remaining input to the
4262 /// encoder.
4263 Unmappable(char),
4264 }
4265
4266 impl EncoderResult {
unmappable_from_bmp(bmp: u16) -> EncoderResult4267 fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4268 EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
4269 }
4270 }
4271
4272 /// A converter that encodes a Unicode stream into bytes according to a
4273 /// character encoding in a streaming (incremental) manner.
4274 ///
4275 /// The various `encode_*` methods take an input buffer (`src`) and an output
4276 /// buffer `dst` both of which are caller-allocated. There are variants for
4277 /// both UTF-8 and UTF-16 input buffers.
4278 ///
4279 /// An `encode_*` method encode characters from `src` into bytes characters
4280 /// stored into `dst` until one of the following three things happens:
4281 ///
4282 /// 1. An unmappable character is encountered (`*_without_replacement` variants
4283 /// only).
4284 ///
4285 /// 2. The output buffer has been filled so near capacity that the decoder
4286 /// cannot be sure that processing an additional character of input wouldn't
4287 /// cause so much output that the output buffer would overflow.
4288 ///
4289 /// 3. All the input characters have been processed.
4290 ///
4291 /// The `encode_*` method then returns tuple of a status indicating which one
4292 /// of the three reasons to return happened, how many input code units (`u8`
4293 /// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4294 /// how many output bytes were written (except when encoding into `Vec<u8>`,
4295 /// whose length change indicates this), and in the case of the variants that
4296 /// perform replacement, a boolean indicating whether an unmappable
4297 /// character was replaced with a numeric character reference during the call.
4298 ///
4299 /// The number of bytes "written" is what's logically written. Garbage may be
4300 /// written in the output buffer beyond the point logically written to.
4301 ///
4302 /// In the case of the methods whose name ends with
4303 /// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4304 /// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4305 /// the three cases listed above).
4306 ///
4307 /// In the case of methods whose name does not end with
4308 /// `*_without_replacement`, unmappable characters are automatically replaced
4309 /// with the corresponding numeric character references and unmappable
4310 /// characters do not cause the methods to return early.
4311 ///
4312 /// When encoding from UTF-8 without replacement, the methods are guaranteed
4313 /// not to return indicating that more output space is needed if the length
4314 /// of the output buffer is at least the length returned by
4315 /// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4316 /// UTF-8 with replacement, the length of the output buffer that guarantees the
4317 /// methods not to return indicating that more output space is needed in the
4318 /// absence of unmappable characters is given by
4319 /// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4320 /// UTF-16 without replacement, the methods are guaranteed not to return
4321 /// indicating that more output space is needed if the length of the output
4322 /// buffer is at least the length returned by
4323 /// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4324 /// from UTF-16 with replacement, the the length of the output buffer that
4325 /// guarantees the methods not to return indicating that more output space is
4326 /// needed in the absence of unmappable characters is given by
4327 /// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4328 /// When encoding with replacement, applications are not expected to size the
4329 /// buffer for the worst case ahead of time but to resize the buffer if there
4330 /// are unmappable characters. This is why max length queries are only available
4331 /// for the case where there are no unmappable characters.
4332 ///
4333 /// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4334 /// calling from Rust, the type system takes care of this.) When encoding from
4335 /// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4336 /// CHARACTERS. Therefore, in order for astral characters not to turn into a
4337 /// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4338 /// are not split across input buffer boundaries.
4339 ///
4340 /// After an `encode_*` call returns, the output produced so far, taken as a
4341 /// whole from the start of the stream, is guaranteed to consist of a valid
4342 /// byte sequence in the target encoding. (I.e. the code unit sequence for a
4343 /// character is guaranteed not to be split across output buffers. However, due
4344 /// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4345 /// from the start for it to be valid. For other encodings, the validity holds
4346 /// on a per-output buffer basis.)
4347 ///
4348 /// The boolean argument `last` indicates that the end of the stream is reached
4349 /// when all the characters in `src` have been consumed. This argument is needed
4350 /// for ISO-2022-JP and is ignored for other encodings.
4351 ///
4352 /// An `Encoder` object can be used to incrementally encode a byte stream.
4353 ///
4354 /// During the processing of a single stream, the caller must call `encode_*`
4355 /// zero or more times with `last` set to `false` and then call `encode_*` at
4356 /// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4357 /// the processing of the stream has ended. Otherwise, the caller must call
4358 /// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4359 /// as a fatal error).
4360 ///
4361 /// Once the stream has ended, the `Encoder` object must not be used anymore.
4362 /// That is, you need to create another one to process another stream.
4363 ///
4364 /// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4365 /// and the caller does not wish to treat it as a fatal error, the input buffer
4366 /// `src` may not have been completely consumed. In that case, the caller must
4367 /// pass the unconsumed contents of `src` to `encode_*` again upon the next
4368 /// call.
4369 ///
4370 /// [1]: enum.EncoderResult.html
4371 /// [2]: #method.max_buffer_length_from_utf8_without_replacement
4372 /// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4373 /// [4]: #method.max_buffer_length_from_utf16_without_replacement
4374 /// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4375 ///
4376 /// # Infinite loops
4377 ///
4378 /// When converting with a fixed-size output buffer whose size is too small to
4379 /// accommodate one character of output, an infinite loop ensues. When
4380 /// converting with a fixed-size output buffer, it generally makes sense to
4381 /// make the buffer fairly large (e.g. couple of kilobytes).
4382 pub struct Encoder {
4383 encoding: &'static Encoding,
4384 variant: VariantEncoder,
4385 }
4386
4387 impl Encoder {
new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder4388 fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4389 Encoder {
4390 encoding: enc,
4391 variant: encoder,
4392 }
4393 }
4394
4395 /// The `Encoding` this `Encoder` is for.
4396 #[inline]
encoding(&self) -> &'static Encoding4397 pub fn encoding(&self) -> &'static Encoding {
4398 self.encoding
4399 }
4400
4401 /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4402 /// ASCII state and `false` otherwise.
4403 #[inline]
has_pending_state(&self) -> bool4404 pub fn has_pending_state(&self) -> bool {
4405 self.variant.has_pending_state()
4406 }
4407
4408 /// Query the worst-case output size when encoding from UTF-8 with
4409 /// replacement.
4410 ///
4411 /// Returns the size of the output buffer in bytes that will not overflow
4412 /// given the current state of the encoder and `byte_length` number of
4413 /// additional input code units if there are no unmappable characters in
4414 /// the input or `None` if `usize` would overflow.
4415 ///
4416 /// Available via the C wrapper.
max_buffer_length_from_utf8_if_no_unmappables( &self, byte_length: usize, ) -> Option<usize>4417 pub fn max_buffer_length_from_utf8_if_no_unmappables(
4418 &self,
4419 byte_length: usize,
4420 ) -> Option<usize> {
4421 checked_add(
4422 if self.encoding().can_encode_everything() {
4423 0
4424 } else {
4425 NCR_EXTRA
4426 },
4427 self.max_buffer_length_from_utf8_without_replacement(byte_length),
4428 )
4429 }
4430
4431 /// Query the worst-case output size when encoding from UTF-8 without
4432 /// replacement.
4433 ///
4434 /// Returns the size of the output buffer in bytes that will not overflow
4435 /// given the current state of the encoder and `byte_length` number of
4436 /// additional input code units or `None` if `usize` would overflow.
4437 ///
4438 /// Available via the C wrapper.
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>4439 pub fn max_buffer_length_from_utf8_without_replacement(
4440 &self,
4441 byte_length: usize,
4442 ) -> Option<usize> {
4443 self.variant
4444 .max_buffer_length_from_utf8_without_replacement(byte_length)
4445 }
4446
4447 /// Incrementally encode into byte stream from UTF-8 with unmappable
4448 /// characters replaced with HTML (decimal) numeric character references.
4449 ///
4450 /// See the documentation of the struct for documentation for `encode_*`
4451 /// methods collectively.
4452 ///
4453 /// Available via the C wrapper.
encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4454 pub fn encode_from_utf8(
4455 &mut self,
4456 src: &str,
4457 dst: &mut [u8],
4458 last: bool,
4459 ) -> (CoderResult, usize, usize, bool) {
4460 let dst_len = dst.len();
4461 let effective_dst_len = if self.encoding().can_encode_everything() {
4462 dst_len
4463 } else {
4464 if dst_len < NCR_EXTRA {
4465 if src.is_empty() && !(last && self.has_pending_state()) {
4466 return (CoderResult::InputEmpty, 0, 0, false);
4467 }
4468 return (CoderResult::OutputFull, 0, 0, false);
4469 }
4470 dst_len - NCR_EXTRA
4471 };
4472 let mut had_unmappables = false;
4473 let mut total_read = 0usize;
4474 let mut total_written = 0usize;
4475 loop {
4476 let (result, read, written) = self.encode_from_utf8_without_replacement(
4477 &src[total_read..],
4478 &mut dst[total_written..effective_dst_len],
4479 last,
4480 );
4481 total_read += read;
4482 total_written += written;
4483 match result {
4484 EncoderResult::InputEmpty => {
4485 return (
4486 CoderResult::InputEmpty,
4487 total_read,
4488 total_written,
4489 had_unmappables,
4490 );
4491 }
4492 EncoderResult::OutputFull => {
4493 return (
4494 CoderResult::OutputFull,
4495 total_read,
4496 total_written,
4497 had_unmappables,
4498 );
4499 }
4500 EncoderResult::Unmappable(unmappable) => {
4501 had_unmappables = true;
4502 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4503 debug_assert_ne!(self.encoding(), UTF_16BE);
4504 debug_assert_ne!(self.encoding(), UTF_16LE);
4505 // Additionally, Iso2022JpEncoder is responsible for
4506 // transitioning to ASCII when returning with Unmappable.
4507 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4508 if total_written >= effective_dst_len {
4509 if total_read == src.len() && !(last && self.has_pending_state()) {
4510 return (
4511 CoderResult::InputEmpty,
4512 total_read,
4513 total_written,
4514 had_unmappables,
4515 );
4516 }
4517 return (
4518 CoderResult::OutputFull,
4519 total_read,
4520 total_written,
4521 had_unmappables,
4522 );
4523 }
4524 }
4525 }
4526 }
4527 }
4528
4529 /// Incrementally encode into byte stream from UTF-8 with unmappable
4530 /// characters replaced with HTML (decimal) numeric character references.
4531 ///
4532 /// See the documentation of the struct for documentation for `encode_*`
4533 /// methods collectively.
4534 ///
4535 /// Available to Rust only.
encode_from_utf8_to_vec( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (CoderResult, usize, bool)4536 pub fn encode_from_utf8_to_vec(
4537 &mut self,
4538 src: &str,
4539 dst: &mut Vec<u8>,
4540 last: bool,
4541 ) -> (CoderResult, usize, bool) {
4542 unsafe {
4543 let old_len = dst.len();
4544 let capacity = dst.capacity();
4545 dst.set_len(capacity);
4546 let (result, read, written, replaced) =
4547 self.encode_from_utf8(src, &mut dst[old_len..], last);
4548 dst.set_len(old_len + written);
4549 (result, read, replaced)
4550 }
4551 }
4552
4553 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4554 ///
4555 /// See the documentation of the struct for documentation for `encode_*`
4556 /// methods collectively.
4557 ///
4558 /// Available via the C wrapper.
encode_from_utf8_without_replacement( &mut self, src: &str, dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4559 pub fn encode_from_utf8_without_replacement(
4560 &mut self,
4561 src: &str,
4562 dst: &mut [u8],
4563 last: bool,
4564 ) -> (EncoderResult, usize, usize) {
4565 self.variant.encode_from_utf8_raw(src, dst, last)
4566 }
4567
4568 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4569 ///
4570 /// See the documentation of the struct for documentation for `encode_*`
4571 /// methods collectively.
4572 ///
4573 /// Available to Rust only.
encode_from_utf8_to_vec_without_replacement( &mut self, src: &str, dst: &mut Vec<u8>, last: bool, ) -> (EncoderResult, usize)4574 pub fn encode_from_utf8_to_vec_without_replacement(
4575 &mut self,
4576 src: &str,
4577 dst: &mut Vec<u8>,
4578 last: bool,
4579 ) -> (EncoderResult, usize) {
4580 unsafe {
4581 let old_len = dst.len();
4582 let capacity = dst.capacity();
4583 dst.set_len(capacity);
4584 let (result, read, written) =
4585 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4586 dst.set_len(old_len + written);
4587 (result, read)
4588 }
4589 }
4590
4591 /// Query the worst-case output size when encoding from UTF-16 with
4592 /// replacement.
4593 ///
4594 /// Returns the size of the output buffer in bytes that will not overflow
4595 /// given the current state of the encoder and `u16_length` number of
4596 /// additional input code units if there are no unmappable characters in
4597 /// the input or `None` if `usize` would overflow.
4598 ///
4599 /// Available via the C wrapper.
max_buffer_length_from_utf16_if_no_unmappables( &self, u16_length: usize, ) -> Option<usize>4600 pub fn max_buffer_length_from_utf16_if_no_unmappables(
4601 &self,
4602 u16_length: usize,
4603 ) -> Option<usize> {
4604 checked_add(
4605 if self.encoding().can_encode_everything() {
4606 0
4607 } else {
4608 NCR_EXTRA
4609 },
4610 self.max_buffer_length_from_utf16_without_replacement(u16_length),
4611 )
4612 }
4613
4614 /// Query the worst-case output size when encoding from UTF-16 without
4615 /// replacement.
4616 ///
4617 /// Returns the size of the output buffer in bytes that will not overflow
4618 /// given the current state of the encoder and `u16_length` number of
4619 /// additional input code units or `None` if `usize` would overflow.
4620 ///
4621 /// Available via the C wrapper.
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>4622 pub fn max_buffer_length_from_utf16_without_replacement(
4623 &self,
4624 u16_length: usize,
4625 ) -> Option<usize> {
4626 self.variant
4627 .max_buffer_length_from_utf16_without_replacement(u16_length)
4628 }
4629
4630 /// Incrementally encode into byte stream from UTF-16 with unmappable
4631 /// characters replaced with HTML (decimal) numeric character references.
4632 ///
4633 /// See the documentation of the struct for documentation for `encode_*`
4634 /// methods collectively.
4635 ///
4636 /// Available via the C wrapper.
encode_from_utf16( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (CoderResult, usize, usize, bool)4637 pub fn encode_from_utf16(
4638 &mut self,
4639 src: &[u16],
4640 dst: &mut [u8],
4641 last: bool,
4642 ) -> (CoderResult, usize, usize, bool) {
4643 let dst_len = dst.len();
4644 let effective_dst_len = if self.encoding().can_encode_everything() {
4645 dst_len
4646 } else {
4647 if dst_len < NCR_EXTRA {
4648 if src.is_empty() && !(last && self.has_pending_state()) {
4649 return (CoderResult::InputEmpty, 0, 0, false);
4650 }
4651 return (CoderResult::OutputFull, 0, 0, false);
4652 }
4653 dst_len - NCR_EXTRA
4654 };
4655 let mut had_unmappables = false;
4656 let mut total_read = 0usize;
4657 let mut total_written = 0usize;
4658 loop {
4659 let (result, read, written) = self.encode_from_utf16_without_replacement(
4660 &src[total_read..],
4661 &mut dst[total_written..effective_dst_len],
4662 last,
4663 );
4664 total_read += read;
4665 total_written += written;
4666 match result {
4667 EncoderResult::InputEmpty => {
4668 return (
4669 CoderResult::InputEmpty,
4670 total_read,
4671 total_written,
4672 had_unmappables,
4673 );
4674 }
4675 EncoderResult::OutputFull => {
4676 return (
4677 CoderResult::OutputFull,
4678 total_read,
4679 total_written,
4680 had_unmappables,
4681 );
4682 }
4683 EncoderResult::Unmappable(unmappable) => {
4684 had_unmappables = true;
4685 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4686 // There are no UTF-16 encoders and even if there were,
4687 // they'd never have unmappables.
4688 debug_assert_ne!(self.encoding(), UTF_16BE);
4689 debug_assert_ne!(self.encoding(), UTF_16LE);
4690 // Additionally, Iso2022JpEncoder is responsible for
4691 // transitioning to ASCII when returning with Unmappable
4692 // from the jis0208 state. That is, when we encode
4693 // ISO-2022-JP and come here, the encoder is in either the
4694 // ASCII or the Roman state. We are allowed to generate any
4695 // printable ASCII excluding \ and ~.
4696 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4697 if total_written >= effective_dst_len {
4698 if total_read == src.len() && !(last && self.has_pending_state()) {
4699 return (
4700 CoderResult::InputEmpty,
4701 total_read,
4702 total_written,
4703 had_unmappables,
4704 );
4705 }
4706 return (
4707 CoderResult::OutputFull,
4708 total_read,
4709 total_written,
4710 had_unmappables,
4711 );
4712 }
4713 }
4714 }
4715 }
4716 }
4717
4718 /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4719 ///
4720 /// See the documentation of the struct for documentation for `encode_*`
4721 /// methods collectively.
4722 ///
4723 /// Available via the C wrapper.
encode_from_utf16_without_replacement( &mut self, src: &[u16], dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize)4724 pub fn encode_from_utf16_without_replacement(
4725 &mut self,
4726 src: &[u16],
4727 dst: &mut [u8],
4728 last: bool,
4729 ) -> (EncoderResult, usize, usize) {
4730 self.variant.encode_from_utf16_raw(src, dst, last)
4731 }
4732 }
4733
4734 /// Format an unmappable as NCR without heap allocation.
write_ncr(unmappable: char, dst: &mut [u8]) -> usize4735 fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4736 // len is the number of decimal digits needed to represent unmappable plus
4737 // 3 (the length of "&#" and ";").
4738 let mut number = unmappable as u32;
4739 let len = if number >= 1_000_000u32 {
4740 10usize
4741 } else if number >= 100_000u32 {
4742 9usize
4743 } else if number >= 10_000u32 {
4744 8usize
4745 } else if number >= 1_000u32 {
4746 7usize
4747 } else if number >= 100u32 {
4748 6usize
4749 } else {
4750 // Review the outcome of https://github.com/whatwg/encoding/issues/15
4751 // to see if this case is possible
4752 5usize
4753 };
4754 debug_assert!(number >= 10u32);
4755 debug_assert!(len <= dst.len());
4756 let mut pos = len - 1;
4757 dst[pos] = b';';
4758 pos -= 1;
4759 loop {
4760 let rightmost = number % 10;
4761 dst[pos] = rightmost as u8 + b'0';
4762 pos -= 1;
4763 if number < 10 {
4764 break;
4765 }
4766 number /= 10;
4767 }
4768 dst[1] = b'#';
4769 dst[0] = b'&';
4770 len
4771 }
4772
4773 #[inline(always)]
in_range16(i: u16, start: u16, end: u16) -> bool4774 fn in_range16(i: u16, start: u16, end: u16) -> bool {
4775 i.wrapping_sub(start) < (end - start)
4776 }
4777
4778 #[inline(always)]
in_range32(i: u32, start: u32, end: u32) -> bool4779 fn in_range32(i: u32, start: u32, end: u32) -> bool {
4780 i.wrapping_sub(start) < (end - start)
4781 }
4782
4783 #[inline(always)]
in_inclusive_range8(i: u8, start: u8, end: u8) -> bool4784 fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4785 i.wrapping_sub(start) <= (end - start)
4786 }
4787
4788 #[inline(always)]
in_inclusive_range16(i: u16, start: u16, end: u16) -> bool4789 fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4790 i.wrapping_sub(start) <= (end - start)
4791 }
4792
4793 #[inline(always)]
in_inclusive_range32(i: u32, start: u32, end: u32) -> bool4794 fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4795 i.wrapping_sub(start) <= (end - start)
4796 }
4797
4798 #[inline(always)]
in_inclusive_range(i: usize, start: usize, end: usize) -> bool4799 fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4800 i.wrapping_sub(start) <= (end - start)
4801 }
4802
4803 #[inline(always)]
checked_add(num: usize, opt: Option<usize>) -> Option<usize>4804 fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4805 if let Some(n) = opt {
4806 n.checked_add(num)
4807 } else {
4808 None
4809 }
4810 }
4811
4812 #[inline(always)]
checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize>4813 fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4814 if let Some(n) = one {
4815 checked_add(n, other)
4816 } else {
4817 None
4818 }
4819 }
4820
4821 #[inline(always)]
checked_mul(num: usize, opt: Option<usize>) -> Option<usize>4822 fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4823 if let Some(n) = opt {
4824 n.checked_mul(num)
4825 } else {
4826 None
4827 }
4828 }
4829
4830 #[inline(always)]
checked_div(opt: Option<usize>, num: usize) -> Option<usize>4831 fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4832 if let Some(n) = opt {
4833 n.checked_div(num)
4834 } else {
4835 None
4836 }
4837 }
4838
4839 #[inline(always)]
checked_next_power_of_two(opt: Option<usize>) -> Option<usize>4840 fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4841 opt.map(|n| n.next_power_of_two())
4842 }
4843
4844 #[inline(always)]
checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize>4845 fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4846 if let Some(a) = one {
4847 if let Some(b) = other {
4848 Some(::std::cmp::min(a, b))
4849 } else {
4850 Some(a)
4851 }
4852 } else {
4853 other
4854 }
4855 }
4856
4857 // ############## TESTS ###############
4858
4859 #[cfg(all(test, feature = "serde"))]
4860 #[derive(Serialize, Deserialize, Debug, PartialEq)]
4861 struct Demo {
4862 num: u32,
4863 name: String,
4864 enc: &'static Encoding,
4865 }
4866
4867 #[cfg(test)]
4868 mod test_labels_names;
4869
4870 #[cfg(test)]
4871 mod tests {
4872 use super::*;
4873 use std::borrow::Cow;
4874
sniff_to_utf16( initial_encoding: &'static Encoding, expected_encoding: &'static Encoding, bytes: &[u8], expect: &[u16], breaks: &[usize], )4875 fn sniff_to_utf16(
4876 initial_encoding: &'static Encoding,
4877 expected_encoding: &'static Encoding,
4878 bytes: &[u8],
4879 expect: &[u16],
4880 breaks: &[usize],
4881 ) {
4882 let mut decoder = initial_encoding.new_decoder();
4883
4884 let mut dest: Vec<u16> =
4885 Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4886 let capacity = dest.capacity();
4887 dest.resize(capacity, 0u16);
4888
4889 let mut total_written = 0usize;
4890 let mut start = 0usize;
4891 for br in breaks {
4892 let (result, read, written, _) =
4893 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4894 total_written += written;
4895 assert_eq!(read, *br - start);
4896 match result {
4897 CoderResult::InputEmpty => {}
4898 CoderResult::OutputFull => {
4899 unreachable!();
4900 }
4901 }
4902 start = *br;
4903 }
4904 let (result, read, written, _) =
4905 decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4906 total_written += written;
4907 match result {
4908 CoderResult::InputEmpty => {}
4909 CoderResult::OutputFull => {
4910 unreachable!();
4911 }
4912 }
4913 assert_eq!(read, bytes.len() - start);
4914 assert_eq!(total_written, expect.len());
4915 assert_eq!(&dest[..total_written], expect);
4916 assert_eq!(decoder.encoding(), expected_encoding);
4917 }
4918
4919 // Any copyright to the test code below this comment is dedicated to the
4920 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4921
4922 #[test]
test_bom_sniffing()4923 fn test_bom_sniffing() {
4924 // ASCII
4925 sniff_to_utf16(
4926 WINDOWS_1252,
4927 WINDOWS_1252,
4928 b"\x61\x62",
4929 &[0x0061u16, 0x0062u16],
4930 &[],
4931 );
4932 // UTF-8
4933 sniff_to_utf16(
4934 WINDOWS_1252,
4935 UTF_8,
4936 b"\xEF\xBB\xBF\x61\x62",
4937 &[0x0061u16, 0x0062u16],
4938 &[],
4939 );
4940 sniff_to_utf16(
4941 WINDOWS_1252,
4942 UTF_8,
4943 b"\xEF\xBB\xBF\x61\x62",
4944 &[0x0061u16, 0x0062u16],
4945 &[1],
4946 );
4947 sniff_to_utf16(
4948 WINDOWS_1252,
4949 UTF_8,
4950 b"\xEF\xBB\xBF\x61\x62",
4951 &[0x0061u16, 0x0062u16],
4952 &[2],
4953 );
4954 sniff_to_utf16(
4955 WINDOWS_1252,
4956 UTF_8,
4957 b"\xEF\xBB\xBF\x61\x62",
4958 &[0x0061u16, 0x0062u16],
4959 &[3],
4960 );
4961 sniff_to_utf16(
4962 WINDOWS_1252,
4963 UTF_8,
4964 b"\xEF\xBB\xBF\x61\x62",
4965 &[0x0061u16, 0x0062u16],
4966 &[4],
4967 );
4968 sniff_to_utf16(
4969 WINDOWS_1252,
4970 UTF_8,
4971 b"\xEF\xBB\xBF\x61\x62",
4972 &[0x0061u16, 0x0062u16],
4973 &[2, 3],
4974 );
4975 sniff_to_utf16(
4976 WINDOWS_1252,
4977 UTF_8,
4978 b"\xEF\xBB\xBF\x61\x62",
4979 &[0x0061u16, 0x0062u16],
4980 &[1, 2],
4981 );
4982 sniff_to_utf16(
4983 WINDOWS_1252,
4984 UTF_8,
4985 b"\xEF\xBB\xBF\x61\x62",
4986 &[0x0061u16, 0x0062u16],
4987 &[1, 3],
4988 );
4989 sniff_to_utf16(
4990 WINDOWS_1252,
4991 UTF_8,
4992 b"\xEF\xBB\xBF\x61\x62",
4993 &[0x0061u16, 0x0062u16],
4994 &[1, 2, 3, 4],
4995 );
4996 sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
4997 // Not UTF-8
4998 sniff_to_utf16(
4999 WINDOWS_1252,
5000 WINDOWS_1252,
5001 b"\xEF\xBB\x61\x62",
5002 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5003 &[],
5004 );
5005 sniff_to_utf16(
5006 WINDOWS_1252,
5007 WINDOWS_1252,
5008 b"\xEF\xBB\x61\x62",
5009 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5010 &[1],
5011 );
5012 sniff_to_utf16(
5013 WINDOWS_1252,
5014 WINDOWS_1252,
5015 b"\xEF\x61\x62",
5016 &[0x00EFu16, 0x0061u16, 0x0062u16],
5017 &[],
5018 );
5019 sniff_to_utf16(
5020 WINDOWS_1252,
5021 WINDOWS_1252,
5022 b"\xEF\x61\x62",
5023 &[0x00EFu16, 0x0061u16, 0x0062u16],
5024 &[1],
5025 );
5026 sniff_to_utf16(
5027 WINDOWS_1252,
5028 WINDOWS_1252,
5029 b"\xEF\xBB",
5030 &[0x00EFu16, 0x00BBu16],
5031 &[],
5032 );
5033 sniff_to_utf16(
5034 WINDOWS_1252,
5035 WINDOWS_1252,
5036 b"\xEF\xBB",
5037 &[0x00EFu16, 0x00BBu16],
5038 &[1],
5039 );
5040 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5041 // Not UTF-16
5042 sniff_to_utf16(
5043 WINDOWS_1252,
5044 WINDOWS_1252,
5045 b"\xFE\x61\x62",
5046 &[0x00FEu16, 0x0061u16, 0x0062u16],
5047 &[],
5048 );
5049 sniff_to_utf16(
5050 WINDOWS_1252,
5051 WINDOWS_1252,
5052 b"\xFE\x61\x62",
5053 &[0x00FEu16, 0x0061u16, 0x0062u16],
5054 &[1],
5055 );
5056 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5057 sniff_to_utf16(
5058 WINDOWS_1252,
5059 WINDOWS_1252,
5060 b"\xFF\x61\x62",
5061 &[0x00FFu16, 0x0061u16, 0x0062u16],
5062 &[],
5063 );
5064 sniff_to_utf16(
5065 WINDOWS_1252,
5066 WINDOWS_1252,
5067 b"\xFF\x61\x62",
5068 &[0x00FFu16, 0x0061u16, 0x0062u16],
5069 &[1],
5070 );
5071 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5072 // UTF-16
5073 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5074 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5075 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5076 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5077 }
5078
5079 #[test]
test_output_encoding()5080 fn test_output_encoding() {
5081 assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5082 assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5083 assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5084 assert_eq!(UTF_8.output_encoding(), UTF_8);
5085 assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5086 assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5087 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5088 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5089 assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5090 assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5091 }
5092
5093 #[test]
test_label_resolution()5094 fn test_label_resolution() {
5095 assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5096 assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5097 assert_eq!(
5098 Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5099 Some(UTF_8)
5100 );
5101 assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5102 assert_eq!(Encoding::for_label(b"bogus"), None);
5103 assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5104 }
5105
5106 #[test]
test_decode_valid_windows_1257_to_cow()5107 fn test_decode_valid_windows_1257_to_cow() {
5108 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5109 match cow {
5110 Cow::Borrowed(_) => unreachable!(),
5111 Cow::Owned(s) => {
5112 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5113 }
5114 }
5115 assert_eq!(encoding, WINDOWS_1257);
5116 assert!(!had_errors);
5117 }
5118
5119 #[test]
test_decode_invalid_windows_1257_to_cow()5120 fn test_decode_invalid_windows_1257_to_cow() {
5121 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5122 match cow {
5123 Cow::Borrowed(_) => unreachable!(),
5124 Cow::Owned(s) => {
5125 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5126 }
5127 }
5128 assert_eq!(encoding, WINDOWS_1257);
5129 assert!(had_errors);
5130 }
5131
5132 #[test]
test_decode_ascii_only_windows_1257_to_cow()5133 fn test_decode_ascii_only_windows_1257_to_cow() {
5134 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5135 match cow {
5136 Cow::Borrowed(s) => {
5137 assert_eq!(s, "abc");
5138 }
5139 Cow::Owned(_) => unreachable!(),
5140 }
5141 assert_eq!(encoding, WINDOWS_1257);
5142 assert!(!had_errors);
5143 }
5144
5145 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow()5146 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5147 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5148 match cow {
5149 Cow::Borrowed(s) => {
5150 assert_eq!(s, "\u{20AC}\u{00E4}");
5151 }
5152 Cow::Owned(_) => unreachable!(),
5153 }
5154 assert_eq!(encoding, UTF_8);
5155 assert!(!had_errors);
5156 }
5157
5158 #[test]
test_decode_bomful_invalid_utf8_as_windows_1257_to_cow()5159 fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5160 let (cow, encoding, had_errors) =
5161 WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5162 match cow {
5163 Cow::Borrowed(_) => unreachable!(),
5164 Cow::Owned(s) => {
5165 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5166 }
5167 }
5168 assert_eq!(encoding, UTF_8);
5169 assert!(had_errors);
5170 }
5171
5172 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow()5173 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5174 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5175 match cow {
5176 Cow::Borrowed(s) => {
5177 assert_eq!(s, "\u{20AC}\u{00E4}");
5178 }
5179 Cow::Owned(_) => unreachable!(),
5180 }
5181 assert_eq!(encoding, UTF_8);
5182 assert!(!had_errors);
5183 }
5184
5185 #[test]
test_decode_bomful_invalid_utf8_as_utf_8_to_cow()5186 fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5187 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5188 match cow {
5189 Cow::Borrowed(_) => unreachable!(),
5190 Cow::Owned(s) => {
5191 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5192 }
5193 }
5194 assert_eq!(encoding, UTF_8);
5195 assert!(had_errors);
5196 }
5197
5198 #[test]
test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal()5199 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5200 let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5201 match cow {
5202 Cow::Borrowed(s) => {
5203 assert_eq!(s, "\u{20AC}\u{00E4}");
5204 }
5205 Cow::Owned(_) => unreachable!(),
5206 }
5207 assert!(!had_errors);
5208 }
5209
5210 #[test]
test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal()5211 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5212 let (cow, had_errors) =
5213 WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5214 match cow {
5215 Cow::Borrowed(_) => unreachable!(),
5216 Cow::Owned(s) => {
5217 assert_eq!(
5218 s,
5219 "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5220 );
5221 }
5222 }
5223 assert!(!had_errors);
5224 }
5225
5226 #[test]
test_decode_valid_windows_1257_to_cow_with_bom_removal()5227 fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5228 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5229 match cow {
5230 Cow::Borrowed(_) => unreachable!(),
5231 Cow::Owned(s) => {
5232 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5233 }
5234 }
5235 assert!(!had_errors);
5236 }
5237
5238 #[test]
test_decode_invalid_windows_1257_to_cow_with_bom_removal()5239 fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5240 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5241 match cow {
5242 Cow::Borrowed(_) => unreachable!(),
5243 Cow::Owned(s) => {
5244 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5245 }
5246 }
5247 assert!(had_errors);
5248 }
5249
5250 #[test]
test_decode_ascii_only_windows_1257_to_cow_with_bom_removal()5251 fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5252 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5253 match cow {
5254 Cow::Borrowed(s) => {
5255 assert_eq!(s, "abc");
5256 }
5257 Cow::Owned(_) => unreachable!(),
5258 }
5259 assert!(!had_errors);
5260 }
5261
5262 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling()5263 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5264 let (cow, had_errors) =
5265 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5266 match cow {
5267 Cow::Borrowed(s) => {
5268 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5269 }
5270 Cow::Owned(_) => unreachable!(),
5271 }
5272 assert!(!had_errors);
5273 }
5274
5275 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling()5276 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5277 let (cow, had_errors) =
5278 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5279 match cow {
5280 Cow::Borrowed(_) => unreachable!(),
5281 Cow::Owned(s) => {
5282 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5283 }
5284 }
5285 assert!(had_errors);
5286 }
5287
5288 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling()5289 fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5290 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5291 match cow {
5292 Cow::Borrowed(_) => unreachable!(),
5293 Cow::Owned(s) => {
5294 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5295 }
5296 }
5297 assert!(!had_errors);
5298 }
5299
5300 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling()5301 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5302 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5303 match cow {
5304 Cow::Borrowed(_) => unreachable!(),
5305 Cow::Owned(s) => {
5306 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5307 }
5308 }
5309 assert!(had_errors);
5310 }
5311
5312 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling()5313 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5314 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5315 match cow {
5316 Cow::Borrowed(s) => {
5317 assert_eq!(s, "abc");
5318 }
5319 Cow::Owned(_) => unreachable!(),
5320 }
5321 assert!(!had_errors);
5322 }
5323
5324 #[test]
test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement()5325 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5326 match UTF_8.decode_without_bom_handling_and_without_replacement(
5327 b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5328 ) {
5329 Some(cow) => match cow {
5330 Cow::Borrowed(s) => {
5331 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5332 }
5333 Cow::Owned(_) => unreachable!(),
5334 },
5335 None => unreachable!(),
5336 }
5337 }
5338
5339 #[test]
test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement()5340 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5341 assert!(UTF_8
5342 .decode_without_bom_handling_and_without_replacement(
5343 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5344 )
5345 .is_none());
5346 }
5347
5348 #[test]
test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5349 fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5350 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5351 Some(cow) => match cow {
5352 Cow::Borrowed(_) => unreachable!(),
5353 Cow::Owned(s) => {
5354 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5355 }
5356 },
5357 None => unreachable!(),
5358 }
5359 }
5360
5361 #[test]
test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement()5362 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5363 assert!(WINDOWS_1257
5364 .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5365 .is_none());
5366 }
5367
5368 #[test]
test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement()5369 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5370 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5371 Some(cow) => match cow {
5372 Cow::Borrowed(s) => {
5373 assert_eq!(s, "abc");
5374 }
5375 Cow::Owned(_) => unreachable!(),
5376 },
5377 None => unreachable!(),
5378 }
5379 }
5380
5381 #[test]
test_encode_ascii_only_windows_1257_to_cow()5382 fn test_encode_ascii_only_windows_1257_to_cow() {
5383 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5384 match cow {
5385 Cow::Borrowed(s) => {
5386 assert_eq!(s, b"abc");
5387 }
5388 Cow::Owned(_) => unreachable!(),
5389 }
5390 assert_eq!(encoding, WINDOWS_1257);
5391 assert!(!had_errors);
5392 }
5393
5394 #[test]
test_encode_valid_windows_1257_to_cow()5395 fn test_encode_valid_windows_1257_to_cow() {
5396 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5397 match cow {
5398 Cow::Borrowed(_) => unreachable!(),
5399 Cow::Owned(s) => {
5400 assert_eq!(s, b"abc\x80\xE4");
5401 }
5402 }
5403 assert_eq!(encoding, WINDOWS_1257);
5404 assert!(!had_errors);
5405 }
5406
5407 #[test]
test_utf16_space_with_one_bom_byte()5408 fn test_utf16_space_with_one_bom_byte() {
5409 let mut decoder = UTF_16LE.new_decoder();
5410 let mut dst = [0u16; 12];
5411 {
5412 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5413 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5414 assert_eq!(result, CoderResult::InputEmpty);
5415 }
5416 {
5417 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5418 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5419 assert_eq!(result, CoderResult::InputEmpty);
5420 }
5421 }
5422
5423 #[test]
test_utf8_space_with_one_bom_byte()5424 fn test_utf8_space_with_one_bom_byte() {
5425 let mut decoder = UTF_8.new_decoder();
5426 let mut dst = [0u16; 12];
5427 {
5428 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5429 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5430 assert_eq!(result, CoderResult::InputEmpty);
5431 }
5432 {
5433 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5434 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5435 assert_eq!(result, CoderResult::InputEmpty);
5436 }
5437 }
5438
5439 #[test]
test_utf16_space_with_two_bom_bytes()5440 fn test_utf16_space_with_two_bom_bytes() {
5441 let mut decoder = UTF_16LE.new_decoder();
5442 let mut dst = [0u16; 12];
5443 {
5444 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5445 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5446 assert_eq!(result, CoderResult::InputEmpty);
5447 }
5448 {
5449 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5450 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5451 assert_eq!(result, CoderResult::InputEmpty);
5452 }
5453 {
5454 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5455 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5456 assert_eq!(result, CoderResult::InputEmpty);
5457 }
5458 }
5459
5460 #[test]
test_utf8_space_with_two_bom_bytes()5461 fn test_utf8_space_with_two_bom_bytes() {
5462 let mut decoder = UTF_8.new_decoder();
5463 let mut dst = [0u16; 12];
5464 {
5465 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5466 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5467 assert_eq!(result, CoderResult::InputEmpty);
5468 }
5469 {
5470 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5471 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5472 assert_eq!(result, CoderResult::InputEmpty);
5473 }
5474 {
5475 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5476 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5477 assert_eq!(result, CoderResult::InputEmpty);
5478 }
5479 }
5480
5481 #[test]
test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call()5482 fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5483 let mut decoder = UTF_16LE.new_decoder();
5484 let mut dst = [0u16; 12];
5485 {
5486 let needed = decoder.max_utf16_buffer_length(2).unwrap();
5487 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5488 assert_eq!(result, CoderResult::InputEmpty);
5489 }
5490 }
5491
5492 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8()5493 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5494 let mut dst = [0u8; 8];
5495 let mut encoder = ISO_2022_JP.new_encoder();
5496 {
5497 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5498 assert_eq!(result, CoderResult::InputEmpty);
5499 }
5500 {
5501 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5502 assert_eq!(result, CoderResult::InputEmpty);
5503 }
5504 }
5505
5506 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf8()5507 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5508 let mut dst = [0u8; 16];
5509 let mut encoder = ISO_2022_JP.new_encoder();
5510 {
5511 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5512 assert_eq!(result, CoderResult::InputEmpty);
5513 }
5514 {
5515 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5516 assert_eq!(result, CoderResult::InputEmpty);
5517 }
5518 {
5519 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5520 assert_eq!(result, CoderResult::OutputFull);
5521 }
5522 }
5523
5524 #[test]
test_buffer_end_iso_2022_jp_from_utf8()5525 fn test_buffer_end_iso_2022_jp_from_utf8() {
5526 let mut dst = [0u8; 18];
5527 {
5528 let mut encoder = ISO_2022_JP.new_encoder();
5529 let (result, _, _, _) =
5530 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5531 assert_eq!(result, CoderResult::InputEmpty);
5532 }
5533 {
5534 let mut encoder = ISO_2022_JP.new_encoder();
5535 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5536 assert_eq!(result, CoderResult::OutputFull);
5537 }
5538 {
5539 let mut encoder = ISO_2022_JP.new_encoder();
5540 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5541 assert_eq!(result, CoderResult::InputEmpty);
5542 }
5543 {
5544 let mut encoder = ISO_2022_JP.new_encoder();
5545 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5546 assert_eq!(result, CoderResult::InputEmpty);
5547 }
5548 }
5549
5550 #[test]
test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16()5551 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5552 let mut dst = [0u8; 8];
5553 let mut encoder = ISO_2022_JP.new_encoder();
5554 {
5555 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5556 assert_eq!(result, CoderResult::InputEmpty);
5557 }
5558 {
5559 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5560 assert_eq!(result, CoderResult::InputEmpty);
5561 }
5562 }
5563
5564 #[test]
test_too_short_buffer_with_iso_2022_jp_roman_from_utf16()5565 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5566 let mut dst = [0u8; 16];
5567 let mut encoder = ISO_2022_JP.new_encoder();
5568 {
5569 let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5570 assert_eq!(result, CoderResult::InputEmpty);
5571 }
5572 {
5573 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5574 assert_eq!(result, CoderResult::InputEmpty);
5575 }
5576 {
5577 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5578 assert_eq!(result, CoderResult::OutputFull);
5579 }
5580 }
5581
5582 #[test]
test_buffer_end_iso_2022_jp_from_utf16()5583 fn test_buffer_end_iso_2022_jp_from_utf16() {
5584 let mut dst = [0u8; 18];
5585 {
5586 let mut encoder = ISO_2022_JP.new_encoder();
5587 let (result, _, _, _) =
5588 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5589 assert_eq!(result, CoderResult::InputEmpty);
5590 }
5591 {
5592 let mut encoder = ISO_2022_JP.new_encoder();
5593 let (result, _, _, _) =
5594 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5595 assert_eq!(result, CoderResult::OutputFull);
5596 }
5597 {
5598 let mut encoder = ISO_2022_JP.new_encoder();
5599 let (result, _, _, _) =
5600 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5601 assert_eq!(result, CoderResult::InputEmpty);
5602 }
5603 {
5604 let mut encoder = ISO_2022_JP.new_encoder();
5605 let (result, _, _, _) =
5606 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5607 assert_eq!(result, CoderResult::InputEmpty);
5608 }
5609 }
5610
5611 #[test]
test_hash()5612 fn test_hash() {
5613 let mut encodings = ::std::collections::HashSet::new();
5614 encodings.insert(UTF_8);
5615 encodings.insert(ISO_2022_JP);
5616 assert!(encodings.contains(UTF_8));
5617 assert!(encodings.contains(ISO_2022_JP));
5618 assert!(!encodings.contains(WINDOWS_1252));
5619 encodings.remove(ISO_2022_JP);
5620 assert!(!encodings.contains(ISO_2022_JP));
5621 }
5622
5623 #[test]
test_iso_2022_jp_ncr_extra_from_utf16()5624 fn test_iso_2022_jp_ncr_extra_from_utf16() {
5625 let mut dst = [0u8; 17];
5626 {
5627 let mut encoder = ISO_2022_JP.new_encoder();
5628 let (result, _, _, _) =
5629 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5630 assert_eq!(result, CoderResult::OutputFull);
5631 }
5632 }
5633
5634 #[test]
test_iso_2022_jp_ncr_extra_from_utf8()5635 fn test_iso_2022_jp_ncr_extra_from_utf8() {
5636 let mut dst = [0u8; 17];
5637 {
5638 let mut encoder = ISO_2022_JP.new_encoder();
5639 let (result, _, _, _) =
5640 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5641 assert_eq!(result, CoderResult::OutputFull);
5642 }
5643 }
5644
5645 #[test]
test_max_length_with_bom_to_utf8()5646 fn test_max_length_with_bom_to_utf8() {
5647 let mut output = [0u8; 20];
5648 let mut decoder = REPLACEMENT.new_decoder();
5649 let input = b"\xEF\xBB\xBFA";
5650 {
5651 let needed = decoder
5652 .max_utf8_buffer_length_without_replacement(input.len())
5653 .unwrap();
5654 let (result, read, written) =
5655 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5656 assert_eq!(result, DecoderResult::InputEmpty);
5657 assert_eq!(read, input.len());
5658 assert_eq!(written, 1);
5659 assert_eq!(output[0], 0x41);
5660 }
5661 }
5662
5663 #[cfg(feature = "serde")]
5664 #[test]
test_serde()5665 fn test_serde() {
5666 let demo = Demo {
5667 num: 42,
5668 name: "foo".into(),
5669 enc: UTF_8,
5670 };
5671
5672 let serialized = serde_json::to_string(&demo).unwrap();
5673
5674 let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5675 assert_eq!(deserialized, demo);
5676
5677 let bincoded = bincode::serialize(&demo).unwrap();
5678 let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5679 assert_eq!(debincoded, demo);
5680 }
5681
5682 #[test]
test_is_single_byte()5683 fn test_is_single_byte() {
5684 assert!(!BIG5.is_single_byte());
5685 assert!(!EUC_JP.is_single_byte());
5686 assert!(!EUC_KR.is_single_byte());
5687 assert!(!GB18030.is_single_byte());
5688 assert!(!GBK.is_single_byte());
5689 assert!(!REPLACEMENT.is_single_byte());
5690 assert!(!SHIFT_JIS.is_single_byte());
5691 assert!(!UTF_8.is_single_byte());
5692 assert!(!UTF_16BE.is_single_byte());
5693 assert!(!UTF_16LE.is_single_byte());
5694 assert!(!ISO_2022_JP.is_single_byte());
5695
5696 assert!(IBM866.is_single_byte());
5697 assert!(ISO_8859_2.is_single_byte());
5698 assert!(ISO_8859_3.is_single_byte());
5699 assert!(ISO_8859_4.is_single_byte());
5700 assert!(ISO_8859_5.is_single_byte());
5701 assert!(ISO_8859_6.is_single_byte());
5702 assert!(ISO_8859_7.is_single_byte());
5703 assert!(ISO_8859_8.is_single_byte());
5704 assert!(ISO_8859_10.is_single_byte());
5705 assert!(ISO_8859_13.is_single_byte());
5706 assert!(ISO_8859_14.is_single_byte());
5707 assert!(ISO_8859_15.is_single_byte());
5708 assert!(ISO_8859_16.is_single_byte());
5709 assert!(ISO_8859_8_I.is_single_byte());
5710 assert!(KOI8_R.is_single_byte());
5711 assert!(KOI8_U.is_single_byte());
5712 assert!(MACINTOSH.is_single_byte());
5713 assert!(WINDOWS_874.is_single_byte());
5714 assert!(WINDOWS_1250.is_single_byte());
5715 assert!(WINDOWS_1251.is_single_byte());
5716 assert!(WINDOWS_1252.is_single_byte());
5717 assert!(WINDOWS_1253.is_single_byte());
5718 assert!(WINDOWS_1254.is_single_byte());
5719 assert!(WINDOWS_1255.is_single_byte());
5720 assert!(WINDOWS_1256.is_single_byte());
5721 assert!(WINDOWS_1257.is_single_byte());
5722 assert!(WINDOWS_1258.is_single_byte());
5723 assert!(X_MAC_CYRILLIC.is_single_byte());
5724 assert!(X_USER_DEFINED.is_single_byte());
5725 }
5726
5727 #[test]
test_latin1_byte_compatible_up_to()5728 fn test_latin1_byte_compatible_up_to() {
5729 let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5730 assert_eq!(
5731 BIG5.new_decoder_without_bom_handling()
5732 .latin1_byte_compatible_up_to(buffer)
5733 .unwrap(),
5734 1
5735 );
5736 assert_eq!(
5737 EUC_JP
5738 .new_decoder_without_bom_handling()
5739 .latin1_byte_compatible_up_to(buffer)
5740 .unwrap(),
5741 1
5742 );
5743 assert_eq!(
5744 EUC_KR
5745 .new_decoder_without_bom_handling()
5746 .latin1_byte_compatible_up_to(buffer)
5747 .unwrap(),
5748 1
5749 );
5750 assert_eq!(
5751 GB18030
5752 .new_decoder_without_bom_handling()
5753 .latin1_byte_compatible_up_to(buffer)
5754 .unwrap(),
5755 1
5756 );
5757 assert_eq!(
5758 GBK.new_decoder_without_bom_handling()
5759 .latin1_byte_compatible_up_to(buffer)
5760 .unwrap(),
5761 1
5762 );
5763 assert!(REPLACEMENT
5764 .new_decoder_without_bom_handling()
5765 .latin1_byte_compatible_up_to(buffer)
5766 .is_none());
5767 assert_eq!(
5768 SHIFT_JIS
5769 .new_decoder_without_bom_handling()
5770 .latin1_byte_compatible_up_to(buffer)
5771 .unwrap(),
5772 1
5773 );
5774 assert_eq!(
5775 UTF_8
5776 .new_decoder_without_bom_handling()
5777 .latin1_byte_compatible_up_to(buffer)
5778 .unwrap(),
5779 1
5780 );
5781 assert!(UTF_16BE
5782 .new_decoder_without_bom_handling()
5783 .latin1_byte_compatible_up_to(buffer)
5784 .is_none());
5785 assert!(UTF_16LE
5786 .new_decoder_without_bom_handling()
5787 .latin1_byte_compatible_up_to(buffer)
5788 .is_none());
5789 assert_eq!(
5790 ISO_2022_JP
5791 .new_decoder_without_bom_handling()
5792 .latin1_byte_compatible_up_to(buffer)
5793 .unwrap(),
5794 1
5795 );
5796
5797 assert_eq!(
5798 IBM866
5799 .new_decoder_without_bom_handling()
5800 .latin1_byte_compatible_up_to(buffer)
5801 .unwrap(),
5802 1
5803 );
5804 assert_eq!(
5805 ISO_8859_2
5806 .new_decoder_without_bom_handling()
5807 .latin1_byte_compatible_up_to(buffer)
5808 .unwrap(),
5809 2
5810 );
5811 assert_eq!(
5812 ISO_8859_3
5813 .new_decoder_without_bom_handling()
5814 .latin1_byte_compatible_up_to(buffer)
5815 .unwrap(),
5816 2
5817 );
5818 assert_eq!(
5819 ISO_8859_4
5820 .new_decoder_without_bom_handling()
5821 .latin1_byte_compatible_up_to(buffer)
5822 .unwrap(),
5823 2
5824 );
5825 assert_eq!(
5826 ISO_8859_5
5827 .new_decoder_without_bom_handling()
5828 .latin1_byte_compatible_up_to(buffer)
5829 .unwrap(),
5830 2
5831 );
5832 assert_eq!(
5833 ISO_8859_6
5834 .new_decoder_without_bom_handling()
5835 .latin1_byte_compatible_up_to(buffer)
5836 .unwrap(),
5837 2
5838 );
5839 assert_eq!(
5840 ISO_8859_7
5841 .new_decoder_without_bom_handling()
5842 .latin1_byte_compatible_up_to(buffer)
5843 .unwrap(),
5844 2
5845 );
5846 assert_eq!(
5847 ISO_8859_8
5848 .new_decoder_without_bom_handling()
5849 .latin1_byte_compatible_up_to(buffer)
5850 .unwrap(),
5851 3
5852 );
5853 assert_eq!(
5854 ISO_8859_10
5855 .new_decoder_without_bom_handling()
5856 .latin1_byte_compatible_up_to(buffer)
5857 .unwrap(),
5858 2
5859 );
5860 assert_eq!(
5861 ISO_8859_13
5862 .new_decoder_without_bom_handling()
5863 .latin1_byte_compatible_up_to(buffer)
5864 .unwrap(),
5865 4
5866 );
5867 assert_eq!(
5868 ISO_8859_14
5869 .new_decoder_without_bom_handling()
5870 .latin1_byte_compatible_up_to(buffer)
5871 .unwrap(),
5872 4
5873 );
5874 assert_eq!(
5875 ISO_8859_15
5876 .new_decoder_without_bom_handling()
5877 .latin1_byte_compatible_up_to(buffer)
5878 .unwrap(),
5879 6
5880 );
5881 assert_eq!(
5882 ISO_8859_16
5883 .new_decoder_without_bom_handling()
5884 .latin1_byte_compatible_up_to(buffer)
5885 .unwrap(),
5886 4
5887 );
5888 assert_eq!(
5889 ISO_8859_8_I
5890 .new_decoder_without_bom_handling()
5891 .latin1_byte_compatible_up_to(buffer)
5892 .unwrap(),
5893 3
5894 );
5895 assert_eq!(
5896 KOI8_R
5897 .new_decoder_without_bom_handling()
5898 .latin1_byte_compatible_up_to(buffer)
5899 .unwrap(),
5900 1
5901 );
5902 assert_eq!(
5903 KOI8_U
5904 .new_decoder_without_bom_handling()
5905 .latin1_byte_compatible_up_to(buffer)
5906 .unwrap(),
5907 1
5908 );
5909 assert_eq!(
5910 MACINTOSH
5911 .new_decoder_without_bom_handling()
5912 .latin1_byte_compatible_up_to(buffer)
5913 .unwrap(),
5914 1
5915 );
5916 assert_eq!(
5917 WINDOWS_874
5918 .new_decoder_without_bom_handling()
5919 .latin1_byte_compatible_up_to(buffer)
5920 .unwrap(),
5921 2
5922 );
5923 assert_eq!(
5924 WINDOWS_1250
5925 .new_decoder_without_bom_handling()
5926 .latin1_byte_compatible_up_to(buffer)
5927 .unwrap(),
5928 4
5929 );
5930 assert_eq!(
5931 WINDOWS_1251
5932 .new_decoder_without_bom_handling()
5933 .latin1_byte_compatible_up_to(buffer)
5934 .unwrap(),
5935 1
5936 );
5937 assert_eq!(
5938 WINDOWS_1252
5939 .new_decoder_without_bom_handling()
5940 .latin1_byte_compatible_up_to(buffer)
5941 .unwrap(),
5942 5
5943 );
5944 assert_eq!(
5945 WINDOWS_1253
5946 .new_decoder_without_bom_handling()
5947 .latin1_byte_compatible_up_to(buffer)
5948 .unwrap(),
5949 3
5950 );
5951 assert_eq!(
5952 WINDOWS_1254
5953 .new_decoder_without_bom_handling()
5954 .latin1_byte_compatible_up_to(buffer)
5955 .unwrap(),
5956 4
5957 );
5958 assert_eq!(
5959 WINDOWS_1255
5960 .new_decoder_without_bom_handling()
5961 .latin1_byte_compatible_up_to(buffer)
5962 .unwrap(),
5963 3
5964 );
5965 assert_eq!(
5966 WINDOWS_1256
5967 .new_decoder_without_bom_handling()
5968 .latin1_byte_compatible_up_to(buffer)
5969 .unwrap(),
5970 1
5971 );
5972 assert_eq!(
5973 WINDOWS_1257
5974 .new_decoder_without_bom_handling()
5975 .latin1_byte_compatible_up_to(buffer)
5976 .unwrap(),
5977 4
5978 );
5979 assert_eq!(
5980 WINDOWS_1258
5981 .new_decoder_without_bom_handling()
5982 .latin1_byte_compatible_up_to(buffer)
5983 .unwrap(),
5984 4
5985 );
5986 assert_eq!(
5987 X_MAC_CYRILLIC
5988 .new_decoder_without_bom_handling()
5989 .latin1_byte_compatible_up_to(buffer)
5990 .unwrap(),
5991 1
5992 );
5993 assert_eq!(
5994 X_USER_DEFINED
5995 .new_decoder_without_bom_handling()
5996 .latin1_byte_compatible_up_to(buffer)
5997 .unwrap(),
5998 1
5999 );
6000
6001 assert!(UTF_8
6002 .new_decoder()
6003 .latin1_byte_compatible_up_to(buffer)
6004 .is_none());
6005
6006 let mut decoder = UTF_8.new_decoder();
6007 let mut output = [0u16; 4];
6008 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6009 assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6010 let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6011 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6012 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6013 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6014 }
6015 }
6016