1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! Functions for converting between different in-RAM representations of text
11 //! and for quickly checking if the Unicode Bidirectional Algorithm can be
12 //! avoided.
13 //!
14 //! By using slices for output, the functions here seek to enable by-register
15 //! (ALU register or SIMD register as available) operations in order to
16 //! outperform iterator-based conversions available in the Rust standard
17 //! library.
18 //!
19 //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20 //! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21 //! in-memory encoding is sometimes used as a storage optimization of text
22 //! when UTF-16 indexing and length semantics are exposed.
23 //!
24 //! The FFI binding for this module are in the
25 //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26 
27 use alloc::borrow::Cow;
28 use alloc::string::String;
29 use alloc::vec::Vec;
30 
31 use super::in_inclusive_range16;
32 use super::in_inclusive_range32;
33 use super::in_inclusive_range8;
34 use super::in_range16;
35 use super::in_range32;
36 use super::DecoderResult;
37 use crate::ascii::*;
38 use crate::utf_8::*;
39 
40 macro_rules! non_fuzz_debug_assert {
41     ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
42 }
43 
44 cfg_if! {
45     if #[cfg(feature = "simd-accel")] {
46         use ::core::intrinsics::likely;
47         use ::core::intrinsics::unlikely;
48     } else {
49         #[inline(always)]
50         // Unsafe to match the intrinsic, which is needlessly unsafe.
51         unsafe fn likely(b: bool) -> bool {
52             b
53         }
54         #[inline(always)]
55         // Unsafe to match the intrinsic, which is needlessly unsafe.
56         unsafe fn unlikely(b: bool) -> bool {
57             b
58         }
59     }
60 }
61 
62 /// Classification of text as Latin1 (all code points are below U+0100),
63 /// left-to-right with some non-Latin1 characters or as containing at least
64 /// some right-to-left characters.
65 #[must_use]
66 #[derive(Debug, PartialEq, Eq)]
67 #[repr(C)]
68 pub enum Latin1Bidi {
69     /// Every character is below U+0100.
70     Latin1 = 0,
71     /// There is at least one character that's U+0100 or higher, but there
72     /// are no right-to-left characters.
73     LeftToRight = 1,
74     /// There is at least one right-to-left character.
75     Bidi = 2,
76 }
77 
78 // `as` truncates, so works on 32-bit, too.
79 #[allow(dead_code)]
80 const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
81 
82 #[allow(unused_macros)]
83 macro_rules! by_unit_check_alu {
84     ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
85         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
86         #[inline(always)]
87         fn $name(buffer: &[$unit]) -> bool {
88             let mut offset = 0usize;
89             let mut accu = 0usize;
90             let unit_size = ::core::mem::size_of::<$unit>();
91             let len = buffer.len();
92             if len >= ALU_ALIGNMENT / unit_size {
93                 // The most common reason to return `false` is for the first code
94                 // unit to fail the test, so check that first.
95                 if buffer[0] >= $bound {
96                     return false;
97                 }
98                 let src = buffer.as_ptr();
99                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
100                     & ALU_ALIGNMENT_MASK)
101                     / unit_size;
102                 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
103                     if until_alignment != 0 {
104                         accu |= buffer[offset] as usize;
105                         offset += 1;
106                         until_alignment -= 1;
107                         while until_alignment != 0 {
108                             accu |= buffer[offset] as usize;
109                             offset += 1;
110                             until_alignment -= 1;
111                         }
112                         if accu >= $bound {
113                             return false;
114                         }
115                     }
116                     let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
117                     if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
118                         let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
119                         loop {
120                             let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
121                                 | unsafe {
122                                     *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
123                                 }
124                                 | unsafe {
125                                     *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
126                                         as *const usize)
127                                 }
128                                 | unsafe {
129                                     *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
130                                         as *const usize)
131                                 };
132                             if unroll_accu & $mask != 0 {
133                                 return false;
134                             }
135                             offset += 4 * (ALU_ALIGNMENT / unit_size);
136                             if offset > len_minus_unroll {
137                                 break;
138                             }
139                         }
140                     }
141                     while offset <= len_minus_stride {
142                         accu |= unsafe { *(src.add(offset) as *const usize) };
143                         offset += ALU_ALIGNMENT / unit_size;
144                     }
145                 }
146             }
147             for &unit in &buffer[offset..] {
148                 accu |= unit as usize;
149             }
150             accu & $mask == 0
151         }
152     };
153 }
154 
155 #[allow(unused_macros)]
156 macro_rules! by_unit_check_simd {
157     ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
158         #[inline(always)]
159         fn $name(buffer: &[$unit]) -> bool {
160             let mut offset = 0usize;
161             let mut accu = 0usize;
162             let unit_size = ::core::mem::size_of::<$unit>();
163             let len = buffer.len();
164             if len >= SIMD_STRIDE_SIZE / unit_size {
165                 // The most common reason to return `false` is for the first code
166                 // unit to fail the test, so check that first.
167                 if buffer[0] >= $bound {
168                     return false;
169                 }
170                 let src = buffer.as_ptr();
171                 let mut until_alignment = ((SIMD_ALIGNMENT
172                     - ((src as usize) & SIMD_ALIGNMENT_MASK))
173                     & SIMD_ALIGNMENT_MASK)
174                     / unit_size;
175                 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
176                     if until_alignment != 0 {
177                         accu |= buffer[offset] as usize;
178                         offset += 1;
179                         until_alignment -= 1;
180                         while until_alignment != 0 {
181                             accu |= buffer[offset] as usize;
182                             offset += 1;
183                             until_alignment -= 1;
184                         }
185                         if accu >= $bound {
186                             return false;
187                         }
188                     }
189                     let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
190                     if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
191                         let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
192                         loop {
193                             let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
194                                 | unsafe {
195                                     *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
196                                         as *const $simd_ty)
197                                 }
198                                 | unsafe {
199                                     *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
200                                         as *const $simd_ty)
201                                 }
202                                 | unsafe {
203                                     *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
204                                         as *const $simd_ty)
205                                 };
206                             if !$func(unroll_accu) {
207                                 return false;
208                             }
209                             offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
210                             if offset > len_minus_unroll {
211                                 break;
212                             }
213                         }
214                     }
215                     let mut simd_accu = $splat;
216                     while offset <= len_minus_stride {
217                         simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
218                         offset += SIMD_STRIDE_SIZE / unit_size;
219                     }
220                     if !$func(simd_accu) {
221                         return false;
222                     }
223                 }
224             }
225             for &unit in &buffer[offset..] {
226                 accu |= unit as usize;
227             }
228             accu < $bound
229         }
230     };
231 }
232 
233 cfg_if! {
234     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
235         use crate::simd_funcs::*;
236         use packed_simd::u8x16;
237         use packed_simd::u16x8;
238 
239         const SIMD_ALIGNMENT: usize = 16;
240 
241         const SIMD_ALIGNMENT_MASK: usize = 15;
242 
243         by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
244         by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
245         by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
246 
247         #[inline(always)]
248         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
249             // This function is a mess, because it simultaneously tries to do
250             // only aligned SIMD (perhaps misguidedly) and needs to deal with
251             // the last code unit in a SIMD stride being part of a valid
252             // surrogate pair.
253             let unit_size = ::core::mem::size_of::<u16>();
254             let src = buffer.as_ptr();
255             let len = buffer.len();
256             let mut offset = 0usize;
257             'outer: loop {
258                 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
259                                         SIMD_ALIGNMENT_MASK) / unit_size;
260                 if until_alignment == 0 {
261                     if offset + SIMD_STRIDE_SIZE / unit_size > len {
262                         break;
263                     }
264                 } else {
265                     let offset_plus_until_alignment = offset + until_alignment;
266                     let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
267                     if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
268                         break;
269                     }
270                     let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
271                     if up_to < until_alignment {
272                         return offset + up_to;
273                     }
274                     if last_valid_low {
275                         offset = offset_plus_until_alignment_plus_one;
276                         continue;
277                     }
278                     offset = offset_plus_until_alignment;
279                 }
280                 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
281                 loop {
282                     let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
283                     if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
284                         if offset_plus_stride == len {
285                             break 'outer;
286                         }
287                         let offset_plus_stride_plus_one = offset_plus_stride + 1;
288                         let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
289                         if up_to < SIMD_STRIDE_SIZE / unit_size {
290                             return offset + up_to;
291                         }
292                         if last_valid_low {
293                             offset = offset_plus_stride_plus_one;
294                             continue 'outer;
295                         }
296                     }
297                     offset = offset_plus_stride;
298                     if offset > len_minus_stride {
299                         break 'outer;
300                     }
301                 }
302             }
303             let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
304             offset + up_to
305         }
306     } else {
307         by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
308         by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
309         by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
310 
311         #[inline(always)]
312         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
313             let (up_to, _) = utf16_valid_up_to_alu(buffer);
314             up_to
315         }
316     }
317 }
318 
319 /// The second return value is true iff the last code unit of the slice was
320 /// reached and turned out to be a low surrogate that is part of a valid pair.
321 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
322 #[inline(always)]
utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool)323 fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
324     let len = buffer.len();
325     if len == 0 {
326         return (0, false);
327     }
328     let mut offset = 0usize;
329     loop {
330         let unit = buffer[offset];
331         let next = offset + 1;
332         let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
333         if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
334             // Not a surrogate
335             offset = next;
336             if offset == len {
337                 return (offset, false);
338             }
339             continue;
340         }
341         if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
342             // high surrogate
343             if next < len {
344                 let second = buffer[next];
345                 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
346                 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
347                     // The next code unit is a low surrogate. Advance position.
348                     offset = next + 1;
349                     if offset == len {
350                         return (offset, true);
351                     }
352                     continue;
353                 }
354                 // The next code unit is not a low surrogate. Don't advance
355                 // position and treat the high surrogate as unpaired.
356                 // fall through
357             }
358             // Unpaired, fall through
359         }
360         // Unpaired surrogate
361         return (offset, false);
362     }
363 }
364 
365 cfg_if! {
366     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
367         #[inline(always)]
368         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
369             let mut offset = 0usize;
370             let bytes = buffer.as_bytes();
371             let len = bytes.len();
372             if len >= SIMD_STRIDE_SIZE {
373                 let src = bytes.as_ptr();
374                 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
375                                            SIMD_ALIGNMENT_MASK;
376                 if until_alignment + SIMD_STRIDE_SIZE <= len {
377                     while until_alignment != 0 {
378                         if bytes[offset] > 0xC3 {
379                             return Some(offset);
380                         }
381                         offset += 1;
382                         until_alignment -= 1;
383                     }
384                     let len_minus_stride = len - SIMD_STRIDE_SIZE;
385                     loop {
386                         if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
387                             // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
388                             while bytes[offset] & 0xC0 == 0x80 {
389                                 offset += 1;
390                             }
391                             return Some(offset);
392                         }
393                         offset += SIMD_STRIDE_SIZE;
394                         if offset > len_minus_stride {
395                             break;
396                         }
397                     }
398                 }
399             }
400             for i in offset..len {
401                 if bytes[i] > 0xC3 {
402                     return Some(i);
403                 }
404             }
405             None
406         }
407     } else {
408         #[inline(always)]
409         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
410             let mut bytes = buffer.as_bytes();
411             let mut total = 0;
412             loop {
413                 if let Some((byte, offset)) = validate_ascii(bytes) {
414                     total += offset;
415                     if byte > 0xC3 {
416                         return Some(total);
417                     }
418                     bytes = &bytes[offset + 2..];
419                     total += 2;
420                 } else {
421                     return None;
422                 }
423             }
424         }
425     }
426 }
427 
428 #[inline(always)]
is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize>429 fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
430     let mut bytes = buffer;
431     let mut total = 0;
432     loop {
433         if let Some((byte, offset)) = validate_ascii(bytes) {
434             total += offset;
435             if in_inclusive_range8(byte, 0xC2, 0xC3) {
436                 let next = offset + 1;
437                 if next == bytes.len() {
438                     return Some(total);
439                 }
440                 if bytes[next] & 0xC0 != 0x80 {
441                     return Some(total);
442                 }
443                 bytes = &bytes[offset + 2..];
444                 total += 2;
445             } else {
446                 return Some(total);
447             }
448         } else {
449             return None;
450         }
451     }
452 }
453 
454 cfg_if! {
455     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
456         #[inline(always)]
457         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
458             let mut offset = 0usize;
459             let len = buffer.len();
460             if len >= SIMD_STRIDE_SIZE / 2 {
461                 let src = buffer.as_ptr();
462                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
463                                            SIMD_ALIGNMENT_MASK) / 2;
464                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
465                     while until_alignment != 0 {
466                         if is_utf16_code_unit_bidi(buffer[offset]) {
467                             return true;
468                         }
469                         offset += 1;
470                         until_alignment -= 1;
471                     }
472                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
473                     loop {
474                         if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
475                             return true;
476                         }
477                         offset += SIMD_STRIDE_SIZE / 2;
478                         if offset > len_minus_stride {
479                             break;
480                         }
481                     }
482                 }
483             }
484             for &u in &buffer[offset..] {
485                 if is_utf16_code_unit_bidi(u) {
486                     return true;
487                 }
488             }
489             false
490         }
491     } else {
492         #[inline(always)]
493         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
494             for &u in buffer {
495                 if is_utf16_code_unit_bidi(u) {
496                     return true;
497                 }
498             }
499             false
500         }
501     }
502 }
503 
504 cfg_if! {
505     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
506         #[inline(always)]
507         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
508             let mut offset = 0usize;
509             let len = buffer.len();
510             if len >= SIMD_STRIDE_SIZE / 2 {
511                 let src = buffer.as_ptr();
512                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
513                                            SIMD_ALIGNMENT_MASK) / 2;
514                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
515                     while until_alignment != 0 {
516                         if buffer[offset] > 0xFF {
517                             // This transition isn't optimal, since the aligment is recomputing
518                             // but not tweaking further today.
519                             if is_utf16_bidi_impl(&buffer[offset..]) {
520                                 return Latin1Bidi::Bidi;
521                             }
522                             return Latin1Bidi::LeftToRight;
523                         }
524                         offset += 1;
525                         until_alignment -= 1;
526                     }
527                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
528                     loop {
529                         let mut s = unsafe { *(src.add(offset) as *const u16x8) };
530                         if !simd_is_latin1(s) {
531                             loop {
532                                 if is_u16x8_bidi(s) {
533                                     return Latin1Bidi::Bidi;
534                                 }
535                                 offset += SIMD_STRIDE_SIZE / 2;
536                                 if offset > len_minus_stride {
537                                     for &u in &buffer[offset..] {
538                                         if is_utf16_code_unit_bidi(u) {
539                                             return Latin1Bidi::Bidi;
540                                         }
541                                     }
542                                     return Latin1Bidi::LeftToRight;
543                                 }
544                                 s = unsafe { *(src.add(offset) as *const u16x8) };
545                             }
546                         }
547                         offset += SIMD_STRIDE_SIZE / 2;
548                         if offset > len_minus_stride {
549                             break;
550                         }
551                     }
552                 }
553             }
554             let mut iter = (&buffer[offset..]).iter();
555             loop {
556                 if let Some(&u) = iter.next() {
557                     if u > 0xFF {
558                         let mut inner_u = u;
559                         loop {
560                             if is_utf16_code_unit_bidi(inner_u) {
561                                 return Latin1Bidi::Bidi;
562                             }
563                             if let Some(&code_unit) = iter.next() {
564                                 inner_u = code_unit;
565                             } else {
566                                 return Latin1Bidi::LeftToRight;
567                             }
568                         }
569                     }
570                 } else {
571                     return Latin1Bidi::Latin1;
572                 }
573             }
574         }
575     } else {
576         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
577         #[inline(always)]
578         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
579             let mut offset = 0usize;
580             let len = buffer.len();
581             if len >= ALU_ALIGNMENT / 2 {
582                 let src = buffer.as_ptr();
583                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
584                                            ALU_ALIGNMENT_MASK) / 2;
585                 if until_alignment + ALU_ALIGNMENT / 2 <= len {
586                     while until_alignment != 0 {
587                         if buffer[offset] > 0xFF {
588                             if is_utf16_bidi_impl(&buffer[offset..]) {
589                                 return Latin1Bidi::Bidi;
590                             }
591                             return Latin1Bidi::LeftToRight;
592                         }
593                         offset += 1;
594                         until_alignment -= 1;
595                     }
596                     let len_minus_stride = len - ALU_ALIGNMENT / 2;
597                     loop {
598                         if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
599                             if is_utf16_bidi_impl(&buffer[offset..]) {
600                                 return Latin1Bidi::Bidi;
601                             }
602                             return Latin1Bidi::LeftToRight;
603                         }
604                         offset += ALU_ALIGNMENT / 2;
605                         if offset > len_minus_stride {
606                             break;
607                         }
608                     }
609                 }
610             }
611             let mut iter = (&buffer[offset..]).iter();
612             loop {
613                 if let Some(&u) = iter.next() {
614                     if u > 0xFF {
615                         let mut inner_u = u;
616                         loop {
617                             if is_utf16_code_unit_bidi(inner_u) {
618                                 return Latin1Bidi::Bidi;
619                             }
620                             if let Some(&code_unit) = iter.next() {
621                                 inner_u = code_unit;
622                             } else {
623                                 return Latin1Bidi::LeftToRight;
624                             }
625                         }
626                     }
627                 } else {
628                     return Latin1Bidi::Latin1;
629                 }
630             }
631         }
632     }
633 }
634 
635 /// Checks whether the buffer is all-ASCII.
636 ///
637 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
638 /// is not guaranteed to fail fast.)
is_ascii(buffer: &[u8]) -> bool639 pub fn is_ascii(buffer: &[u8]) -> bool {
640     is_ascii_impl(buffer)
641 }
642 
643 /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
644 /// only ASCII characters).
645 ///
646 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
647 /// is not guaranteed to fail fast.)
is_basic_latin(buffer: &[u16]) -> bool648 pub fn is_basic_latin(buffer: &[u16]) -> bool {
649     is_basic_latin_impl(buffer)
650 }
651 
652 /// Checks whether the buffer is valid UTF-8 representing only code points
653 /// less than or equal to U+00FF.
654 ///
655 /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
656 /// invalidity or code points above U+00FF are discovered.
is_utf8_latin1(buffer: &[u8]) -> bool657 pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
658     is_utf8_latin1_impl(buffer).is_none()
659 }
660 
661 /// Checks whether the buffer represents only code points less than or equal
662 /// to U+00FF.
663 ///
664 /// Fails fast. (I.e. returns before having read the whole buffer if code
665 /// points above U+00FF are discovered.
is_str_latin1(buffer: &str) -> bool666 pub fn is_str_latin1(buffer: &str) -> bool {
667     is_str_latin1_impl(buffer).is_none()
668 }
669 
670 /// Checks whether the buffer represents only code point less than or equal
671 /// to U+00FF.
672 ///
673 /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
674 /// is not guaranteed to fail fast.)
is_utf16_latin1(buffer: &[u16]) -> bool675 pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
676     is_utf16_latin1_impl(buffer)
677 }
678 
679 /// Checks whether a potentially-invalid UTF-8 buffer contains code points
680 /// that trigger right-to-left processing.
681 ///
682 /// The check is done on a Unicode block basis without regard to assigned
683 /// vs. unassigned code points in the block. Hebrew presentation forms in
684 /// the Alphabetic Presentation Forms block are treated as if they formed
685 /// a block on their own (i.e. it treated as right-to-left). Additionally,
686 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
687 /// for. Control characters that are technically bidi controls but do not
688 /// cause right-to-left behavior without the presence of right-to-left
689 /// characters or right-to-left controls are not checked for. As a special
690 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
691 ///
692 /// Returns `true` if the input is invalid UTF-8 or the input contains an
693 /// RTL character. Returns `false` if the input is valid UTF-8 and contains
694 /// no RTL characters.
695 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
696 #[inline]
is_utf8_bidi(buffer: &[u8]) -> bool697 pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
698     // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
699     // than UTF-8 validation followed by `is_str_bidi()` for German,
700     // Russian and Japanese. However, this is considerably slower for Thai.
701     // Chances are that the compiler makes some branch predictions that are
702     // unfortunate for Thai. Not spending the time to manually optimize
703     // further at this time, since it's unclear if this variant even has
704     // use cases. However, this is worth revisiting once Rust gets the
705     // ability to annotate relative priorities of match arms.
706 
707     // U+058F: D6 8F
708     // U+0590: D6 90
709     // U+08FF: E0 A3 BF
710     // U+0900: E0 A4 80
711     //
712     // U+200F: E2 80 8F
713     // U+202B: E2 80 AB
714     // U+202E: E2 80 AE
715     // U+2067: E2 81 A7
716     //
717     // U+FB1C: EF AC 9C
718     // U+FB1D: EF AC 9D
719     // U+FDFF: EF B7 BF
720     // U+FE00: EF B8 80
721     //
722     // U+FE6F: EF B9 AF
723     // U+FE70: EF B9 B0
724     // U+FEFE: EF BB BE
725     // U+FEFF: EF BB BF
726     //
727     // U+107FF: F0 90 9F BF
728     // U+10800: F0 90 A0 80
729     // U+10FFF: F0 90 BF BF
730     // U+11000: F0 91 80 80
731     //
732     // U+1E7FF: F0 9E 9F BF
733     // U+1E800: F0 9E A0 80
734     // U+1EFFF: F0 9E BF BF
735     // U+1F000: F0 9F 80 80
736     let mut src = buffer;
737     'outer: loop {
738         if let Some((mut byte, mut read)) = validate_ascii(src) {
739             // Check for the longest sequence to avoid checking twice for the
740             // multi-byte sequences.
741             if read + 4 <= src.len() {
742                 'inner: loop {
743                     // At this point, `byte` is not included in `read`.
744                     match byte {
745                         0..=0x7F => {
746                             // ASCII: go back to SIMD.
747                             read += 1;
748                             src = &src[read..];
749                             continue 'outer;
750                         }
751                         0xC2..=0xD5 => {
752                             // Two-byte
753                             let second = unsafe { *(src.get_unchecked(read + 1)) };
754                             if !in_inclusive_range8(second, 0x80, 0xBF) {
755                                 return true;
756                             }
757                             read += 2;
758                         }
759                         0xD6 => {
760                             // Two-byte
761                             let second = unsafe { *(src.get_unchecked(read + 1)) };
762                             if !in_inclusive_range8(second, 0x80, 0xBF) {
763                                 return true;
764                             }
765                             // XXX consider folding the above and below checks
766                             if second > 0x8F {
767                                 return true;
768                             }
769                             read += 2;
770                         }
771                         // two-byte starting with 0xD7 and above is bidi
772                         0xE1 | 0xE3..=0xEC | 0xEE => {
773                             // Three-byte normal
774                             let second = unsafe { *(src.get_unchecked(read + 1)) };
775                             let third = unsafe { *(src.get_unchecked(read + 2)) };
776                             if ((UTF8_DATA.table[usize::from(second)]
777                                 & unsafe {
778                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
779                                 })
780                                 | (third >> 6))
781                                 != 2
782                             {
783                                 return true;
784                             }
785                             read += 3;
786                         }
787                         0xE2 => {
788                             // Three-byte normal, potentially bidi
789                             let second = unsafe { *(src.get_unchecked(read + 1)) };
790                             let third = unsafe { *(src.get_unchecked(read + 2)) };
791                             if ((UTF8_DATA.table[usize::from(second)]
792                                 & unsafe {
793                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
794                                 })
795                                 | (third >> 6))
796                                 != 2
797                             {
798                                 return true;
799                             }
800                             if second == 0x80 {
801                                 if third == 0x8F || third == 0xAB || third == 0xAE {
802                                     return true;
803                                 }
804                             } else if second == 0x81 {
805                                 if third == 0xA7 {
806                                     return true;
807                                 }
808                             }
809                             read += 3;
810                         }
811                         0xEF => {
812                             // Three-byte normal, potentially bidi
813                             let second = unsafe { *(src.get_unchecked(read + 1)) };
814                             let third = unsafe { *(src.get_unchecked(read + 2)) };
815                             if ((UTF8_DATA.table[usize::from(second)]
816                                 & unsafe {
817                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
818                                 })
819                                 | (third >> 6))
820                                 != 2
821                             {
822                                 return true;
823                             }
824                             if in_inclusive_range8(second, 0xAC, 0xB7) {
825                                 if second == 0xAC {
826                                     if third > 0x9C {
827                                         return true;
828                                     }
829                                 } else {
830                                     return true;
831                                 }
832                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
833                                 if second == 0xB9 {
834                                     if third > 0xAF {
835                                         return true;
836                                     }
837                                 } else if second == 0xBB {
838                                     if third != 0xBF {
839                                         return true;
840                                     }
841                                 } else {
842                                     return true;
843                                 }
844                             }
845                             read += 3;
846                         }
847                         0xE0 => {
848                             // Three-byte special lower bound, potentially bidi
849                             let second = unsafe { *(src.get_unchecked(read + 1)) };
850                             let third = unsafe { *(src.get_unchecked(read + 2)) };
851                             if ((UTF8_DATA.table[usize::from(second)]
852                                 & unsafe {
853                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
854                                 })
855                                 | (third >> 6))
856                                 != 2
857                             {
858                                 return true;
859                             }
860                             // XXX can this be folded into the above validity check
861                             if second < 0xA4 {
862                                 return true;
863                             }
864                             read += 3;
865                         }
866                         0xED => {
867                             // Three-byte special upper bound
868                             let second = unsafe { *(src.get_unchecked(read + 1)) };
869                             let third = unsafe { *(src.get_unchecked(read + 2)) };
870                             if ((UTF8_DATA.table[usize::from(second)]
871                                 & unsafe {
872                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
873                                 })
874                                 | (third >> 6))
875                                 != 2
876                             {
877                                 return true;
878                             }
879                             read += 3;
880                         }
881                         0xF1..=0xF4 => {
882                             // Four-byte normal
883                             let second = unsafe { *(src.get_unchecked(read + 1)) };
884                             let third = unsafe { *(src.get_unchecked(read + 2)) };
885                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
886                             if (u16::from(
887                                 UTF8_DATA.table[usize::from(second)]
888                                     & unsafe {
889                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
890                                     },
891                             ) | u16::from(third >> 6)
892                                 | (u16::from(fourth & 0xC0) << 2))
893                                 != 0x202
894                             {
895                                 return true;
896                             }
897                             read += 4;
898                         }
899                         0xF0 => {
900                             // Four-byte special lower bound, potentially bidi
901                             let second = unsafe { *(src.get_unchecked(read + 1)) };
902                             let third = unsafe { *(src.get_unchecked(read + 2)) };
903                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
904                             if (u16::from(
905                                 UTF8_DATA.table[usize::from(second)]
906                                     & unsafe {
907                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
908                                     },
909                             ) | u16::from(third >> 6)
910                                 | (u16::from(fourth & 0xC0) << 2))
911                                 != 0x202
912                             {
913                                 return true;
914                             }
915                             if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
916                                 let third = src[read + 2];
917                                 if third >= 0xA0 {
918                                     return true;
919                                 }
920                             }
921                             read += 4;
922                         }
923                         _ => {
924                             // Invalid lead or bidi-only lead
925                             return true;
926                         }
927                     }
928                     if read + 4 > src.len() {
929                         if read == src.len() {
930                             return false;
931                         }
932                         byte = src[read];
933                         break 'inner;
934                     }
935                     byte = src[read];
936                     continue 'inner;
937                 }
938             }
939             // We can't have a complete 4-byte sequence, but we could still have
940             // a complete shorter sequence.
941 
942             // At this point, `byte` is not included in `read`.
943             match byte {
944                 0..=0x7F => {
945                     // ASCII: go back to SIMD.
946                     read += 1;
947                     src = &src[read..];
948                     continue 'outer;
949                 }
950                 0xC2..=0xD5 => {
951                     // Two-byte
952                     let new_read = read + 2;
953                     if new_read > src.len() {
954                         return true;
955                     }
956                     let second = unsafe { *(src.get_unchecked(read + 1)) };
957                     if !in_inclusive_range8(second, 0x80, 0xBF) {
958                         return true;
959                     }
960                     read = new_read;
961                     // We need to deal with the case where we came here with 3 bytes
962                     // left, so we need to take a look at the last one.
963                     src = &src[read..];
964                     continue 'outer;
965                 }
966                 0xD6 => {
967                     // Two-byte, potentially bidi
968                     let new_read = read + 2;
969                     if new_read > src.len() {
970                         return true;
971                     }
972                     let second = unsafe { *(src.get_unchecked(read + 1)) };
973                     if !in_inclusive_range8(second, 0x80, 0xBF) {
974                         return true;
975                     }
976                     // XXX consider folding the above and below checks
977                     if second > 0x8F {
978                         return true;
979                     }
980                     read = new_read;
981                     // We need to deal with the case where we came here with 3 bytes
982                     // left, so we need to take a look at the last one.
983                     src = &src[read..];
984                     continue 'outer;
985                 }
986                 // two-byte starting with 0xD7 and above is bidi
987                 0xE1 | 0xE3..=0xEC | 0xEE => {
988                     // Three-byte normal
989                     let new_read = read + 3;
990                     if new_read > src.len() {
991                         return true;
992                     }
993                     let second = unsafe { *(src.get_unchecked(read + 1)) };
994                     let third = unsafe { *(src.get_unchecked(read + 2)) };
995                     if ((UTF8_DATA.table[usize::from(second)]
996                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
997                         | (third >> 6))
998                         != 2
999                     {
1000                         return true;
1001                     }
1002                 }
1003                 0xE2 => {
1004                     // Three-byte normal, potentially bidi
1005                     let new_read = read + 3;
1006                     if new_read > src.len() {
1007                         return true;
1008                     }
1009                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1010                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1011                     if ((UTF8_DATA.table[usize::from(second)]
1012                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1013                         | (third >> 6))
1014                         != 2
1015                     {
1016                         return true;
1017                     }
1018                     if second == 0x80 {
1019                         if third == 0x8F || third == 0xAB || third == 0xAE {
1020                             return true;
1021                         }
1022                     } else if second == 0x81 {
1023                         if third == 0xA7 {
1024                             return true;
1025                         }
1026                     }
1027                 }
1028                 0xEF => {
1029                     // Three-byte normal, potentially bidi
1030                     let new_read = read + 3;
1031                     if new_read > src.len() {
1032                         return true;
1033                     }
1034                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1035                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1036                     if ((UTF8_DATA.table[usize::from(second)]
1037                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1038                         | (third >> 6))
1039                         != 2
1040                     {
1041                         return true;
1042                     }
1043                     if in_inclusive_range8(second, 0xAC, 0xB7) {
1044                         if second == 0xAC {
1045                             if third > 0x9C {
1046                                 return true;
1047                             }
1048                         } else {
1049                             return true;
1050                         }
1051                     } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1052                         if second == 0xB9 {
1053                             if third > 0xAF {
1054                                 return true;
1055                             }
1056                         } else if second == 0xBB {
1057                             if third != 0xBF {
1058                                 return true;
1059                             }
1060                         } else {
1061                             return true;
1062                         }
1063                     }
1064                 }
1065                 0xE0 => {
1066                     // Three-byte special lower bound, potentially bidi
1067                     let new_read = read + 3;
1068                     if new_read > src.len() {
1069                         return true;
1070                     }
1071                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1072                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1073                     if ((UTF8_DATA.table[usize::from(second)]
1074                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1075                         | (third >> 6))
1076                         != 2
1077                     {
1078                         return true;
1079                     }
1080                     // XXX can this be folded into the above validity check
1081                     if second < 0xA4 {
1082                         return true;
1083                     }
1084                 }
1085                 0xED => {
1086                     // Three-byte special upper bound
1087                     let new_read = read + 3;
1088                     if new_read > src.len() {
1089                         return true;
1090                     }
1091                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1092                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1093                     if ((UTF8_DATA.table[usize::from(second)]
1094                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1095                         | (third >> 6))
1096                         != 2
1097                     {
1098                         return true;
1099                     }
1100                 }
1101                 _ => {
1102                     // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1103                     return true;
1104                 }
1105             }
1106             return false;
1107         } else {
1108             return false;
1109         }
1110     }
1111 }
1112 
1113 /// Checks whether a valid UTF-8 buffer contains code points that trigger
1114 /// right-to-left processing.
1115 ///
1116 /// The check is done on a Unicode block basis without regard to assigned
1117 /// vs. unassigned code points in the block. Hebrew presentation forms in
1118 /// the Alphabetic Presentation Forms block are treated as if they formed
1119 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1120 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1121 /// for. Control characters that are technically bidi controls but do not
1122 /// cause right-to-left behavior without the presence of right-to-left
1123 /// characters or right-to-left controls are not checked for. As a special
1124 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1125 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1126 #[inline]
is_str_bidi(buffer: &str) -> bool1127 pub fn is_str_bidi(buffer: &str) -> bool {
1128     // U+058F: D6 8F
1129     // U+0590: D6 90
1130     // U+08FF: E0 A3 BF
1131     // U+0900: E0 A4 80
1132     //
1133     // U+200F: E2 80 8F
1134     // U+202B: E2 80 AB
1135     // U+202E: E2 80 AE
1136     // U+2067: E2 81 A7
1137     //
1138     // U+FB1C: EF AC 9C
1139     // U+FB1D: EF AC 9D
1140     // U+FDFF: EF B7 BF
1141     // U+FE00: EF B8 80
1142     //
1143     // U+FE6F: EF B9 AF
1144     // U+FE70: EF B9 B0
1145     // U+FEFE: EF BB BE
1146     // U+FEFF: EF BB BF
1147     //
1148     // U+107FF: F0 90 9F BF
1149     // U+10800: F0 90 A0 80
1150     // U+10FFF: F0 90 BF BF
1151     // U+11000: F0 91 80 80
1152     //
1153     // U+1E7FF: F0 9E 9F BF
1154     // U+1E800: F0 9E A0 80
1155     // U+1EFFF: F0 9E BF BF
1156     // U+1F000: F0 9F 80 80
1157     let mut bytes = buffer.as_bytes();
1158     'outer: loop {
1159         // TODO: Instead of just validating ASCII using SIMD, use SIMD
1160         // to check for non-ASCII lead bytes, too, to quickly conclude
1161         // that the vector consist entirely of CJK and below-Hebrew
1162         // code points.
1163         // Unfortunately, scripts above Arabic but below CJK share
1164         // lead bytes with RTL.
1165         if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1166             'inner: loop {
1167                 // At this point, `byte` is not included in `read`.
1168                 if byte < 0xE0 {
1169                     if byte >= 0x80 {
1170                         // Two-byte
1171                         // Adding `unlikely` here improved throughput on
1172                         // Russian plain text by 33%!
1173                         if unsafe { unlikely(byte >= 0xD6) } {
1174                             if byte == 0xD6 {
1175                                 let second = bytes[read + 1];
1176                                 if second > 0x8F {
1177                                     return true;
1178                                 }
1179                             } else {
1180                                 return true;
1181                             }
1182                         }
1183                         read += 2;
1184                     } else {
1185                         // ASCII: write and go back to SIMD.
1186                         read += 1;
1187                         // Intuitively, we should go back to the outer loop only
1188                         // if byte is 0x30 or above, so as to avoid trashing on
1189                         // ASCII space, comma and period in non-Latin context.
1190                         // However, the extra branch seems to cost more than it's
1191                         // worth.
1192                         bytes = &bytes[read..];
1193                         continue 'outer;
1194                     }
1195                 } else if byte < 0xF0 {
1196                     // Three-byte
1197                     if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1198                         let second = bytes[read + 1];
1199                         if byte == 0xE0 {
1200                             if second < 0xA4 {
1201                                 return true;
1202                             }
1203                         } else if byte == 0xE2 {
1204                             let third = bytes[read + 2];
1205                             if second == 0x80 {
1206                                 if third == 0x8F || third == 0xAB || third == 0xAE {
1207                                     return true;
1208                                 }
1209                             } else if second == 0x81 {
1210                                 if third == 0xA7 {
1211                                     return true;
1212                                 }
1213                             }
1214                         } else {
1215                             debug_assert_eq!(byte, 0xEF);
1216                             if in_inclusive_range8(second, 0xAC, 0xB7) {
1217                                 if second == 0xAC {
1218                                     let third = bytes[read + 2];
1219                                     if third > 0x9C {
1220                                         return true;
1221                                     }
1222                                 } else {
1223                                     return true;
1224                                 }
1225                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1226                                 if second == 0xB9 {
1227                                     let third = bytes[read + 2];
1228                                     if third > 0xAF {
1229                                         return true;
1230                                     }
1231                                 } else if second == 0xBB {
1232                                     let third = bytes[read + 2];
1233                                     if third != 0xBF {
1234                                         return true;
1235                                     }
1236                                 } else {
1237                                     return true;
1238                                 }
1239                             }
1240                         }
1241                     }
1242                     read += 3;
1243                 } else {
1244                     // Four-byte
1245                     let second = bytes[read + 1];
1246                     if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1247                         let third = bytes[read + 2];
1248                         if third >= 0xA0 {
1249                             return true;
1250                         }
1251                     }
1252                     read += 4;
1253                 }
1254                 // The comparison is always < or == and never >, but including
1255                 // > here to let the compiler assume that < is true if this
1256                 // comparison is false.
1257                 if read >= bytes.len() {
1258                     return false;
1259                 }
1260                 byte = bytes[read];
1261                 continue 'inner;
1262             }
1263         } else {
1264             return false;
1265         }
1266     }
1267 }
1268 
1269 /// Checks whether a UTF-16 buffer contains code points that trigger
1270 /// right-to-left processing.
1271 ///
1272 /// The check is done on a Unicode block basis without regard to assigned
1273 /// vs. unassigned code points in the block. Hebrew presentation forms in
1274 /// the Alphabetic Presentation Forms block are treated as if they formed
1275 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1276 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1277 /// for. Control characters that are technically bidi controls but do not
1278 /// cause right-to-left behavior without the presence of right-to-left
1279 /// characters or right-to-left controls are not checked for. As a special
1280 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1281 ///
1282 /// Returns `true` if the input contains an RTL character or an unpaired
1283 /// high surrogate that could be the high half of an RTL character.
1284 /// Returns `false` if the input contains neither RTL characters nor
1285 /// unpaired high surrogates that could be higher halves of RTL characters.
is_utf16_bidi(buffer: &[u16]) -> bool1286 pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1287     is_utf16_bidi_impl(buffer)
1288 }
1289 
1290 /// Checks whether a scalar value triggers right-to-left processing.
1291 ///
1292 /// The check is done on a Unicode block basis without regard to assigned
1293 /// vs. unassigned code points in the block. Hebrew presentation forms in
1294 /// the Alphabetic Presentation Forms block are treated as if they formed
1295 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1296 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1297 /// for. Control characters that are technically bidi controls but do not
1298 /// cause right-to-left behavior without the presence of right-to-left
1299 /// characters or right-to-left controls are not checked for. As a special
1300 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1301 #[inline(always)]
is_char_bidi(c: char) -> bool1302 pub fn is_char_bidi(c: char) -> bool {
1303     // Controls:
1304     // Every control with RIGHT-TO-LEFT in its name in
1305     // https://www.unicode.org/charts/PDF/U2000.pdf
1306     // U+200F RLM
1307     // U+202B RLE
1308     // U+202E RLO
1309     // U+2067 RLI
1310     //
1311     // BMP RTL:
1312     // https://www.unicode.org/roadmaps/bmp/
1313     // U+0590...U+08FF
1314     // U+FB1D...U+FDFF Hebrew presentation forms and
1315     //                 Arabic Presentation Forms A
1316     // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1317     //
1318     // Supplementary RTL:
1319     // https://www.unicode.org/roadmaps/smp/
1320     // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1321     // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1322     let code_point = u32::from(c);
1323     if code_point < 0x0590 {
1324         // Below Hebrew
1325         return false;
1326     }
1327     if in_range32(code_point, 0x0900, 0xFB1D) {
1328         // Above Arabic Extended-A and below Hebrew presentation forms
1329         if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1330             // In the range that contains the RTL controls
1331             return code_point == 0x200F
1332                 || code_point == 0x202B
1333                 || code_point == 0x202E
1334                 || code_point == 0x2067;
1335         }
1336         return false;
1337     }
1338     if code_point > 0x1EFFF {
1339         // Above second astral RTL. (Emoji is here.)
1340         return false;
1341     }
1342     if in_range32(code_point, 0x11000, 0x1E800) {
1343         // Between astral RTL blocks
1344         return false;
1345     }
1346     if in_range32(code_point, 0xFEFF, 0x10800) {
1347         // Above Arabic Presentations Forms B (excl. BOM) and below first
1348         // astral RTL
1349         return false;
1350     }
1351     if in_range32(code_point, 0xFE00, 0xFE70) {
1352         // Between Arabic Presentations Forms
1353         return false;
1354     }
1355     true
1356 }
1357 
1358 /// Checks whether a UTF-16 code unit triggers right-to-left processing.
1359 ///
1360 /// The check is done on a Unicode block basis without regard to assigned
1361 /// vs. unassigned code points in the block. Hebrew presentation forms in
1362 /// the Alphabetic Presentation Forms block are treated as if they formed
1363 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1364 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1365 /// for. Control characters that are technically bidi controls but do not
1366 /// cause right-to-left behavior without the presence of right-to-left
1367 /// characters or right-to-left controls are not checked for. As a special
1368 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1369 ///
1370 /// Since supplementary-plane right-to-left blocks are identifiable from the
1371 /// high surrogate without examining the low surrogate, this function returns
1372 /// `true` for such high surrogates making the function suitable for handling
1373 /// supplementary-plane text without decoding surrogate pairs to scalar
1374 /// values. Obviously, such high surrogates are then reported as right-to-left
1375 /// even if actually unpaired.
1376 #[inline(always)]
is_utf16_code_unit_bidi(u: u16) -> bool1377 pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1378     if u < 0x0590 {
1379         // Below Hebrew
1380         return false;
1381     }
1382     if in_range16(u, 0x0900, 0xD802) {
1383         // Above Arabic Extended-A and below first RTL surrogate
1384         if in_inclusive_range16(u, 0x200F, 0x2067) {
1385             // In the range that contains the RTL controls
1386             return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1387         }
1388         return false;
1389     }
1390     if in_range16(u, 0xD83C, 0xFB1D) {
1391         // Between astral RTL high surrogates and Hebrew presentation forms
1392         // (Emoji is here)
1393         return false;
1394     }
1395     if in_range16(u, 0xD804, 0xD83A) {
1396         // Between RTL high surragates
1397         return false;
1398     }
1399     if u > 0xFEFE {
1400         // Above Arabic Presentation Forms (excl. BOM)
1401         return false;
1402     }
1403     if in_range16(u, 0xFE00, 0xFE70) {
1404         // Between Arabic Presentations Forms
1405         return false;
1406     }
1407     true
1408 }
1409 
1410 /// Checks whether a potentially invalid UTF-8 buffer contains code points
1411 /// that trigger right-to-left processing or is all-Latin1.
1412 ///
1413 /// Possibly more efficient than performing the checks separately.
1414 ///
1415 /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1416 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1417 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi1418 pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1419     if let Some(offset) = is_utf8_latin1_impl(buffer) {
1420         if is_utf8_bidi(&buffer[offset..]) {
1421             Latin1Bidi::Bidi
1422         } else {
1423             Latin1Bidi::LeftToRight
1424         }
1425     } else {
1426         Latin1Bidi::Latin1
1427     }
1428 }
1429 
1430 /// Checks whether a valid UTF-8 buffer contains code points
1431 /// that trigger right-to-left processing or is all-Latin1.
1432 ///
1433 /// Possibly more efficient than performing the checks separately.
1434 ///
1435 /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1436 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1437 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi1438 pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1439     // The transition from the latin1 check to the bidi check isn't
1440     // optimal but not tweaking it to perfection today.
1441     if let Some(offset) = is_str_latin1_impl(buffer) {
1442         if is_str_bidi(&buffer[offset..]) {
1443             Latin1Bidi::Bidi
1444         } else {
1445             Latin1Bidi::LeftToRight
1446         }
1447     } else {
1448         Latin1Bidi::Latin1
1449     }
1450 }
1451 
1452 /// Checks whether a potentially invalid UTF-16 buffer contains code points
1453 /// that trigger right-to-left processing or is all-Latin1.
1454 ///
1455 /// Possibly more efficient than performing the checks separately.
1456 ///
1457 /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1458 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1459 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi1460 pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1461     check_utf16_for_latin1_and_bidi_impl(buffer)
1462 }
1463 
1464 /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1465 /// with the REPLACEMENT CHARACTER.
1466 ///
1467 /// The length of the destination buffer must be at least the length of the
1468 /// source buffer _plus one_.
1469 ///
1470 /// Returns the number of `u16`s written.
1471 ///
1472 /// # Panics
1473 ///
1474 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize1475 pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1476     // TODO: Can the requirement for dst to be at least one unit longer
1477     // be eliminated?
1478     assert!(dst.len() > src.len());
1479     let mut decoder = Utf8Decoder::new_inner();
1480     let mut total_read = 0usize;
1481     let mut total_written = 0usize;
1482     loop {
1483         let (result, read, written) =
1484             decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1485         total_read += read;
1486         total_written += written;
1487         match result {
1488             DecoderResult::InputEmpty => {
1489                 return total_written;
1490             }
1491             DecoderResult::OutputFull => {
1492                 unreachable!("The assert at the top of the function should have caught this.");
1493             }
1494             DecoderResult::Malformed(_, _) => {
1495                 // There should always be space for the U+FFFD, because
1496                 // otherwise we'd have gotten OutputFull already.
1497                 dst[total_written] = 0xFFFD;
1498                 total_written += 1;
1499             }
1500         }
1501     }
1502 }
1503 
1504 /// Converts valid UTF-8 to valid UTF-16.
1505 ///
1506 /// The length of the destination buffer must be at least the length of the
1507 /// source buffer.
1508 ///
1509 /// Returns the number of `u16`s written.
1510 ///
1511 /// # Panics
1512 ///
1513 /// Panics if the destination buffer is shorter than stated above.
convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize1514 pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1515     assert!(
1516         dst.len() >= src.len(),
1517         "Destination must not be shorter than the source."
1518     );
1519     let bytes = src.as_bytes();
1520     let mut read = 0;
1521     let mut written = 0;
1522     'outer: loop {
1523         let mut byte = {
1524             let src_remaining = &bytes[read..];
1525             let dst_remaining = &mut dst[written..];
1526             let length = src_remaining.len();
1527             match unsafe {
1528                 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1529             } {
1530                 None => {
1531                     written += length;
1532                     return written;
1533                 }
1534                 Some((non_ascii, consumed)) => {
1535                     read += consumed;
1536                     written += consumed;
1537                     non_ascii
1538                 }
1539             }
1540         };
1541         'inner: loop {
1542             // At this point, `byte` is not included in `read`.
1543             if byte < 0xE0 {
1544                 if byte >= 0x80 {
1545                     // Two-byte
1546                     let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1547                     let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1548                     unsafe { *(dst.get_unchecked_mut(written)) = point };
1549                     read += 2;
1550                     written += 1;
1551                 } else {
1552                     // ASCII: write and go back to SIMD.
1553                     unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1554                     read += 1;
1555                     written += 1;
1556                     // Intuitively, we should go back to the outer loop only
1557                     // if byte is 0x30 or above, so as to avoid trashing on
1558                     // ASCII space, comma and period in non-Latin context.
1559                     // However, the extra branch seems to cost more than it's
1560                     // worth.
1561                     continue 'outer;
1562                 }
1563             } else if byte < 0xF0 {
1564                 // Three-byte
1565                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1566                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1567                 let point = ((u16::from(byte) & 0xF) << 12)
1568                     | ((u16::from(second) & 0x3F) << 6)
1569                     | (u16::from(third) & 0x3F);
1570                 unsafe { *(dst.get_unchecked_mut(written)) = point };
1571                 read += 3;
1572                 written += 1;
1573             } else {
1574                 // Four-byte
1575                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1576                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1577                 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1578                 let point = ((u32::from(byte) & 0x7) << 18)
1579                     | ((u32::from(second) & 0x3F) << 12)
1580                     | ((u32::from(third) & 0x3F) << 6)
1581                     | (u32::from(fourth) & 0x3F);
1582                 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1583                 unsafe {
1584                     *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1585                 };
1586                 read += 4;
1587                 written += 2;
1588             }
1589             // The comparison is always < or == and never >, but including
1590             // > here to let the compiler assume that < is true if this
1591             // comparison is false.
1592             if read >= src.len() {
1593                 return written;
1594             }
1595             byte = bytes[read];
1596             continue 'inner;
1597         }
1598     }
1599 }
1600 
1601 /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1602 ///
1603 /// The length of the destination buffer must be at least the length of the
1604 /// source buffer.
1605 ///
1606 /// Returns the number of `u16`s written or `None` if the input was invalid.
1607 ///
1608 /// When the input was invalid, some output may have been written.
1609 ///
1610 /// # Panics
1611 ///
1612 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize>1613 pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1614     assert!(
1615         dst.len() >= src.len(),
1616         "Destination must not be shorter than the source."
1617     );
1618     let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1619     if read == src.len() {
1620         return Some(written);
1621     }
1622     None
1623 }
1624 
1625 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1626 /// with the REPLACEMENT CHARACTER with potentially insufficient output
1627 /// space.
1628 ///
1629 /// Returns the number of code units read and the number of bytes written.
1630 ///
1631 /// Guarantees that the bytes in the destination beyond the number of
1632 /// bytes claimed as written by the second item of the return tuple
1633 /// are left unmodified.
1634 ///
1635 /// Not all code units are read if there isn't enough output space.
1636 ///
1637 /// Note  that this method isn't designed for general streamability but for
1638 /// not allocating memory for the worst case up front. Specifically,
1639 /// if the input starts with or ends with an unpaired surrogate, those are
1640 /// replaced with the REPLACEMENT CHARACTER.
1641 ///
1642 /// Matches the semantics of `TextEncoder.encodeInto()` from the
1643 /// Encoding Standard.
1644 ///
1645 /// # Safety
1646 ///
1647 /// If you want to convert into a `&mut str`, use
1648 /// `convert_utf16_to_str_partial()` instead of using this function
1649 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1650 #[inline(always)]
convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize)1651 pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1652     // The two functions called below are marked `inline(never)` to make
1653     // transitions from the hot part (first function) into the cold part
1654     // (second function) go through a return and another call to discouge
1655     // the CPU from speculating from the hot code into the cold code.
1656     // Letting the transitions be mere intra-function jumps, even to
1657     // basic blocks out-of-lined to the end of the function would wipe
1658     // away a quarter of Arabic encode performance on Haswell!
1659     let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1660     if unsafe { likely(read == src.len()) } {
1661         return (read, written);
1662     }
1663     let (tail_read, tail_written) =
1664         convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1665     (read + tail_read, written + tail_written)
1666 }
1667 
1668 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1669 /// with the REPLACEMENT CHARACTER.
1670 ///
1671 /// The length of the destination buffer must be at least the length of the
1672 /// source buffer times three.
1673 ///
1674 /// Returns the number of bytes written.
1675 ///
1676 /// # Panics
1677 ///
1678 /// Panics if the destination buffer is shorter than stated above.
1679 ///
1680 /// # Safety
1681 ///
1682 /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1683 /// instead of using this function together with the `unsafe` method
1684 /// `as_bytes_mut()` on `&mut str`.
1685 #[inline(always)]
convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize1686 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1687     assert!(dst.len() >= src.len() * 3);
1688     let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1689     debug_assert_eq!(read, src.len());
1690     written
1691 }
1692 
1693 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1694 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1695 /// signaled using the Rust type system with potentially insufficient output
1696 /// space.
1697 ///
1698 /// Returns the number of code units read and the number of bytes written.
1699 ///
1700 /// Not all code units are read if there isn't enough output space.
1701 ///
1702 /// Note  that this method isn't designed for general streamability but for
1703 /// not allocating memory for the worst case up front. Specifically,
1704 /// if the input starts with or ends with an unpaired surrogate, those are
1705 /// replaced with the REPLACEMENT CHARACTER.
convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize)1706 pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1707     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1708     let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1709     let len = bytes.len();
1710     let mut trail = written;
1711     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1712         bytes[trail] = 0;
1713         trail += 1;
1714     }
1715     (read, written)
1716 }
1717 
1718 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1719 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1720 /// signaled using the Rust type system.
1721 ///
1722 /// The length of the destination buffer must be at least the length of the
1723 /// source buffer times three.
1724 ///
1725 /// Returns the number of bytes written.
1726 ///
1727 /// # Panics
1728 ///
1729 /// Panics if the destination buffer is shorter than stated above.
1730 #[inline(always)]
convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize1731 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1732     assert!(dst.len() >= src.len() * 3);
1733     let (read, written) = convert_utf16_to_str_partial(src, dst);
1734     debug_assert_eq!(read, src.len());
1735     written
1736 }
1737 
1738 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1739 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1740 ///
1741 /// The length of the destination buffer must be at least the length of the
1742 /// source buffer.
1743 ///
1744 /// The number of `u16`s written equals the length of the source buffer.
1745 ///
1746 /// # Panics
1747 ///
1748 /// Panics if the destination buffer is shorter than stated above.
convert_latin1_to_utf16(src: &[u8], dst: &mut [u16])1749 pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1750     assert!(
1751         dst.len() >= src.len(),
1752         "Destination must not be shorter than the source."
1753     );
1754     // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1755     // instructions and this code, but, yet, the autovectorized version is
1756     // faster.
1757     unsafe {
1758         unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1759     }
1760 }
1761 
1762 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1763 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1764 /// output space.
1765 ///
1766 /// Returns the number of bytes read and the number of bytes written.
1767 ///
1768 /// If the output isn't large enough, not all input is consumed.
1769 ///
1770 /// # Safety
1771 ///
1772 /// If you want to convert into a `&mut str`, use
1773 /// `convert_utf16_to_str_partial()` instead of using this function
1774 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize)1775 pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1776     let src_len = src.len();
1777     let src_ptr = src.as_ptr();
1778     let dst_ptr = dst.as_mut_ptr();
1779     let dst_len = dst.len();
1780     let mut total_read = 0usize;
1781     let mut total_written = 0usize;
1782     loop {
1783         // src can't advance more than dst
1784         let src_left = src_len - total_read;
1785         let dst_left = dst_len - total_written;
1786         let min_left = ::core::cmp::min(src_left, dst_left);
1787         if let Some((non_ascii, consumed)) = unsafe {
1788             ascii_to_ascii(
1789                 src_ptr.add(total_read),
1790                 dst_ptr.add(total_written),
1791                 min_left,
1792             )
1793         } {
1794             total_read += consumed;
1795             total_written += consumed;
1796             if total_written.checked_add(2).unwrap() > dst_len {
1797                 return (total_read, total_written);
1798             }
1799 
1800             total_read += 1; // consume `non_ascii`
1801 
1802             dst[total_written] = (non_ascii >> 6) | 0xC0;
1803             total_written += 1;
1804             dst[total_written] = (non_ascii & 0x3F) | 0x80;
1805             total_written += 1;
1806             continue;
1807         }
1808         return (total_read + min_left, total_written + min_left);
1809     }
1810 }
1811 
1812 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1813 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1814 ///
1815 /// The length of the destination buffer must be at least the length of the
1816 /// source buffer times two.
1817 ///
1818 /// Returns the number of bytes written.
1819 ///
1820 /// # Panics
1821 ///
1822 /// Panics if the destination buffer is shorter than stated above.
1823 ///
1824 /// # Safety
1825 ///
1826 /// Note that this function may write garbage beyond the number of bytes
1827 /// indicated by the return value, so using a `&mut str` interpreted as
1828 /// `&mut [u8]` as the destination is not safe. If you want to convert into
1829 /// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1830 #[inline]
convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize1831 pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1832     assert!(
1833         dst.len() >= src.len() * 2,
1834         "Destination must not be shorter than the source times two."
1835     );
1836     let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1837     debug_assert_eq!(read, src.len());
1838     written
1839 }
1840 
1841 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1842 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1843 /// output is signaled using the Rust type system with potentially insufficient
1844 /// output space.
1845 ///
1846 /// Returns the number of bytes read and the number of bytes written.
1847 ///
1848 /// If the output isn't large enough, not all input is consumed.
1849 #[inline]
convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize)1850 pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1851     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1852     let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1853     let len = bytes.len();
1854     let mut trail = written;
1855     let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE);
1856     while trail < max {
1857         bytes[trail] = 0;
1858         trail += 1;
1859     }
1860     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1861         bytes[trail] = 0;
1862         trail += 1;
1863     }
1864     (read, written)
1865 }
1866 
1867 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1868 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1869 /// output is signaled using the Rust type system.
1870 ///
1871 /// The length of the destination buffer must be at least the length of the
1872 /// source buffer times two.
1873 ///
1874 /// Returns the number of bytes written.
1875 ///
1876 /// # Panics
1877 ///
1878 /// Panics if the destination buffer is shorter than stated above.
1879 #[inline]
convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize1880 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1881     assert!(
1882         dst.len() >= src.len() * 2,
1883         "Destination must not be shorter than the source times two."
1884     );
1885     let (read, written) = convert_latin1_to_str_partial(src, dst);
1886     debug_assert_eq!(read, src.len());
1887     written
1888 }
1889 
1890 /// If the input is valid UTF-8 representing only Unicode code points from
1891 /// U+0000 to U+00FF, inclusive, converts the input into output that
1892 /// represents the value of each code point as the unsigned byte value of
1893 /// each output byte.
1894 ///
1895 /// If the input does not fulfill the condition stated above, this function
1896 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1897 /// does something that is memory-safe without any promises about any
1898 /// properties of the output. In particular, callers shouldn't assume the
1899 /// output to be the same across crate versions or CPU architectures and
1900 /// should not assume that non-ASCII input can't map to ASCII output.
1901 ///
1902 /// The length of the destination buffer must be at least the length of the
1903 /// source buffer.
1904 ///
1905 /// Returns the number of bytes written.
1906 ///
1907 /// # Panics
1908 ///
1909 /// Panics if the destination buffer is shorter than stated above.
1910 ///
1911 /// If debug assertions are enabled (and not fuzzing) and the input is
1912 /// not in the range U+0000 to U+00FF, inclusive.
convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize1913 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1914     assert!(
1915         dst.len() >= src.len(),
1916         "Destination must not be shorter than the source."
1917     );
1918     non_fuzz_debug_assert!(is_utf8_latin1(src));
1919     let src_len = src.len();
1920     let src_ptr = src.as_ptr();
1921     let dst_ptr = dst.as_mut_ptr();
1922     let mut total_read = 0usize;
1923     let mut total_written = 0usize;
1924     loop {
1925         // dst can't advance more than src
1926         let src_left = src_len - total_read;
1927         if let Some((non_ascii, consumed)) = unsafe {
1928             ascii_to_ascii(
1929                 src_ptr.add(total_read),
1930                 dst_ptr.add(total_written),
1931                 src_left,
1932             )
1933         } {
1934             total_read += consumed + 1;
1935             total_written += consumed;
1936 
1937             if total_read == src_len {
1938                 return total_written;
1939             }
1940 
1941             let trail = src[total_read];
1942             total_read += 1;
1943 
1944             dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1945             total_written += 1;
1946             continue;
1947         }
1948         return total_written + src_left;
1949     }
1950 }
1951 
1952 /// If the input is valid UTF-16 representing only Unicode code points from
1953 /// U+0000 to U+00FF, inclusive, converts the input into output that
1954 /// represents the value of each code point as the unsigned byte value of
1955 /// each output byte.
1956 ///
1957 /// If the input does not fulfill the condition stated above, does something
1958 /// that is memory-safe without any promises about any properties of the
1959 /// output and will probably assert in debug builds in future versions.
1960 /// In particular, callers shouldn't assume the output to be the same across
1961 /// crate versions or CPU architectures and should not assume that non-ASCII
1962 /// input can't map to ASCII output.
1963 ///
1964 /// The length of the destination buffer must be at least the length of the
1965 /// source buffer.
1966 ///
1967 /// The number of bytes written equals the length of the source buffer.
1968 ///
1969 /// # Panics
1970 ///
1971 /// Panics if the destination buffer is shorter than stated above.
1972 ///
1973 /// (Probably in future versions if debug assertions are enabled (and not
1974 /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8])1975 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1976     assert!(
1977         dst.len() >= src.len(),
1978         "Destination must not be shorter than the source."
1979     );
1980     // non_fuzz_debug_assert!(is_utf16_latin1(src));
1981     unsafe {
1982         pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1983     }
1984 }
1985 
1986 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1987 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1988 ///
1989 /// Borrows if input is ASCII-only. Performs a single heap allocation
1990 /// otherwise.
decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str>1991 pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1992     let up_to = ascii_valid_up_to(bytes);
1993     // >= makes later things optimize better than ==
1994     if up_to >= bytes.len() {
1995         debug_assert_eq!(up_to, bytes.len());
1996         let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
1997         return Cow::Borrowed(s);
1998     }
1999     let (head, tail) = bytes.split_at(up_to);
2000     let capacity = head.len() + tail.len() * 2;
2001     let mut vec = Vec::with_capacity(capacity);
2002     unsafe {
2003         vec.set_len(capacity);
2004     }
2005     (&mut vec[..up_to]).copy_from_slice(head);
2006     let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2007     vec.truncate(up_to + written);
2008     Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2009 }
2010 
2011 /// If the input is valid UTF-8 representing only Unicode code points from
2012 /// U+0000 to U+00FF, inclusive, converts the input into output that
2013 /// represents the value of each code point as the unsigned byte value of
2014 /// each output byte.
2015 ///
2016 /// If the input does not fulfill the condition stated above, this function
2017 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2018 /// does something that is memory-safe without any promises about any
2019 /// properties of the output. In particular, callers shouldn't assume the
2020 /// output to be the same across crate versions or CPU architectures and
2021 /// should not assume that non-ASCII input can't map to ASCII output.
2022 ///
2023 /// Borrows if input is ASCII-only. Performs a single heap allocation
2024 /// otherwise.
encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]>2025 pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2026     let bytes = string.as_bytes();
2027     let up_to = ascii_valid_up_to(bytes);
2028     // >= makes later things optimize better than ==
2029     if up_to >= bytes.len() {
2030         debug_assert_eq!(up_to, bytes.len());
2031         return Cow::Borrowed(bytes);
2032     }
2033     let (head, tail) = bytes.split_at(up_to);
2034     let capacity = bytes.len();
2035     let mut vec = Vec::with_capacity(capacity);
2036     unsafe {
2037         vec.set_len(capacity);
2038     }
2039     (&mut vec[..up_to]).copy_from_slice(head);
2040     let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2041     vec.truncate(up_to + written);
2042     Cow::Owned(vec)
2043 }
2044 
2045 /// Returns the index of the first unpaired surrogate or, if the input is
2046 /// valid UTF-16 in its entirety, the length of the input.
utf16_valid_up_to(buffer: &[u16]) -> usize2047 pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2048     utf16_valid_up_to_impl(buffer)
2049 }
2050 
2051 /// Returns the index of first byte that starts an invalid byte
2052 /// sequence or a non-Latin1 byte sequence, or the length of the
2053 /// string if there are neither.
utf8_latin1_up_to(buffer: &[u8]) -> usize2054 pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2055     is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2056 }
2057 
2058 /// Returns the index of first byte that starts a non-Latin1 byte
2059 /// sequence, or the length of the string if there are none.
str_latin1_up_to(buffer: &str) -> usize2060 pub fn str_latin1_up_to(buffer: &str) -> usize {
2061     is_str_latin1_impl(buffer).unwrap_or(buffer.len())
2062 }
2063 
2064 /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2065 #[inline]
ensure_utf16_validity(buffer: &mut [u16])2066 pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2067     let mut offset = 0;
2068     loop {
2069         offset += utf16_valid_up_to(&buffer[offset..]);
2070         if offset == buffer.len() {
2071             return;
2072         }
2073         buffer[offset] = 0xFFFD;
2074         offset += 1;
2075     }
2076 }
2077 
2078 /// Copies ASCII from source to destination up to the first non-ASCII byte
2079 /// (or the end of the input if it is ASCII in its entirety).
2080 ///
2081 /// The length of the destination buffer must be at least the length of the
2082 /// source buffer.
2083 ///
2084 /// Returns the number of bytes written.
2085 ///
2086 /// # Panics
2087 ///
2088 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize2089 pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2090     assert!(
2091         dst.len() >= src.len(),
2092         "Destination must not be shorter than the source."
2093     );
2094     if let Some((_, consumed)) =
2095         unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2096     {
2097         consumed
2098     } else {
2099         src.len()
2100     }
2101 }
2102 
2103 /// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2104 /// the first non-ASCII byte (or the end of the input if it is ASCII in its
2105 /// entirety).
2106 ///
2107 /// The length of the destination buffer must be at least the length of the
2108 /// source buffer.
2109 ///
2110 /// Returns the number of `u16`s written.
2111 ///
2112 /// # Panics
2113 ///
2114 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize2115 pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2116     assert!(
2117         dst.len() >= src.len(),
2118         "Destination must not be shorter than the source."
2119     );
2120     if let Some((_, consumed)) =
2121         unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2122     {
2123         consumed
2124     } else {
2125         src.len()
2126     }
2127 }
2128 
2129 /// Copies Basic Latin from source to destination narrowing it to ASCII up to
2130 /// the first non-Basic Latin code unit (or the end of the input if it is
2131 /// Basic Latin in its entirety).
2132 ///
2133 /// The length of the destination buffer must be at least the length of the
2134 /// source buffer.
2135 ///
2136 /// Returns the number of bytes written.
2137 ///
2138 /// # Panics
2139 ///
2140 /// Panics if the destination buffer is shorter than stated above.
copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize2141 pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2142     assert!(
2143         dst.len() >= src.len(),
2144         "Destination must not be shorter than the source."
2145     );
2146     if let Some((_, consumed)) =
2147         unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2148     {
2149         consumed
2150     } else {
2151         src.len()
2152     }
2153 }
2154 
2155 // Any copyright to the test code below this comment is dedicated to the
2156 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2157 
2158 #[cfg(test)]
2159 mod tests {
2160     use super::*;
2161 
2162     #[test]
test_is_ascii_success()2163     fn test_is_ascii_success() {
2164         let mut src: Vec<u8> = Vec::with_capacity(128);
2165         src.resize(128, 0);
2166         for i in 0..src.len() {
2167             src[i] = i as u8;
2168         }
2169         for i in 0..src.len() {
2170             assert!(is_ascii(&src[i..]));
2171         }
2172     }
2173 
2174     #[test]
test_is_ascii_fail()2175     fn test_is_ascii_fail() {
2176         let mut src: Vec<u8> = Vec::with_capacity(128);
2177         src.resize(128, 0);
2178         for i in 0..src.len() {
2179             src[i] = i as u8;
2180         }
2181         for i in 0..src.len() {
2182             let tail = &mut src[i..];
2183             for j in 0..tail.len() {
2184                 tail[j] = 0xA0;
2185                 assert!(!is_ascii(tail));
2186             }
2187         }
2188     }
2189 
2190     #[test]
test_is_basic_latin_success()2191     fn test_is_basic_latin_success() {
2192         let mut src: Vec<u16> = Vec::with_capacity(128);
2193         src.resize(128, 0);
2194         for i in 0..src.len() {
2195             src[i] = i as u16;
2196         }
2197         for i in 0..src.len() {
2198             assert!(is_basic_latin(&src[i..]));
2199         }
2200     }
2201 
2202     #[test]
test_is_basic_latin_fail()2203     fn test_is_basic_latin_fail() {
2204         let mut src: Vec<u16> = Vec::with_capacity(128);
2205         src.resize(128, 0);
2206         for i in 0..src.len() {
2207             src[i] = i as u16;
2208         }
2209         for i in 0..src.len() {
2210             let tail = &mut src[i..];
2211             for j in 0..tail.len() {
2212                 tail[j] = 0xA0;
2213                 assert!(!is_basic_latin(tail));
2214             }
2215         }
2216     }
2217 
2218     #[test]
test_is_utf16_latin1_success()2219     fn test_is_utf16_latin1_success() {
2220         let mut src: Vec<u16> = Vec::with_capacity(256);
2221         src.resize(256, 0);
2222         for i in 0..src.len() {
2223             src[i] = i as u16;
2224         }
2225         for i in 0..src.len() {
2226             assert!(is_utf16_latin1(&src[i..]));
2227             assert_eq!(
2228                 check_utf16_for_latin1_and_bidi(&src[i..]),
2229                 Latin1Bidi::Latin1
2230             );
2231         }
2232     }
2233 
2234     #[test]
test_is_utf16_latin1_fail()2235     fn test_is_utf16_latin1_fail() {
2236         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2237         let mut src: Vec<u16> = Vec::with_capacity(len);
2238         src.resize(len, 0);
2239         for i in 0..src.len() {
2240             src[i] = i as u16;
2241         }
2242         for i in 0..src.len() {
2243             let tail = &mut src[i..];
2244             for j in 0..tail.len() {
2245                 tail[j] = 0x100 + j as u16;
2246                 assert!(!is_utf16_latin1(tail));
2247                 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2248             }
2249         }
2250     }
2251 
2252     #[test]
test_is_str_latin1_success()2253     fn test_is_str_latin1_success() {
2254         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2255         let mut src: Vec<u16> = Vec::with_capacity(len);
2256         src.resize(len, 0);
2257         for i in 0..src.len() {
2258             src[i] = i as u16;
2259         }
2260         for i in 0..src.len() {
2261             let s = String::from_utf16(&src[i..]).unwrap();
2262             assert!(is_str_latin1(&s[..]));
2263             assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2264         }
2265     }
2266 
2267     #[test]
test_is_str_latin1_fail()2268     fn test_is_str_latin1_fail() {
2269         let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2270         let mut src: Vec<u16> = Vec::with_capacity(len);
2271         src.resize(len, 0);
2272         for i in 0..src.len() {
2273             src[i] = i as u16;
2274         }
2275         for i in 0..src.len() {
2276             let tail = &mut src[i..];
2277             for j in 0..tail.len() {
2278                 tail[j] = 0x100 + j as u16;
2279                 let s = String::from_utf16(tail).unwrap();
2280                 assert!(!is_str_latin1(&s[..]));
2281                 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2282             }
2283         }
2284     }
2285 
2286     #[test]
test_is_utf8_latin1_success()2287     fn test_is_utf8_latin1_success() {
2288         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2289         let mut src: Vec<u16> = Vec::with_capacity(len);
2290         src.resize(len, 0);
2291         for i in 0..src.len() {
2292             src[i] = i as u16;
2293         }
2294         for i in 0..src.len() {
2295             let s = String::from_utf16(&src[i..]).unwrap();
2296             assert!(is_utf8_latin1(s.as_bytes()));
2297             assert_eq!(
2298                 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2299                 Latin1Bidi::Latin1
2300             );
2301         }
2302     }
2303 
2304     #[test]
test_is_utf8_latin1_fail()2305     fn test_is_utf8_latin1_fail() {
2306         let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2307         let mut src: Vec<u16> = Vec::with_capacity(len);
2308         src.resize(len, 0);
2309         for i in 0..src.len() {
2310             src[i] = i as u16;
2311         }
2312         for i in 0..src.len() {
2313             let tail = &mut src[i..];
2314             for j in 0..tail.len() {
2315                 tail[j] = 0x100 + j as u16;
2316                 let s = String::from_utf16(tail).unwrap();
2317                 assert!(!is_utf8_latin1(s.as_bytes()));
2318                 assert_ne!(
2319                     check_utf8_for_latin1_and_bidi(s.as_bytes()),
2320                     Latin1Bidi::Latin1
2321                 );
2322             }
2323         }
2324     }
2325 
2326     #[test]
test_is_utf8_latin1_invalid()2327     fn test_is_utf8_latin1_invalid() {
2328         assert!(!is_utf8_latin1(b"\xC3"));
2329         assert!(!is_utf8_latin1(b"a\xC3"));
2330         assert!(!is_utf8_latin1(b"\xFF"));
2331         assert!(!is_utf8_latin1(b"a\xFF"));
2332         assert!(!is_utf8_latin1(b"\xC3\xFF"));
2333         assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2334     }
2335 
2336     #[test]
test_convert_utf8_to_utf16()2337     fn test_convert_utf8_to_utf16() {
2338         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2339         let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2340         dst.resize(src.len() + 1, 0);
2341         let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2342         dst.truncate(len);
2343         let reference: Vec<u16> = src.encode_utf16().collect();
2344         assert_eq!(dst, reference);
2345     }
2346 
2347     #[test]
test_convert_str_to_utf16()2348     fn test_convert_str_to_utf16() {
2349         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2350         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2351         dst.resize(src.len(), 0);
2352         let len = convert_str_to_utf16(src, &mut dst[..]);
2353         dst.truncate(len);
2354         let reference: Vec<u16> = src.encode_utf16().collect();
2355         assert_eq!(dst, reference);
2356     }
2357 
2358     #[test]
test_convert_utf16_to_utf8_partial()2359     fn test_convert_utf16_to_utf8_partial() {
2360         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2361         let src: Vec<u16> = reference.encode_utf16().collect();
2362         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2363         dst.resize(src.len() * 3 + 1, 0);
2364         let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2365         let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2366         dst.truncate(len);
2367         assert_eq!(dst, reference.as_bytes());
2368     }
2369 
2370     #[test]
test_convert_utf16_to_utf8()2371     fn test_convert_utf16_to_utf8() {
2372         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2373         let src: Vec<u16> = reference.encode_utf16().collect();
2374         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2375         dst.resize(src.len() * 3 + 1, 0);
2376         let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2377         dst.truncate(len);
2378         assert_eq!(dst, reference.as_bytes());
2379     }
2380 
2381     #[test]
test_convert_latin1_to_utf16()2382     fn test_convert_latin1_to_utf16() {
2383         let mut src: Vec<u8> = Vec::with_capacity(256);
2384         src.resize(256, 0);
2385         let mut reference: Vec<u16> = Vec::with_capacity(256);
2386         reference.resize(256, 0);
2387         for i in 0..256 {
2388             src[i] = i as u8;
2389             reference[i] = i as u16;
2390         }
2391         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2392         dst.resize(src.len(), 0);
2393         convert_latin1_to_utf16(&src[..], &mut dst[..]);
2394         assert_eq!(dst, reference);
2395     }
2396 
2397     #[test]
test_convert_latin1_to_utf8_partial()2398     fn test_convert_latin1_to_utf8_partial() {
2399         let mut dst = [0u8, 2];
2400         let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2401         assert_eq!(read, 1);
2402         assert_eq!(written, 1);
2403     }
2404 
2405     #[test]
test_convert_latin1_to_utf8()2406     fn test_convert_latin1_to_utf8() {
2407         let mut src: Vec<u8> = Vec::with_capacity(256);
2408         src.resize(256, 0);
2409         let mut reference: Vec<u16> = Vec::with_capacity(256);
2410         reference.resize(256, 0);
2411         for i in 0..256 {
2412             src[i] = i as u8;
2413             reference[i] = i as u16;
2414         }
2415         let s = String::from_utf16(&reference[..]).unwrap();
2416         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2417         dst.resize(src.len() * 2, 0);
2418         let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2419         dst.truncate(len);
2420         assert_eq!(&dst[..], s.as_bytes());
2421     }
2422 
2423     #[test]
test_convert_utf8_to_latin1_lossy()2424     fn test_convert_utf8_to_latin1_lossy() {
2425         let mut reference: Vec<u8> = Vec::with_capacity(256);
2426         reference.resize(256, 0);
2427         let mut src16: Vec<u16> = Vec::with_capacity(256);
2428         src16.resize(256, 0);
2429         for i in 0..256 {
2430             src16[i] = i as u16;
2431             reference[i] = i as u8;
2432         }
2433         let src = String::from_utf16(&src16[..]).unwrap();
2434         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2435         dst.resize(src.len(), 0);
2436         let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2437         dst.truncate(len);
2438         assert_eq!(dst, reference);
2439     }
2440 
2441     #[cfg(all(debug_assertions, not(fuzzing)))]
2442     #[test]
2443     #[should_panic]
test_convert_utf8_to_latin1_lossy_panics()2444     fn test_convert_utf8_to_latin1_lossy_panics() {
2445         let mut dst = [0u8; 16];
2446         let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2447     }
2448 
2449     #[test]
test_convert_utf16_to_latin1_lossy()2450     fn test_convert_utf16_to_latin1_lossy() {
2451         let mut src: Vec<u16> = Vec::with_capacity(256);
2452         src.resize(256, 0);
2453         let mut reference: Vec<u8> = Vec::with_capacity(256);
2454         reference.resize(256, 0);
2455         for i in 0..256 {
2456             src[i] = i as u16;
2457             reference[i] = i as u8;
2458         }
2459         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2460         dst.resize(src.len(), 0);
2461         convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2462         assert_eq!(dst, reference);
2463     }
2464 
2465     #[test]
2466     // #[should_panic]
test_convert_utf16_to_latin1_lossy_panics()2467     fn test_convert_utf16_to_latin1_lossy_panics() {
2468         let mut dst = [0u8; 16];
2469         let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2470     }
2471 
2472     #[test]
test_utf16_valid_up_to()2473     fn test_utf16_valid_up_to() {
2474         let valid = vec![
2475             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2476             0xD83Du16, 0xDCA9u16, 0x00B6u16,
2477         ];
2478         assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2479         let lone_high = vec![
2480             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2481             0x2603u16, 0xD83Du16, 0x00B6u16,
2482         ];
2483         assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2484         let lone_low = vec![
2485             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2486             0x2603u16, 0xDCA9u16, 0x00B6u16,
2487         ];
2488         assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2489         let lone_high_at_end = vec![
2490             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2491             0x2603u16, 0x00B6u16, 0xD83Du16,
2492         ];
2493         assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2494     }
2495 
2496     #[test]
test_ensure_utf16_validity()2497     fn test_ensure_utf16_validity() {
2498         let mut src = vec![
2499             0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500             0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2501             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2502         ];
2503         let reference = vec![
2504             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2505             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2506             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2507         ];
2508         ensure_utf16_validity(&mut src[..]);
2509         assert_eq!(src, reference);
2510     }
2511 
2512     #[test]
test_is_char_bidi()2513     fn test_is_char_bidi() {
2514         assert!(!is_char_bidi('a'));
2515         assert!(!is_char_bidi('\u{03B1}'));
2516         assert!(!is_char_bidi('\u{3041}'));
2517         assert!(!is_char_bidi('\u{1F4A9}'));
2518         assert!(!is_char_bidi('\u{FE00}'));
2519         assert!(!is_char_bidi('\u{202C}'));
2520         assert!(!is_char_bidi('\u{FEFF}'));
2521         assert!(is_char_bidi('\u{0590}'));
2522         assert!(is_char_bidi('\u{08FF}'));
2523         assert!(is_char_bidi('\u{061C}'));
2524         assert!(is_char_bidi('\u{FB50}'));
2525         assert!(is_char_bidi('\u{FDFF}'));
2526         assert!(is_char_bidi('\u{FE70}'));
2527         assert!(is_char_bidi('\u{FEFE}'));
2528         assert!(is_char_bidi('\u{200F}'));
2529         assert!(is_char_bidi('\u{202B}'));
2530         assert!(is_char_bidi('\u{202E}'));
2531         assert!(is_char_bidi('\u{2067}'));
2532         assert!(is_char_bidi('\u{10800}'));
2533         assert!(is_char_bidi('\u{10FFF}'));
2534         assert!(is_char_bidi('\u{1E800}'));
2535         assert!(is_char_bidi('\u{1EFFF}'));
2536     }
2537 
2538     #[test]
test_is_utf16_code_unit_bidi()2539     fn test_is_utf16_code_unit_bidi() {
2540         assert!(!is_utf16_code_unit_bidi(0x0062));
2541         assert!(!is_utf16_code_unit_bidi(0x03B1));
2542         assert!(!is_utf16_code_unit_bidi(0x3041));
2543         assert!(!is_utf16_code_unit_bidi(0xD801));
2544         assert!(!is_utf16_code_unit_bidi(0xFE00));
2545         assert!(!is_utf16_code_unit_bidi(0x202C));
2546         assert!(!is_utf16_code_unit_bidi(0xFEFF));
2547         assert!(is_utf16_code_unit_bidi(0x0590));
2548         assert!(is_utf16_code_unit_bidi(0x08FF));
2549         assert!(is_utf16_code_unit_bidi(0x061C));
2550         assert!(is_utf16_code_unit_bidi(0xFB1D));
2551         assert!(is_utf16_code_unit_bidi(0xFB50));
2552         assert!(is_utf16_code_unit_bidi(0xFDFF));
2553         assert!(is_utf16_code_unit_bidi(0xFE70));
2554         assert!(is_utf16_code_unit_bidi(0xFEFE));
2555         assert!(is_utf16_code_unit_bidi(0x200F));
2556         assert!(is_utf16_code_unit_bidi(0x202B));
2557         assert!(is_utf16_code_unit_bidi(0x202E));
2558         assert!(is_utf16_code_unit_bidi(0x2067));
2559         assert!(is_utf16_code_unit_bidi(0xD802));
2560         assert!(is_utf16_code_unit_bidi(0xD803));
2561         assert!(is_utf16_code_unit_bidi(0xD83A));
2562         assert!(is_utf16_code_unit_bidi(0xD83B));
2563     }
2564 
2565     #[test]
test_is_str_bidi()2566     fn test_is_str_bidi() {
2567         assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2568         assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2569         assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2570         assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2571         assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2572         assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2573         assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2574         assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2575         assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2576         assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2577         assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2578         assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2579         assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2580         assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2581         assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2582         assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2583         assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2584         assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2585         assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2586         assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2587         assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2588         assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2589     }
2590 
2591     #[test]
test_is_utf8_bidi()2592     fn test_is_utf8_bidi() {
2593         assert!(!is_utf8_bidi(
2594             "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2595         ));
2596         assert!(!is_utf8_bidi(
2597             "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2598         ));
2599         assert!(!is_utf8_bidi(
2600             "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2601         ));
2602         assert!(!is_utf8_bidi(
2603             "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2604         ));
2605         assert!(!is_utf8_bidi(
2606             "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2607         ));
2608         assert!(!is_utf8_bidi(
2609             "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2610         ));
2611         assert!(!is_utf8_bidi(
2612             "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2613         ));
2614         assert!(is_utf8_bidi(
2615             "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2616         ));
2617         assert!(is_utf8_bidi(
2618             "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2619         ));
2620         assert!(is_utf8_bidi(
2621             "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2622         ));
2623         assert!(is_utf8_bidi(
2624             "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2625         ));
2626         assert!(is_utf8_bidi(
2627             "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2628         ));
2629         assert!(is_utf8_bidi(
2630             "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2631         ));
2632         assert!(is_utf8_bidi(
2633             "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2634         ));
2635         assert!(is_utf8_bidi(
2636             "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2637         ));
2638         assert!(is_utf8_bidi(
2639             "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2640         ));
2641         assert!(is_utf8_bidi(
2642             "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2643         ));
2644         assert!(is_utf8_bidi(
2645             "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2646         ));
2647         assert!(is_utf8_bidi(
2648             "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2649         ));
2650         assert!(is_utf8_bidi(
2651             "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2652         ));
2653         assert!(is_utf8_bidi(
2654             "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2655         ));
2656         assert!(is_utf8_bidi(
2657             "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2658         ));
2659     }
2660 
2661     #[test]
test_is_utf16_bidi()2662     fn test_is_utf16_bidi() {
2663         assert!(!is_utf16_bidi(&[
2664             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2665             0x67, 0x68, 0x69,
2666         ]));
2667         assert!(!is_utf16_bidi(&[
2668             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2669             0x67, 0x68, 0x69,
2670         ]));
2671         assert!(!is_utf16_bidi(&[
2672             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2673             0x67, 0x68, 0x69,
2674         ]));
2675         assert!(!is_utf16_bidi(&[
2676             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2677             0x67, 0x68, 0x69,
2678         ]));
2679         assert!(!is_utf16_bidi(&[
2680             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2681             0x67, 0x68, 0x69,
2682         ]));
2683         assert!(!is_utf16_bidi(&[
2684             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2685             0x67, 0x68, 0x69,
2686         ]));
2687         assert!(!is_utf16_bidi(&[
2688             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2689             0x67, 0x68, 0x69,
2690         ]));
2691         assert!(is_utf16_bidi(&[
2692             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2693             0x67, 0x68, 0x69,
2694         ]));
2695         assert!(is_utf16_bidi(&[
2696             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2697             0x67, 0x68, 0x69,
2698         ]));
2699         assert!(is_utf16_bidi(&[
2700             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2701             0x67, 0x68, 0x69,
2702         ]));
2703         assert!(is_utf16_bidi(&[
2704             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2705             0x67, 0x68, 0x69,
2706         ]));
2707         assert!(is_utf16_bidi(&[
2708             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2709             0x67, 0x68, 0x69,
2710         ]));
2711         assert!(is_utf16_bidi(&[
2712             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2713             0x67, 0x68, 0x69,
2714         ]));
2715         assert!(is_utf16_bidi(&[
2716             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2717             0x67, 0x68, 0x69,
2718         ]));
2719         assert!(is_utf16_bidi(&[
2720             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2721             0x67, 0x68, 0x69,
2722         ]));
2723         assert!(is_utf16_bidi(&[
2724             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2725             0x67, 0x68, 0x69,
2726         ]));
2727         assert!(is_utf16_bidi(&[
2728             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2729             0x67, 0x68, 0x69,
2730         ]));
2731         assert!(is_utf16_bidi(&[
2732             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2733             0x67, 0x68, 0x69,
2734         ]));
2735         assert!(is_utf16_bidi(&[
2736             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2737             0x67, 0x68, 0x69,
2738         ]));
2739         assert!(is_utf16_bidi(&[
2740             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2741             0x67, 0x68, 0x69,
2742         ]));
2743         assert!(is_utf16_bidi(&[
2744             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2745             0x67, 0x68, 0x69,
2746         ]));
2747         assert!(is_utf16_bidi(&[
2748             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2749             0x67, 0x68, 0x69,
2750         ]));
2751         assert!(is_utf16_bidi(&[
2752             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2753             0x67, 0x68, 0x69,
2754         ]));
2755 
2756         assert!(is_utf16_bidi(&[
2757             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2758             0x66, 0x67, 0x68, 0x69,
2759         ]));
2760     }
2761 
2762     #[test]
test_check_str_for_latin1_and_bidi()2763     fn test_check_str_for_latin1_and_bidi() {
2764         assert_ne!(
2765             check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2766             Latin1Bidi::Bidi
2767         );
2768         assert_ne!(
2769             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2770             Latin1Bidi::Bidi
2771         );
2772         assert_ne!(
2773             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2774             Latin1Bidi::Bidi
2775         );
2776         assert_ne!(
2777             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2778             Latin1Bidi::Bidi
2779         );
2780         assert_ne!(
2781             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2782             Latin1Bidi::Bidi
2783         );
2784         assert_ne!(
2785             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2786             Latin1Bidi::Bidi
2787         );
2788         assert_ne!(
2789             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2790             Latin1Bidi::Bidi
2791         );
2792         assert_eq!(
2793             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2794             Latin1Bidi::Bidi
2795         );
2796         assert_eq!(
2797             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2798             Latin1Bidi::Bidi
2799         );
2800         assert_eq!(
2801             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2802             Latin1Bidi::Bidi
2803         );
2804         assert_eq!(
2805             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2806             Latin1Bidi::Bidi
2807         );
2808         assert_eq!(
2809             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2810             Latin1Bidi::Bidi
2811         );
2812         assert_eq!(
2813             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2814             Latin1Bidi::Bidi
2815         );
2816         assert_eq!(
2817             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2818             Latin1Bidi::Bidi
2819         );
2820         assert_eq!(
2821             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2822             Latin1Bidi::Bidi
2823         );
2824         assert_eq!(
2825             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2826             Latin1Bidi::Bidi
2827         );
2828         assert_eq!(
2829             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2830             Latin1Bidi::Bidi
2831         );
2832         assert_eq!(
2833             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2834             Latin1Bidi::Bidi
2835         );
2836         assert_eq!(
2837             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2838             Latin1Bidi::Bidi
2839         );
2840         assert_eq!(
2841             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2842             Latin1Bidi::Bidi
2843         );
2844         assert_eq!(
2845             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2846             Latin1Bidi::Bidi
2847         );
2848         assert_eq!(
2849             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2850             Latin1Bidi::Bidi
2851         );
2852     }
2853 
2854     #[test]
test_check_utf8_for_latin1_and_bidi()2855     fn test_check_utf8_for_latin1_and_bidi() {
2856         assert_ne!(
2857             check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2858             Latin1Bidi::Bidi
2859         );
2860         assert_ne!(
2861             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2862             Latin1Bidi::Bidi
2863         );
2864         assert_ne!(
2865             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2866             Latin1Bidi::Bidi
2867         );
2868         assert_ne!(
2869             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2870             Latin1Bidi::Bidi
2871         );
2872         assert_ne!(
2873             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2874             Latin1Bidi::Bidi
2875         );
2876         assert_ne!(
2877             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2878             Latin1Bidi::Bidi
2879         );
2880         assert_ne!(
2881             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2882             Latin1Bidi::Bidi
2883         );
2884         assert_eq!(
2885             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2886             Latin1Bidi::Bidi
2887         );
2888         assert_eq!(
2889             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2890             Latin1Bidi::Bidi
2891         );
2892         assert_eq!(
2893             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2894             Latin1Bidi::Bidi
2895         );
2896         assert_eq!(
2897             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2898             Latin1Bidi::Bidi
2899         );
2900         assert_eq!(
2901             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2902             Latin1Bidi::Bidi
2903         );
2904         assert_eq!(
2905             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2906             Latin1Bidi::Bidi
2907         );
2908         assert_eq!(
2909             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2910             Latin1Bidi::Bidi
2911         );
2912         assert_eq!(
2913             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2914             Latin1Bidi::Bidi
2915         );
2916         assert_eq!(
2917             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2918             Latin1Bidi::Bidi
2919         );
2920         assert_eq!(
2921             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2922             Latin1Bidi::Bidi
2923         );
2924         assert_eq!(
2925             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2926             Latin1Bidi::Bidi
2927         );
2928         assert_eq!(
2929             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2930             Latin1Bidi::Bidi
2931         );
2932         assert_eq!(
2933             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2934             Latin1Bidi::Bidi
2935         );
2936         assert_eq!(
2937             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2938             Latin1Bidi::Bidi
2939         );
2940         assert_eq!(
2941             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2942             Latin1Bidi::Bidi
2943         );
2944     }
2945 
2946     #[test]
test_check_utf16_for_latin1_and_bidi()2947     fn test_check_utf16_for_latin1_and_bidi() {
2948         assert_ne!(
2949             check_utf16_for_latin1_and_bidi(&[
2950                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2951                 0x66, 0x67, 0x68, 0x69,
2952             ]),
2953             Latin1Bidi::Bidi
2954         );
2955         assert_ne!(
2956             check_utf16_for_latin1_and_bidi(&[
2957                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2958                 0x66, 0x67, 0x68, 0x69,
2959             ]),
2960             Latin1Bidi::Bidi
2961         );
2962         assert_ne!(
2963             check_utf16_for_latin1_and_bidi(&[
2964                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2965                 0x66, 0x67, 0x68, 0x69,
2966             ]),
2967             Latin1Bidi::Bidi
2968         );
2969         assert_ne!(
2970             check_utf16_for_latin1_and_bidi(&[
2971                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2972                 0x66, 0x67, 0x68, 0x69,
2973             ]),
2974             Latin1Bidi::Bidi
2975         );
2976         assert_ne!(
2977             check_utf16_for_latin1_and_bidi(&[
2978                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2979                 0x66, 0x67, 0x68, 0x69,
2980             ]),
2981             Latin1Bidi::Bidi
2982         );
2983         assert_ne!(
2984             check_utf16_for_latin1_and_bidi(&[
2985                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2986                 0x66, 0x67, 0x68, 0x69,
2987             ]),
2988             Latin1Bidi::Bidi
2989         );
2990         assert_ne!(
2991             check_utf16_for_latin1_and_bidi(&[
2992                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
2993                 0x66, 0x67, 0x68, 0x69,
2994             ]),
2995             Latin1Bidi::Bidi
2996         );
2997         assert_eq!(
2998             check_utf16_for_latin1_and_bidi(&[
2999                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3000                 0x66, 0x67, 0x68, 0x69,
3001             ]),
3002             Latin1Bidi::Bidi
3003         );
3004         assert_eq!(
3005             check_utf16_for_latin1_and_bidi(&[
3006                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3007                 0x66, 0x67, 0x68, 0x69,
3008             ]),
3009             Latin1Bidi::Bidi
3010         );
3011         assert_eq!(
3012             check_utf16_for_latin1_and_bidi(&[
3013                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3014                 0x66, 0x67, 0x68, 0x69,
3015             ]),
3016             Latin1Bidi::Bidi
3017         );
3018         assert_eq!(
3019             check_utf16_for_latin1_and_bidi(&[
3020                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3021                 0x66, 0x67, 0x68, 0x69,
3022             ]),
3023             Latin1Bidi::Bidi
3024         );
3025         assert_eq!(
3026             check_utf16_for_latin1_and_bidi(&[
3027                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3028                 0x66, 0x67, 0x68, 0x69,
3029             ]),
3030             Latin1Bidi::Bidi
3031         );
3032         assert_eq!(
3033             check_utf16_for_latin1_and_bidi(&[
3034                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3035                 0x66, 0x67, 0x68, 0x69,
3036             ]),
3037             Latin1Bidi::Bidi
3038         );
3039         assert_eq!(
3040             check_utf16_for_latin1_and_bidi(&[
3041                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3042                 0x66, 0x67, 0x68, 0x69,
3043             ]),
3044             Latin1Bidi::Bidi
3045         );
3046         assert_eq!(
3047             check_utf16_for_latin1_and_bidi(&[
3048                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3049                 0x66, 0x67, 0x68, 0x69,
3050             ]),
3051             Latin1Bidi::Bidi
3052         );
3053         assert_eq!(
3054             check_utf16_for_latin1_and_bidi(&[
3055                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3056                 0x66, 0x67, 0x68, 0x69,
3057             ]),
3058             Latin1Bidi::Bidi
3059         );
3060         assert_eq!(
3061             check_utf16_for_latin1_and_bidi(&[
3062                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3063                 0x66, 0x67, 0x68, 0x69,
3064             ]),
3065             Latin1Bidi::Bidi
3066         );
3067         assert_eq!(
3068             check_utf16_for_latin1_and_bidi(&[
3069                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3070                 0x66, 0x67, 0x68, 0x69,
3071             ]),
3072             Latin1Bidi::Bidi
3073         );
3074         assert_eq!(
3075             check_utf16_for_latin1_and_bidi(&[
3076                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3077                 0x66, 0x67, 0x68, 0x69,
3078             ]),
3079             Latin1Bidi::Bidi
3080         );
3081         assert_eq!(
3082             check_utf16_for_latin1_and_bidi(&[
3083                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3084                 0x66, 0x67, 0x68, 0x69,
3085             ]),
3086             Latin1Bidi::Bidi
3087         );
3088         assert_eq!(
3089             check_utf16_for_latin1_and_bidi(&[
3090                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3091                 0x66, 0x67, 0x68, 0x69,
3092             ]),
3093             Latin1Bidi::Bidi
3094         );
3095         assert_eq!(
3096             check_utf16_for_latin1_and_bidi(&[
3097                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3098                 0x66, 0x67, 0x68, 0x69,
3099             ]),
3100             Latin1Bidi::Bidi
3101         );
3102         assert_eq!(
3103             check_utf16_for_latin1_and_bidi(&[
3104                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3105                 0x66, 0x67, 0x68, 0x69,
3106             ]),
3107             Latin1Bidi::Bidi
3108         );
3109 
3110         assert_eq!(
3111             check_utf16_for_latin1_and_bidi(&[
3112                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3113                 0x65, 0x66, 0x67, 0x68, 0x69,
3114             ]),
3115             Latin1Bidi::Bidi
3116         );
3117     }
3118 
3119     #[inline(always)]
reference_is_char_bidi(c: char) -> bool3120     pub fn reference_is_char_bidi(c: char) -> bool {
3121         match c {
3122             '\u{0590}'..='\u{08FF}'
3123             | '\u{FB1D}'..='\u{FDFF}'
3124             | '\u{FE70}'..='\u{FEFE}'
3125             | '\u{10800}'..='\u{10FFF}'
3126             | '\u{1E800}'..='\u{1EFFF}'
3127             | '\u{200F}'
3128             | '\u{202B}'
3129             | '\u{202E}'
3130             | '\u{2067}' => true,
3131             _ => false,
3132         }
3133     }
3134 
3135     #[inline(always)]
reference_is_utf16_code_unit_bidi(u: u16) -> bool3136     pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3137         match u {
3138             0x0590..=0x08FF
3139             | 0xFB1D..=0xFDFF
3140             | 0xFE70..=0xFEFE
3141             | 0xD802
3142             | 0xD803
3143             | 0xD83A
3144             | 0xD83B
3145             | 0x200F
3146             | 0x202B
3147             | 0x202E
3148             | 0x2067 => true,
3149             _ => false,
3150         }
3151     }
3152 
3153     #[test]
3154     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_char_bidi_thoroughly()3155     fn test_is_char_bidi_thoroughly() {
3156         for i in 0..0xD800u32 {
3157             let c: char = ::core::char::from_u32(i).unwrap();
3158             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3159         }
3160         for i in 0xE000..0x110000u32 {
3161             let c: char = ::core::char::from_u32(i).unwrap();
3162             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3163         }
3164     }
3165 
3166     #[test]
3167     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_code_unit_bidi_thoroughly()3168     fn test_is_utf16_code_unit_bidi_thoroughly() {
3169         for i in 0..0x10000u32 {
3170             let u = i as u16;
3171             assert_eq!(
3172                 is_utf16_code_unit_bidi(u),
3173                 reference_is_utf16_code_unit_bidi(u)
3174             );
3175         }
3176     }
3177 
3178     #[test]
3179     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_str_bidi_thoroughly()3180     fn test_is_str_bidi_thoroughly() {
3181         let mut buf = [0; 4];
3182         for i in 0..0xD800u32 {
3183             let c: char = ::core::char::from_u32(i).unwrap();
3184             assert_eq!(
3185                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3186                 reference_is_char_bidi(c)
3187             );
3188         }
3189         for i in 0xE000..0x110000u32 {
3190             let c: char = ::core::char::from_u32(i).unwrap();
3191             assert_eq!(
3192                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3193                 reference_is_char_bidi(c)
3194             );
3195         }
3196     }
3197 
3198     #[test]
3199     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf8_bidi_thoroughly()3200     fn test_is_utf8_bidi_thoroughly() {
3201         let mut buf = [0; 8];
3202         for i in 0..0xD800u32 {
3203             let c: char = ::core::char::from_u32(i).unwrap();
3204             let expect = reference_is_char_bidi(c);
3205             {
3206                 let len = {
3207                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3208                     assert_eq!(is_utf8_bidi(bytes), expect);
3209                     bytes.len()
3210                 };
3211                 {
3212                     let tail = &mut buf[len..];
3213                     for b in tail.iter_mut() {
3214                         *b = 0;
3215                     }
3216                 }
3217             }
3218             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3219         }
3220         for i in 0xE000..0x110000u32 {
3221             let c: char = ::core::char::from_u32(i).unwrap();
3222             let expect = reference_is_char_bidi(c);
3223             {
3224                 let len = {
3225                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3226                     assert_eq!(is_utf8_bidi(bytes), expect);
3227                     bytes.len()
3228                 };
3229                 {
3230                     let tail = &mut buf[len..];
3231                     for b in tail.iter_mut() {
3232                         *b = 0;
3233                     }
3234                 }
3235             }
3236             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3237         }
3238     }
3239 
3240     #[test]
3241     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_bidi_thoroughly()3242     fn test_is_utf16_bidi_thoroughly() {
3243         let mut buf = [0; 32];
3244         for i in 0..0x10000u32 {
3245             let u = i as u16;
3246             buf[15] = u;
3247             assert_eq!(
3248                 is_utf16_bidi(&buf[..]),
3249                 reference_is_utf16_code_unit_bidi(u)
3250             );
3251         }
3252     }
3253 
3254     #[test]
test_is_utf8_bidi_edge_cases()3255     fn test_is_utf8_bidi_edge_cases() {
3256         assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3257         assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3258         assert!(!is_utf8_bidi(b"abc"));
3259         assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3260         assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3261         assert!(is_utf8_bidi(b"ab\xC2"));
3262     }
3263 
3264     #[test]
test_decode_latin1()3265     fn test_decode_latin1() {
3266         match decode_latin1(b"ab") {
3267             Cow::Borrowed(s) => {
3268                 assert_eq!(s, "ab");
3269             }
3270             Cow::Owned(_) => {
3271                 unreachable!("Should have borrowed");
3272             }
3273         }
3274         assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3275     }
3276 
3277     #[test]
test_encode_latin1_lossy()3278     fn test_encode_latin1_lossy() {
3279         match encode_latin1_lossy("ab") {
3280             Cow::Borrowed(s) => {
3281                 assert_eq!(s, b"ab");
3282             }
3283             Cow::Owned(_) => {
3284                 unreachable!("Should have borrowed");
3285             }
3286         }
3287         assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3288     }
3289 
3290     #[test]
test_convert_utf8_to_utf16_without_replacement()3291     fn test_convert_utf8_to_utf16_without_replacement() {
3292         let mut buf = [0u16; 5];
3293         assert_eq!(
3294             convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3295             Some(2)
3296         );
3297         assert_eq!(buf[0], u16::from(b'a'));
3298         assert_eq!(buf[1], u16::from(b'b'));
3299         assert_eq!(buf[2], 0);
3300         assert_eq!(
3301             convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3302             Some(2)
3303         );
3304         assert_eq!(buf[0], 0xE4);
3305         assert_eq!(buf[1], u16::from(b'c'));
3306         assert_eq!(buf[2], 0);
3307         assert_eq!(
3308             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3309             Some(1)
3310         );
3311         assert_eq!(buf[0], 0x2603);
3312         assert_eq!(buf[1], u16::from(b'c'));
3313         assert_eq!(buf[2], 0);
3314         assert_eq!(
3315             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3316             Some(2)
3317         );
3318         assert_eq!(buf[0], 0x2603);
3319         assert_eq!(buf[1], u16::from(b'd'));
3320         assert_eq!(buf[2], 0);
3321         assert_eq!(
3322             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3323             Some(2)
3324         );
3325         assert_eq!(buf[0], 0x2603);
3326         assert_eq!(buf[1], 0xE4);
3327         assert_eq!(buf[2], 0);
3328         assert_eq!(
3329             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3330             Some(2)
3331         );
3332         assert_eq!(buf[0], 0xD83D);
3333         assert_eq!(buf[1], 0xDCCE);
3334         assert_eq!(buf[2], 0);
3335         assert_eq!(
3336             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3337             Some(3)
3338         );
3339         assert_eq!(buf[0], 0xD83D);
3340         assert_eq!(buf[1], 0xDCCE);
3341         assert_eq!(buf[2], u16::from(b'e'));
3342         assert_eq!(
3343             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3344             None
3345         );
3346     }
3347 }
3348