1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! Functions for converting between different in-RAM representations of text
11 //! and for quickly checking if the Unicode Bidirectional Algorithm can be
12 //! avoided.
13 //!
14 //! By using slices for output, the functions here seek to enable by-register
15 //! (ALU register or SIMD register as available) operations in order to
16 //! outperform iterator-based conversions available in the Rust standard
17 //! library.
18 //!
19 //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20 //! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21 //! in-memory encoding is sometimes used as a storage optimization of text
22 //! when UTF-16 indexing and length semantics are exposed.
23 //!
24 //! The FFI binding for this module are in the
25 //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26 
27 use std::borrow::Cow;
28 
29 use super::in_inclusive_range16;
30 use super::in_inclusive_range32;
31 use super::in_inclusive_range8;
32 use super::in_range16;
33 use super::in_range32;
34 use super::DecoderResult;
35 use crate::ascii::*;
36 use crate::utf_8::*;
37 
38 macro_rules! non_fuzz_debug_assert {
39     ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
40 }
41 
42 cfg_if! {
43     if #[cfg(feature = "simd-accel")] {
44         use ::std::intrinsics::likely;
45         use ::std::intrinsics::unlikely;
46     } else {
47         #[inline(always)]
48         // Unsafe to match the intrinsic, which is needlessly unsafe.
49         unsafe fn likely(b: bool) -> bool {
50             b
51         }
52         #[inline(always)]
53         // Unsafe to match the intrinsic, which is needlessly unsafe.
54         unsafe fn unlikely(b: bool) -> bool {
55             b
56         }
57     }
58 }
59 
60 /// Classification of text as Latin1 (all code points are below U+0100),
61 /// left-to-right with some non-Latin1 characters or as containing at least
62 /// some right-to-left characters.
63 #[must_use]
64 #[derive(Debug, PartialEq, Eq)]
65 #[repr(C)]
66 pub enum Latin1Bidi {
67     /// Every character is below U+0100.
68     Latin1 = 0,
69     /// There is at least one character that's U+0100 or higher, but there
70     /// are no right-to-left characters.
71     LeftToRight = 1,
72     /// There is at least one right-to-left character.
73     Bidi = 2,
74 }
75 
76 // `as` truncates, so works on 32-bit, too.
77 #[allow(dead_code)]
78 const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
79 
80 #[allow(unused_macros)]
81 macro_rules! by_unit_check_alu {
82     ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
83         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
84         #[inline(always)]
85         fn $name(buffer: &[$unit]) -> bool {
86             let mut offset = 0usize;
87             let mut accu = 0usize;
88             let unit_size = ::std::mem::size_of::<$unit>();
89             let len = buffer.len();
90             if len >= ALU_ALIGNMENT / unit_size {
91                 // The most common reason to return `false` is for the first code
92                 // unit to fail the test, so check that first.
93                 if buffer[0] >= $bound {
94                     return false;
95                 }
96                 let src = buffer.as_ptr();
97                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
98                     & ALU_ALIGNMENT_MASK)
99                     / unit_size;
100                 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
101                     if until_alignment != 0 {
102                         accu |= buffer[offset] as usize;
103                         offset += 1;
104                         until_alignment -= 1;
105                         while until_alignment != 0 {
106                             accu |= buffer[offset] as usize;
107                             offset += 1;
108                             until_alignment -= 1;
109                         }
110                         if accu >= $bound {
111                             return false;
112                         }
113                     }
114                     let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
115                     if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
116                         let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
117                         loop {
118                             let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
119                                 | unsafe {
120                                     *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
121                                 }
122                                 | unsafe {
123                                     *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
124                                         as *const usize)
125                                 }
126                                 | unsafe {
127                                     *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
128                                         as *const usize)
129                                 };
130                             if unroll_accu & $mask != 0 {
131                                 return false;
132                             }
133                             offset += 4 * (ALU_ALIGNMENT / unit_size);
134                             if offset > len_minus_unroll {
135                                 break;
136                             }
137                         }
138                     }
139                     while offset <= len_minus_stride {
140                         accu |= unsafe { *(src.add(offset) as *const usize) };
141                         offset += ALU_ALIGNMENT / unit_size;
142                     }
143                 }
144             }
145             for &unit in &buffer[offset..] {
146                 accu |= unit as usize;
147             }
148             accu & $mask == 0
149         }
150     };
151 }
152 
153 #[allow(unused_macros)]
154 macro_rules! by_unit_check_simd {
155     ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
156         #[inline(always)]
157         fn $name(buffer: &[$unit]) -> bool {
158             let mut offset = 0usize;
159             let mut accu = 0usize;
160             let unit_size = ::std::mem::size_of::<$unit>();
161             let len = buffer.len();
162             if len >= SIMD_STRIDE_SIZE / unit_size {
163                 // The most common reason to return `false` is for the first code
164                 // unit to fail the test, so check that first.
165                 if buffer[0] >= $bound {
166                     return false;
167                 }
168                 let src = buffer.as_ptr();
169                 let mut until_alignment = ((SIMD_ALIGNMENT
170                     - ((src as usize) & SIMD_ALIGNMENT_MASK))
171                     & SIMD_ALIGNMENT_MASK)
172                     / unit_size;
173                 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
174                     if until_alignment != 0 {
175                         accu |= buffer[offset] as usize;
176                         offset += 1;
177                         until_alignment -= 1;
178                         while until_alignment != 0 {
179                             accu |= buffer[offset] as usize;
180                             offset += 1;
181                             until_alignment -= 1;
182                         }
183                         if accu >= $bound {
184                             return false;
185                         }
186                     }
187                     let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
188                     if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
189                         let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
190                         loop {
191                             let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
192                                 | unsafe {
193                                     *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
194                                         as *const $simd_ty)
195                                 }
196                                 | unsafe {
197                                     *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
198                                         as *const $simd_ty)
199                                 }
200                                 | unsafe {
201                                     *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
202                                         as *const $simd_ty)
203                                 };
204                             if !$func(unroll_accu) {
205                                 return false;
206                             }
207                             offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
208                             if offset > len_minus_unroll {
209                                 break;
210                             }
211                         }
212                     }
213                     let mut simd_accu = $splat;
214                     while offset <= len_minus_stride {
215                         simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
216                         offset += SIMD_STRIDE_SIZE / unit_size;
217                     }
218                     if !$func(simd_accu) {
219                         return false;
220                     }
221                 }
222             }
223             for &unit in &buffer[offset..] {
224                 accu |= unit as usize;
225             }
226             accu < $bound
227         }
228     };
229 }
230 
231 cfg_if! {
232     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
233         use crate::simd_funcs::*;
234         use packed_simd::u8x16;
235         use packed_simd::u16x8;
236 
237         const SIMD_ALIGNMENT: usize = 16;
238 
239         const SIMD_ALIGNMENT_MASK: usize = 15;
240 
241         by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
242         by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
243         by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
244 
245         #[inline(always)]
246         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
247             // This function is a mess, because it simultaneously tries to do
248             // only aligned SIMD (perhaps misguidedly) and needs to deal with
249             // the last code unit in a SIMD stride being part of a valid
250             // surrogate pair.
251             let unit_size = ::std::mem::size_of::<u16>();
252             let src = buffer.as_ptr();
253             let len = buffer.len();
254             let mut offset = 0usize;
255             'outer: loop {
256                 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
257                                         SIMD_ALIGNMENT_MASK) / unit_size;
258                 if until_alignment == 0 {
259                     if offset + SIMD_STRIDE_SIZE / unit_size > len {
260                         break;
261                     }
262                 } else {
263                     let offset_plus_until_alignment = offset + until_alignment;
264                     let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
265                     if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
266                         break;
267                     }
268                     let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
269                     if up_to < until_alignment {
270                         return offset + up_to;
271                     }
272                     if last_valid_low {
273                         offset = offset_plus_until_alignment_plus_one;
274                         continue;
275                     }
276                     offset = offset_plus_until_alignment;
277                 }
278                 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
279                 loop {
280                     let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
281                     if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
282                         if offset_plus_stride == len {
283                             break 'outer;
284                         }
285                         let offset_plus_stride_plus_one = offset_plus_stride + 1;
286                         let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
287                         if up_to < SIMD_STRIDE_SIZE / unit_size {
288                             return offset + up_to;
289                         }
290                         if last_valid_low {
291                             offset = offset_plus_stride_plus_one;
292                             continue 'outer;
293                         }
294                     }
295                     offset = offset_plus_stride;
296                     if offset > len_minus_stride {
297                         break 'outer;
298                     }
299                 }
300             }
301             let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
302             offset + up_to
303         }
304     } else {
305         by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
306         by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
307         by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
308 
309         #[inline(always)]
310         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
311             let (up_to, _) = utf16_valid_up_to_alu(buffer);
312             up_to
313         }
314     }
315 }
316 
317 /// The second return value is true iff the last code unit of the slice was
318 /// reached and turned out to be a low surrogate that is part of a valid pair.
319 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
320 #[inline(always)]
utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool)321 fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
322     let len = buffer.len();
323     if len == 0 {
324         return (0, false);
325     }
326     let mut offset = 0usize;
327     loop {
328         let unit = buffer[offset];
329         let next = offset + 1;
330         let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
331         if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
332             // Not a surrogate
333             offset = next;
334             if offset == len {
335                 return (offset, false);
336             }
337             continue;
338         }
339         if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
340             // high surrogate
341             if next < len {
342                 let second = buffer[next];
343                 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
344                 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
345                     // The next code unit is a low surrogate. Advance position.
346                     offset = next + 1;
347                     if offset == len {
348                         return (offset, true);
349                     }
350                     continue;
351                 }
352                 // The next code unit is not a low surrogate. Don't advance
353                 // position and treat the high surrogate as unpaired.
354                 // fall through
355             }
356             // Unpaired, fall through
357         }
358         // Unpaired surrogate
359         return (offset, false);
360     }
361 }
362 
363 cfg_if! {
364     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
365         #[inline(always)]
366         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
367             let mut offset = 0usize;
368             let bytes = buffer.as_bytes();
369             let len = bytes.len();
370             if len >= SIMD_STRIDE_SIZE {
371                 let src = bytes.as_ptr();
372                 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
373                                            SIMD_ALIGNMENT_MASK;
374                 if until_alignment + SIMD_STRIDE_SIZE <= len {
375                     while until_alignment != 0 {
376                         if bytes[offset] > 0xC3 {
377                             return Some(offset);
378                         }
379                         offset += 1;
380                         until_alignment -= 1;
381                     }
382                     let len_minus_stride = len - SIMD_STRIDE_SIZE;
383                     loop {
384                         if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
385                             // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
386                             while bytes[offset] & 0xC0 == 0x80 {
387                                 offset += 1;
388                             }
389                             return Some(offset);
390                         }
391                         offset += SIMD_STRIDE_SIZE;
392                         if offset > len_minus_stride {
393                             break;
394                         }
395                     }
396                 }
397             }
398             for i in offset..len {
399                 if bytes[i] > 0xC3 {
400                     return Some(i);
401                 }
402             }
403             None
404         }
405     } else {
406         #[inline(always)]
407         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
408             let mut bytes = buffer.as_bytes();
409             let mut total = 0;
410             loop {
411                 if let Some((byte, offset)) = validate_ascii(bytes) {
412                     total += offset;
413                     if byte > 0xC3 {
414                         return Some(total);
415                     }
416                     bytes = &bytes[offset + 2..];
417                     total += 2;
418                 } else {
419                     return None;
420                 }
421             }
422         }
423     }
424 }
425 
426 #[inline(always)]
is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize>427 fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
428     let mut bytes = buffer;
429     let mut total = 0;
430     loop {
431         if let Some((byte, offset)) = validate_ascii(bytes) {
432             total += offset;
433             if in_inclusive_range8(byte, 0xC2, 0xC3) {
434                 let next = offset + 1;
435                 if next == bytes.len() {
436                     return Some(total);
437                 }
438                 if bytes[next] & 0xC0 != 0x80 {
439                     return Some(total);
440                 }
441                 bytes = &bytes[offset + 2..];
442                 total += 2;
443             } else {
444                 return Some(total);
445             }
446         } else {
447             return None;
448         }
449     }
450 }
451 
452 cfg_if! {
453     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
454         #[inline(always)]
455         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
456             let mut offset = 0usize;
457             let len = buffer.len();
458             if len >= SIMD_STRIDE_SIZE / 2 {
459                 let src = buffer.as_ptr();
460                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
461                                            SIMD_ALIGNMENT_MASK) / 2;
462                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
463                     while until_alignment != 0 {
464                         if is_utf16_code_unit_bidi(buffer[offset]) {
465                             return true;
466                         }
467                         offset += 1;
468                         until_alignment -= 1;
469                     }
470                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
471                     loop {
472                         if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
473                             return true;
474                         }
475                         offset += SIMD_STRIDE_SIZE / 2;
476                         if offset > len_minus_stride {
477                             break;
478                         }
479                     }
480                 }
481             }
482             for &u in &buffer[offset..] {
483                 if is_utf16_code_unit_bidi(u) {
484                     return true;
485                 }
486             }
487             false
488         }
489     } else {
490         #[inline(always)]
491         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
492             for &u in buffer {
493                 if is_utf16_code_unit_bidi(u) {
494                     return true;
495                 }
496             }
497             false
498         }
499     }
500 }
501 
502 cfg_if! {
503     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
504         #[inline(always)]
505         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
506             let mut offset = 0usize;
507             let len = buffer.len();
508             if len >= SIMD_STRIDE_SIZE / 2 {
509                 let src = buffer.as_ptr();
510                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
511                                            SIMD_ALIGNMENT_MASK) / 2;
512                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
513                     while until_alignment != 0 {
514                         if buffer[offset] > 0xFF {
515                             // This transition isn't optimal, since the aligment is recomputing
516                             // but not tweaking further today.
517                             if is_utf16_bidi_impl(&buffer[offset..]) {
518                                 return Latin1Bidi::Bidi;
519                             }
520                             return Latin1Bidi::LeftToRight;
521                         }
522                         offset += 1;
523                         until_alignment -= 1;
524                     }
525                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
526                     loop {
527                         let mut s = unsafe { *(src.add(offset) as *const u16x8) };
528                         if !simd_is_latin1(s) {
529                             loop {
530                                 if is_u16x8_bidi(s) {
531                                     return Latin1Bidi::Bidi;
532                                 }
533                                 offset += SIMD_STRIDE_SIZE / 2;
534                                 if offset > len_minus_stride {
535                                     for &u in &buffer[offset..] {
536                                         if is_utf16_code_unit_bidi(u) {
537                                             return Latin1Bidi::Bidi;
538                                         }
539                                     }
540                                     return Latin1Bidi::LeftToRight;
541                                 }
542                                 s = unsafe { *(src.add(offset) as *const u16x8) };
543                             }
544                         }
545                         offset += SIMD_STRIDE_SIZE / 2;
546                         if offset > len_minus_stride {
547                             break;
548                         }
549                     }
550                 }
551             }
552             let mut iter = (&buffer[offset..]).iter();
553             loop {
554                 if let Some(&u) = iter.next() {
555                     if u > 0xFF {
556                         let mut inner_u = u;
557                         loop {
558                             if is_utf16_code_unit_bidi(inner_u) {
559                                 return Latin1Bidi::Bidi;
560                             }
561                             if let Some(&code_unit) = iter.next() {
562                                 inner_u = code_unit;
563                             } else {
564                                 return Latin1Bidi::LeftToRight;
565                             }
566                         }
567                     }
568                 } else {
569                     return Latin1Bidi::Latin1;
570                 }
571             }
572         }
573     } else {
574         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
575         #[inline(always)]
576         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
577             let mut offset = 0usize;
578             let len = buffer.len();
579             if len >= ALU_ALIGNMENT / 2 {
580                 let src = buffer.as_ptr();
581                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
582                                            ALU_ALIGNMENT_MASK) / 2;
583                 if until_alignment + ALU_ALIGNMENT / 2 <= len {
584                     while until_alignment != 0 {
585                         if buffer[offset] > 0xFF {
586                             if is_utf16_bidi_impl(&buffer[offset..]) {
587                                 return Latin1Bidi::Bidi;
588                             }
589                             return Latin1Bidi::LeftToRight;
590                         }
591                         offset += 1;
592                         until_alignment -= 1;
593                     }
594                     let len_minus_stride = len - ALU_ALIGNMENT / 2;
595                     loop {
596                         if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
597                             if is_utf16_bidi_impl(&buffer[offset..]) {
598                                 return Latin1Bidi::Bidi;
599                             }
600                             return Latin1Bidi::LeftToRight;
601                         }
602                         offset += ALU_ALIGNMENT / 2;
603                         if offset > len_minus_stride {
604                             break;
605                         }
606                     }
607                 }
608             }
609             let mut iter = (&buffer[offset..]).iter();
610             loop {
611                 if let Some(&u) = iter.next() {
612                     if u > 0xFF {
613                         let mut inner_u = u;
614                         loop {
615                             if is_utf16_code_unit_bidi(inner_u) {
616                                 return Latin1Bidi::Bidi;
617                             }
618                             if let Some(&code_unit) = iter.next() {
619                                 inner_u = code_unit;
620                             } else {
621                                 return Latin1Bidi::LeftToRight;
622                             }
623                         }
624                     }
625                 } else {
626                     return Latin1Bidi::Latin1;
627                 }
628             }
629         }
630     }
631 }
632 
633 /// Checks whether the buffer is all-ASCII.
634 ///
635 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
636 /// is not guaranteed to fail fast.)
is_ascii(buffer: &[u8]) -> bool637 pub fn is_ascii(buffer: &[u8]) -> bool {
638     is_ascii_impl(buffer)
639 }
640 
641 /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
642 /// only ASCII characters).
643 ///
644 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
645 /// is not guaranteed to fail fast.)
is_basic_latin(buffer: &[u16]) -> bool646 pub fn is_basic_latin(buffer: &[u16]) -> bool {
647     is_basic_latin_impl(buffer)
648 }
649 
650 /// Checks whether the buffer is valid UTF-8 representing only code points
651 /// less than or equal to U+00FF.
652 ///
653 /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
654 /// invalidity or code points above U+00FF are discovered.
is_utf8_latin1(buffer: &[u8]) -> bool655 pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
656     is_utf8_latin1_impl(buffer).is_none()
657 }
658 
659 /// Checks whether the buffer represents only code points less than or equal
660 /// to U+00FF.
661 ///
662 /// Fails fast. (I.e. returns before having read the whole buffer if code
663 /// points above U+00FF are discovered.
is_str_latin1(buffer: &str) -> bool664 pub fn is_str_latin1(buffer: &str) -> bool {
665     is_str_latin1_impl(buffer).is_none()
666 }
667 
668 /// Checks whether the buffer represents only code point less than or equal
669 /// to U+00FF.
670 ///
671 /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
672 /// is not guaranteed to fail fast.)
is_utf16_latin1(buffer: &[u16]) -> bool673 pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
674     is_utf16_latin1_impl(buffer)
675 }
676 
677 /// Checks whether a potentially-invalid UTF-8 buffer contains code points
678 /// that trigger right-to-left processing.
679 ///
680 /// The check is done on a Unicode block basis without regard to assigned
681 /// vs. unassigned code points in the block. Hebrew presentation forms in
682 /// the Alphabetic Presentation Forms block are treated as if they formed
683 /// a block on their own (i.e. it treated as right-to-left). Additionally,
684 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
685 /// for. Control characters that are technically bidi controls but do not
686 /// cause right-to-left behavior without the presence of right-to-left
687 /// characters or right-to-left controls are not checked for. As a special
688 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
689 ///
690 /// Returns `true` if the input is invalid UTF-8 or the input contains an
691 /// RTL character. Returns `false` if the input is valid UTF-8 and contains
692 /// no RTL characters.
693 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
694 #[inline]
is_utf8_bidi(buffer: &[u8]) -> bool695 pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
696     // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
697     // than UTF-8 validation followed by `is_str_bidi()` for German,
698     // Russian and Japanese. However, this is considerably slower for Thai.
699     // Chances are that the compiler makes some branch predictions that are
700     // unfortunate for Thai. Not spending the time to manually optimize
701     // further at this time, since it's unclear if this variant even has
702     // use cases. However, this is worth revisiting once Rust gets the
703     // ability to annotate relative priorities of match arms.
704 
705     // U+058F: D6 8F
706     // U+0590: D6 90
707     // U+08FF: E0 A3 BF
708     // U+0900: E0 A4 80
709     //
710     // U+200F: E2 80 8F
711     // U+202B: E2 80 AB
712     // U+202E: E2 80 AE
713     // U+2067: E2 81 A7
714     //
715     // U+FB1C: EF AC 9C
716     // U+FB1D: EF AC 9D
717     // U+FDFF: EF B7 BF
718     // U+FE00: EF B8 80
719     //
720     // U+FE6F: EF B9 AF
721     // U+FE70: EF B9 B0
722     // U+FEFE: EF BB BE
723     // U+FEFF: EF BB BF
724     //
725     // U+107FF: F0 90 9F BF
726     // U+10800: F0 90 A0 80
727     // U+10FFF: F0 90 BF BF
728     // U+11000: F0 91 80 80
729     //
730     // U+1E7FF: F0 9E 9F BF
731     // U+1E800: F0 9E A0 80
732     // U+1EFFF: F0 9E BF BF
733     // U+1F000: F0 9F 80 80
734     let mut src = buffer;
735     'outer: loop {
736         if let Some((mut byte, mut read)) = validate_ascii(src) {
737             // Check for the longest sequence to avoid checking twice for the
738             // multi-byte sequences.
739             if read + 4 <= src.len() {
740                 'inner: loop {
741                     // At this point, `byte` is not included in `read`.
742                     match byte {
743                         0..=0x7F => {
744                             // ASCII: go back to SIMD.
745                             read += 1;
746                             src = &src[read..];
747                             continue 'outer;
748                         }
749                         0xC2..=0xD5 => {
750                             // Two-byte
751                             let second = unsafe { *(src.get_unchecked(read + 1)) };
752                             if !in_inclusive_range8(second, 0x80, 0xBF) {
753                                 return true;
754                             }
755                             read += 2;
756                         }
757                         0xD6 => {
758                             // Two-byte
759                             let second = unsafe { *(src.get_unchecked(read + 1)) };
760                             if !in_inclusive_range8(second, 0x80, 0xBF) {
761                                 return true;
762                             }
763                             // XXX consider folding the above and below checks
764                             if second > 0x8F {
765                                 return true;
766                             }
767                             read += 2;
768                         }
769                         // two-byte starting with 0xD7 and above is bidi
770                         0xE1 | 0xE3..=0xEC | 0xEE => {
771                             // Three-byte normal
772                             let second = unsafe { *(src.get_unchecked(read + 1)) };
773                             let third = unsafe { *(src.get_unchecked(read + 2)) };
774                             if ((UTF8_DATA.table[usize::from(second)]
775                                 & unsafe {
776                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
777                                 })
778                                 | (third >> 6))
779                                 != 2
780                             {
781                                 return true;
782                             }
783                             read += 3;
784                         }
785                         0xE2 => {
786                             // Three-byte normal, potentially bidi
787                             let second = unsafe { *(src.get_unchecked(read + 1)) };
788                             let third = unsafe { *(src.get_unchecked(read + 2)) };
789                             if ((UTF8_DATA.table[usize::from(second)]
790                                 & unsafe {
791                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
792                                 })
793                                 | (third >> 6))
794                                 != 2
795                             {
796                                 return true;
797                             }
798                             if second == 0x80 {
799                                 if third == 0x8F || third == 0xAB || third == 0xAE {
800                                     return true;
801                                 }
802                             } else if second == 0x81 {
803                                 if third == 0xA7 {
804                                     return true;
805                                 }
806                             }
807                             read += 3;
808                         }
809                         0xEF => {
810                             // Three-byte normal, potentially bidi
811                             let second = unsafe { *(src.get_unchecked(read + 1)) };
812                             let third = unsafe { *(src.get_unchecked(read + 2)) };
813                             if ((UTF8_DATA.table[usize::from(second)]
814                                 & unsafe {
815                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
816                                 })
817                                 | (third >> 6))
818                                 != 2
819                             {
820                                 return true;
821                             }
822                             if in_inclusive_range8(second, 0xAC, 0xB7) {
823                                 if second == 0xAC {
824                                     if third > 0x9C {
825                                         return true;
826                                     }
827                                 } else {
828                                     return true;
829                                 }
830                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
831                                 if second == 0xB9 {
832                                     if third > 0xAF {
833                                         return true;
834                                     }
835                                 } else if second == 0xBB {
836                                     if third != 0xBF {
837                                         return true;
838                                     }
839                                 } else {
840                                     return true;
841                                 }
842                             }
843                             read += 3;
844                         }
845                         0xE0 => {
846                             // Three-byte special lower bound, potentially bidi
847                             let second = unsafe { *(src.get_unchecked(read + 1)) };
848                             let third = unsafe { *(src.get_unchecked(read + 2)) };
849                             if ((UTF8_DATA.table[usize::from(second)]
850                                 & unsafe {
851                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
852                                 })
853                                 | (third >> 6))
854                                 != 2
855                             {
856                                 return true;
857                             }
858                             // XXX can this be folded into the above validity check
859                             if second < 0xA4 {
860                                 return true;
861                             }
862                             read += 3;
863                         }
864                         0xED => {
865                             // Three-byte special upper bound
866                             let second = unsafe { *(src.get_unchecked(read + 1)) };
867                             let third = unsafe { *(src.get_unchecked(read + 2)) };
868                             if ((UTF8_DATA.table[usize::from(second)]
869                                 & unsafe {
870                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
871                                 })
872                                 | (third >> 6))
873                                 != 2
874                             {
875                                 return true;
876                             }
877                             read += 3;
878                         }
879                         0xF1..=0xF4 => {
880                             // Four-byte normal
881                             let second = unsafe { *(src.get_unchecked(read + 1)) };
882                             let third = unsafe { *(src.get_unchecked(read + 2)) };
883                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
884                             if (u16::from(
885                                 UTF8_DATA.table[usize::from(second)]
886                                     & unsafe {
887                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
888                                     },
889                             ) | u16::from(third >> 6)
890                                 | (u16::from(fourth & 0xC0) << 2))
891                                 != 0x202
892                             {
893                                 return true;
894                             }
895                             read += 4;
896                         }
897                         0xF0 => {
898                             // Four-byte special lower bound, potentially bidi
899                             let second = unsafe { *(src.get_unchecked(read + 1)) };
900                             let third = unsafe { *(src.get_unchecked(read + 2)) };
901                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
902                             if (u16::from(
903                                 UTF8_DATA.table[usize::from(second)]
904                                     & unsafe {
905                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
906                                     },
907                             ) | u16::from(third >> 6)
908                                 | (u16::from(fourth & 0xC0) << 2))
909                                 != 0x202
910                             {
911                                 return true;
912                             }
913                             if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
914                                 let third = src[read + 2];
915                                 if third >= 0xA0 {
916                                     return true;
917                                 }
918                             }
919                             read += 4;
920                         }
921                         _ => {
922                             // Invalid lead or bidi-only lead
923                             return true;
924                         }
925                     }
926                     if read + 4 > src.len() {
927                         if read == src.len() {
928                             return false;
929                         }
930                         byte = src[read];
931                         break 'inner;
932                     }
933                     byte = src[read];
934                     continue 'inner;
935                 }
936             }
937             // We can't have a complete 4-byte sequence, but we could still have
938             // a complete shorter sequence.
939 
940             // At this point, `byte` is not included in `read`.
941             match byte {
942                 0..=0x7F => {
943                     // ASCII: go back to SIMD.
944                     read += 1;
945                     src = &src[read..];
946                     continue 'outer;
947                 }
948                 0xC2..=0xD5 => {
949                     // Two-byte
950                     let new_read = read + 2;
951                     if new_read > src.len() {
952                         return true;
953                     }
954                     let second = unsafe { *(src.get_unchecked(read + 1)) };
955                     if !in_inclusive_range8(second, 0x80, 0xBF) {
956                         return true;
957                     }
958                     read = new_read;
959                     // We need to deal with the case where we came here with 3 bytes
960                     // left, so we need to take a look at the last one.
961                     src = &src[read..];
962                     continue 'outer;
963                 }
964                 0xD6 => {
965                     // Two-byte, potentially bidi
966                     let new_read = read + 2;
967                     if new_read > src.len() {
968                         return true;
969                     }
970                     let second = unsafe { *(src.get_unchecked(read + 1)) };
971                     if !in_inclusive_range8(second, 0x80, 0xBF) {
972                         return true;
973                     }
974                     // XXX consider folding the above and below checks
975                     if second > 0x8F {
976                         return true;
977                     }
978                     read = new_read;
979                     // We need to deal with the case where we came here with 3 bytes
980                     // left, so we need to take a look at the last one.
981                     src = &src[read..];
982                     continue 'outer;
983                 }
984                 // two-byte starting with 0xD7 and above is bidi
985                 0xE1 | 0xE3..=0xEC | 0xEE => {
986                     // Three-byte normal
987                     let new_read = read + 3;
988                     if new_read > src.len() {
989                         return true;
990                     }
991                     let second = unsafe { *(src.get_unchecked(read + 1)) };
992                     let third = unsafe { *(src.get_unchecked(read + 2)) };
993                     if ((UTF8_DATA.table[usize::from(second)]
994                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
995                         | (third >> 6))
996                         != 2
997                     {
998                         return true;
999                     }
1000                 }
1001                 0xE2 => {
1002                     // Three-byte normal, potentially bidi
1003                     let new_read = read + 3;
1004                     if new_read > src.len() {
1005                         return true;
1006                     }
1007                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1008                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1009                     if ((UTF8_DATA.table[usize::from(second)]
1010                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1011                         | (third >> 6))
1012                         != 2
1013                     {
1014                         return true;
1015                     }
1016                     if second == 0x80 {
1017                         if third == 0x8F || third == 0xAB || third == 0xAE {
1018                             return true;
1019                         }
1020                     } else if second == 0x81 {
1021                         if third == 0xA7 {
1022                             return true;
1023                         }
1024                     }
1025                 }
1026                 0xEF => {
1027                     // Three-byte normal, potentially bidi
1028                     let new_read = read + 3;
1029                     if new_read > src.len() {
1030                         return true;
1031                     }
1032                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1033                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1034                     if ((UTF8_DATA.table[usize::from(second)]
1035                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1036                         | (third >> 6))
1037                         != 2
1038                     {
1039                         return true;
1040                     }
1041                     if in_inclusive_range8(second, 0xAC, 0xB7) {
1042                         if second == 0xAC {
1043                             if third > 0x9C {
1044                                 return true;
1045                             }
1046                         } else {
1047                             return true;
1048                         }
1049                     } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1050                         if second == 0xB9 {
1051                             if third > 0xAF {
1052                                 return true;
1053                             }
1054                         } else if second == 0xBB {
1055                             if third != 0xBF {
1056                                 return true;
1057                             }
1058                         } else {
1059                             return true;
1060                         }
1061                     }
1062                 }
1063                 0xE0 => {
1064                     // Three-byte special lower bound, potentially bidi
1065                     let new_read = read + 3;
1066                     if new_read > src.len() {
1067                         return true;
1068                     }
1069                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1070                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1071                     if ((UTF8_DATA.table[usize::from(second)]
1072                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1073                         | (third >> 6))
1074                         != 2
1075                     {
1076                         return true;
1077                     }
1078                     // XXX can this be folded into the above validity check
1079                     if second < 0xA4 {
1080                         return true;
1081                     }
1082                 }
1083                 0xED => {
1084                     // Three-byte special upper bound
1085                     let new_read = read + 3;
1086                     if new_read > src.len() {
1087                         return true;
1088                     }
1089                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1090                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1091                     if ((UTF8_DATA.table[usize::from(second)]
1092                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1093                         | (third >> 6))
1094                         != 2
1095                     {
1096                         return true;
1097                     }
1098                 }
1099                 _ => {
1100                     // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1101                     return true;
1102                 }
1103             }
1104             return false;
1105         } else {
1106             return false;
1107         }
1108     }
1109 }
1110 
1111 /// Checks whether a valid UTF-8 buffer contains code points that trigger
1112 /// right-to-left processing.
1113 ///
1114 /// The check is done on a Unicode block basis without regard to assigned
1115 /// vs. unassigned code points in the block. Hebrew presentation forms in
1116 /// the Alphabetic Presentation Forms block are treated as if they formed
1117 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1118 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1119 /// for. Control characters that are technically bidi controls but do not
1120 /// cause right-to-left behavior without the presence of right-to-left
1121 /// characters or right-to-left controls are not checked for. As a special
1122 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1123 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1124 #[inline]
is_str_bidi(buffer: &str) -> bool1125 pub fn is_str_bidi(buffer: &str) -> bool {
1126     // U+058F: D6 8F
1127     // U+0590: D6 90
1128     // U+08FF: E0 A3 BF
1129     // U+0900: E0 A4 80
1130     //
1131     // U+200F: E2 80 8F
1132     // U+202B: E2 80 AB
1133     // U+202E: E2 80 AE
1134     // U+2067: E2 81 A7
1135     //
1136     // U+FB1C: EF AC 9C
1137     // U+FB1D: EF AC 9D
1138     // U+FDFF: EF B7 BF
1139     // U+FE00: EF B8 80
1140     //
1141     // U+FE6F: EF B9 AF
1142     // U+FE70: EF B9 B0
1143     // U+FEFE: EF BB BE
1144     // U+FEFF: EF BB BF
1145     //
1146     // U+107FF: F0 90 9F BF
1147     // U+10800: F0 90 A0 80
1148     // U+10FFF: F0 90 BF BF
1149     // U+11000: F0 91 80 80
1150     //
1151     // U+1E7FF: F0 9E 9F BF
1152     // U+1E800: F0 9E A0 80
1153     // U+1EFFF: F0 9E BF BF
1154     // U+1F000: F0 9F 80 80
1155     let mut bytes = buffer.as_bytes();
1156     'outer: loop {
1157         // TODO: Instead of just validating ASCII using SIMD, use SIMD
1158         // to check for non-ASCII lead bytes, too, to quickly conclude
1159         // that the vector consist entirely of CJK and below-Hebrew
1160         // code points.
1161         // Unfortunately, scripts above Arabic but below CJK share
1162         // lead bytes with RTL.
1163         if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1164             'inner: loop {
1165                 // At this point, `byte` is not included in `read`.
1166                 if byte < 0xE0 {
1167                     if byte >= 0x80 {
1168                         // Two-byte
1169                         // Adding `unlikely` here improved throughput on
1170                         // Russian plain text by 33%!
1171                         if unsafe { unlikely(byte >= 0xD6) } {
1172                             if byte == 0xD6 {
1173                                 let second = bytes[read + 1];
1174                                 if second > 0x8F {
1175                                     return true;
1176                                 }
1177                             } else {
1178                                 return true;
1179                             }
1180                         }
1181                         read += 2;
1182                     } else {
1183                         // ASCII: write and go back to SIMD.
1184                         read += 1;
1185                         // Intuitively, we should go back to the outer loop only
1186                         // if byte is 0x30 or above, so as to avoid trashing on
1187                         // ASCII space, comma and period in non-Latin context.
1188                         // However, the extra branch seems to cost more than it's
1189                         // worth.
1190                         bytes = &bytes[read..];
1191                         continue 'outer;
1192                     }
1193                 } else if byte < 0xF0 {
1194                     // Three-byte
1195                     if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1196                         let second = bytes[read + 1];
1197                         if byte == 0xE0 {
1198                             if second < 0xA4 {
1199                                 return true;
1200                             }
1201                         } else if byte == 0xE2 {
1202                             let third = bytes[read + 2];
1203                             if second == 0x80 {
1204                                 if third == 0x8F || third == 0xAB || third == 0xAE {
1205                                     return true;
1206                                 }
1207                             } else if second == 0x81 {
1208                                 if third == 0xA7 {
1209                                     return true;
1210                                 }
1211                             }
1212                         } else {
1213                             debug_assert_eq!(byte, 0xEF);
1214                             if in_inclusive_range8(second, 0xAC, 0xB7) {
1215                                 if second == 0xAC {
1216                                     let third = bytes[read + 2];
1217                                     if third > 0x9C {
1218                                         return true;
1219                                     }
1220                                 } else {
1221                                     return true;
1222                                 }
1223                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1224                                 if second == 0xB9 {
1225                                     let third = bytes[read + 2];
1226                                     if third > 0xAF {
1227                                         return true;
1228                                     }
1229                                 } else if second == 0xBB {
1230                                     let third = bytes[read + 2];
1231                                     if third != 0xBF {
1232                                         return true;
1233                                     }
1234                                 } else {
1235                                     return true;
1236                                 }
1237                             }
1238                         }
1239                     }
1240                     read += 3;
1241                 } else {
1242                     // Four-byte
1243                     let second = bytes[read + 1];
1244                     if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1245                         let third = bytes[read + 2];
1246                         if third >= 0xA0 {
1247                             return true;
1248                         }
1249                     }
1250                     read += 4;
1251                 }
1252                 // The comparison is always < or == and never >, but including
1253                 // > here to let the compiler assume that < is true if this
1254                 // comparison is false.
1255                 if read >= bytes.len() {
1256                     return false;
1257                 }
1258                 byte = bytes[read];
1259                 continue 'inner;
1260             }
1261         } else {
1262             return false;
1263         }
1264     }
1265 }
1266 
1267 /// Checks whether a UTF-16 buffer contains code points that trigger
1268 /// right-to-left processing.
1269 ///
1270 /// The check is done on a Unicode block basis without regard to assigned
1271 /// vs. unassigned code points in the block. Hebrew presentation forms in
1272 /// the Alphabetic Presentation Forms block are treated as if they formed
1273 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1274 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1275 /// for. Control characters that are technically bidi controls but do not
1276 /// cause right-to-left behavior without the presence of right-to-left
1277 /// characters or right-to-left controls are not checked for. As a special
1278 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1279 ///
1280 /// Returns `true` if the input contains an RTL character or an unpaired
1281 /// high surrogate that could be the high half of an RTL character.
1282 /// Returns `false` if the input contains neither RTL characters nor
1283 /// unpaired high surrogates that could be higher halves of RTL characters.
is_utf16_bidi(buffer: &[u16]) -> bool1284 pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1285     is_utf16_bidi_impl(buffer)
1286 }
1287 
1288 /// Checks whether a scalar value triggers right-to-left processing.
1289 ///
1290 /// The check is done on a Unicode block basis without regard to assigned
1291 /// vs. unassigned code points in the block. Hebrew presentation forms in
1292 /// the Alphabetic Presentation Forms block are treated as if they formed
1293 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1294 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1295 /// for. Control characters that are technically bidi controls but do not
1296 /// cause right-to-left behavior without the presence of right-to-left
1297 /// characters or right-to-left controls are not checked for. As a special
1298 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1299 #[inline(always)]
is_char_bidi(c: char) -> bool1300 pub fn is_char_bidi(c: char) -> bool {
1301     // Controls:
1302     // Every control with RIGHT-TO-LEFT in its name in
1303     // https://www.unicode.org/charts/PDF/U2000.pdf
1304     // U+200F RLM
1305     // U+202B RLE
1306     // U+202E RLO
1307     // U+2067 RLI
1308     //
1309     // BMP RTL:
1310     // https://www.unicode.org/roadmaps/bmp/
1311     // U+0590...U+08FF
1312     // U+FB1D...U+FDFF Hebrew presentation forms and
1313     //                 Arabic Presentation Forms A
1314     // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1315     //
1316     // Supplementary RTL:
1317     // https://www.unicode.org/roadmaps/smp/
1318     // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1319     // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1320     let code_point = u32::from(c);
1321     if code_point < 0x0590 {
1322         // Below Hebrew
1323         return false;
1324     }
1325     if in_range32(code_point, 0x0900, 0xFB1D) {
1326         // Above Arabic Extended-A and below Hebrew presentation forms
1327         if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1328             // In the range that contains the RTL controls
1329             return code_point == 0x200F
1330                 || code_point == 0x202B
1331                 || code_point == 0x202E
1332                 || code_point == 0x2067;
1333         }
1334         return false;
1335     }
1336     if code_point > 0x1EFFF {
1337         // Above second astral RTL. (Emoji is here.)
1338         return false;
1339     }
1340     if in_range32(code_point, 0x11000, 0x1E800) {
1341         // Between astral RTL blocks
1342         return false;
1343     }
1344     if in_range32(code_point, 0xFEFF, 0x10800) {
1345         // Above Arabic Presentations Forms B (excl. BOM) and below first
1346         // astral RTL
1347         return false;
1348     }
1349     if in_range32(code_point, 0xFE00, 0xFE70) {
1350         // Between Arabic Presentations Forms
1351         return false;
1352     }
1353     true
1354 }
1355 
1356 /// Checks whether a UTF-16 code unit triggers right-to-left processing.
1357 ///
1358 /// The check is done on a Unicode block basis without regard to assigned
1359 /// vs. unassigned code points in the block. Hebrew presentation forms in
1360 /// the Alphabetic Presentation Forms block are treated as if they formed
1361 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1362 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1363 /// for. Control characters that are technically bidi controls but do not
1364 /// cause right-to-left behavior without the presence of right-to-left
1365 /// characters or right-to-left controls are not checked for. As a special
1366 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1367 ///
1368 /// Since supplementary-plane right-to-left blocks are identifiable from the
1369 /// high surrogate without examining the low surrogate, this function returns
1370 /// `true` for such high surrogates making the function suitable for handling
1371 /// supplementary-plane text without decoding surrogate pairs to scalar
1372 /// values. Obviously, such high surrogates are then reported as right-to-left
1373 /// even if actually unpaired.
1374 #[inline(always)]
is_utf16_code_unit_bidi(u: u16) -> bool1375 pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1376     if u < 0x0590 {
1377         // Below Hebrew
1378         return false;
1379     }
1380     if in_range16(u, 0x0900, 0xD802) {
1381         // Above Arabic Extended-A and below first RTL surrogate
1382         if in_inclusive_range16(u, 0x200F, 0x2067) {
1383             // In the range that contains the RTL controls
1384             return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1385         }
1386         return false;
1387     }
1388     if in_range16(u, 0xD83C, 0xFB1D) {
1389         // Between astral RTL high surrogates and Hebrew presentation forms
1390         // (Emoji is here)
1391         return false;
1392     }
1393     if in_range16(u, 0xD804, 0xD83A) {
1394         // Between RTL high surragates
1395         return false;
1396     }
1397     if u > 0xFEFE {
1398         // Above Arabic Presentation Forms (excl. BOM)
1399         return false;
1400     }
1401     if in_range16(u, 0xFE00, 0xFE70) {
1402         // Between Arabic Presentations Forms
1403         return false;
1404     }
1405     true
1406 }
1407 
1408 /// Checks whether a potentially invalid UTF-8 buffer contains code points
1409 /// that trigger right-to-left processing or is all-Latin1.
1410 ///
1411 /// Possibly more efficient than performing the checks separately.
1412 ///
1413 /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1414 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1415 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi1416 pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1417     if let Some(offset) = is_utf8_latin1_impl(buffer) {
1418         if is_utf8_bidi(&buffer[offset..]) {
1419             Latin1Bidi::Bidi
1420         } else {
1421             Latin1Bidi::LeftToRight
1422         }
1423     } else {
1424         Latin1Bidi::Latin1
1425     }
1426 }
1427 
1428 /// Checks whether a valid UTF-8 buffer contains code points
1429 /// that trigger right-to-left processing or is all-Latin1.
1430 ///
1431 /// Possibly more efficient than performing the checks separately.
1432 ///
1433 /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1434 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1435 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi1436 pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1437     // The transition from the latin1 check to the bidi check isn't
1438     // optimal but not tweaking it to perfection today.
1439     if let Some(offset) = is_str_latin1_impl(buffer) {
1440         if is_str_bidi(&buffer[offset..]) {
1441             Latin1Bidi::Bidi
1442         } else {
1443             Latin1Bidi::LeftToRight
1444         }
1445     } else {
1446         Latin1Bidi::Latin1
1447     }
1448 }
1449 
1450 /// Checks whether a potentially invalid UTF-16 buffer contains code points
1451 /// that trigger right-to-left processing or is all-Latin1.
1452 ///
1453 /// Possibly more efficient than performing the checks separately.
1454 ///
1455 /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1456 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1457 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi1458 pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1459     check_utf16_for_latin1_and_bidi_impl(buffer)
1460 }
1461 
1462 /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1463 /// with the REPLACEMENT CHARACTER.
1464 ///
1465 /// The length of the destination buffer must be at least the length of the
1466 /// source buffer _plus one_.
1467 ///
1468 /// Returns the number of `u16`s written.
1469 ///
1470 /// # Panics
1471 ///
1472 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize1473 pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1474     // TODO: Can the requirement for dst to be at least one unit longer
1475     // be eliminated?
1476     assert!(dst.len() > src.len());
1477     let mut decoder = Utf8Decoder::new_inner();
1478     let mut total_read = 0usize;
1479     let mut total_written = 0usize;
1480     loop {
1481         let (result, read, written) =
1482             decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1483         total_read += read;
1484         total_written += written;
1485         match result {
1486             DecoderResult::InputEmpty => {
1487                 return total_written;
1488             }
1489             DecoderResult::OutputFull => {
1490                 unreachable!("The assert at the top of the function should have caught this.");
1491             }
1492             DecoderResult::Malformed(_, _) => {
1493                 // There should always be space for the U+FFFD, because
1494                 // otherwise we'd have gotten OutputFull already.
1495                 dst[total_written] = 0xFFFD;
1496                 total_written += 1;
1497             }
1498         }
1499     }
1500 }
1501 
1502 /// Converts valid UTF-8 to valid UTF-16.
1503 ///
1504 /// The length of the destination buffer must be at least the length of the
1505 /// source buffer.
1506 ///
1507 /// Returns the number of `u16`s written.
1508 ///
1509 /// # Panics
1510 ///
1511 /// Panics if the destination buffer is shorter than stated above.
convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize1512 pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1513     assert!(
1514         dst.len() >= src.len(),
1515         "Destination must not be shorter than the source."
1516     );
1517     let bytes = src.as_bytes();
1518     let mut read = 0;
1519     let mut written = 0;
1520     'outer: loop {
1521         let mut byte = {
1522             let src_remaining = &bytes[read..];
1523             let dst_remaining = &mut dst[written..];
1524             let length = src_remaining.len();
1525             match unsafe {
1526                 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1527             } {
1528                 None => {
1529                     written += length;
1530                     return written;
1531                 }
1532                 Some((non_ascii, consumed)) => {
1533                     read += consumed;
1534                     written += consumed;
1535                     non_ascii
1536                 }
1537             }
1538         };
1539         'inner: loop {
1540             // At this point, `byte` is not included in `read`.
1541             if byte < 0xE0 {
1542                 if byte >= 0x80 {
1543                     // Two-byte
1544                     let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1545                     let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1546                     unsafe { *(dst.get_unchecked_mut(written)) = point };
1547                     read += 2;
1548                     written += 1;
1549                 } else {
1550                     // ASCII: write and go back to SIMD.
1551                     unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1552                     read += 1;
1553                     written += 1;
1554                     // Intuitively, we should go back to the outer loop only
1555                     // if byte is 0x30 or above, so as to avoid trashing on
1556                     // ASCII space, comma and period in non-Latin context.
1557                     // However, the extra branch seems to cost more than it's
1558                     // worth.
1559                     continue 'outer;
1560                 }
1561             } else if byte < 0xF0 {
1562                 // Three-byte
1563                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1564                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1565                 let point = ((u16::from(byte) & 0xF) << 12)
1566                     | ((u16::from(second) & 0x3F) << 6)
1567                     | (u16::from(third) & 0x3F);
1568                 unsafe { *(dst.get_unchecked_mut(written)) = point };
1569                 read += 3;
1570                 written += 1;
1571             } else {
1572                 // Four-byte
1573                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1574                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1575                 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1576                 let point = ((u32::from(byte) & 0x7) << 18)
1577                     | ((u32::from(second) & 0x3F) << 12)
1578                     | ((u32::from(third) & 0x3F) << 6)
1579                     | (u32::from(fourth) & 0x3F);
1580                 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1581                 unsafe {
1582                     *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1583                 };
1584                 read += 4;
1585                 written += 2;
1586             }
1587             // The comparison is always < or == and never >, but including
1588             // > here to let the compiler assume that < is true if this
1589             // comparison is false.
1590             if read >= src.len() {
1591                 return written;
1592             }
1593             byte = bytes[read];
1594             continue 'inner;
1595         }
1596     }
1597 }
1598 
1599 /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1600 ///
1601 /// The length of the destination buffer must be at least the length of the
1602 /// source buffer.
1603 ///
1604 /// Returns the number of `u16`s written or `None` if the input was invalid.
1605 ///
1606 /// When the input was invalid, some output may have been written.
1607 ///
1608 /// # Panics
1609 ///
1610 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize>1611 pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1612     assert!(
1613         dst.len() >= src.len(),
1614         "Destination must not be shorter than the source."
1615     );
1616     let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1617     if read == src.len() {
1618         return Some(written);
1619     }
1620     None
1621 }
1622 
1623 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1624 /// with the REPLACEMENT CHARACTER with potentially insufficient output
1625 /// space.
1626 ///
1627 /// Returns the number of code units read and the number of bytes written.
1628 ///
1629 /// Guarantees that the bytes in the destination beyond the number of
1630 /// bytes claimed as written by the second item of the return tuple
1631 /// are left unmodified.
1632 ///
1633 /// Not all code units are read if there isn't enough output space.
1634 ///
1635 /// Note  that this method isn't designed for general streamability but for
1636 /// not allocating memory for the worst case up front. Specifically,
1637 /// if the input starts with or ends with an unpaired surrogate, those are
1638 /// replaced with the REPLACEMENT CHARACTER.
1639 ///
1640 /// Matches the semantics of `TextEncoder.encodeInto()` from the
1641 /// Encoding Standard.
1642 ///
1643 /// # Safety
1644 ///
1645 /// If you want to convert into a `&mut str`, use
1646 /// `convert_utf16_to_str_partial()` instead of using this function
1647 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1648 #[inline(always)]
convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize)1649 pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1650     // The two functions called below are marked `inline(never)` to make
1651     // transitions from the hot part (first function) into the cold part
1652     // (second function) go through a return and another call to discouge
1653     // the CPU from speculating from the hot code into the cold code.
1654     // Letting the transitions be mere intra-function jumps, even to
1655     // basic blocks out-of-lined to the end of the function would wipe
1656     // away a quarter of Arabic encode performance on Haswell!
1657     let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1658     if unsafe { likely(read == src.len()) } {
1659         return (read, written);
1660     }
1661     let (tail_read, tail_written) =
1662         convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1663     (read + tail_read, written + tail_written)
1664 }
1665 
1666 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1667 /// with the REPLACEMENT CHARACTER.
1668 ///
1669 /// The length of the destination buffer must be at least the length of the
1670 /// source buffer times three.
1671 ///
1672 /// Returns the number of bytes written.
1673 ///
1674 /// # Panics
1675 ///
1676 /// Panics if the destination buffer is shorter than stated above.
1677 ///
1678 /// # Safety
1679 ///
1680 /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1681 /// instead of using this function together with the `unsafe` method
1682 /// `as_bytes_mut()` on `&mut str`.
1683 #[inline(always)]
convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize1684 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1685     assert!(dst.len() >= src.len() * 3);
1686     let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1687     debug_assert_eq!(read, src.len());
1688     written
1689 }
1690 
1691 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1692 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1693 /// signaled using the Rust type system with potentially insufficient output
1694 /// space.
1695 ///
1696 /// Returns the number of code units read and the number of bytes written.
1697 ///
1698 /// Not all code units are read if there isn't enough output space.
1699 ///
1700 /// Note  that this method isn't designed for general streamability but for
1701 /// not allocating memory for the worst case up front. Specifically,
1702 /// if the input starts with or ends with an unpaired surrogate, those are
1703 /// replaced with the REPLACEMENT CHARACTER.
convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize)1704 pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1705     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1706     let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1707     let len = bytes.len();
1708     let mut trail = written;
1709     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1710         bytes[trail] = 0;
1711         trail += 1;
1712     }
1713     (read, written)
1714 }
1715 
1716 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1717 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1718 /// signaled using the Rust type system.
1719 ///
1720 /// The length of the destination buffer must be at least the length of the
1721 /// source buffer times three.
1722 ///
1723 /// Returns the number of bytes written.
1724 ///
1725 /// # Panics
1726 ///
1727 /// Panics if the destination buffer is shorter than stated above.
1728 #[inline(always)]
convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize1729 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1730     assert!(dst.len() >= src.len() * 3);
1731     let (read, written) = convert_utf16_to_str_partial(src, dst);
1732     debug_assert_eq!(read, src.len());
1733     written
1734 }
1735 
1736 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1737 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1738 ///
1739 /// The length of the destination buffer must be at least the length of the
1740 /// source buffer.
1741 ///
1742 /// The number of `u16`s written equals the length of the source buffer.
1743 ///
1744 /// # Panics
1745 ///
1746 /// Panics if the destination buffer is shorter than stated above.
convert_latin1_to_utf16(src: &[u8], dst: &mut [u16])1747 pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1748     assert!(
1749         dst.len() >= src.len(),
1750         "Destination must not be shorter than the source."
1751     );
1752     // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1753     // instructions and this code, but, yet, the autovectorized version is
1754     // faster.
1755     unsafe {
1756         unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1757     }
1758 }
1759 
1760 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1761 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1762 /// output space.
1763 ///
1764 /// Returns the number of bytes read and the number of bytes written.
1765 ///
1766 /// If the output isn't large enough, not all input is consumed.
1767 ///
1768 /// # Safety
1769 ///
1770 /// If you want to convert into a `&mut str`, use
1771 /// `convert_utf16_to_str_partial()` instead of using this function
1772 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize)1773 pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1774     let src_len = src.len();
1775     let src_ptr = src.as_ptr();
1776     let dst_ptr = dst.as_mut_ptr();
1777     let dst_len = dst.len();
1778     let mut total_read = 0usize;
1779     let mut total_written = 0usize;
1780     loop {
1781         // src can't advance more than dst
1782         let src_left = src_len - total_read;
1783         let dst_left = dst_len - total_written;
1784         let min_left = ::std::cmp::min(src_left, dst_left);
1785         if let Some((non_ascii, consumed)) = unsafe {
1786             ascii_to_ascii(
1787                 src_ptr.add(total_read),
1788                 dst_ptr.add(total_written),
1789                 min_left,
1790             )
1791         } {
1792             total_read += consumed;
1793             total_written += consumed;
1794             if total_written.checked_add(2).unwrap() > dst_len {
1795                 return (total_read, total_written);
1796             }
1797 
1798             total_read += 1; // consume `non_ascii`
1799 
1800             dst[total_written] = (non_ascii >> 6) | 0xC0;
1801             total_written += 1;
1802             dst[total_written] = (non_ascii & 0x3F) | 0x80;
1803             total_written += 1;
1804             continue;
1805         }
1806         return (total_read + min_left, total_written + min_left);
1807     }
1808 }
1809 
1810 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1811 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1812 ///
1813 /// The length of the destination buffer must be at least the length of the
1814 /// source buffer times two.
1815 ///
1816 /// Returns the number of bytes written.
1817 ///
1818 /// # Panics
1819 ///
1820 /// Panics if the destination buffer is shorter than stated above.
1821 ///
1822 /// # Safety
1823 ///
1824 /// Note that this function may write garbage beyond the number of bytes
1825 /// indicated by the return value, so using a `&mut str` interpreted as
1826 /// `&mut [u8]` as the destination is not safe. If you want to convert into
1827 /// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1828 #[inline]
convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize1829 pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1830     assert!(
1831         dst.len() >= src.len() * 2,
1832         "Destination must not be shorter than the source times two."
1833     );
1834     let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1835     debug_assert_eq!(read, src.len());
1836     written
1837 }
1838 
1839 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1840 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1841 /// output is signaled using the Rust type system with potentially insufficient
1842 /// output space.
1843 ///
1844 /// Returns the number of bytes read and the number of bytes written.
1845 ///
1846 /// If the output isn't large enough, not all input is consumed.
1847 #[inline]
convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize)1848 pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1849     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1850     let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1851     let len = bytes.len();
1852     let mut trail = written;
1853     let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
1854     while trail < max {
1855         bytes[trail] = 0;
1856         trail += 1;
1857     }
1858     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1859         bytes[trail] = 0;
1860         trail += 1;
1861     }
1862     (read, written)
1863 }
1864 
1865 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1866 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1867 /// output is signaled using the Rust type system.
1868 ///
1869 /// The length of the destination buffer must be at least the length of the
1870 /// source buffer times two.
1871 ///
1872 /// Returns the number of bytes written.
1873 ///
1874 /// # Panics
1875 ///
1876 /// Panics if the destination buffer is shorter than stated above.
1877 #[inline]
convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize1878 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1879     assert!(
1880         dst.len() >= src.len() * 2,
1881         "Destination must not be shorter than the source times two."
1882     );
1883     let (read, written) = convert_latin1_to_str_partial(src, dst);
1884     debug_assert_eq!(read, src.len());
1885     written
1886 }
1887 
1888 /// If the input is valid UTF-8 representing only Unicode code points from
1889 /// U+0000 to U+00FF, inclusive, converts the input into output that
1890 /// represents the value of each code point as the unsigned byte value of
1891 /// each output byte.
1892 ///
1893 /// If the input does not fulfill the condition stated above, this function
1894 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1895 /// does something that is memory-safe without any promises about any
1896 /// properties of the output. In particular, callers shouldn't assume the
1897 /// output to be the same across crate versions or CPU architectures and
1898 /// should not assume that non-ASCII input can't map to ASCII output.
1899 ///
1900 /// The length of the destination buffer must be at least the length of the
1901 /// source buffer.
1902 ///
1903 /// Returns the number of bytes written.
1904 ///
1905 /// # Panics
1906 ///
1907 /// Panics if the destination buffer is shorter than stated above.
1908 ///
1909 /// If debug assertions are enabled (and not fuzzing) and the input is
1910 /// not in the range U+0000 to U+00FF, inclusive.
convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize1911 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1912     assert!(
1913         dst.len() >= src.len(),
1914         "Destination must not be shorter than the source."
1915     );
1916     non_fuzz_debug_assert!(is_utf8_latin1(src));
1917     let src_len = src.len();
1918     let src_ptr = src.as_ptr();
1919     let dst_ptr = dst.as_mut_ptr();
1920     let mut total_read = 0usize;
1921     let mut total_written = 0usize;
1922     loop {
1923         // dst can't advance more than src
1924         let src_left = src_len - total_read;
1925         if let Some((non_ascii, consumed)) = unsafe {
1926             ascii_to_ascii(
1927                 src_ptr.add(total_read),
1928                 dst_ptr.add(total_written),
1929                 src_left,
1930             )
1931         } {
1932             total_read += consumed + 1;
1933             total_written += consumed;
1934 
1935             if total_read == src_len {
1936                 return total_written;
1937             }
1938 
1939             let trail = src[total_read];
1940             total_read += 1;
1941 
1942             dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1943             total_written += 1;
1944             continue;
1945         }
1946         return total_written + src_left;
1947     }
1948 }
1949 
1950 /// If the input is valid UTF-16 representing only Unicode code points from
1951 /// U+0000 to U+00FF, inclusive, converts the input into output that
1952 /// represents the value of each code point as the unsigned byte value of
1953 /// each output byte.
1954 ///
1955 /// If the input does not fulfill the condition stated above, does something
1956 /// that is memory-safe without any promises about any properties of the
1957 /// output and will probably assert in debug builds in future versions.
1958 /// In particular, callers shouldn't assume the output to be the same across
1959 /// crate versions or CPU architectures and should not assume that non-ASCII
1960 /// input can't map to ASCII output.
1961 ///
1962 /// The length of the destination buffer must be at least the length of the
1963 /// source buffer.
1964 ///
1965 /// The number of bytes written equals the length of the source buffer.
1966 ///
1967 /// # Panics
1968 ///
1969 /// Panics if the destination buffer is shorter than stated above.
1970 ///
1971 /// (Probably in future versions if debug assertions are enabled (and not
1972 /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8])1973 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1974     assert!(
1975         dst.len() >= src.len(),
1976         "Destination must not be shorter than the source."
1977     );
1978     // non_fuzz_debug_assert!(is_utf16_latin1(src));
1979     unsafe {
1980         pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1981     }
1982 }
1983 
1984 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1985 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1986 ///
1987 /// Borrows if input is ASCII-only. Performs a single heap allocation
1988 /// otherwise.
decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str>1989 pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1990     let up_to = ascii_valid_up_to(bytes);
1991     // >= makes later things optimize better than ==
1992     if up_to >= bytes.len() {
1993         debug_assert_eq!(up_to, bytes.len());
1994         let s: &str = unsafe { ::std::str::from_utf8_unchecked(bytes) };
1995         return Cow::Borrowed(s);
1996     }
1997     let (head, tail) = bytes.split_at(up_to);
1998     let capacity = head.len() + tail.len() * 2;
1999     let mut vec = Vec::with_capacity(capacity);
2000     unsafe {
2001         vec.set_len(capacity);
2002     }
2003     (&mut vec[..up_to]).copy_from_slice(head);
2004     let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2005     vec.truncate(up_to + written);
2006     Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2007 }
2008 
2009 /// If the input is valid UTF-8 representing only Unicode code points from
2010 /// U+0000 to U+00FF, inclusive, converts the input into output that
2011 /// represents the value of each code point as the unsigned byte value of
2012 /// each output byte.
2013 ///
2014 /// If the input does not fulfill the condition stated above, this function
2015 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2016 /// does something that is memory-safe without any promises about any
2017 /// properties of the output. In particular, callers shouldn't assume the
2018 /// output to be the same across crate versions or CPU architectures and
2019 /// should not assume that non-ASCII input can't map to ASCII output.
2020 ///
2021 /// Borrows if input is ASCII-only. Performs a single heap allocation
2022 /// otherwise.
encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]>2023 pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2024     let bytes = string.as_bytes();
2025     let up_to = ascii_valid_up_to(bytes);
2026     // >= makes later things optimize better than ==
2027     if up_to >= bytes.len() {
2028         debug_assert_eq!(up_to, bytes.len());
2029         return Cow::Borrowed(bytes);
2030     }
2031     let (head, tail) = bytes.split_at(up_to);
2032     let capacity = bytes.len();
2033     let mut vec = Vec::with_capacity(capacity);
2034     unsafe {
2035         vec.set_len(capacity);
2036     }
2037     (&mut vec[..up_to]).copy_from_slice(head);
2038     let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2039     vec.truncate(up_to + written);
2040     Cow::Owned(vec)
2041 }
2042 
2043 /// Returns the index of the first unpaired surrogate or, if the input is
2044 /// valid UTF-16 in its entirety, the length of the input.
utf16_valid_up_to(buffer: &[u16]) -> usize2045 pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2046     utf16_valid_up_to_impl(buffer)
2047 }
2048 
2049 /// Returns the index of first byte that starts an invalid byte
2050 /// sequence or a non-Latin1 byte sequence, or the length of the
2051 /// string if there are neither.
utf8_latin1_up_to(buffer: &[u8]) -> usize2052 pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2053     is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2054 }
2055 
2056 /// Returns the index of first byte that starts a non-Latin1 byte
2057 /// sequence, or the length of the string if there are none.
str_latin1_up_to(buffer: &str) -> usize2058 pub fn str_latin1_up_to(buffer: &str) -> usize {
2059     is_str_latin1_impl(buffer).unwrap_or(buffer.len())
2060 }
2061 
2062 /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2063 #[inline]
ensure_utf16_validity(buffer: &mut [u16])2064 pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2065     let mut offset = 0;
2066     loop {
2067         offset += utf16_valid_up_to(&buffer[offset..]);
2068         if offset == buffer.len() {
2069             return;
2070         }
2071         buffer[offset] = 0xFFFD;
2072         offset += 1;
2073     }
2074 }
2075 
2076 /// Copies ASCII from source to destination up to the first non-ASCII byte
2077 /// (or the end of the input if it is ASCII in its entirety).
2078 ///
2079 /// The length of the destination buffer must be at least the length of the
2080 /// source buffer.
2081 ///
2082 /// Returns the number of bytes written.
2083 ///
2084 /// # Panics
2085 ///
2086 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize2087 pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2088     assert!(
2089         dst.len() >= src.len(),
2090         "Destination must not be shorter than the source."
2091     );
2092     if let Some((_, consumed)) =
2093         unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2094     {
2095         consumed
2096     } else {
2097         src.len()
2098     }
2099 }
2100 
2101 /// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2102 /// the first non-ASCII byte (or the end of the input if it is ASCII in its
2103 /// entirety).
2104 ///
2105 /// The length of the destination buffer must be at least the length of the
2106 /// source buffer.
2107 ///
2108 /// Returns the number of `u16`s written.
2109 ///
2110 /// # Panics
2111 ///
2112 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize2113 pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2114     assert!(
2115         dst.len() >= src.len(),
2116         "Destination must not be shorter than the source."
2117     );
2118     if let Some((_, consumed)) =
2119         unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2120     {
2121         consumed
2122     } else {
2123         src.len()
2124     }
2125 }
2126 
2127 /// Copies Basic Latin from source to destination narrowing it to ASCII up to
2128 /// the first non-Basic Latin code unit (or the end of the input if it is
2129 /// Basic Latin in its entirety).
2130 ///
2131 /// The length of the destination buffer must be at least the length of the
2132 /// source buffer.
2133 ///
2134 /// Returns the number of bytes written.
2135 ///
2136 /// # Panics
2137 ///
2138 /// Panics if the destination buffer is shorter than stated above.
copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize2139 pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2140     assert!(
2141         dst.len() >= src.len(),
2142         "Destination must not be shorter than the source."
2143     );
2144     if let Some((_, consumed)) =
2145         unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2146     {
2147         consumed
2148     } else {
2149         src.len()
2150     }
2151 }
2152 
2153 // Any copyright to the test code below this comment is dedicated to the
2154 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2155 
2156 #[cfg(test)]
2157 mod tests {
2158     use super::*;
2159 
2160     #[test]
test_is_ascii_success()2161     fn test_is_ascii_success() {
2162         let mut src: Vec<u8> = Vec::with_capacity(128);
2163         src.resize(128, 0);
2164         for i in 0..src.len() {
2165             src[i] = i as u8;
2166         }
2167         for i in 0..src.len() {
2168             assert!(is_ascii(&src[i..]));
2169         }
2170     }
2171 
2172     #[test]
test_is_ascii_fail()2173     fn test_is_ascii_fail() {
2174         let mut src: Vec<u8> = Vec::with_capacity(128);
2175         src.resize(128, 0);
2176         for i in 0..src.len() {
2177             src[i] = i as u8;
2178         }
2179         for i in 0..src.len() {
2180             let tail = &mut src[i..];
2181             for j in 0..tail.len() {
2182                 tail[j] = 0xA0;
2183                 assert!(!is_ascii(tail));
2184             }
2185         }
2186     }
2187 
2188     #[test]
test_is_basic_latin_success()2189     fn test_is_basic_latin_success() {
2190         let mut src: Vec<u16> = Vec::with_capacity(128);
2191         src.resize(128, 0);
2192         for i in 0..src.len() {
2193             src[i] = i as u16;
2194         }
2195         for i in 0..src.len() {
2196             assert!(is_basic_latin(&src[i..]));
2197         }
2198     }
2199 
2200     #[test]
test_is_basic_latin_fail()2201     fn test_is_basic_latin_fail() {
2202         let mut src: Vec<u16> = Vec::with_capacity(128);
2203         src.resize(128, 0);
2204         for i in 0..src.len() {
2205             src[i] = i as u16;
2206         }
2207         for i in 0..src.len() {
2208             let tail = &mut src[i..];
2209             for j in 0..tail.len() {
2210                 tail[j] = 0xA0;
2211                 assert!(!is_basic_latin(tail));
2212             }
2213         }
2214     }
2215 
2216     #[test]
test_is_utf16_latin1_success()2217     fn test_is_utf16_latin1_success() {
2218         let mut src: Vec<u16> = Vec::with_capacity(256);
2219         src.resize(256, 0);
2220         for i in 0..src.len() {
2221             src[i] = i as u16;
2222         }
2223         for i in 0..src.len() {
2224             assert!(is_utf16_latin1(&src[i..]));
2225             assert_eq!(
2226                 check_utf16_for_latin1_and_bidi(&src[i..]),
2227                 Latin1Bidi::Latin1
2228             );
2229         }
2230     }
2231 
2232     #[test]
test_is_utf16_latin1_fail()2233     fn test_is_utf16_latin1_fail() {
2234         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2235         let mut src: Vec<u16> = Vec::with_capacity(len);
2236         src.resize(len, 0);
2237         for i in 0..src.len() {
2238             src[i] = i as u16;
2239         }
2240         for i in 0..src.len() {
2241             let tail = &mut src[i..];
2242             for j in 0..tail.len() {
2243                 tail[j] = 0x100 + j as u16;
2244                 assert!(!is_utf16_latin1(tail));
2245                 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2246             }
2247         }
2248     }
2249 
2250     #[test]
test_is_str_latin1_success()2251     fn test_is_str_latin1_success() {
2252         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2253         let mut src: Vec<u16> = Vec::with_capacity(len);
2254         src.resize(len, 0);
2255         for i in 0..src.len() {
2256             src[i] = i as u16;
2257         }
2258         for i in 0..src.len() {
2259             let s = String::from_utf16(&src[i..]).unwrap();
2260             assert!(is_str_latin1(&s[..]));
2261             assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2262         }
2263     }
2264 
2265     #[test]
test_is_str_latin1_fail()2266     fn test_is_str_latin1_fail() {
2267         let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2268         let mut src: Vec<u16> = Vec::with_capacity(len);
2269         src.resize(len, 0);
2270         for i in 0..src.len() {
2271             src[i] = i as u16;
2272         }
2273         for i in 0..src.len() {
2274             let tail = &mut src[i..];
2275             for j in 0..tail.len() {
2276                 tail[j] = 0x100 + j as u16;
2277                 let s = String::from_utf16(tail).unwrap();
2278                 assert!(!is_str_latin1(&s[..]));
2279                 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2280             }
2281         }
2282     }
2283 
2284     #[test]
test_is_utf8_latin1_success()2285     fn test_is_utf8_latin1_success() {
2286         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2287         let mut src: Vec<u16> = Vec::with_capacity(len);
2288         src.resize(len, 0);
2289         for i in 0..src.len() {
2290             src[i] = i as u16;
2291         }
2292         for i in 0..src.len() {
2293             let s = String::from_utf16(&src[i..]).unwrap();
2294             assert!(is_utf8_latin1(s.as_bytes()));
2295             assert_eq!(
2296                 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2297                 Latin1Bidi::Latin1
2298             );
2299         }
2300     }
2301 
2302     #[test]
test_is_utf8_latin1_fail()2303     fn test_is_utf8_latin1_fail() {
2304         let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2305         let mut src: Vec<u16> = Vec::with_capacity(len);
2306         src.resize(len, 0);
2307         for i in 0..src.len() {
2308             src[i] = i as u16;
2309         }
2310         for i in 0..src.len() {
2311             let tail = &mut src[i..];
2312             for j in 0..tail.len() {
2313                 tail[j] = 0x100 + j as u16;
2314                 let s = String::from_utf16(tail).unwrap();
2315                 assert!(!is_utf8_latin1(s.as_bytes()));
2316                 assert_ne!(
2317                     check_utf8_for_latin1_and_bidi(s.as_bytes()),
2318                     Latin1Bidi::Latin1
2319                 );
2320             }
2321         }
2322     }
2323 
2324     #[test]
test_is_utf8_latin1_invalid()2325     fn test_is_utf8_latin1_invalid() {
2326         assert!(!is_utf8_latin1(b"\xC3"));
2327         assert!(!is_utf8_latin1(b"a\xC3"));
2328         assert!(!is_utf8_latin1(b"\xFF"));
2329         assert!(!is_utf8_latin1(b"a\xFF"));
2330         assert!(!is_utf8_latin1(b"\xC3\xFF"));
2331         assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2332     }
2333 
2334     #[test]
test_convert_utf8_to_utf16()2335     fn test_convert_utf8_to_utf16() {
2336         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2337         let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2338         dst.resize(src.len() + 1, 0);
2339         let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2340         dst.truncate(len);
2341         let reference: Vec<u16> = src.encode_utf16().collect();
2342         assert_eq!(dst, reference);
2343     }
2344 
2345     #[test]
test_convert_str_to_utf16()2346     fn test_convert_str_to_utf16() {
2347         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2348         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2349         dst.resize(src.len(), 0);
2350         let len = convert_str_to_utf16(src, &mut dst[..]);
2351         dst.truncate(len);
2352         let reference: Vec<u16> = src.encode_utf16().collect();
2353         assert_eq!(dst, reference);
2354     }
2355 
2356     #[test]
test_convert_utf16_to_utf8_partial()2357     fn test_convert_utf16_to_utf8_partial() {
2358         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2359         let src: Vec<u16> = reference.encode_utf16().collect();
2360         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2361         dst.resize(src.len() * 3 + 1, 0);
2362         let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2363         let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2364         dst.truncate(len);
2365         assert_eq!(dst, reference.as_bytes());
2366     }
2367 
2368     #[test]
test_convert_utf16_to_utf8()2369     fn test_convert_utf16_to_utf8() {
2370         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2371         let src: Vec<u16> = reference.encode_utf16().collect();
2372         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2373         dst.resize(src.len() * 3 + 1, 0);
2374         let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2375         dst.truncate(len);
2376         assert_eq!(dst, reference.as_bytes());
2377     }
2378 
2379     #[test]
test_convert_latin1_to_utf16()2380     fn test_convert_latin1_to_utf16() {
2381         let mut src: Vec<u8> = Vec::with_capacity(256);
2382         src.resize(256, 0);
2383         let mut reference: Vec<u16> = Vec::with_capacity(256);
2384         reference.resize(256, 0);
2385         for i in 0..256 {
2386             src[i] = i as u8;
2387             reference[i] = i as u16;
2388         }
2389         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2390         dst.resize(src.len(), 0);
2391         convert_latin1_to_utf16(&src[..], &mut dst[..]);
2392         assert_eq!(dst, reference);
2393     }
2394 
2395     #[test]
test_convert_latin1_to_utf8_partial()2396     fn test_convert_latin1_to_utf8_partial() {
2397         let mut dst = [0u8, 2];
2398         let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2399         assert_eq!(read, 1);
2400         assert_eq!(written, 1);
2401     }
2402 
2403     #[test]
test_convert_latin1_to_utf8()2404     fn test_convert_latin1_to_utf8() {
2405         let mut src: Vec<u8> = Vec::with_capacity(256);
2406         src.resize(256, 0);
2407         let mut reference: Vec<u16> = Vec::with_capacity(256);
2408         reference.resize(256, 0);
2409         for i in 0..256 {
2410             src[i] = i as u8;
2411             reference[i] = i as u16;
2412         }
2413         let s = String::from_utf16(&reference[..]).unwrap();
2414         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2415         dst.resize(src.len() * 2, 0);
2416         let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2417         dst.truncate(len);
2418         assert_eq!(&dst[..], s.as_bytes());
2419     }
2420 
2421     #[test]
test_convert_utf8_to_latin1_lossy()2422     fn test_convert_utf8_to_latin1_lossy() {
2423         let mut reference: Vec<u8> = Vec::with_capacity(256);
2424         reference.resize(256, 0);
2425         let mut src16: Vec<u16> = Vec::with_capacity(256);
2426         src16.resize(256, 0);
2427         for i in 0..256 {
2428             src16[i] = i as u16;
2429             reference[i] = i as u8;
2430         }
2431         let src = String::from_utf16(&src16[..]).unwrap();
2432         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2433         dst.resize(src.len(), 0);
2434         let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2435         dst.truncate(len);
2436         assert_eq!(dst, reference);
2437     }
2438 
2439     #[cfg(all(debug_assertions, not(fuzzing)))]
2440     #[test]
2441     #[should_panic]
test_convert_utf8_to_latin1_lossy_panics()2442     fn test_convert_utf8_to_latin1_lossy_panics() {
2443         let mut dst = [0u8; 16];
2444         let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2445     }
2446 
2447     #[test]
test_convert_utf16_to_latin1_lossy()2448     fn test_convert_utf16_to_latin1_lossy() {
2449         let mut src: Vec<u16> = Vec::with_capacity(256);
2450         src.resize(256, 0);
2451         let mut reference: Vec<u8> = Vec::with_capacity(256);
2452         reference.resize(256, 0);
2453         for i in 0..256 {
2454             src[i] = i as u16;
2455             reference[i] = i as u8;
2456         }
2457         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2458         dst.resize(src.len(), 0);
2459         convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2460         assert_eq!(dst, reference);
2461     }
2462 
2463     #[test]
2464     // #[should_panic]
test_convert_utf16_to_latin1_lossy_panics()2465     fn test_convert_utf16_to_latin1_lossy_panics() {
2466         let mut dst = [0u8; 16];
2467         let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2468     }
2469 
2470     #[test]
test_utf16_valid_up_to()2471     fn test_utf16_valid_up_to() {
2472         let valid = vec![
2473             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2474             0xD83Du16, 0xDCA9u16, 0x00B6u16,
2475         ];
2476         assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2477         let lone_high = vec![
2478             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2479             0x2603u16, 0xD83Du16, 0x00B6u16,
2480         ];
2481         assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2482         let lone_low = vec![
2483             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2484             0x2603u16, 0xDCA9u16, 0x00B6u16,
2485         ];
2486         assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2487         let lone_high_at_end = vec![
2488             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2489             0x2603u16, 0x00B6u16, 0xD83Du16,
2490         ];
2491         assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2492     }
2493 
2494     #[test]
test_ensure_utf16_validity()2495     fn test_ensure_utf16_validity() {
2496         let mut src = vec![
2497             0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2498             0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2499             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500         ];
2501         let reference = vec![
2502             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2503             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2504             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2505         ];
2506         ensure_utf16_validity(&mut src[..]);
2507         assert_eq!(src, reference);
2508     }
2509 
2510     #[test]
test_is_char_bidi()2511     fn test_is_char_bidi() {
2512         assert!(!is_char_bidi('a'));
2513         assert!(!is_char_bidi('\u{03B1}'));
2514         assert!(!is_char_bidi('\u{3041}'));
2515         assert!(!is_char_bidi('\u{1F4A9}'));
2516         assert!(!is_char_bidi('\u{FE00}'));
2517         assert!(!is_char_bidi('\u{202C}'));
2518         assert!(!is_char_bidi('\u{FEFF}'));
2519         assert!(is_char_bidi('\u{0590}'));
2520         assert!(is_char_bidi('\u{08FF}'));
2521         assert!(is_char_bidi('\u{061C}'));
2522         assert!(is_char_bidi('\u{FB50}'));
2523         assert!(is_char_bidi('\u{FDFF}'));
2524         assert!(is_char_bidi('\u{FE70}'));
2525         assert!(is_char_bidi('\u{FEFE}'));
2526         assert!(is_char_bidi('\u{200F}'));
2527         assert!(is_char_bidi('\u{202B}'));
2528         assert!(is_char_bidi('\u{202E}'));
2529         assert!(is_char_bidi('\u{2067}'));
2530         assert!(is_char_bidi('\u{10800}'));
2531         assert!(is_char_bidi('\u{10FFF}'));
2532         assert!(is_char_bidi('\u{1E800}'));
2533         assert!(is_char_bidi('\u{1EFFF}'));
2534     }
2535 
2536     #[test]
test_is_utf16_code_unit_bidi()2537     fn test_is_utf16_code_unit_bidi() {
2538         assert!(!is_utf16_code_unit_bidi(0x0062));
2539         assert!(!is_utf16_code_unit_bidi(0x03B1));
2540         assert!(!is_utf16_code_unit_bidi(0x3041));
2541         assert!(!is_utf16_code_unit_bidi(0xD801));
2542         assert!(!is_utf16_code_unit_bidi(0xFE00));
2543         assert!(!is_utf16_code_unit_bidi(0x202C));
2544         assert!(!is_utf16_code_unit_bidi(0xFEFF));
2545         assert!(is_utf16_code_unit_bidi(0x0590));
2546         assert!(is_utf16_code_unit_bidi(0x08FF));
2547         assert!(is_utf16_code_unit_bidi(0x061C));
2548         assert!(is_utf16_code_unit_bidi(0xFB1D));
2549         assert!(is_utf16_code_unit_bidi(0xFB50));
2550         assert!(is_utf16_code_unit_bidi(0xFDFF));
2551         assert!(is_utf16_code_unit_bidi(0xFE70));
2552         assert!(is_utf16_code_unit_bidi(0xFEFE));
2553         assert!(is_utf16_code_unit_bidi(0x200F));
2554         assert!(is_utf16_code_unit_bidi(0x202B));
2555         assert!(is_utf16_code_unit_bidi(0x202E));
2556         assert!(is_utf16_code_unit_bidi(0x2067));
2557         assert!(is_utf16_code_unit_bidi(0xD802));
2558         assert!(is_utf16_code_unit_bidi(0xD803));
2559         assert!(is_utf16_code_unit_bidi(0xD83A));
2560         assert!(is_utf16_code_unit_bidi(0xD83B));
2561     }
2562 
2563     #[test]
test_is_str_bidi()2564     fn test_is_str_bidi() {
2565         assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2566         assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2567         assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2568         assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2569         assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2570         assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2571         assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2572         assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2573         assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2574         assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2575         assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2576         assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2577         assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2578         assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2579         assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2580         assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2581         assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2582         assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2583         assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2584         assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2585         assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2586         assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2587     }
2588 
2589     #[test]
test_is_utf8_bidi()2590     fn test_is_utf8_bidi() {
2591         assert!(!is_utf8_bidi(
2592             "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2593         ));
2594         assert!(!is_utf8_bidi(
2595             "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2596         ));
2597         assert!(!is_utf8_bidi(
2598             "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2599         ));
2600         assert!(!is_utf8_bidi(
2601             "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2602         ));
2603         assert!(!is_utf8_bidi(
2604             "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2605         ));
2606         assert!(!is_utf8_bidi(
2607             "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2608         ));
2609         assert!(!is_utf8_bidi(
2610             "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2611         ));
2612         assert!(is_utf8_bidi(
2613             "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2614         ));
2615         assert!(is_utf8_bidi(
2616             "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2617         ));
2618         assert!(is_utf8_bidi(
2619             "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2620         ));
2621         assert!(is_utf8_bidi(
2622             "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2623         ));
2624         assert!(is_utf8_bidi(
2625             "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2626         ));
2627         assert!(is_utf8_bidi(
2628             "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2629         ));
2630         assert!(is_utf8_bidi(
2631             "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2632         ));
2633         assert!(is_utf8_bidi(
2634             "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2635         ));
2636         assert!(is_utf8_bidi(
2637             "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2638         ));
2639         assert!(is_utf8_bidi(
2640             "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2641         ));
2642         assert!(is_utf8_bidi(
2643             "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2644         ));
2645         assert!(is_utf8_bidi(
2646             "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2647         ));
2648         assert!(is_utf8_bidi(
2649             "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2650         ));
2651         assert!(is_utf8_bidi(
2652             "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2653         ));
2654         assert!(is_utf8_bidi(
2655             "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2656         ));
2657     }
2658 
2659     #[test]
test_is_utf16_bidi()2660     fn test_is_utf16_bidi() {
2661         assert!(!is_utf16_bidi(&[
2662             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2663             0x67, 0x68, 0x69,
2664         ]));
2665         assert!(!is_utf16_bidi(&[
2666             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2667             0x67, 0x68, 0x69,
2668         ]));
2669         assert!(!is_utf16_bidi(&[
2670             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2671             0x67, 0x68, 0x69,
2672         ]));
2673         assert!(!is_utf16_bidi(&[
2674             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2675             0x67, 0x68, 0x69,
2676         ]));
2677         assert!(!is_utf16_bidi(&[
2678             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2679             0x67, 0x68, 0x69,
2680         ]));
2681         assert!(!is_utf16_bidi(&[
2682             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2683             0x67, 0x68, 0x69,
2684         ]));
2685         assert!(!is_utf16_bidi(&[
2686             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2687             0x67, 0x68, 0x69,
2688         ]));
2689         assert!(is_utf16_bidi(&[
2690             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2691             0x67, 0x68, 0x69,
2692         ]));
2693         assert!(is_utf16_bidi(&[
2694             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2695             0x67, 0x68, 0x69,
2696         ]));
2697         assert!(is_utf16_bidi(&[
2698             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2699             0x67, 0x68, 0x69,
2700         ]));
2701         assert!(is_utf16_bidi(&[
2702             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2703             0x67, 0x68, 0x69,
2704         ]));
2705         assert!(is_utf16_bidi(&[
2706             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2707             0x67, 0x68, 0x69,
2708         ]));
2709         assert!(is_utf16_bidi(&[
2710             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2711             0x67, 0x68, 0x69,
2712         ]));
2713         assert!(is_utf16_bidi(&[
2714             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2715             0x67, 0x68, 0x69,
2716         ]));
2717         assert!(is_utf16_bidi(&[
2718             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2719             0x67, 0x68, 0x69,
2720         ]));
2721         assert!(is_utf16_bidi(&[
2722             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2723             0x67, 0x68, 0x69,
2724         ]));
2725         assert!(is_utf16_bidi(&[
2726             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2727             0x67, 0x68, 0x69,
2728         ]));
2729         assert!(is_utf16_bidi(&[
2730             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2731             0x67, 0x68, 0x69,
2732         ]));
2733         assert!(is_utf16_bidi(&[
2734             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2735             0x67, 0x68, 0x69,
2736         ]));
2737         assert!(is_utf16_bidi(&[
2738             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2739             0x67, 0x68, 0x69,
2740         ]));
2741         assert!(is_utf16_bidi(&[
2742             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2743             0x67, 0x68, 0x69,
2744         ]));
2745         assert!(is_utf16_bidi(&[
2746             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2747             0x67, 0x68, 0x69,
2748         ]));
2749         assert!(is_utf16_bidi(&[
2750             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2751             0x67, 0x68, 0x69,
2752         ]));
2753 
2754         assert!(is_utf16_bidi(&[
2755             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2756             0x66, 0x67, 0x68, 0x69,
2757         ]));
2758     }
2759 
2760     #[test]
test_check_str_for_latin1_and_bidi()2761     fn test_check_str_for_latin1_and_bidi() {
2762         assert_ne!(
2763             check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2764             Latin1Bidi::Bidi
2765         );
2766         assert_ne!(
2767             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2768             Latin1Bidi::Bidi
2769         );
2770         assert_ne!(
2771             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2772             Latin1Bidi::Bidi
2773         );
2774         assert_ne!(
2775             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2776             Latin1Bidi::Bidi
2777         );
2778         assert_ne!(
2779             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2780             Latin1Bidi::Bidi
2781         );
2782         assert_ne!(
2783             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2784             Latin1Bidi::Bidi
2785         );
2786         assert_ne!(
2787             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2788             Latin1Bidi::Bidi
2789         );
2790         assert_eq!(
2791             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2792             Latin1Bidi::Bidi
2793         );
2794         assert_eq!(
2795             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2796             Latin1Bidi::Bidi
2797         );
2798         assert_eq!(
2799             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2800             Latin1Bidi::Bidi
2801         );
2802         assert_eq!(
2803             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2804             Latin1Bidi::Bidi
2805         );
2806         assert_eq!(
2807             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2808             Latin1Bidi::Bidi
2809         );
2810         assert_eq!(
2811             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2812             Latin1Bidi::Bidi
2813         );
2814         assert_eq!(
2815             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2816             Latin1Bidi::Bidi
2817         );
2818         assert_eq!(
2819             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2820             Latin1Bidi::Bidi
2821         );
2822         assert_eq!(
2823             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2824             Latin1Bidi::Bidi
2825         );
2826         assert_eq!(
2827             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2828             Latin1Bidi::Bidi
2829         );
2830         assert_eq!(
2831             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2832             Latin1Bidi::Bidi
2833         );
2834         assert_eq!(
2835             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2836             Latin1Bidi::Bidi
2837         );
2838         assert_eq!(
2839             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2840             Latin1Bidi::Bidi
2841         );
2842         assert_eq!(
2843             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2844             Latin1Bidi::Bidi
2845         );
2846         assert_eq!(
2847             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2848             Latin1Bidi::Bidi
2849         );
2850     }
2851 
2852     #[test]
test_check_utf8_for_latin1_and_bidi()2853     fn test_check_utf8_for_latin1_and_bidi() {
2854         assert_ne!(
2855             check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2856             Latin1Bidi::Bidi
2857         );
2858         assert_ne!(
2859             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2860             Latin1Bidi::Bidi
2861         );
2862         assert_ne!(
2863             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2864             Latin1Bidi::Bidi
2865         );
2866         assert_ne!(
2867             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2868             Latin1Bidi::Bidi
2869         );
2870         assert_ne!(
2871             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2872             Latin1Bidi::Bidi
2873         );
2874         assert_ne!(
2875             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2876             Latin1Bidi::Bidi
2877         );
2878         assert_ne!(
2879             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2880             Latin1Bidi::Bidi
2881         );
2882         assert_eq!(
2883             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2884             Latin1Bidi::Bidi
2885         );
2886         assert_eq!(
2887             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2888             Latin1Bidi::Bidi
2889         );
2890         assert_eq!(
2891             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2892             Latin1Bidi::Bidi
2893         );
2894         assert_eq!(
2895             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2896             Latin1Bidi::Bidi
2897         );
2898         assert_eq!(
2899             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2900             Latin1Bidi::Bidi
2901         );
2902         assert_eq!(
2903             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2904             Latin1Bidi::Bidi
2905         );
2906         assert_eq!(
2907             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2908             Latin1Bidi::Bidi
2909         );
2910         assert_eq!(
2911             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2912             Latin1Bidi::Bidi
2913         );
2914         assert_eq!(
2915             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2916             Latin1Bidi::Bidi
2917         );
2918         assert_eq!(
2919             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2920             Latin1Bidi::Bidi
2921         );
2922         assert_eq!(
2923             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2924             Latin1Bidi::Bidi
2925         );
2926         assert_eq!(
2927             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2928             Latin1Bidi::Bidi
2929         );
2930         assert_eq!(
2931             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2932             Latin1Bidi::Bidi
2933         );
2934         assert_eq!(
2935             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2936             Latin1Bidi::Bidi
2937         );
2938         assert_eq!(
2939             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2940             Latin1Bidi::Bidi
2941         );
2942     }
2943 
2944     #[test]
test_check_utf16_for_latin1_and_bidi()2945     fn test_check_utf16_for_latin1_and_bidi() {
2946         assert_ne!(
2947             check_utf16_for_latin1_and_bidi(&[
2948                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2949                 0x66, 0x67, 0x68, 0x69,
2950             ]),
2951             Latin1Bidi::Bidi
2952         );
2953         assert_ne!(
2954             check_utf16_for_latin1_and_bidi(&[
2955                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2956                 0x66, 0x67, 0x68, 0x69,
2957             ]),
2958             Latin1Bidi::Bidi
2959         );
2960         assert_ne!(
2961             check_utf16_for_latin1_and_bidi(&[
2962                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2963                 0x66, 0x67, 0x68, 0x69,
2964             ]),
2965             Latin1Bidi::Bidi
2966         );
2967         assert_ne!(
2968             check_utf16_for_latin1_and_bidi(&[
2969                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2970                 0x66, 0x67, 0x68, 0x69,
2971             ]),
2972             Latin1Bidi::Bidi
2973         );
2974         assert_ne!(
2975             check_utf16_for_latin1_and_bidi(&[
2976                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2977                 0x66, 0x67, 0x68, 0x69,
2978             ]),
2979             Latin1Bidi::Bidi
2980         );
2981         assert_ne!(
2982             check_utf16_for_latin1_and_bidi(&[
2983                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2984                 0x66, 0x67, 0x68, 0x69,
2985             ]),
2986             Latin1Bidi::Bidi
2987         );
2988         assert_ne!(
2989             check_utf16_for_latin1_and_bidi(&[
2990                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
2991                 0x66, 0x67, 0x68, 0x69,
2992             ]),
2993             Latin1Bidi::Bidi
2994         );
2995         assert_eq!(
2996             check_utf16_for_latin1_and_bidi(&[
2997                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
2998                 0x66, 0x67, 0x68, 0x69,
2999             ]),
3000             Latin1Bidi::Bidi
3001         );
3002         assert_eq!(
3003             check_utf16_for_latin1_and_bidi(&[
3004                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3005                 0x66, 0x67, 0x68, 0x69,
3006             ]),
3007             Latin1Bidi::Bidi
3008         );
3009         assert_eq!(
3010             check_utf16_for_latin1_and_bidi(&[
3011                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3012                 0x66, 0x67, 0x68, 0x69,
3013             ]),
3014             Latin1Bidi::Bidi
3015         );
3016         assert_eq!(
3017             check_utf16_for_latin1_and_bidi(&[
3018                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3019                 0x66, 0x67, 0x68, 0x69,
3020             ]),
3021             Latin1Bidi::Bidi
3022         );
3023         assert_eq!(
3024             check_utf16_for_latin1_and_bidi(&[
3025                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3026                 0x66, 0x67, 0x68, 0x69,
3027             ]),
3028             Latin1Bidi::Bidi
3029         );
3030         assert_eq!(
3031             check_utf16_for_latin1_and_bidi(&[
3032                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3033                 0x66, 0x67, 0x68, 0x69,
3034             ]),
3035             Latin1Bidi::Bidi
3036         );
3037         assert_eq!(
3038             check_utf16_for_latin1_and_bidi(&[
3039                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3040                 0x66, 0x67, 0x68, 0x69,
3041             ]),
3042             Latin1Bidi::Bidi
3043         );
3044         assert_eq!(
3045             check_utf16_for_latin1_and_bidi(&[
3046                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3047                 0x66, 0x67, 0x68, 0x69,
3048             ]),
3049             Latin1Bidi::Bidi
3050         );
3051         assert_eq!(
3052             check_utf16_for_latin1_and_bidi(&[
3053                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3054                 0x66, 0x67, 0x68, 0x69,
3055             ]),
3056             Latin1Bidi::Bidi
3057         );
3058         assert_eq!(
3059             check_utf16_for_latin1_and_bidi(&[
3060                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3061                 0x66, 0x67, 0x68, 0x69,
3062             ]),
3063             Latin1Bidi::Bidi
3064         );
3065         assert_eq!(
3066             check_utf16_for_latin1_and_bidi(&[
3067                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3068                 0x66, 0x67, 0x68, 0x69,
3069             ]),
3070             Latin1Bidi::Bidi
3071         );
3072         assert_eq!(
3073             check_utf16_for_latin1_and_bidi(&[
3074                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3075                 0x66, 0x67, 0x68, 0x69,
3076             ]),
3077             Latin1Bidi::Bidi
3078         );
3079         assert_eq!(
3080             check_utf16_for_latin1_and_bidi(&[
3081                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3082                 0x66, 0x67, 0x68, 0x69,
3083             ]),
3084             Latin1Bidi::Bidi
3085         );
3086         assert_eq!(
3087             check_utf16_for_latin1_and_bidi(&[
3088                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3089                 0x66, 0x67, 0x68, 0x69,
3090             ]),
3091             Latin1Bidi::Bidi
3092         );
3093         assert_eq!(
3094             check_utf16_for_latin1_and_bidi(&[
3095                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3096                 0x66, 0x67, 0x68, 0x69,
3097             ]),
3098             Latin1Bidi::Bidi
3099         );
3100         assert_eq!(
3101             check_utf16_for_latin1_and_bidi(&[
3102                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3103                 0x66, 0x67, 0x68, 0x69,
3104             ]),
3105             Latin1Bidi::Bidi
3106         );
3107 
3108         assert_eq!(
3109             check_utf16_for_latin1_and_bidi(&[
3110                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3111                 0x65, 0x66, 0x67, 0x68, 0x69,
3112             ]),
3113             Latin1Bidi::Bidi
3114         );
3115     }
3116 
3117     #[inline(always)]
reference_is_char_bidi(c: char) -> bool3118     pub fn reference_is_char_bidi(c: char) -> bool {
3119         match c {
3120             '\u{0590}'..='\u{08FF}'
3121             | '\u{FB1D}'..='\u{FDFF}'
3122             | '\u{FE70}'..='\u{FEFE}'
3123             | '\u{10800}'..='\u{10FFF}'
3124             | '\u{1E800}'..='\u{1EFFF}'
3125             | '\u{200F}'
3126             | '\u{202B}'
3127             | '\u{202E}'
3128             | '\u{2067}' => true,
3129             _ => false,
3130         }
3131     }
3132 
3133     #[inline(always)]
reference_is_utf16_code_unit_bidi(u: u16) -> bool3134     pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3135         match u {
3136             0x0590..=0x08FF
3137             | 0xFB1D..=0xFDFF
3138             | 0xFE70..=0xFEFE
3139             | 0xD802
3140             | 0xD803
3141             | 0xD83A
3142             | 0xD83B
3143             | 0x200F
3144             | 0x202B
3145             | 0x202E
3146             | 0x2067 => true,
3147             _ => false,
3148         }
3149     }
3150 
3151     #[test]
3152     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_char_bidi_thoroughly()3153     fn test_is_char_bidi_thoroughly() {
3154         for i in 0..0xD800u32 {
3155             let c: char = ::std::char::from_u32(i).unwrap();
3156             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3157         }
3158         for i in 0xE000..0x110000u32 {
3159             let c: char = ::std::char::from_u32(i).unwrap();
3160             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3161         }
3162     }
3163 
3164     #[test]
3165     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_code_unit_bidi_thoroughly()3166     fn test_is_utf16_code_unit_bidi_thoroughly() {
3167         for i in 0..0x10000u32 {
3168             let u = i as u16;
3169             assert_eq!(
3170                 is_utf16_code_unit_bidi(u),
3171                 reference_is_utf16_code_unit_bidi(u)
3172             );
3173         }
3174     }
3175 
3176     #[test]
3177     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_str_bidi_thoroughly()3178     fn test_is_str_bidi_thoroughly() {
3179         let mut buf = [0; 4];
3180         for i in 0..0xD800u32 {
3181             let c: char = ::std::char::from_u32(i).unwrap();
3182             assert_eq!(
3183                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3184                 reference_is_char_bidi(c)
3185             );
3186         }
3187         for i in 0xE000..0x110000u32 {
3188             let c: char = ::std::char::from_u32(i).unwrap();
3189             assert_eq!(
3190                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3191                 reference_is_char_bidi(c)
3192             );
3193         }
3194     }
3195 
3196     #[test]
3197     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf8_bidi_thoroughly()3198     fn test_is_utf8_bidi_thoroughly() {
3199         let mut buf = [0; 8];
3200         for i in 0..0xD800u32 {
3201             let c: char = ::std::char::from_u32(i).unwrap();
3202             let expect = reference_is_char_bidi(c);
3203             {
3204                 let len = {
3205                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3206                     assert_eq!(is_utf8_bidi(bytes), expect);
3207                     bytes.len()
3208                 };
3209                 {
3210                     let tail = &mut buf[len..];
3211                     for b in tail.iter_mut() {
3212                         *b = 0;
3213                     }
3214                 }
3215             }
3216             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3217         }
3218         for i in 0xE000..0x110000u32 {
3219             let c: char = ::std::char::from_u32(i).unwrap();
3220             let expect = reference_is_char_bidi(c);
3221             {
3222                 let len = {
3223                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3224                     assert_eq!(is_utf8_bidi(bytes), expect);
3225                     bytes.len()
3226                 };
3227                 {
3228                     let tail = &mut buf[len..];
3229                     for b in tail.iter_mut() {
3230                         *b = 0;
3231                     }
3232                 }
3233             }
3234             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3235         }
3236     }
3237 
3238     #[test]
3239     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_bidi_thoroughly()3240     fn test_is_utf16_bidi_thoroughly() {
3241         let mut buf = [0; 32];
3242         for i in 0..0x10000u32 {
3243             let u = i as u16;
3244             buf[15] = u;
3245             assert_eq!(
3246                 is_utf16_bidi(&buf[..]),
3247                 reference_is_utf16_code_unit_bidi(u)
3248             );
3249         }
3250     }
3251 
3252     #[test]
test_is_utf8_bidi_edge_cases()3253     fn test_is_utf8_bidi_edge_cases() {
3254         assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3255         assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3256         assert!(!is_utf8_bidi(b"abc"));
3257         assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3258         assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3259         assert!(is_utf8_bidi(b"ab\xC2"));
3260     }
3261 
3262     #[test]
test_decode_latin1()3263     fn test_decode_latin1() {
3264         match decode_latin1(b"ab") {
3265             Cow::Borrowed(s) => {
3266                 assert_eq!(s, "ab");
3267             }
3268             Cow::Owned(_) => {
3269                 unreachable!("Should have borrowed");
3270             }
3271         }
3272         assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3273     }
3274 
3275     #[test]
test_encode_latin1_lossy()3276     fn test_encode_latin1_lossy() {
3277         match encode_latin1_lossy("ab") {
3278             Cow::Borrowed(s) => {
3279                 assert_eq!(s, b"ab");
3280             }
3281             Cow::Owned(_) => {
3282                 unreachable!("Should have borrowed");
3283             }
3284         }
3285         assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3286     }
3287 
3288     #[test]
test_convert_utf8_to_utf16_without_replacement()3289     fn test_convert_utf8_to_utf16_without_replacement() {
3290         let mut buf = [0u16; 5];
3291         assert_eq!(
3292             convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3293             Some(2)
3294         );
3295         assert_eq!(buf[0], u16::from(b'a'));
3296         assert_eq!(buf[1], u16::from(b'b'));
3297         assert_eq!(buf[2], 0);
3298         assert_eq!(
3299             convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3300             Some(2)
3301         );
3302         assert_eq!(buf[0], 0xE4);
3303         assert_eq!(buf[1], u16::from(b'c'));
3304         assert_eq!(buf[2], 0);
3305         assert_eq!(
3306             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3307             Some(1)
3308         );
3309         assert_eq!(buf[0], 0x2603);
3310         assert_eq!(buf[1], u16::from(b'c'));
3311         assert_eq!(buf[2], 0);
3312         assert_eq!(
3313             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3314             Some(2)
3315         );
3316         assert_eq!(buf[0], 0x2603);
3317         assert_eq!(buf[1], u16::from(b'd'));
3318         assert_eq!(buf[2], 0);
3319         assert_eq!(
3320             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3321             Some(2)
3322         );
3323         assert_eq!(buf[0], 0x2603);
3324         assert_eq!(buf[1], 0xE4);
3325         assert_eq!(buf[2], 0);
3326         assert_eq!(
3327             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3328             Some(2)
3329         );
3330         assert_eq!(buf[0], 0xD83D);
3331         assert_eq!(buf[1], 0xDCCE);
3332         assert_eq!(buf[2], 0);
3333         assert_eq!(
3334             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3335             Some(3)
3336         );
3337         assert_eq!(buf[0], 0xD83D);
3338         assert_eq!(buf[1], 0xDCCE);
3339         assert_eq!(buf[2], u16::from(b'e'));
3340         assert_eq!(
3341             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3342             None
3343         );
3344     }
3345 }
3346